Reputation: 11
We are in the middle of a conversion from MySQL to PG 9.2. However we have caught a snag with our aggregate queries in PG. They take MUCH longer to run in PG versus MySQL. The thing I believe is causing the biggest issue is our count(distinct x) in the aggregate queries. This is causing an extra sort and no matter what we set WORK_MEM to it spills onto disk. We have also tried setting the enable_sort to 'off' and enable_seqscan to 'off'. Also, we regularly run VACUUM ANALYZE. Lastly we are not able to take advantage of indexes because our query has many dim fields and most of those are concatenated. The tables are 40 mill to 80 mill. Please any suggestions would help! Here is a sample of an explain analyze.
Query
SELECT
concat(
cast(coalesce(trim(tablea.state_id),'na') as varchar),
'-',
cast(
coalesce(
CASE WHEN char_length(cast(tablea.case_num as varchar)) = 12
THEN left(cast(tablea.case_num as varchar), 2)
else left(cast(tablea.case_num as varchar), 1)
END,'na'
) as varchar
)
),
to_char(tablea.visit_date,'yyyy-mm-01') ,
coalesce(tablea.payor_code,0) ,
coalesce(tablea.los,'other') ,
case when cpt_group is null or cpt_group = ''
then '0'
else split_part(tablea.cpt_group,'-',1)
end ,
case when cpt_group is null or cpt_group = ''
then '0'
else split_part(tablea.cpt_group,'-',2)
end ,
case when (tablea.erp_code) isnull
then 'na'
else concat(
cast(coalesce(trim(tablea.state_id), 'na') as varchar),
'-',
coalesce(cast(tablea.erp_code as varchar), 'na') ,
'-',
coalesce(cast(tablea.mlp_code as varchar), 'na')
)
end ,
coalesce(tablea.erp_attnd,'n') ,
count(distinct concat(tablea.state_id,tablea.case_num)) ,
count( unique_rec_id),
sum(tablea.units) ,
sum(tablea.total_amt) ,
sum(tablea.total_rvu) ,
sum(tablea.budget_rvu),
sum(tablea.work_rvu) ,
sum(tablea.payroll_rvu) ,
sum(extract(day from charges.insert_ts - charges.visit_date))
FROM tablea
GROUP BY 1,2,3,4,5,6 ,7 ,8
Table Structure
state_id character varying(8),
unique_rec_id bigint NOT NULL,
case_num bigint,
seq_nbr smallint,
charge_code character varying(18),
cpt_type character varying(2),
cpt_code character varying(19),
cpt_version_code character varying(2),
erp_code integer,
mlp_code integer,
shared_svcs character varying(1),
visit_date date,
units numeric(6,2),
unit_amt numeric(7,2),
total_amt numeric(7,2),
unit_rvu numeric(6,2),
total_rvu numeric(6,2),
place_of_serv smallint,
insert_ts timestamp(6) without time zone,
update_ts timestamp(6) without time zone,
alw_amt numeric(8,2),
pmt_amt numeric(8,2),
adj_amt numeric(8,2),
payor_code smallint,
coder_id smallint,
erp_attnd character varying(1),
tos smallint,
etos smallint,
los character varying(5),
budget_rvu numeric(6,2),
work_rvu numeric(6,2),
payroll_rvu numeric(6,2),
cpt_group character varying(7),
CONSTRAINT charges_pkey PRIMARY KEY (unique_rec_id)
Explain output
"GroupAggregate (cost=10007828904.29..10010019592.38 rows=3519178 width=77) (actual time=1270982.096..1448667.608 rows=7848213 loops=1)"
" Output: (pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', (COALESCE(CASE WHEN (char_length(((case_num)::character varying)::text) = 12) THEN "left"(((case_num)::character varying)::text, 2) ELSE "left"(((case_num)::character varying)::text, 1) END, 'na'::text))::character varying)), (to_char((visit_date)::timestamp with time zone, 'yyyy-mm-01'::text)), (COALESCE((payor_code)::integer, 0)), (COALESCE(los, 'other'::character varying)), (CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 1) END), (CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 2) END), (CASE WHEN (erp_code IS NULL) THEN 'na'::text ELSE pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', COALESCE((erp_code)::character varying, 'na'::character varying), '-', COALESCE((mlp_code)::character varying, 'na'::character varying)) END), (COALESCE(erp_attnd, 'n'::character varying)), count(DISTINCT pg_catalog.concat(state_id, case_num)), count(unique_rec_id), sum(units), sum(total_amt), sum(total_rvu), sum(budget_rvu), sum(work_rvu), sum(payroll_rvu), sum(date_part('day'::text, (insert_ts - (visit_date)::timestamp without time zone)))"
" Buffers: shared hit=866434, temp read=558843 written=558843"
" -> Sort (cost=10007828904.29..10007916883.73 rows=35191776 width=77) (actual time=1270982.019..1314712.552 rows=36729163 loops=1)"
" Output: (pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', (COALESCE(CASE WHEN (char_length(((case_num)::character varying)::text) = 12) THEN "left"(((case_num)::character varying)::text, 2) ELSE "left"(((case_num)::character varying)::text, 1) END, 'na'::text))::character varying)), (to_char((visit_date)::timestamp with time zone, 'yyyy-mm-01'::text)), (COALESCE((payor_code)::integer, 0)), (COALESCE(los, 'other'::character varying)), (CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 1) END), (CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 2) END), (CASE WHEN (erp_code IS NULL) THEN 'na'::text ELSE pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', COALESCE((erp_code)::character varying, 'na'::character varying), '-', COALESCE((mlp_code)::character varying, 'na'::character varying)) END), (COALESCE(erp_attnd, 'n'::character varying)), state_id, case_num, unique_rec_id, units, total_amt, total_rvu, budget_rvu, work_rvu, payroll_rvu, insert_ts, visit_date"
" Sort Key: (pg_catalog.concat((COALESCE(btrim((charges.state_id)::text), 'na'::text))::character varying, '-', (COALESCE(CASE WHEN (char_length(((charges.case_num)::character varying)::text) = 12) THEN "left"(((charges.case_num)::character varying)::text, 2) ELSE "left"(((charges.case_num)::character varying)::text, 1) END, 'na'::text))::character varying)), (to_char((charges.visit_date)::timestamp with time zone, 'yyyy-mm-01'::text)), (COALESCE((charges.payor_code)::integer, 0)), (COALESCE(charges.los, 'other'::character varying)), (CASE WHEN ((charges.cpt_group IS NULL) OR ((charges.cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((charges.cpt_group)::text, '-'::text, 1) END), (CASE WHEN ((charges.cpt_group IS NULL) OR ((charges.cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((charges.cpt_group)::text, '-'::text, 2) END), (CASE WHEN (charges.erp_code IS NULL) THEN 'na'::text ELSE pg_catalog.concat((COALESCE(btrim((charges.state_id)::text), 'na'::text))::character varying, '-', COALESCE((charges.erp_code)::character varying, 'na'::character varying), '-', COALESCE((charges.mlp_code)::character varying, 'na'::character varying)) END), (COALESCE(charges.erp_attnd, 'n'::character varying))"
" Sort Method: external merge Disk: 4470728kB"
" Buffers: shared hit=866434, temp read=558843 written=558843"
" -> Seq Scan on public.charges (cost=10000000000.00..10003417837.76 rows=35191776 width=77) (actual time=0.062..182871.790 rows=36729163 loops=1)"
" Output: pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', (COALESCE(CASE WHEN (char_length(((case_num)::character varying)::text) = 12) THEN "left"(((case_num)::character varying)::text, 2) ELSE "left"(((case_num)::character varying)::text, 1) END, 'na'::text))::character varying), to_char((visit_date)::timestamp with time zone, 'yyyy-mm-01'::text), COALESCE((payor_code)::integer, 0), COALESCE(los, 'other'::character varying), CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 1) END, CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 2) END, CASE WHEN (erp_code IS NULL) THEN 'na'::text ELSE pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', COALESCE((erp_code)::character varying, 'na'::character varying), '-', COALESCE((mlp_code)::character varying, 'na'::character varying)) END, COALESCE(erp_attnd, 'n'::character varying), state_id, case_num, unique_rec_id, units, total_amt, total_rvu, budget_rvu, work_rvu, payroll_rvu, insert_ts, visit_date"
" Buffers: shared hit=866434"
"Total runtime: 1450131.131 ms"
Upvotes: 1
Views: 488
Reputation: 125244
I made superficial optimizations:
SELECT
concat(coalesce(state_id, 'na'), '-', case_num),
visit_date,
coalesce(payor_code, 0),
coalesce(los, 'other'),
case when cpt_group_1 is null or cpt_group = ''
then '0'
else cpt_group_1
end,
case when cpt_group_2 is null or cpt_group = ''
then '0'
else cpt_group_2
end,
erp_code ,
coalesce(erp_attnd, 'n'),
count_case_num,
rec_id,
units,
total_amt,
total_rvu,
budget_rvu,
work_rvu,
payroll_rvu,
days
from (
select
tablea.state_id,
CASE WHEN tablea.case_num between 100000000000 and 999999999999
THEN left(cast(tablea.case_num as text), 2)
else left(cast(tablea.case_num as text), 1)
END case_num,
date_trunc('month', tablea.visit_date) visit_date,
tablea.payor_code,
tablea.los,
split_part(tablea.cpt_group, '-', 1) cpt_group_1,
split_part(tablea.cpt_group, '-', 2) cpt_group_2,
case when tablea.erp_code is null
then 'na'
else concat(
coalesce(tablea.state_id, 'na'),
'-',
coalesce(cast(tablea.erp_code as text), 'na'),
'-',
coalesce(cast(tablea.mlp_code as text), 'na')
)
end erp_code,
tablea.erp_attnd,
count(distinct (tablea.state_id, tablea.case_num)) count_case_num,
count(unique_rec_id) rec_id,
sum(tablea.units) units,
sum(tablea.total_amt) total_amt,
sum(tablea.total_rvu) total_rvu,
sum(tablea.budget_rvu) budget_rvu,
sum(tablea.work_rvu) work_rvu,
sum(tablea.payroll_rvu) payroll_rvu,
sum(extract(day from charges.insert_ts - charges.visit_date)) days
FROM tablea
GROUP BY 1, 2, 3, 4, 5, 6 ,7 ,8, 9
) s
The part you worry about I changed from:
count(distinct concat(tablea.state_id,tablea.case_num))
to:
count(distinct (tablea.state_id, tablea.case_num))
Upvotes: 1