Tara
Tara

Reputation: 11

PostgreSQL 9.2--Agg query slow with count(distinct x)

We are in the middle of a conversion from MySQL to PG 9.2. However we have caught a snag with our aggregate queries in PG. They take MUCH longer to run in PG versus MySQL. The thing I believe is causing the biggest issue is our count(distinct x) in the aggregate queries. This is causing an extra sort and no matter what we set WORK_MEM to it spills onto disk. We have also tried setting the enable_sort to 'off' and enable_seqscan to 'off'. Also, we regularly run VACUUM ANALYZE. Lastly we are not able to take advantage of indexes because our query has many dim fields and most of those are concatenated. The tables are 40 mill to 80 mill. Please any suggestions would help! Here is a sample of an explain analyze.

Query

SELECT
    concat(
        cast(coalesce(trim(tablea.state_id),'na') as varchar),
        '-',
        cast(
            coalesce(
                CASE WHEN char_length(cast(tablea.case_num as varchar)) = 12
                    THEN left(cast(tablea.case_num as varchar), 2)
                    else left(cast(tablea.case_num as varchar), 1)
                END,'na'
            ) as varchar
        )
    ),
    to_char(tablea.visit_date,'yyyy-mm-01') ,
    coalesce(tablea.payor_code,0) ,
    coalesce(tablea.los,'other') ,
    case when cpt_group is null or cpt_group = ''
        then '0'
        else split_part(tablea.cpt_group,'-',1)
    end ,
    case when cpt_group is null or cpt_group = ''
        then '0'
        else split_part(tablea.cpt_group,'-',2)
    end ,
    case when (tablea.erp_code) isnull
        then 'na'
        else concat(
            cast(coalesce(trim(tablea.state_id), 'na') as varchar),
            '-',
            coalesce(cast(tablea.erp_code as varchar), 'na') ,
            '-',
            coalesce(cast(tablea.mlp_code as varchar), 'na')
            )
    end ,
    coalesce(tablea.erp_attnd,'n') ,
    count(distinct concat(tablea.state_id,tablea.case_num)) ,
    count( unique_rec_id),
    sum(tablea.units) ,
    sum(tablea.total_amt) ,
    sum(tablea.total_rvu) ,
    sum(tablea.budget_rvu),
    sum(tablea.work_rvu) ,
    sum(tablea.payroll_rvu) ,
    sum(extract(day from charges.insert_ts - charges.visit_date))
FROM tablea
GROUP BY 1,2,3,4,5,6 ,7 ,8

Table Structure

state_id character varying(8),
unique_rec_id bigint NOT NULL,
case_num bigint,
seq_nbr smallint,
charge_code character varying(18),
cpt_type character varying(2),
cpt_code character varying(19),
cpt_version_code character varying(2),
erp_code integer,
mlp_code integer,
shared_svcs character varying(1),
visit_date date,
units numeric(6,2),
unit_amt numeric(7,2),
total_amt numeric(7,2),
unit_rvu numeric(6,2),
total_rvu numeric(6,2),
place_of_serv smallint,
insert_ts timestamp(6) without time zone,
update_ts timestamp(6) without time zone,
alw_amt numeric(8,2),
pmt_amt numeric(8,2),
adj_amt numeric(8,2),
payor_code smallint,
coder_id smallint,
erp_attnd character varying(1),
tos smallint,
etos smallint,
los character varying(5),
budget_rvu numeric(6,2),
work_rvu numeric(6,2),
payroll_rvu numeric(6,2),
cpt_group character varying(7),
CONSTRAINT charges_pkey PRIMARY KEY (unique_rec_id)

Explain output

"GroupAggregate  (cost=10007828904.29..10010019592.38 rows=3519178 width=77) (actual time=1270982.096..1448667.608 rows=7848213 loops=1)"
"  Output: (pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', (COALESCE(CASE WHEN (char_length(((case_num)::character varying)::text) = 12) THEN "left"(((case_num)::character varying)::text, 2) ELSE "left"(((case_num)::character varying)::text, 1) END, 'na'::text))::character varying)), (to_char((visit_date)::timestamp with time zone, 'yyyy-mm-01'::text)), (COALESCE((payor_code)::integer, 0)), (COALESCE(los, 'other'::character varying)), (CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 1) END), (CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 2) END), (CASE WHEN (erp_code IS NULL) THEN 'na'::text ELSE pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', COALESCE((erp_code)::character varying, 'na'::character varying), '-', COALESCE((mlp_code)::character varying, 'na'::character varying)) END), (COALESCE(erp_attnd, 'n'::character varying)), count(DISTINCT pg_catalog.concat(state_id, case_num)), count(unique_rec_id), sum(units), sum(total_amt), sum(total_rvu), sum(budget_rvu), sum(work_rvu), sum(payroll_rvu), sum(date_part('day'::text, (insert_ts - (visit_date)::timestamp without time zone)))"
"  Buffers: shared hit=866434, temp read=558843 written=558843"
"  ->  Sort  (cost=10007828904.29..10007916883.73 rows=35191776 width=77) (actual time=1270982.019..1314712.552 rows=36729163 loops=1)"
"        Output: (pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', (COALESCE(CASE WHEN (char_length(((case_num)::character varying)::text) = 12) THEN "left"(((case_num)::character varying)::text, 2) ELSE "left"(((case_num)::character varying)::text, 1) END, 'na'::text))::character varying)), (to_char((visit_date)::timestamp with time zone, 'yyyy-mm-01'::text)), (COALESCE((payor_code)::integer, 0)), (COALESCE(los, 'other'::character varying)), (CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 1) END), (CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 2) END), (CASE WHEN (erp_code IS NULL) THEN 'na'::text ELSE pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', COALESCE((erp_code)::character varying, 'na'::character varying), '-', COALESCE((mlp_code)::character varying, 'na'::character varying)) END), (COALESCE(erp_attnd, 'n'::character varying)), state_id, case_num, unique_rec_id, units, total_amt, total_rvu, budget_rvu, work_rvu, payroll_rvu, insert_ts, visit_date"
"        Sort Key: (pg_catalog.concat((COALESCE(btrim((charges.state_id)::text), 'na'::text))::character varying, '-', (COALESCE(CASE WHEN (char_length(((charges.case_num)::character varying)::text) = 12) THEN "left"(((charges.case_num)::character varying)::text, 2) ELSE "left"(((charges.case_num)::character varying)::text, 1) END, 'na'::text))::character varying)), (to_char((charges.visit_date)::timestamp with time zone, 'yyyy-mm-01'::text)), (COALESCE((charges.payor_code)::integer, 0)), (COALESCE(charges.los, 'other'::character varying)), (CASE WHEN ((charges.cpt_group IS NULL) OR ((charges.cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((charges.cpt_group)::text, '-'::text, 1) END), (CASE WHEN ((charges.cpt_group IS NULL) OR ((charges.cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((charges.cpt_group)::text, '-'::text, 2) END), (CASE WHEN (charges.erp_code IS NULL) THEN 'na'::text ELSE pg_catalog.concat((COALESCE(btrim((charges.state_id)::text), 'na'::text))::character varying, '-', COALESCE((charges.erp_code)::character varying, 'na'::character varying), '-', COALESCE((charges.mlp_code)::character varying, 'na'::character varying)) END), (COALESCE(charges.erp_attnd, 'n'::character varying))"
"        Sort Method: external merge  Disk: 4470728kB"
"        Buffers: shared hit=866434, temp read=558843 written=558843"
"        ->  Seq Scan on public.charges  (cost=10000000000.00..10003417837.76 rows=35191776 width=77) (actual time=0.062..182871.790 rows=36729163 loops=1)"
"              Output: pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', (COALESCE(CASE WHEN (char_length(((case_num)::character varying)::text) = 12) THEN "left"(((case_num)::character varying)::text, 2) ELSE "left"(((case_num)::character varying)::text, 1) END, 'na'::text))::character varying), to_char((visit_date)::timestamp with time zone, 'yyyy-mm-01'::text), COALESCE((payor_code)::integer, 0), COALESCE(los, 'other'::character varying), CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 1) END, CASE WHEN ((cpt_group IS NULL) OR ((cpt_group)::text = ''::text)) THEN '0'::text ELSE split_part((cpt_group)::text, '-'::text, 2) END, CASE WHEN (erp_code IS NULL) THEN 'na'::text ELSE pg_catalog.concat((COALESCE(btrim((state_id)::text), 'na'::text))::character varying, '-', COALESCE((erp_code)::character varying, 'na'::character varying), '-', COALESCE((mlp_code)::character varying, 'na'::character varying)) END, COALESCE(erp_attnd, 'n'::character varying), state_id, case_num, unique_rec_id, units, total_amt, total_rvu, budget_rvu, work_rvu, payroll_rvu, insert_ts, visit_date"
"              Buffers: shared hit=866434"
"Total runtime: 1450131.131 ms"

Upvotes: 1

Views: 488

Answers (2)

Ati
Ati

Reputation: 1

try to add set enable_sort='off'; before your query.

Upvotes: 0

Clodoaldo Neto
Clodoaldo Neto

Reputation: 125244

I made superficial optimizations:

SELECT
    concat(coalesce(state_id, 'na'), '-', case_num),
    visit_date,
    coalesce(payor_code, 0),
    coalesce(los, 'other'),
    case when cpt_group_1 is null or cpt_group = ''
        then '0'
        else cpt_group_1
    end,
    case when cpt_group_2 is null or cpt_group = ''
        then '0'
        else cpt_group_2
    end,
    erp_code ,
    coalesce(erp_attnd, 'n'),
    count_case_num,
    rec_id,
    units,
    total_amt,
    total_rvu,
    budget_rvu,
    work_rvu,
    payroll_rvu,
    days
from (
    select
        tablea.state_id,
        CASE WHEN tablea.case_num between 100000000000 and 999999999999
            THEN left(cast(tablea.case_num as text), 2)
            else left(cast(tablea.case_num as text), 1)
        END case_num,
        date_trunc('month', tablea.visit_date) visit_date,
        tablea.payor_code,
        tablea.los,
        split_part(tablea.cpt_group, '-', 1) cpt_group_1,
        split_part(tablea.cpt_group, '-', 2) cpt_group_2,
        case when tablea.erp_code is null
            then 'na'
            else concat(
                coalesce(tablea.state_id, 'na'),
                '-',
                coalesce(cast(tablea.erp_code as text), 'na'),
                '-',
                coalesce(cast(tablea.mlp_code as text), 'na')
                )
        end erp_code,
        tablea.erp_attnd,
        count(distinct (tablea.state_id, tablea.case_num)) count_case_num,
        count(unique_rec_id) rec_id,
        sum(tablea.units) units,
        sum(tablea.total_amt) total_amt,
        sum(tablea.total_rvu) total_rvu,
        sum(tablea.budget_rvu) budget_rvu,
        sum(tablea.work_rvu) work_rvu,
        sum(tablea.payroll_rvu) payroll_rvu,
        sum(extract(day from charges.insert_ts - charges.visit_date)) days
    FROM tablea
    GROUP BY 1, 2, 3, 4, 5, 6 ,7 ,8, 9
) s

The part you worry about I changed from:

count(distinct concat(tablea.state_id,tablea.case_num))

to:

count(distinct (tablea.state_id, tablea.case_num))

Upvotes: 1

Related Questions