Reputation: 1021
I have to get rid of some unnecessary data from my Postgresql database. Here is the query which works for small data:
WITH bad_row_history(survey_id, template_id) AS ((
SELECT row_id, (row_value->>'template_id')::INTEGER
FROM public.row_history
WHERE record_table='survey_storage'
AND row_value->>'status'IN ('Never Surveyed','Incomplete Configuration')
AND row_id NOT IN (
SELECT row_id
FROM public.row_history
WHERE record_table='survey_storage'
AND row_value->>'status'='Ready to Launch'
)
) LIMIT 10),
delete_su AS (
DELETE FROM survey_user
WHERE survey_id = ANY(ARRAY(select survey_id FROM bad_row_history))
),
delete_slu AS(
DELETE FROM survey_library_users
WHERE survey_library_id = ANY(ARRAY(select template_id FROM bad_row_history))
),
delete_ss AS(
DELETE FROM survey_storage
WHERE id = ANY(ARRAY(select survey_id FROM bad_row_history))
),
delete_sl AS(
DELETE FROM survey_library
WHERE id = ANY(ARRAY(select template_id FROM bad_row_history))
)
delete FROM row_history
WHERE row_id = ANY(ARRAY(select survey_id FROM bad_row_history))
In the cte, you will find I have added a limit.Otherwise the query never completes.Without limit the cte yields 937,147 rows.There are 5 delete statements.For each delete there could be at least one row and may be 3 to 5 rows at max. I have 3 questions:
I understand this will be a time taking job.Let it be, but at least it should work. Should not hang. Yesterday I ran it without LIMIT and after running few hours it hanged & all the deletions got rolled back.But earlier with small limits like 10, 100 it has worked.
UPDATE As per suggestions I have introduced temp table & deletes with sub-query to the temp table. Here is the script:
DROP bad_row_history if EXISTS;
CREATE TEMPORARY TABLE bad_row_history (
survey_id int8 NOT NULL,
template_id int8 NOT NULL
);
ANALYZE bad_row_history;
INSERT INTO bad_row_history(survey_id, template_id)
(SELECT row_id, (row_value->>'template_id')::INTEGER
FROM public.row_history
WHERE record_table='survey_storage'
AND row_value->>'status'IN ('Never Surveyed','Incomplete Configuration')
AND row_id NOT IN (
SELECT row_id
FROM public.row_history
WHERE record_table='survey_storage'
AND row_value->>'status'='Ready to Launch'
)
);
DELETE FROM survey_user
WHERE survey_id IN (select survey_id FROM bad_row_history);
DELETE FROM survey_library_users
WHERE survey_library_id IN(select template_id FROM bad_row_history);
DELETE FROM survey_storage
WHERE id IN(select survey_id FROM bad_row_history);
DELETE FROM survey_library
WHERE id IN(select template_id FROM bad_row_history);
delete FROM row_history
WHERE row_id IN(select survey_id FROM bad_row_history)
UPDATE-2
disable_triggers.sql
ALTER TABLE survey_user DISABLE TRIGGER ALL;
ALTER TABLE survey_storage DISABLE TRIGGER ALL;
ALTER TABLE survey_library DISABLE TRIGGER ALL;
script
CREATE TEMPORARY TABLE bad_survey (
survey_id int8 NOT NULL,
template_id int8 NOT NULL
);
analyze bad_survey;
insert into bad_survey(survey_id, template_id)
(select id as survey_id, template_id
from survey_storage
where status in ('Never Surveyed','Incomplete Configuration','Ready to Launch')
and id=original_row_id
and tenant_id=owner_tenant_id
and tenant_id=5);
insert into bad_survey(survey_id, template_id)
(select pss.id, pss.template_id
from survey_storage css
inner join company_by_path cbp
on css.company_by_path_id = cbp.id
and css.tenant_id = cbp.tenant_id
and cbp.relationship_type = 'partner'
inner join survey_storage pss
on cbp.owner_tenant_id = pss.tenant_id
and css.master_template_id = pss.master_template_id
and css.tenant_id = pss.owner_tenant_id
and css.source_id = pss.source_id
and css.tenant_id != pss.tenant_id
and css.template_id != pss.template_id
and pss.id != pss.original_row_id
where css.id in (select id as survey_id
from survey_storage
where status in ('Never Surveyed','Incomplete Configuration','Ready to Launch')
and id=original_row_id
and tenant_id=owner_tenant_id
and tenant_id=5));
DELETE FROM survey_user su
USING bad_survey bs
WHERE su.survey_id = bs.survey_id;
DELETE FROM survey_library_users slu
USING bad_survey bs
WHERE slu.survey_library_id = bs.template_id;
DELETE FROM row_history rh
USING bad_survey bs
WHERE rh.row_id = bs.survey_id;
DELETE FROM survey_storage ss
USING bad_survey bs
WHERE ss.id = bs.survey_id;
DELETE FROM survey_library sl
USING bad_survey bs
WHERE sl.id = bs.template_id;
enable_triggers.sql
ALTER TABLE survey_user ENABLE TRIGGER ALL;
ALTER TABLE survey_storage ENABLE TRIGGER ALL;
ALTER TABLE survey_library ENABLE TRIGGER ALL;
Upvotes: 0
Views: 3103
Reputation: 246493
Instead of doing everything in a single statement, proceed like this:
Create a temporary table from the result of the first CTE.
ANALYZE
that temporary table.
Run one DELETE
statement per table, joining with the temporary table.
Upvotes: 1
Reputation: 10352
The problem with your query is that Postgres is materializing the CTE, i.e. computing ~1m rows and storing them in memory, then the delete queries convert that to an array 5 separate times, and that's very expensive and slow.
I think you could make it a lot faster by not converting to an array, i.e.
survey_library_id IN (select template_id FROM bad_row_history)
rather than
survey_library_id = ANY(ARRAY(select template_id FROM bad_row_history))
What I would probably do though is make bad_row_history a temporary table, with columns template_id, survey_id etc, and then run the deletes as separate statements with subselects on the temporary table. That way the optimiser should be able to work more effectively on each delete.
Upvotes: 0