Reputation: 36686
How I can select the rows from a table that when ordered
Supposing I have these rows as result of a query (table token
holds id
and word
and table positioning
holds id
and position
):
id | word | textblockid |sentence |position
5 | Fear | 5 | 1 | 1
8 | of | 5 | 1 | 2
6 | the | 5 | 1 | 3
7 | Dark | 5 | 1 | 4
9 | is | 5 | 1 | 5
I can have this spread in the table with different textblockids, sentences and positions.
I want to transform in this:
id | word | textblockid | sentence |position
10 | Fear of the Dark | 5 | 1 | 1
9 | is | 5 | 1 | 2
I'm doing a function that receives an array with the ids to merge, something like merge_tokens('{5,8,6,7}')
.
I insert the new word Fear of the Dark
in the table token
and get the generated id (as in example, id
is 10
). This is easy.
I need to update the id
of the first word (in this case, Fear
) to 10
and delete the next words(of
,the
,Dark
).
My doubt is how I can perform these operations. I think I need to SELECT
from an ordered table where the first row id matches the first element in the id array, the second row id matches the second element id array and so on, and after this, update the first element and remove the next ones.
I can't delete just delete the other rows by id because they are used in other words. I only will delete the of
where the previous is Fear
, the next is of
and the next Dark
. Following this rule, I only can delete the
where the previous is of
, the other previous Fear
and the next is Dark
.
As example, I can have in the same table something like that can't be affected:
id | word | textblockid |sentence |position
6 | the | 8 | 3 | 10
11 | sound | 8 | 3 | 21
8 | of | 8 | 3 | 12
6 | the | 8 | 3 | 13
7 | mountain | 8 | 3 | 14
Upvotes: 0
Views: 945
Reputation: 656744
After answering most of your recent questions I have a vague idea of what you are doing. So I had a closer look at your solution and optimized quite a bit. Mostly I simplified the code, but there are some substantial improvements, too.
=
in plpgsql. Use :=
instead. See this related question for more info.LOOP BEGIN
? A separate code block only slows down if you don't need it. Removed it.Please have a look at the code side-by-side for some hints.
Test the two versions to see which performs faster.
For your consideration:
CREATE OR REPLACE FUNCTION merge_tokens(words varchar[], separator varchar)
RETURNS VOID AS
$body$
DECLARE
r record;
current_id integer;
ids integer[];
generated_word varchar := ''; -- you can initialize variables at declaration time. Saves additional assignment.
BEGIN
-- get the ids and generate the word
RAISE NOTICE 'Getting ids and generating words';
generated_word := array_to_string(words, separator); -- 1 assignment is much cheaper. Also: no trim() needed.
ids := ARRAY
( SELECT t.id
FROM (
SELECT row_number() OVER () AS rn, text
FROM (SELECT unnest(words) AS text) x) y
JOIN token t USING (text)
ORDER BY rn);
RAISE NOTICE 'Generated word: %', generated_word;
-- check if the don't exists to insert it
SELECT INTO current_id t.id FROM token t WHERE t.text = generated_word;
IF NOT FOUND THEN
RAISE NOTICE 'Word don''t exists';
INSERT INTO token(text) VALUES(generated_word)
RETURNING id
INTO current_id; --get the last value without additional query.
END IF;
RAISE NOTICE 'Word id: %', current_id;
-- select the records that will be updated
RAISE NOTICE 'Getting words to be updated.';
FOR r IN
SELECT textblockid, sentence, position, tokenid, rn
FROM
( -- select the rows that are complete
SELECT textblockid, sentence, position, tokenid, rn, count(*) OVER (PARTITION BY grp) AS counting
FROM
( -- match source with lookup table
SELECT source.textblockid, source.sentence, source.position, source.tokenid, source.rn, source.grp
FROM
( -- select textblocks where words appears with row number to matching
SELECT tb.textblockid, tb.sentence, tb.position, tb.tokenid, grp
,CASE WHEN grp > 0 THEN
row_number() OVER (PARTITION BY grp ORDER BY tb.textblockid, tb.sentence, tb.position)
END AS rn
FROM
( -- create the groups to be used in partition by to generate the row numbers
SELECT tb.textblockid, tb.sentence, tb.position, tb.tokenid
,SUM(CASE WHEN tb.tokenid = ids[1] THEN 1 ELSE 0 END) OVER (ORDER BY tb.textblockid, tb.sentence, tb.position) AS grp
FROM textblockhastoken tb
JOIN
( --select the textblocks where the word appears
SELECT textblockid, sentence
FROM textblockhastoken tb
WHERE tb.tokenid = ids[1]
) res USING (textblockid, sentence)
) tb
) source
-- create the lookup table to match positions
JOIN (SELECT row_number() OVER () as rn, id FROM unnest(ids) AS id) lookup USING (rn)
WHERE source.tokenid = lookup.id
) merged
) g
WHERE g.counting = array_length(ids,1)
ORDER BY g.rn --order by row number to update first, delete and change positions after
LOOP
--check if update or delete
IF (r.rn = 1) THEN
RAISE NOTICE 'Updating word in T:% S:% P:%', r.textblockid, r.sentence, r.position;
UPDATE textblockhastoken tb SET tokenid = current_id
WHERE (tb.textblockid, tb.sentence, tb.position)
= ( r.textblockid, r.sentence, r.position);
ELSE
RAISE NOTICE 'Deleting word in T:% S:% P:%', r.textblockid, r.sentence, r.position;
DELETE FROM textblockhastoken tb
WHERE (tb.textblockid, tb.sentence, tb.position)
= ( r.textblockid, r.sentence, r.position);
END IF;
--check if is the last word to update the positions
IF (r.rn = array_length(ids,1)) THEN
RAISE NOTICE 'Changing positions in T:% S:%', r.textblockid, r.sentence;
UPDATE textblockhastoken tb SET position = new_position
FROM
( SELECT textblockid, sentence, position
,row_number() OVER (PARTITION BY tb.textblockid, tb.sentence ORDER BY tb.position) as new_position
FROM textblockhastoken tb
WHERE tb.textblockid = r.textblockid AND tb.sentence = r.sentence
) np
WHERE (tb.textblockid, tb.sentence, tb.position)
= (np.textblockid, np.sentence, np.position)
AND tb.position <> np.new_position;
END IF;
END LOOP;
END;
$body$ LANGUAGE plpgsql;
Upvotes: 1
Reputation: 36686
This answer is for my particular case. I don't know if is the best way, but works for me.
I build this procedure with answer from these questions: Is possible have different conditions for each row in a query? and How create a WINDOW in PostgreSQL until the same value appears again?
The FOREARCH is only working in PostgreSQL 9.1.
CREATE OR REPLACE FUNCTION merge_tokens(words VARCHAR[], separator VARCHAR)
RETURNS VOID
AS $$
DECLARE
r RECORD;
current_id INTEGER;
current_word VARCHAR;
ids INTEGER[];
generated_word VARCHAR;
BEGIN
-- get the ids and generate the word
RAISE NOTICE 'Getting ids and generating words';
generated_word = '';
FOREACH current_word IN ARRAY words
LOOP BEGIN
generated_word = generated_word || current_word;
generated_word = generated_word || separator;
SELECT t.id INTO current_id FROM token t WHERE t.text = current_word;
ids = ids || current_id;
END;
END LOOP;
-- remove lead and ending spacing in word
RAISE NOTICE 'Generated word: %', generated_word;
generated_word = TRIM(generated_word);
-- check if the don't exists to insert it
SELECT t.id INTO current_id FROM token t WHERE t.text = generated_word;
IF (current_id IS NULL) THEN
RAISE NOTICE 'Word don''t exists';
INSERT INTO token(id,text) VALUES(nextval('tokenidsqc'),generated_word);
current_id = lastval(); --get the last value from the sequence
END IF;
RAISE NOTICE 'Word id: %', current_id;
-- select the records that will be updated
RAISE NOTICE 'Getting words to be updated.';
FOR r IN SELECT grouping.textblockid, grouping.sentence, grouping.position, grouping.tokenid, grouping.row_number
FROM
(
-- select the rows that are complete
SELECT merged.textblockid, merged.sentence, merged.position, merged.tokenid,merged.row_number,count(*) OVER w as counting
FROM
(
-- match source with lookup table
SELECT source.textblockid, source.sentence, source.position, source.tokenid,source.row_number, source.grp
FROM
( -- select textblocks where words appears with row number to matching
SELECT tb.textblockid, tb.sentence, tb.position, tb.tokenid, grp,
CASE WHEN grp > 0 THEN
row_number() OVER (PARTITION BY grp ORDER BY tb.textblockid,tb.sentence,tb.position)
END AS row_number
FROM
( -- create the groups to be used in partition by to generate the row numbers
SELECT tb.textblockid, tb.sentence, tb.position, tb.tokenid,
SUM(CASE WHEN tb.tokenid = ids[1] THEN 1 ELSE 0 END) OVER (ORDER BY tb.textblockid,tb.sentence,tb.position) AS grp
FROM textblockhastoken tb,
( --select the textblocks where the word appears
SELECT textblockid, sentence
FROM textblockhastoken tb
WHERE tb.tokenid = ids[1]
)res
WHERE tb.textblockid = res.textblockid
AND tb.sentence = res.sentence
)tb
)source,
-- create the lookup table to match positions
(
SELECT row_number() OVER () as row_number,id FROM unnest(ids::INTEGER[]) as id
)lookup
WHERE source.tokenid = lookup.id
AND source.row_number = lookup.row_number
)merged
WINDOW w AS (PARTITION BY grp)
) grouping
WHERE grouping.counting = array_length(ids,1)
ORDER BY grouping.row_number --order by row number to update first, delete and change positions after
-- end of query and start of iterations actions
LOOP BEGIN
--check if update or delete
IF (r.row_number = 1) THEN
RAISE NOTICE 'Updating word in T:% S:% P:%', r.textblockid, r.sentence, r.position;
UPDATE textblockhastoken tb SET tokenid = current_id
WHERE tb.textblockid = r.textblockid
AND tb.sentence = r.sentence
AND tb.position = r.position;
ELSE
RAISE NOTICE 'Deleting word in T:% S:% P:%', r.textblockid, r.sentence, r.position;
DELETE FROM textblockhastoken tb
WHERE tb.textblockid = r.textblockid
AND tb.sentence = r.sentence
AND tb.position = r.position;
END IF;
--check if is the last word to update the positions
IF (r.row_number = array_length(ids,1)) THEN
RAISE NOTICE 'Changing positions in T:% S:%', r.textblockid, r.sentence;
UPDATE textblockhastoken tb SET position = new_position
FROM
(
SELECT textblockid, sentence, position, row_number() OVER w as new_position
FROM textblockhastoken tb
WHERE tb.textblockid = r.textblockid AND tb.sentence = r.sentence
WINDOW w AS (PARTITION BY tb.textblockid, tb.sentence ORDER BY tb.position)
)new_positioning
WHERE tb.textblockid = new_positioning.textblockid
AND tb.sentence = new_positioning.sentence
AND tb.position = new_positioning.position
AND tb.position <> new_positioning.new_position;
END IF;
END;
END LOOP;
END
$$
LANGUAGE plpgsql;
Upvotes: 0
Reputation: 656744
Best do this in one transaction:
UPDATE token
SET word = (
SELECT string_agg(word, ' ' ORDER BY position)
FROM token
WHERE id = ANY('{5,8,6,7}'::int[])
)
,id = nextval('token_id_seq')
WHERE id = ('{5,8,6,7}'::int[])[1];
DELETE FROM token
WHERE id = ANY('{5,8,6,7}'::int[])
AND id <> ('{5,8,6,7}'::int[])[1];
Replace '{5,8,6,7}'::int[]
with your integer array parameter.
I get the new id
from the sequence I assume exists.
I further assume that the ordering in array concurs with the ordering by position. Alternative version follows below.
id
to be updated is the first element of the array.
Ordering of the words can be done inside the aggregate function (since PostgreSQL 9.0). Read about that in the manual.
Order selected rows according to sequence of array elements:
SELECT rn, t.*
FROM (
SELECT id
,row_number() OVER () AS rn
FROM (SELECT unnest('{5,8,6,7}'::int[]) id) x
) x
JOIN token t USING (id)
ORDER BY rn;
Or ... does the same with different techniques, works in older versions of Postgres, too:
SELECT rn, t.*
FROM (
SELECT rn
,a[rn] AS id
FROM (SELECT '{5,8,6,7}'::int[] AS a
,generate_series(1, array_upper('{5,8,6,7}'::int[], 1)) rn) x
) x
JOIN token t USING (id)
ORDER BY rn;
Use that in the UPDATE statement:
UPDATE token
SET word = (
SELECT string_agg(word, ' ' ORDER BY rn)
FROM (
SELECT rn
,a[rn] AS id
FROM (
SELECT '{5,8,6,7}'::int[] AS a
,generate_series(1, array_upper('{5,8,6,7}'::int[], 1)) rn) x
) x
JOIN token t USING (id)
)
,id = nextval('token_id_seq')
WHERE id = ('{5,8,6,7}'::int[])[1];
Upvotes: 1
Reputation: 44250
This fragment does not use arrays. (I don't like arrays)
set search_path='tmp';
DROP TABLE wordlist;
CREATE TABLE wordlist
( id INTEGER NOT NULL PRIMARY KEY
, word varchar
, textblockid INTEGER NOT NULL
, sentence INTEGER NOT NULL
, postion INTEGER NOT NULL
, UNIQUE (textblockid,sentence,postion)
);
INSERT INTO wordlist(id,word,textblockid,sentence,postion) VALUES
(5 , 'Fear', 5 , 1 , 1 )
,(8 , 'of', 5 , 1 , 2 )
,(6 , 'the', 5 , 1 , 3 )
,(7 , 'Dark', 5 , 1 , 4 )
,(9 , 'is', 5 , 1 , 5 )
;
WITH RECURSIVE meuk AS (
SELECT 0 AS lev
, id,word AS words
, textblockid,sentence,postion AS lastpos
FROM wordlist
UNION
SELECT 1+ mk.lev AS lev
, wl.id
, mk.words || ' '::text || wl.word AS words
, wl.textblockid,wl.sentence
, wl.postion AS lastpos
FROM meuk mk
JOIN wordlist wl ON (wl.textblockid = mk.textblockid
AND wl.sentence = mk.sentence
AND wl.postion = mk.lastpos+1)
)
SELECT * FROM meuk
WHERE lev = 3
;
results:
SET
DROP TABLE
NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "wordlist_pkey" for table "wordlist"
NOTICE: CREATE TABLE / UNIQUE will create implicit index "wordlist_textblockid_sentence_postion_key" for table "wordlist"
CREATE TABLE
INSERT 0 5
lev | id | words | textblockid | sentence | lastpos
-----+----+------------------+-------------+----------+---------
3 | 7 | Fear of the Dark | 5 | 1 | 4
3 | 9 | of the Dark is | 5 | 1 | 5
(2 rows)
Upvotes: 1
Reputation: 16955
Is this something you could do as part of your merge_tokens function? Seems like you could just have that function keep track of which records need to be updated/deleted, simply based on the provided array (first element updated, the rest deleted).
Upvotes: 0