Reputation: 3646
I've been scratching my head on this problem in PostgreSQL. I have a table test
with 2 columns: - id
and content
. e.g.
create table test (id integer,
content varchar(1024));
insert into test (id, content) values
(1, 'Lorem Ipsum is simply dummy text of the printing and typesetting industry.'),
(2, 'Lorem Ipsum has been the industrys standard dummy text '),
(3, 'ever since the 1500s, when an unknown printer took a galley of type and scrambled it to'),
(4, 'make a type specimen book.'),
(5, 'It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.'),
(6, 'It was popularised in the 1960s with the release of Letraset sheets containing Lorem '),
(7, 'Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker'),
(8, ' including versions of Lorem Ipsum.');
If I run the following query ...
select id, length(content) as characters from test order by id
... then I get: -
id | characters
---+-----------
1 | 74
2 | 55
3 | 87
4 | 26
5 | 120
6 | 85
7 | 87
8 | 35
What I want to do is group the id
into rows where the sum of the content goes over a threshold. For example, if that threshold is 100
then the desired result would look like the following: -
ids | characters
----+-----------
1,2 | 129
3,4 | 113
5 | 120
6,7 | 172
8 | 35
NOTE (1): - The query doesn't need to generate a characters
column - just the ids
- they are here to communicate that they are all over 100
- except for the last row which is 35
.
NOTE (2): - ids
could be a comma-delimited string or a PostgreSQL array - the type is less important than the values
Can I use a window function to do this or do I need something more complex like a lateral join
?
Upvotes: 7
Views: 1284
Reputation: 2774
Here I have a query which uses the LEAD() window function
SELECT id || ',' || next_id, characters + next_characters total_characters
FROM (SELECT id, characters, row_num,
CASE
WHEN row_num % 2 = 0
AND characters < 100 THEN Lead(id) OVER(ORDER BY id)
ELSE NULL
END next_id,
CASE
WHEN row_num % 2 = 0
AND characters < 100 THEN NULL
ELSE Lead(characters) OVER(ORDER BY id)
END AS next_characters
FROM (SELECT id,
Length(content) AS characters,
Row_number()
OVER(
ORDER BY id) row_num
FROM test
ORDER BY id))
WHERE next_id IS NULL;
Hope this may help you.
Upvotes: 1
Reputation: 15624
Using stored functions allows to avoid (sometime) the head-breaking queries.
create or replace function fn_foo(ids out int[], characters out int) returns setof record language plpgsql as $$
declare
r record;
threshold int := 100;
begin
ids := '{}'; characters := 0;
for r in (
select id, coalesce(length(content),0) as lng
from test order by id)
loop
characters := characters + r.lng;
ids := ids || r.id;
if characters > threshold then
return next;
ids := '{}'; characters := 0;
end if;
end loop;
if ids <> '{}' then
return next;
end if;
end $$;
select * from fn_foo();
╔═══════╤════════════╗
║ ids │ characters ║
╠═══════╪════════════╣
║ {1,2} │ 129 ║
║ {3,4} │ 113 ║
║ {5} │ 120 ║
║ {6,7} │ 172 ║
║ {8} │ 35 ║
╚═══════╧════════════╝
(5 rows)
Upvotes: 2
Reputation: 1271051
This type of problem requires a recursive CTE (or similar functionality). Here is an example:
with recursive t as (
select id, length(content) as len,
row_number() over (order by id) as seqnum
from test
),
cte(id, len, ids, seqnum, grp) as (
select id, len, len as cumelen, t.id::text, 1::int as seqnum, 1 as grp
from t
where seqnum = 1
union all
select t.id,
t.len,
(case when cte.cumelen >= 100 then t.len else cte.cumelen + t.len end) as cumelen,
(case when cte.cumelen >= 100 then t.id::text else cte.ids || ',' || t.id::text end) as ids,
t.seqnum
(case when cte.cumelen >= 100 then cte.grp + 1 else cte.grp end) as ids,
from t join
cte
on cte.seqnum = t.seqnum - 1
)
select grp, max(ids)
from cte
group by grp;
Here is a small working example:
with recursive test as (
select 1 as id, 'abcd'::text as content union all
select 2 as id, 'abcd'::text as content union all
select 3 as id, 'abcd'::text as content
),
t as (
select id, length(content) as len,
row_number() over (order by id) as seqnum
from test
),
cte(id, len, cumelen, ids, seqnum, grp) as (
select id, len, len as cumelen, t.id::text, 1::int as seqnum, 1 as grp
from t
where seqnum = 1
union all
select t.id,
t.len,
(case when cte.cumelen >= 5 then t.len else cte.cumelen + t.len end) as cumelen,
(case when cte.cumelen >= 5 then t.id::text else cte.ids || ',' || t.id::text end) as ids,
t.seqnum::int,
(case when cte.cumelen >= 5 then cte.grp + 1 else cte.grp end)
from t join
cte
on cte.seqnum = t.seqnum - 1
)
select grp, max(ids)
from cte
group by grp;
Upvotes: 5