Reputation: 1490
I have data that looks like this:
dt val
2020-01-01 A
2020-01-02 A
2020-01-03 B
2020-01-04 B
2020-01-05 B
2020-01-06 C
2020-01-07 C
2020-01-08 B
2020-01-09 A
2020-01-10 A
2020-01-11 A
Is it possible to define a partition by neighboring values of val
to define:
dt val block_start_dt block_end_dt prev_block_end_dt next_block_start_dt
2020-01-01 A 2020-01-01 2020-01-02 null 2020-01-03
2020-01-02 A 2020-01-01 2020-01-02 null 2020-01-03
2020-01-03 B 2020-01-03 2020-01-05 2020-01-02 2020-01-06
2020-01-04 B 2020-01-03 2020-01-05 2020-01-02 2020-01-06
2020-01-05 B 2020-01-03 2020-01-05 2020-01-02 2020-01-06
2020-01-06 C 2020-01-06 2020-01-07 2020-01-05 2020-01-08
2020-01-07 C 2020-01-06 2020-01-07 2020-01-05 2020-01-08
2020-01-08 B 2020-01-08 2020-01-08 2020-01-07 2020-01-09
2020-01-09 A 2020-01-09 2020-01-11 2020-01-08 null
2020-01-10 A 2020-01-09 2020-01-11 2020-01-08 null
2020-01-11 A 2020-01-09 2020-01-11 2020-01-08 null
The only way that I can think of is use window functions thrice in subqueries, but I was wondering if there was any way to do it without reading in the data thrice.
Upvotes: 0
Views: 72
Reputation: 26140
it can also be done with way too many nested window functions, but zero joins like:
WITH data AS (
select * from values
('2020-01-01', 'A'),
('2020-01-02', 'A'),
('2020-01-03', 'B'),
('2020-01-04', 'B'),
('2020-01-05', 'B'),
('2020-01-06', 'C'),
('2020-01-07', 'C'),
('2020-01-08', 'B'),
('2020-01-09', 'A'),
('2020-01-10', 'A'),
('2020-01-11', 'A')
v(dt,val)
)
SELECT
dt
,val
,block_start_dt
,block_end_dt
,COALESCE(lag_block_end_dt_a,lag(lag_block_end_dt_a) ignore nulls over(order by dt)) as pre_block_end_dt
,COALESCE(lead_block_start_dt_a,lead(lead_block_start_dt_a) ignore nulls over(order by dt)) as next_block_start_dt
FROM (
SELECT b.*
,lag(block_end_dt) over (order by dt) as lag_block_end_dt
,iff(lag_block_end_dt=block_end_dt, null, lag_block_end_dt) as lag_block_end_dt_a
,lead(block_start_dt) over (order by dt) as lead_block_start_dt
,iff(lead_block_start_dt=block_start_dt, null, lead_block_start_dt) as lead_block_start_dt_a
FROM (
SELECT a.*
,first_value(dt) over (partition by val, block order by dt) as block_start_dt
,last_value(dt) over (partition by val, block order by dt) as block_end_dt
FROM (
SELECT dt
,val
,row_number() over(order by dt) - row_number() over(partition by val order by dt) as block
FROM data
) a
) b
) c
order by dt;
gives:
DT VAL BLOCK_START_DT BLOCK_END_DT PRE_BLOCK_END_DT NEXT_BLOCK_START_DT
2020-01-01 A 2020-01-01 2020-01-02 null 2020-01-03
2020-01-02 A 2020-01-01 2020-01-02 null 2020-01-03
2020-01-03 B 2020-01-03 2020-01-05 2020-01-02 2020-01-06
2020-01-04 B 2020-01-03 2020-01-05 2020-01-02 2020-01-06
2020-01-05 B 2020-01-03 2020-01-05 2020-01-02 2020-01-06
2020-01-06 C 2020-01-06 2020-01-07 2020-01-05 2020-01-08
2020-01-07 C 2020-01-06 2020-01-07 2020-01-05 2020-01-08
2020-01-08 B 2020-01-08 2020-01-08 2020-01-07 2020-01-09
2020-01-09 A 2020-01-09 2020-01-11 2020-01-08 null
2020-01-10 A 2020-01-09 2020-01-11 2020-01-08 null
2020-01-11 A 2020-01-09 2020-01-11 2020-01-08 null
Upvotes: 1
Reputation: 86808
I'd use a gaps and islands approach, but having first used LAG() and LEAD() to find the dates for the preceding and succeeding groups...
WITH
pre_and_post AS
(
SELECT
data.*,
CASE WHEN LAG(val) OVER (ORDER BY dt) <> val THEN LAG(dt) OVER (ORDER BY dt) END AS prev_end,
CASE WHEN LEAD(val) OVER (ORDER BY dt) <> val THEN LEAD(dt) OVER (ORDER BY dt) END AS next_start
FROM
data
),
gaps_and_islands AS
(
SELECT
*,
ROW_NUMBER() OVER ( ORDER BY dt)
-
ROW_NUMBER() OVER (PARTITION BY val ORDER BY dt) AS val_group_id
FROM
pre_and_post
)
SELECT
dt,
val,
MIN(dt ) OVER (PARTITION BY val, val_group_id) AS val_group_start,
MAX(dt ) OVER (PARTITION BY val, val_group_id) AS val_group_end,
MAX(prev_end ) OVER (PARTITION BY val, val_group_id) AS prev_val_group_end,
MAX(next_start) OVER (PARTITION BY val, val_group_id) AS next_val_group_start
FROM
gaps_and_islands
ORDER BY
dt
https://dbfiddle.uk/?rdbms=postgres_9.5&fiddle=799ca51c518e4014e5156ff369043313
Upvotes: 1
Reputation: 1271231
You can basically do what you want by aggregating the rows as a gaps-and-islands problem. The basic idea is the difference of row numbers, but you want additional information about adjacent blocks as well:
select val, min(dt) as block_start_dt, max(dt) as block_end_dt,
lag(min(dt)) over (order by min(dt)) as prev_block_start_dt,
lag(max(dt)) over (order by min(dt)) as prev_block_end_dt,
lead(min(dt)) over (order by min(dt)) as next_block_start_dt,
lead(max(dt)) over (order by min(dt)) as next_block_end_dt
from (select t.*,
row_number() over (order by dt) as seqnum,
row_number() over (partition by val order by dt) as seqnum_2
from t
) t
group by val, (seqnum - seqnum_2);
That might be sufficient for your purposes. But . . . you can join this back to your data to get detail on each row:
select t.*, x.*
from t join
(select val, min(dt) as block_start_dt, max(dt) as block_end_dt,
lag(min(dt)) over (order by min(dt)) as prev_block_start_dt,
lag(max(dt)) over (order by min(dt)) as prev_block_end_dt,
lead(min(dt)) over (order by min(dt)) as next_block_start_dt,
lead(max(dt)) over (order by min(dt)) as next_block_end_dt
from (select t.*,
row_number() over (order by dt) as seqnum,
row_number() over (partition by val order by dt) as seqnum_2
from t
) t
group by val, (seqnum - seqnum_2)
) x
on t.val = x.val and
t.dt between x.block_start_dt and x.block_end_dt;
Upvotes: 0