Reputation: 11
I am trying to create a column that represents how many days ago the most recent promotion started for each product. It should continue counting on when the promotion ended, until the next promotion starts.
For example, I want the following:
product | date | in_promo | days_since_last_promo |
---|---|---|---|
1 | 2023-10-01 | false | null |
1 | 2023-10-02 | false | null |
1 | 2023-10-03 | true | 0 |
1 | 2023-10-04 | true | 1 |
1 | 2023-10-05 | true | 2 |
1 | 2023-10-06 | false | 3 |
1 | 2023-10-07 | false | 4 |
1 | 2023-10-08 | true | 0 |
1 | 2023-10-09 | true | 1 |
1 | 2023-10-10 | false | 2 |
Particularly, I'm having trouble getting the right days_since_last_promo
for these lines:
product | date | in_promo | days_since_last_promo |
---|---|---|---|
1 | 2023-10-06 | false | 3 |
1 | 2023-10-07 | false | 4 |
I've been puzzling with lags, row_number() and partition by's but I can't figure it out. Is this even possible in SQL?
I would say it's related to this post, but we're trying to achieve something slightly different.
I tried for example
select
product
, date
, in_promo
, row_number() over (partition by recipe_id, in_promo, seqnum_u - seqnum_uo
order by date_cet
) as days_since_last_promo
from (select p.*,
row_number() over (partition by product order by date) as seqnum_u,
row_number() over (partition by product, in_promo order by date) as seqnum_uo
from product_sales_data as p
)
But that will give me
product | date | in_promo | days_since_last_promo |
---|---|---|---|
1 | 2023-10-01 | false | 1 |
1 | 2023-10-02 | false | 2 |
1 | 2023-10-03 | true | 1 |
1 | 2023-10-04 | true | 2 |
1 | 2023-10-05 | true | 3 |
1 | 2023-10-06 | false | 1 |
1 | 2023-10-07 | false | 2 |
1 | 2023-10-08 | true | 1 |
1 | 2023-10-09 | true | 2 |
1 | 2023-10-10 | false | 1 |
i.e. restarting the row_number when in_promo=false
.
Upvotes: 0
Views: 146
Reputation: 11
Thank you @p3consulting! I parsed your code into SQL server again, in case anyone else needs it:
select d.product, d.date, in_promo,
sum(ndays) over(partition by product, grp order by date) as days_since_last_promo
from (
select d.*,
case
when not in_promo and grp = 0 then null
else
coalesce(
date_diff(date,
last_value(date) over(partition by product, grp order by date
rows between unbounded preceding and 1 preceding)
, day
),
0
)
end as ndays
from (
select d.*,
sum(change) over(partition by product order by date) as grp
from (
select *,
case
when in_promo then (case
when lag(in_promo) over(partition by product order by date) then 0
else 1
end)
else 0
end as change
from data
) d
) d
) d
order by product, date
Upvotes: 0
Reputation: 3849
The general idea is that you tag each record that either originates as true (in promotion) or switches from false to true, first new occurrence of a promotion.
Then a self join to get the last date of an earlier change and take the difference. There is cleaner syntax with inline correlated subqueries that most dbms's support to avoid the self join, but I wasn't sure about bigquery.
Then adjust by 1.
This is in postgres, but I think bigquery supports all the syntax.
create table some_sample_data
( product integer,
_date date,
in_promo varchar(100)
)
insert into some_sample_data values (1, '2023-10-01', 'false');
insert into some_sample_data values (1, '2023-10-02', 'false');
insert into some_sample_data values (1, '2023-10-03', 'true');
insert into some_sample_data values (1, '2023-10-04', 'true');
insert into some_sample_data values (1, '2023-10-05', 'true');
insert into some_sample_data values (1, '2023-10-06', 'false');
insert into some_sample_data values (1, '2023-10-07', 'false');
insert into some_sample_data values (1, '2023-10-08', 'true');
insert into some_sample_data values (1, '2023-10-09', 'true');
insert into some_sample_data values (1, '2023-10-10', 'false');
insert into some_sample_data values (2, '2023-10-01', 'true');
insert into some_sample_data values (2, '2023-10-03', 'false');
insert into some_sample_data values (2, '2023-10-04', 'false');
insert into some_sample_data values (2, '2023-10-05', 'true');
with sequenced_result as (
SELECT
*,
case when in_promo = 'true'
and coalesce(lag(in_promo) OVER(PARTITION BY product ORDER BY cast(_date as date) asc),'false') = 'false'
then 1
else 0
end originated_as_or_change_to_true
FROM some_sample_data
),
earlier_snapshots as (
select t1.*,
max(t2._date) as date_of_last_change
from sequenced_result t1
left
join sequenced_result t2
on t1.product = t2.product
and t1._date >= t2._date
and t2.originated_as_or_change_to_true = 1
group
by t1.product,
t1._date,
t1.in_promo,
t1.originated_as_or_change_to_true
)
select product,
_date,
in_promo,
_date - date_of_last_change + 1 as days_since_last_promo
from earlier_snapshots
order
by product,
_date
Upvotes: 0
Reputation: 4670
Here a solution using ORACLE syntax but with standard analytic functions, assuming the start of the promotion is the first date in_promo for a sequence of rows ( in_promo* !in_promo+ ) (should be easier with MATCH_RECOGNIZE but would be ORACLE-only):
with data(product, dat, in_promo) as (
select 1, date '2023-10-01', 'false' from dual union all
select 1, date '2023-10-02', 'false' from dual union all
select 1, date '2023-10-03', 'true' from dual union all
select 1, date '2023-10-04', 'true' from dual union all
select 1, date '2023-10-05', 'true' from dual union all
select 1, date '2023-10-06', 'false' from dual union all
select 1, date '2023-10-07', 'false' from dual union all
select 1, date '2023-10-08', 'true' from dual union all
select 1, date '2023-10-09', 'true' from dual union all
select 1, date '2023-10-10', 'false' from dual
)
select d.product, d.dat,
sum(ndays) over(partition by product, grp order by dat) as days_since_last_promo
from (
select d.*,
case when in_promo = 0 and grp = 0 then null
else
nvl(
dat - last_value(dat) over(partition by product, grp order by dat
rows between unbounded preceding and 1 preceding),
0
)
end
as ndays
from (
select d.*,
sum(change) over(partition by product order by dat) as grp
from (
select d.*,
decode(in_promo,1,
decode(1,lag(in_promo) over(partition by product order by dat),0,1),
0
) as change
from (select product, dat, decode(in_promo,'true',1,0) as in_promo from data) d
) d
) d
) d
order by dat
;
1 01/10/2023 00:00:00
1 02/10/2023 00:00:00
1 03/10/2023 00:00:00 0
1 04/10/2023 00:00:00 1
1 05/10/2023 00:00:00 2
1 06/10/2023 00:00:00 3
1 07/10/2023 00:00:00 4
1 08/10/2023 00:00:00 0
1 09/10/2023 00:00:00 1
1 10/10/2023 00:00:00 2
Upvotes: 0