Reputation: 21
I am trying to get the the data that occurs as events in a fact table to be pivoted and captured in a clean format as below, any suggestions as to how this can be accomplished using SQL syntax that is compatible with snowflake ?
The events are expected to appear in below format:
1.subscription_id 2.cancel 3.survey
I need to capture the first occurrence of cancel & survey that appears right below subscription_id. There can be cases where subscription_id occurs but there can be no immediate 'cancel' or 'survey'. So in that case we need to ignore 'cancel' or survey page_name
There can also be cases where we can have multiple 'cancel' or 'survey' in that case we need to capture the first occurrence after subscription_id
Input Table format:
Session | page_name | page_value | page_num | Timestamp |
---|---|---|---|---|
1 | subscription_id | 12345 | 5 | 1/1/20 13:00 |
1 | cancel | no_interest | 6 | 1/1/20 13:05 |
1 | cancel | watch_all | 7 | 1/1/20 13:10 |
1 | Survey | Clicked | 8 | 1/1/20 13:15 |
1 | Survey | Not_clicked | 9 | 1/1/20 13:20 |
1 | subscription_id | 456 | 10 | 1/1/20 13:25 |
1 | cancel | don’t_like | 11 | 1/1/20 13:30 |
1 | cancel | bored | 12 | 1/1/20 13:40 |
1 | subscription_id | 789 | 13 | 1/1/20 13:45 |
1 | Survey | Not_clicked | 14 | 1/1/20 13:50 |
1 | cancel | watch_all | 15 | 1/1/20 13:55 |
1 | Survey | Clicked | 16 | 1/1/20 14:15 |
Expected Table format:
Session | subscription_id | cancel_reason | Cancel_time | Survey_reason | Survey_time |
---|---|---|---|---|---|
1 | 12345 | no_interest | 1/1/20 13:05 | Clicked | 1/1/20 13:15 |
1 | 456 | don’t_lke | 1/1/20 13:30 | ||
1 | 789 | Not_clicked | 1/1/20 13:50 |
Upvotes: 0
Views: 405
Reputation: 25928
So the first part of the problem is to give each thing a grouping cluster, which can be done with CONDITIONAL_TRUE_EVENT
with data(page_name, page_value, page_num, timestamp) as (
select * from values
('subscription_id', '12345', 5, '2020-01-01 13:00'::timestamp),
('cancel', 'no_interest', 6, '2020-01-01 13:05'::timestamp),
('cancel', 'watch_all', 7, '2020-01-01 13:10'::timestamp),
('Survey', 'Clicked', 8, '2020-01-01 13:15'::timestamp),
('Survey', 'Not_clicked', 9, '2020-01-01 13:20'::timestamp),
('subscription_id', '456', 10, '2020-01-01 13:25'::timestamp),
('cancel', 'don''t_like', 11, '2020-01-01 13:30'::timestamp),
('cancel', 'bored', 12, '2020-01-01 13:40'::timestamp),
('subscription_id', '789', 13, '2020-01-01 13:45'::timestamp),
('Survey', 'Not_clicked', 14, '2020-01-01 13:50'::timestamp),
('cancel', 'watch_all', 15, '2020-01-01 13:55'::timestamp),
('Survey', 'Clicked', 16, '2020-01-01 14:15'::timestamp)
)
select *
,CONDITIONAL_TRUE_EVENT(page_name='subscription_id')over(order by page_num) as event_grp
from data
PAGE_NAME | PAGE_VALUE | PAGE_NUM | TIMESTAMP | EVENT_GRP |
---|---|---|---|---|
subscription_id | 12345 | 5 | 2020-01-01 13:00:00.000 | 1 |
cancel | no_interest | 6 | 2020-01-01 13:05:00.000 | 1 |
cancel | watch_all | 7 | 2020-01-01 13:10:00.000 | 1 |
Survey | Clicked | 8 | 2020-01-01 13:15:00.000 | 1 |
Survey | Not_clicked | 9 | 2020-01-01 13:20:00.000 | 1 |
subscription_id | 456 | 10 | 2020-01-01 13:25:00.000 | 2 |
cancel | don't_like | 11 | 2020-01-01 13:30:00.000 | 2 |
cancel | bored | 12 | 2020-01-01 13:40:00.000 | 2 |
subscription_id | 789 | 13 | 2020-01-01 13:45:00.000 | 3 |
Survey | Not_clicked | 14 | 2020-01-01 13:50:00.000 | 3 |
cancel | watch_all | 15 | 2020-01-01 13:55:00.000 | 3 |
Survey | Clicked | 16 | 2020-01-01 14:15:00.000 | 3 |
Then using QUALIFY/ROW_NUMBER on that we can keep the first of each groups... but qualify cannot we used in the same block as a CONDITIONAL_TRUE_EVENT.. so nested it is:
select * from (
select *
,CONDITIONAL_TRUE_EVENT(page_name='subscription_id')over(order by page_num) as event_grp
from data
)
qualify row_number()over(partition by event_grp, page_name order by timestamp) = 1
PAGE_NAME | PAGE_VALUE | PAGE_NUM | TIMESTAMP | EVENT_GRP |
---|---|---|---|---|
subscription_id | 12345 | 5 | 2020-01-01 13:00:00.000 | 1 |
cancel | no_interest | 6 | 2020-01-01 13:05:00.000 | 1 |
Survey | Clicked | 8 | 2020-01-01 13:15:00.000 | 1 |
subscription_id | 456 | 10 | 2020-01-01 13:25:00.000 | 2 |
cancel | don't_like | 11 | 2020-01-01 13:30:00.000 | 2 |
subscription_id | 789 | 13 | 2020-01-01 13:45:00.000 | 3 |
Survey | Not_clicked | 14 | 2020-01-01 13:50:00.000 | 3 |
cancel | watch_all | 15 | 2020-01-01 13:55:00.000 | 3 |
Now to do the ORDER of event filter:
with data(session, page_name, page_value, page_num, timestamp) as (
select * from values
(1,'subscription_id', '12345', 5, '2020-01-01 13:00'::timestamp),
(1,'cancel', 'no_interest', 6, '2020-01-01 13:05'::timestamp),
(1,'cancel', 'watch_all', 7, '2020-01-01 13:10'::timestamp),
(1,'Survey', 'Clicked', 8, '2020-01-01 13:15'::timestamp),
(1,'Survey', 'Not_clicked', 9, '2020-01-01 13:20'::timestamp),
(1,'subscription_id', '456', 10, '2020-01-01 13:25'::timestamp),
(1,'cancel', 'don''t_like', 11, '2020-01-01 13:30'::timestamp),
(1,'cancel', 'bored', 12, '2020-01-01 13:40'::timestamp),
(1,'subscription_id', '789', 13, '2020-01-01 13:45'::timestamp),
(1,'Survey', 'Not_clicked', 14, '2020-01-01 13:50'::timestamp),
(1,'cancel', 'watch_all', 15, '2020-01-01 13:55'::timestamp),
(1,'Survey', 'Clicked', 16, '2020-01-01 14:15'::timestamp)
), step_one as (
select * from (
select *
,CONDITIONAL_TRUE_EVENT(page_name='subscription_id')over(partition by session order by page_num) as event_grp
from data
)
qualify row_number()over(partition by session, event_grp, page_name order by timestamp) = 1
)--, step_two as (
select *
,row_number()over(partition by session, event_grp order by timestamp) as rn
from step_one
qualify case
when page_name='subscription_id' then true
when page_name='cancel' and rn = 2 then true
when page_name='Survey' then true
else false end
;)
gives:
SESSION | PAGE_NAME | PAGE_VALUE | PAGE_NUM | TIMESTAMP | EVENT_GRP | RN |
---|---|---|---|---|---|---|
1 | subscription_id | 12345 | 5 | 2020-01-01 13:00:00.000 | 1 | 1 |
1 | cancel | no_interest | 6 | 2020-01-01 13:05:00.000 | 1 | 2 |
1 | Survey | Clicked | 8 | 2020-01-01 13:15:00.000 | 1 | 3 |
1 | subscription_id | 456 | 10 | 2020-01-01 13:25:00.000 | 2 | 1 |
1 | cancel | don't_like | 11 | 2020-01-01 13:30:00.000 | 2 | 2 |
1 | subscription_id | 789 | 13 | 2020-01-01 13:45:00.000 | 3 | 1 |
1 | Survey | Not_clicked | 14 | 2020-01-01 13:50:00.000 | 3 | 2 |
so now we can unpivot this with MAX and IFF, The iff is used to select the only value we want, and the MAX take the non-null value (which due to the ROW_NUMBER fitlering of the prior step there is only one non-null value per column)
with data(session, page_name, page_value, page_num, timestamp) as (
select * from values
(1,'subscription_id', '12345', 5, '2020-01-01 13:00'::timestamp),
(1,'cancel', 'no_interest', 6, '2020-01-01 13:05'::timestamp),
(1,'cancel', 'watch_all', 7, '2020-01-01 13:10'::timestamp),
(1,'Survey', 'Clicked', 8, '2020-01-01 13:15'::timestamp),
(1,'Survey', 'Not_clicked', 9, '2020-01-01 13:20'::timestamp),
(1,'subscription_id', '456', 10, '2020-01-01 13:25'::timestamp),
(1,'cancel', 'don''t_like', 11, '2020-01-01 13:30'::timestamp),
(1,'cancel', 'bored', 12, '2020-01-01 13:40'::timestamp),
(1,'subscription_id', '789', 13, '2020-01-01 13:45'::timestamp),
(1,'Survey', 'Not_clicked', 14, '2020-01-01 13:50'::timestamp),
(1,'cancel', 'watch_all', 15, '2020-01-01 13:55'::timestamp),
(1,'Survey', 'Clicked', 16, '2020-01-01 14:15'::timestamp)
), step_one as (
select * from (
select *
,CONDITIONAL_TRUE_EVENT(page_name='subscription_id')over(partition by session order by page_num) as event_grp
from data
)
qualify row_number()over(partition by session, event_grp, page_name order by timestamp) = 1
), step_two as (
select *
,row_number()over(partition by session, event_grp order by timestamp) as rn
from step_one
qualify case
when page_name='subscription_id' then true
when page_name='cancel' and rn = 2 then true
when page_name='Survey' then true
else false end
)
select
session
,max(iff(page_name='subscription_id',page_value, null)) as subscription_id
,max(iff(page_name='cancel',page_value, null)) as cancel_reason
,max(iff(page_name='cancel',timestamp, null)) as cancel_timestamp
,max(iff(page_name='Survey',page_value, null)) as Survey_reason
,max(iff(page_name='Survey',timestamp, null)) as Survey_timestamp
from step_two
group by session, event_grp
order by session, event_grp
gives:
SESSION | SUBSCRIPTION_ID | CANCEL_REASON | CANCEL_TIMESTAMP | SURVEY_REASON | SURVEY_TIMESTAMP |
---|---|---|---|---|---|
1 | 12345 | no_interest | 2020-01-01 13:05:00.000 | Clicked | 2020-01-01 13:15:00.000 |
1 | 456 | don't_like | 2020-01-01 13:30:00.000 | null | null |
1 | 789 | null | null | Not_clicked | 2020-01-01 13:50:00.000 |
Not how I would solve this:
with data(session, page_name, page_value, page_num, timestamp) as (
select * from values
(1,'subscription_id', '12345', 5, '2020-01-01 13:00'::timestamp),
(1,'cancel', 'no_interest', 6, '2020-01-01 13:05'::timestamp),
(1,'cancel', 'watch_all', 7, '2020-01-01 13:10'::timestamp),
(1,'Survey', 'Clicked', 8, '2020-01-01 13:15'::timestamp),
(1,'Survey', 'Not_clicked', 9, '2020-01-01 13:20'::timestamp),
(1,'subscription_id', '456', 10, '2020-01-01 13:25'::timestamp),
(1,'cancel', 'don''t_like', 11, '2020-01-01 13:30'::timestamp),
(1,'cancel', 'bored', 12, '2020-01-01 13:40'::timestamp),
(1,'subscription_id', '789', 13, '2020-01-01 13:45'::timestamp),
(1,'Survey', 'Not_clicked', 14, '2020-01-01 13:50'::timestamp),
(1,'cancel', 'watch_all', 15, '2020-01-01 13:55'::timestamp),
(1,'Survey', 'Clicked', 16, '2020-01-01 14:15'::timestamp)
), step_one as (
select *
,CONDITIONAL_TRUE_EVENT(page_name='subscription_id')over(partition by session order by page_num) as event_grp
from data
)
select
session,
iff(Survey_timestamp is null OR cancel_timestamp<Survey_timestamp, cancel_reason, null) as cancel_reason,
iff(Survey_timestamp is null OR cancel_timestamp<Survey_timestamp, cancel_timestamp, null) as cancel_timestamp,
Survey_reason,
Survey_timestamp
from (
select distinct
session, event_grp
,first_value(iff(page_name='subscription_id',page_value, null))ignore nulls over(partition by session, event_grp order by timestamp) as subscription_id
,first_value(iff(page_name='cancel',page_value, null))ignore nulls over(partition by session, event_grp order by timestamp) as cancel_reason
,first_value(iff(page_name='cancel',timestamp, null))ignore nulls over(partition by session, event_grp order by timestamp) as cancel_timestamp
,first_value(iff(page_name='Survey',page_value, null))ignore nulls over(partition by session, event_grp order by timestamp) as Survey_reason
,first_value(iff(page_name='Survey',timestamp, null))ignore nulls over(partition by session, event_grp order by timestamp) as Survey_timestamp
from step_one
)
order by session, event_grp
Upvotes: 1