Reputation: 59
I am trying to identify shopping baskets from transactions in a supermarket.
I have customer id and transaction ID and the time of that the item was picked from the shelf - the transaction. I do have a sort of basket ID but it isn't accurate. I can see transactions in the basket which are at different times of the day so I can see that they should be different basket_id's. It's a flaw in the data I can't do anything about.
What I can see if that if the difference between one transaction and another is greater than 20 mins then it's a different basket. I do this using the Lag function in SQL.
I add a flag to the transaction with value 1 where that time is greater than 20 mins. So I have basket id's which will actually be 1+ baskets.
Any ideas how I can create real_basket_id?
Many thanks
Andrew
Upvotes: 0
Views: 299
Reputation: 3906
As variant you can try to use recursion. See my example.
CREATE TABLE #baskets(
buyer_id int,
basket_id int,
trans_time datetime
)
INSERT #baskets(buyer_id,basket_id,trans_time)VALUES
(1,11,DATETIMEFROMPARTS(2017,12,14,1,0,0,0)),
(1,12,DATETIMEFROMPARTS(2017,12,14,1,5,0,0)),
(1,12,DATETIMEFROMPARTS(2017,12,14,1,15,0,0)),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,50,0,0)),
(2,21,DATETIMEFROMPARTS(2017,12,14,2,0,0,0)),
(2,22,DATETIMEFROMPARTS(2017,12,14,2,45,0,0))
SELECT *
FROM #baskets
ORDER BY buyer_id,trans_time
;WITH numBaskCTE AS(
SELECT
buyer_id,
basket_id,
trans_time,
ROW_NUMBER()OVER(PARTITION BY buyer_id ORDER BY trans_time) n
FROM #baskets
),
checkBaskCTE AS(
SELECT
buyer_id,
basket_id,
trans_time,
n,
basket_id real_basket_id,
trans_time prev_time
FROM numBaskCTE
WHERE n=1
UNION ALL
SELECT
n.buyer_id,
n.basket_id,
n.trans_time,
n.n,
IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.basket_id,n.basket_id),
IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.prev_time,n.trans_time) prev_time
FROM checkBaskCTE c
JOIN numBaskCTE n ON n.buyer_id=c.buyer_id AND n.n=c.n+1
)
SELECT
buyer_id,
basket_id,
trans_time,
real_basket_id
FROM checkBaskCTE
ORDER BY buyer_id,trans_time
DROP TABLE #baskets
If you have column real_basket_id
then you can use update only for new rows WHERE real_basket_id IS NULL
.
CREATE TABLE #baskets(
buyer_id int,
basket_id int,
trans_time datetime,
real_basket_id int
)
INSERT #baskets(buyer_id,basket_id,trans_time,real_basket_id)VALUES
(1,10,DATETIMEFROMPARTS(2017,12,12,21,40,0,0),10),
(1,11,DATETIMEFROMPARTS(2017,12,13,22,30,0,0),11),
(1,12,DATETIMEFROMPARTS(2017,12,14,1,0,0,0),NULL),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,5,0,0),NULL),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,15,0,0),NULL),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,50,0,0),NULL),
(2,21,DATETIMEFROMPARTS(2017,12,14,2,0,0,0),NULL),
(2,22,DATETIMEFROMPARTS(2017,12,14,2,45,0,0),NULL),
(3,30,DATETIMEFROMPARTS(2017,12,12,21,40,0,0),30),
(3,31,DATETIMEFROMPARTS(2017,12,14,0,54,0,0),31),
(3,32,DATETIMEFROMPARTS(2017,12,14,1,0,0,0),NULL),
(3,33,DATETIMEFROMPARTS(2017,12,14,1,5,0,0),NULL)
SELECT *
FROM #baskets
WHERE real_basket_id IS NULL -- only new rows
ORDER BY buyer_id,trans_time
;WITH numBaskCTE AS(
-- all new transactions + one last transaction for each buyers
SELECT
buyer_id,
basket_id,
real_basket_id,
trans_time,
ROW_NUMBER()OVER(PARTITION BY buyer_id ORDER BY trans_time) n
FROM
(
SELECT *,LEAD(real_basket_id)OVER(PARTITION BY buyer_id ORDER BY trans_time) next_real_basket_id
FROM #baskets
) q
WHERE next_real_basket_id IS NULL
),
checkBaskCTE AS(
SELECT
buyer_id,
basket_id,
trans_time,
n,
ISNULL(real_basket_id,basket_id) real_basket_id,
trans_time prev_time,
IIF(real_basket_id IS NULL,1,0) is_new_row
FROM numBaskCTE
WHERE n=1
UNION ALL
SELECT
n.buyer_id,
n.basket_id,
n.trans_time,
n.n,
IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.basket_id,n.basket_id),
IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.prev_time,n.trans_time) prev_time,
1 is_new_row
FROM checkBaskCTE c
JOIN numBaskCTE n ON n.buyer_id=c.buyer_id AND n.n=c.n+1
)
UPDATE b
SET
b.real_basket_id=q.real_basket_id
FROM #baskets b
JOIN
(
SELECT
buyer_id,
basket_id,
trans_time,
real_basket_id
FROM checkBaskCTE
WHERE is_new_row=1
) q
ON b.buyer_id=q.buyer_id AND b.trans_time=q.trans_time
SELECT *
FROM #baskets
ORDER BY buyer_id,trans_time
DROP TABLE #baskets
Upvotes: 1