Andrew Whittam
Andrew Whittam

Reputation: 59

SQL Server SQL - Increment sequence based on specific value or flag

I am trying to identify shopping baskets from transactions in a supermarket.

I have customer id and transaction ID and the time of that the item was picked from the shelf - the transaction. I do have a sort of basket ID but it isn't accurate. I can see transactions in the basket which are at different times of the day so I can see that they should be different basket_id's. It's a flaw in the data I can't do anything about.

What I can see if that if the difference between one transaction and another is greater than 20 mins then it's a different basket. I do this using the Lag function in SQL.

I add a flag to the transaction with value 1 where that time is greater than 20 mins. So I have basket id's which will actually be 1+ baskets.

Any ideas how I can create real_basket_id?

Many thanks

Andrew

Upvotes: 0

Views: 299

Answers (1)

Sergey Menshov
Sergey Menshov

Reputation: 3906

As variant you can try to use recursion. See my example.

CREATE TABLE #baskets(
  buyer_id int,
  basket_id int,
  trans_time datetime
)

INSERT #baskets(buyer_id,basket_id,trans_time)VALUES
(1,11,DATETIMEFROMPARTS(2017,12,14,1,0,0,0)),
(1,12,DATETIMEFROMPARTS(2017,12,14,1,5,0,0)),
(1,12,DATETIMEFROMPARTS(2017,12,14,1,15,0,0)),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,50,0,0)),
(2,21,DATETIMEFROMPARTS(2017,12,14,2,0,0,0)),
(2,22,DATETIMEFROMPARTS(2017,12,14,2,45,0,0))

SELECT *
FROM #baskets
ORDER BY buyer_id,trans_time

;WITH numBaskCTE AS(
  SELECT
    buyer_id,
    basket_id,
    trans_time,
    ROW_NUMBER()OVER(PARTITION BY buyer_id ORDER BY trans_time) n
  FROM #baskets
),
checkBaskCTE AS(
  SELECT
    buyer_id,
    basket_id,
    trans_time,
    n,
    basket_id real_basket_id,
    trans_time prev_time
  FROM numBaskCTE
  WHERE n=1

  UNION ALL

  SELECT
    n.buyer_id,
    n.basket_id,
    n.trans_time,
    n.n,
    IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.basket_id,n.basket_id),
    IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.prev_time,n.trans_time) prev_time
  FROM checkBaskCTE c
  JOIN numBaskCTE n ON n.buyer_id=c.buyer_id AND n.n=c.n+1
)
SELECT
  buyer_id,
  basket_id,
  trans_time,
  real_basket_id
FROM checkBaskCTE
ORDER BY buyer_id,trans_time

DROP TABLE #baskets

If you have column real_basket_id then you can use update only for new rows WHERE real_basket_id IS NULL.

CREATE TABLE #baskets(
  buyer_id int,
  basket_id int,
  trans_time datetime,
  real_basket_id int
)

INSERT #baskets(buyer_id,basket_id,trans_time,real_basket_id)VALUES
(1,10,DATETIMEFROMPARTS(2017,12,12,21,40,0,0),10),
(1,11,DATETIMEFROMPARTS(2017,12,13,22,30,0,0),11),
(1,12,DATETIMEFROMPARTS(2017,12,14,1,0,0,0),NULL),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,5,0,0),NULL),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,15,0,0),NULL),
(1,13,DATETIMEFROMPARTS(2017,12,14,1,50,0,0),NULL),
(2,21,DATETIMEFROMPARTS(2017,12,14,2,0,0,0),NULL),
(2,22,DATETIMEFROMPARTS(2017,12,14,2,45,0,0),NULL),
(3,30,DATETIMEFROMPARTS(2017,12,12,21,40,0,0),30),
(3,31,DATETIMEFROMPARTS(2017,12,14,0,54,0,0),31),
(3,32,DATETIMEFROMPARTS(2017,12,14,1,0,0,0),NULL),
(3,33,DATETIMEFROMPARTS(2017,12,14,1,5,0,0),NULL)


SELECT *
FROM #baskets
WHERE real_basket_id IS NULL -- only new rows
ORDER BY buyer_id,trans_time

;WITH numBaskCTE AS(
  -- all new transactions + one last transaction for each buyers
  SELECT
    buyer_id,
    basket_id,
    real_basket_id,
    trans_time,
    ROW_NUMBER()OVER(PARTITION BY buyer_id ORDER BY trans_time) n
  FROM
    (
      SELECT *,LEAD(real_basket_id)OVER(PARTITION BY buyer_id ORDER BY trans_time) next_real_basket_id
      FROM #baskets
    ) q
  WHERE next_real_basket_id IS NULL
),
checkBaskCTE AS(
  SELECT
    buyer_id,
    basket_id,
    trans_time,
    n,
    ISNULL(real_basket_id,basket_id) real_basket_id,
    trans_time prev_time,
    IIF(real_basket_id IS NULL,1,0) is_new_row
  FROM numBaskCTE
  WHERE n=1

  UNION ALL

  SELECT
    n.buyer_id,
    n.basket_id,
    n.trans_time,
    n.n,
    IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.basket_id,n.basket_id),
    IIF(DATEDIFF(MINUTE,c.prev_time,n.trans_time)<=20,c.prev_time,n.trans_time) prev_time,
    1 is_new_row
  FROM checkBaskCTE c
  JOIN numBaskCTE n ON n.buyer_id=c.buyer_id AND n.n=c.n+1
)
UPDATE b
SET
  b.real_basket_id=q.real_basket_id
FROM #baskets b
JOIN
  (
    SELECT
      buyer_id,
      basket_id,
      trans_time,
      real_basket_id
    FROM checkBaskCTE
    WHERE is_new_row=1
  ) q
ON b.buyer_id=q.buyer_id AND b.trans_time=q.trans_time

SELECT *
FROM #baskets
ORDER BY buyer_id,trans_time

DROP TABLE #baskets

Upvotes: 1

Related Questions