Reputation: 33

SQL First Value function causes duplication

I have a use case where I need to find the first value of a column on basis of a particular partition and use that value in further calculations.

My source table looks like this :

order_date	category	previous points	event	new points
2023-03-02	AC	300	-100	200
2023-04-01	AC	200	-100	100
2023-04-02	AC	100	300	400
2023-04-03	AC	400	-100	300
2023-04-09	AC	300	200	500
2023-04-27	AC	500	-100	400

Source table #2 contains continuous weekly dates for each category:

date	category
2023-03-30	AC
2023-04-06	AC
2023-04-13	AC
2023-04-20	AC
2023-04-27	AC
2023-05-04	AC

My destination table is a join for table 1 and 2 where the order date should fall between the weekly dates. I need to capture the following -

previous_points = start value of the points for that week,

points added = sum of points added for that week,

points removed = sum of points removed for that week,

new points = previous_points + points added - points removed

The final table would look similar to this -

Category	Weekly Dates	previous points	points added	points removed	new points
AC	2023-03-30	200	0	0	200
AC	2023-04-06	200	300	200	300
AC	2023-04-13	300	200	0	500
AC	2023-04-20	500	0	0	500
AC	2023-04-27	500	0	100	400
AC	2023-05-04	400	0	0	400

I have used first value to find the start points for that week's partition. On running it, I get duplicate records in the result. Please advice a better way to reach this result.

select 
    t2.date, t2.category,
    first_value(previous_points) over (partition by t2.category, t2.date order by order_date asc) as previous_points,
    sum(case when sign(event) = 1 then abs(event_value) else 0 end) as points added,
    sum(case when sign(event) = -1 then abs(event_value) else 0 end) as points removed
from 
    table2 t2
left join 
    table1 t1 on t2.category = t1.category
              and t1.order_date between t2.date-6 and t2.date
group by t2.date, t2.category

Upvotes: 0

Answers (2)

Jaytiger

Reputation: 12264

You might consider below as well.

-- sample data
WITH table1 AS (
  SELECT DATE '2023-03-02' order_date, 'AC' catetory, 300 prev_pt, -100 event, 200 new_pt UNION ALL
  SELECT '2023-04-01' order_date, 'AC' catetory, 200 prev_pt, -100 event, 100 new_pt UNION ALL
  SELECT '2023-04-02' order_date, 'AC' catetory, 100 prev_pt, 300 event, 400 new_pt UNION ALL
  SELECT '2023-04-03' order_date, 'AC' catetory, 400 prev_pt, -100 event, 300 new_pt UNION ALL
  SELECT '2023-04-09' order_date, 'AC' catetory, 300 prev_pt, 200 event, 500 new_pt UNION ALL
  SELECT '2023-04-27' order_date, 'AC' catetory, 500 prev_pt, -100 event, 400 new_pt
),
table2 AS (
  SELECT date, 'AC' category 
    FROM UNNEST(GENERATE_DATE_ARRAY(DATE '2023-03-30', '2023-05-04', INTERVAL 1 WEEK)) date
)
-- query starts here
SELECT date, category,
       COALESCE(prev_pt, LAST_VALUE(new_pt IGNORE NULLS) OVER w0, LAST_VALUE(prev_pt IGNORE NULLS) OVER w1) AS prev_pt,
       COALESCE(added_pt, 0) AS added_pt,
       COALESCE(removed_pt, 0) AS removed_pt,
       COALESCE(new_pt, LAST_VALUE(new_pt IGNORE NULLS) OVER w0, LAST_VALUE(prev_pt IGNORE NULLS) OVER w1) AS new_pt,
  FROM table2 t2 LEFT JOIN (
    SELECT EXTRACT(WEEK(FRIDAY) FROM order_date) weekno,
           ANY_VALUE(prev_pt HAVING MIN order_date) prev_pt,
           ANY_VALUE(new_pt HAVING MAX order_date) new_pt,
           SUM(IF(event > 0, event, 0)) added_pt,
           SUM(IF(event < 0, -1 * event, 0)) removed_pt,
      FROM table1 GROUP BY 1
  ) t1 ON t1.weekno = EXTRACT(WEEK(FRIDAY) FROM t2.date)
WINDOW w0 AS (PARTITION BY category ORDER BY date),
       w1 AS (PARTITION BY category ORDER BY date DESC)
ORDER BY date;

-- query result
+------------+----------+---------+----------+------------+--------+
|    date    | category | prev_pt | added_pt | removed_pt | new_pt |
+------------+----------+---------+----------+------------+--------+
| 2023-03-30 | AC       |     200 |        0 |          0 |    200 |
| 2023-04-06 | AC       |     200 |      300 |        200 |    300 |
| 2023-04-13 | AC       |     300 |      200 |          0 |    500 |
| 2023-04-20 | AC       |     500 |        0 |          0 |    500 |
| 2023-04-27 | AC       |     500 |        0 |        100 |    400 |
| 2023-05-04 | AC       |     400 |        0 |          0 |    400 |
+------------+----------+---------+----------+------------+--------+

Upvotes: 2

ValNik

Reputation: 5846

Such a simplified solution as you expect is, in my opinion, unrealizable. For example, the calculation first_value(previous_points) will surely cause an error: column "t1.previous_points" must appear in the GROUP BY clause or be used in an aggregate function.
Try this example with test data

create table table1 (order_date date,category varchar(10),previous_points int
  ,event_value int, new_points int);
insert into table1 values
 ('2023-03-02','AC',300,    -100,   200)
,('2023-04-01','AC',200,    -100,   100)
,('2023-04-02','AC',100,    300,    400)
,('2023-04-03','AC',400,    -100,   300)
,('2023-04-09','AC',300,    200,    500)
,('2023-04-27','AC',500,    -100,   400)
;
create table table2 (weekdate date,category varchar(10));
insert into table2 values
 ('2023-03-30','AC')
,('2023-04-06','AC')
,('2023-04-13','AC')
,('2023-04-20','AC')
,('2023-04-27','AC')
,('2023-05-04','AC')
;

Test query

with weekT as(  --table of week date with some additional fields for to simplify calculations (first date of week
select *
 -- ,dateadd(d,-6,weekdate) beg_weekdate   --SQL server
  ,weekdate - interval '6 days' as  beg_weekdate  --postgres
from table2
)
,points as( -- next date for events table for join with week table
select *
  ,lead(order_date,1,cast('2900-01-01' as date))
         over(partition by category order by order_date) next_date
from table1
)
,t3 as(
select w.weekdate,w.category,order_date
  ,case when p.order_date between w.beg_weekdate and w.weekdate
      then previous_points
   else new_points end previous_points
  ,case when p.order_date between w.beg_weekdate and w.weekdate
      then event_value
   else 0 end event_value
  ,new_points
   -- row order for taking first row of week
  ,row_number()over(partition by w.category,w.weekdate  order by p.order_date ) dn
   -- row count for taking last row of week
  ,count(*)over(partition by w.category,w.weekdate) mdn
  ,next_date,beg_weekdate,previous_points as previous_points2,event_value as event_value2
from weekT w
left join points p on p.category=w.category
   and (   -- usual join table1<->table2 if order_date belongs to week
         (p.order_date between w.beg_weekdate and w.weekdate)
           -- join if order_date out of weeks or no order_date belongs to week
           -- this is main idea of solution
       or(p.order_date<w.beg_weekdate and p.next_date>w.weekdate)
      )
)
select category,weekdate,max(beg_weekdate)prev_weekdate
  ,min(case when dn=1 then previous_points end) previous_points
  ,sum(case when sign(event_value) = 1 then abs(event_value) else 0 end) as points_added
  ,sum(case when sign(event_value) = -1 then abs(event_value) else 0 end) as points_removed
  ,min(case when dn=mdn then new_points end) new_points
from t3
group by category,weekdate

I want to show an intermediate result

weekdate	cat	order_date	prev.p-s	ev.value	new.p-s	dn	mdn	next_date	beg.week
2023-03-30	AC	2023-03-02	200	0	200	1	1	2023-04-01	2023-03-24
2023-04-06	AC	2023-04-01	200	-100	100	1	3	2023-04-02	2023-03-31
2023-04-06	AC	2023-04-02	100	300	400	2	3	2023-04-03	2023-03-31
2023-04-06	AC	2023-04-03	400	-100	300	3	3	2023-04-09	2023-03-31
2023-04-13	AC	2023-04-09	300	200	500	1	1	2023-04-27	2023-04-07
2023-04-20	AC	2023-04-09	500	0	500	1	1	2023-04-27	2023-04-14
2023-04-27	AC	2023-04-27	500	-100	400	1	1	2900-01-01	2023-04-21
2023-05-04	AC	2023-04-27	400	0	400	1	1	2900-01-01	2023-04-28

Query result

category	weekdate	previous_points	points_added	points_removed	new_points
AC	2023-03-30	200	0	0	200
AC	2023-04-06	200	300	200	300
AC	2023-04-13	300	200	0	500
AC	2023-04-20	500	0	0	500
AC	2023-04-27	500	0	100	400
AC	2023-05-04	400	0	0	400

Example

Upvotes: 0

SQL First Value function causes duplication

Answers (2)

Related Questions