Reputation: 33
I have a use case where I need to find the first value of a column on basis of a particular partition and use that value in further calculations.
My source table looks like this :
order_date | category | previous points | event | new points |
---|---|---|---|---|
2023-03-02 | AC | 300 | -100 | 200 |
2023-04-01 | AC | 200 | -100 | 100 |
2023-04-02 | AC | 100 | 300 | 400 |
2023-04-03 | AC | 400 | -100 | 300 |
2023-04-09 | AC | 300 | 200 | 500 |
2023-04-27 | AC | 500 | -100 | 400 |
Source table #2 contains continuous weekly dates for each category:
date | category |
---|---|
2023-03-30 | AC |
2023-04-06 | AC |
2023-04-13 | AC |
2023-04-20 | AC |
2023-04-27 | AC |
2023-05-04 | AC |
My destination table is a join for table 1 and 2 where the order date should fall between the weekly dates. I need to capture the following -
previous_points = start value of the points for that week,
points added = sum of points added for that week,
points removed = sum of points removed for that week,
new points = previous_points + points added - points removed
The final table would look similar to this -
Category | Weekly Dates | previous points | points added | points removed | new points |
---|---|---|---|---|---|
AC | 2023-03-30 | 200 | 0 | 0 | 200 |
AC | 2023-04-06 | 200 | 300 | 200 | 300 |
AC | 2023-04-13 | 300 | 200 | 0 | 500 |
AC | 2023-04-20 | 500 | 0 | 0 | 500 |
AC | 2023-04-27 | 500 | 0 | 100 | 400 |
AC | 2023-05-04 | 400 | 0 | 0 | 400 |
I have used first value to find the start points for that week's partition. On running it, I get duplicate records in the result. Please advice a better way to reach this result.
select
t2.date, t2.category,
first_value(previous_points) over (partition by t2.category, t2.date order by order_date asc) as previous_points,
sum(case when sign(event) = 1 then abs(event_value) else 0 end) as points added,
sum(case when sign(event) = -1 then abs(event_value) else 0 end) as points removed
from
table2 t2
left join
table1 t1 on t2.category = t1.category
and t1.order_date between t2.date-6 and t2.date
group by t2.date, t2.category
Upvotes: 0
Views: 157
Reputation: 12264
You might consider below as well.
-- sample data
WITH table1 AS (
SELECT DATE '2023-03-02' order_date, 'AC' catetory, 300 prev_pt, -100 event, 200 new_pt UNION ALL
SELECT '2023-04-01' order_date, 'AC' catetory, 200 prev_pt, -100 event, 100 new_pt UNION ALL
SELECT '2023-04-02' order_date, 'AC' catetory, 100 prev_pt, 300 event, 400 new_pt UNION ALL
SELECT '2023-04-03' order_date, 'AC' catetory, 400 prev_pt, -100 event, 300 new_pt UNION ALL
SELECT '2023-04-09' order_date, 'AC' catetory, 300 prev_pt, 200 event, 500 new_pt UNION ALL
SELECT '2023-04-27' order_date, 'AC' catetory, 500 prev_pt, -100 event, 400 new_pt
),
table2 AS (
SELECT date, 'AC' category
FROM UNNEST(GENERATE_DATE_ARRAY(DATE '2023-03-30', '2023-05-04', INTERVAL 1 WEEK)) date
)
-- query starts here
SELECT date, category,
COALESCE(prev_pt, LAST_VALUE(new_pt IGNORE NULLS) OVER w0, LAST_VALUE(prev_pt IGNORE NULLS) OVER w1) AS prev_pt,
COALESCE(added_pt, 0) AS added_pt,
COALESCE(removed_pt, 0) AS removed_pt,
COALESCE(new_pt, LAST_VALUE(new_pt IGNORE NULLS) OVER w0, LAST_VALUE(prev_pt IGNORE NULLS) OVER w1) AS new_pt,
FROM table2 t2 LEFT JOIN (
SELECT EXTRACT(WEEK(FRIDAY) FROM order_date) weekno,
ANY_VALUE(prev_pt HAVING MIN order_date) prev_pt,
ANY_VALUE(new_pt HAVING MAX order_date) new_pt,
SUM(IF(event > 0, event, 0)) added_pt,
SUM(IF(event < 0, -1 * event, 0)) removed_pt,
FROM table1 GROUP BY 1
) t1 ON t1.weekno = EXTRACT(WEEK(FRIDAY) FROM t2.date)
WINDOW w0 AS (PARTITION BY category ORDER BY date),
w1 AS (PARTITION BY category ORDER BY date DESC)
ORDER BY date;
-- query result
+------------+----------+---------+----------+------------+--------+
| date | category | prev_pt | added_pt | removed_pt | new_pt |
+------------+----------+---------+----------+------------+--------+
| 2023-03-30 | AC | 200 | 0 | 0 | 200 |
| 2023-04-06 | AC | 200 | 300 | 200 | 300 |
| 2023-04-13 | AC | 300 | 200 | 0 | 500 |
| 2023-04-20 | AC | 500 | 0 | 0 | 500 |
| 2023-04-27 | AC | 500 | 0 | 100 | 400 |
| 2023-05-04 | AC | 400 | 0 | 0 | 400 |
+------------+----------+---------+----------+------------+--------+
Upvotes: 2
Reputation: 5846
Such a simplified solution as you expect is, in my opinion, unrealizable. For example, the calculation first_value(previous_points)
will surely cause an error: column "t1.previous_points" must appear in the GROUP BY clause or be used in an aggregate function.
Try this example with test data
create table table1 (order_date date,category varchar(10),previous_points int
,event_value int, new_points int);
insert into table1 values
('2023-03-02','AC',300, -100, 200)
,('2023-04-01','AC',200, -100, 100)
,('2023-04-02','AC',100, 300, 400)
,('2023-04-03','AC',400, -100, 300)
,('2023-04-09','AC',300, 200, 500)
,('2023-04-27','AC',500, -100, 400)
;
create table table2 (weekdate date,category varchar(10));
insert into table2 values
('2023-03-30','AC')
,('2023-04-06','AC')
,('2023-04-13','AC')
,('2023-04-20','AC')
,('2023-04-27','AC')
,('2023-05-04','AC')
;
Test query
with weekT as( --table of week date with some additional fields for to simplify calculations (first date of week
select *
-- ,dateadd(d,-6,weekdate) beg_weekdate --SQL server
,weekdate - interval '6 days' as beg_weekdate --postgres
from table2
)
,points as( -- next date for events table for join with week table
select *
,lead(order_date,1,cast('2900-01-01' as date))
over(partition by category order by order_date) next_date
from table1
)
,t3 as(
select w.weekdate,w.category,order_date
,case when p.order_date between w.beg_weekdate and w.weekdate
then previous_points
else new_points end previous_points
,case when p.order_date between w.beg_weekdate and w.weekdate
then event_value
else 0 end event_value
,new_points
-- row order for taking first row of week
,row_number()over(partition by w.category,w.weekdate order by p.order_date ) dn
-- row count for taking last row of week
,count(*)over(partition by w.category,w.weekdate) mdn
,next_date,beg_weekdate,previous_points as previous_points2,event_value as event_value2
from weekT w
left join points p on p.category=w.category
and ( -- usual join table1<->table2 if order_date belongs to week
(p.order_date between w.beg_weekdate and w.weekdate)
-- join if order_date out of weeks or no order_date belongs to week
-- this is main idea of solution
or(p.order_date<w.beg_weekdate and p.next_date>w.weekdate)
)
)
select category,weekdate,max(beg_weekdate)prev_weekdate
,min(case when dn=1 then previous_points end) previous_points
,sum(case when sign(event_value) = 1 then abs(event_value) else 0 end) as points_added
,sum(case when sign(event_value) = -1 then abs(event_value) else 0 end) as points_removed
,min(case when dn=mdn then new_points end) new_points
from t3
group by category,weekdate
I want to show an intermediate result
weekdate | cat | order_date | prev.p-s | ev.value | new.p-s | dn | mdn | next_date | beg.week |
---|---|---|---|---|---|---|---|---|---|
2023-03-30 | AC | 2023-03-02 | 200 | 0 | 200 | 1 | 1 | 2023-04-01 | 2023-03-24 |
2023-04-06 | AC | 2023-04-01 | 200 | -100 | 100 | 1 | 3 | 2023-04-02 | 2023-03-31 |
2023-04-06 | AC | 2023-04-02 | 100 | 300 | 400 | 2 | 3 | 2023-04-03 | 2023-03-31 |
2023-04-06 | AC | 2023-04-03 | 400 | -100 | 300 | 3 | 3 | 2023-04-09 | 2023-03-31 |
2023-04-13 | AC | 2023-04-09 | 300 | 200 | 500 | 1 | 1 | 2023-04-27 | 2023-04-07 |
2023-04-20 | AC | 2023-04-09 | 500 | 0 | 500 | 1 | 1 | 2023-04-27 | 2023-04-14 |
2023-04-27 | AC | 2023-04-27 | 500 | -100 | 400 | 1 | 1 | 2900-01-01 | 2023-04-21 |
2023-05-04 | AC | 2023-04-27 | 400 | 0 | 400 | 1 | 1 | 2900-01-01 | 2023-04-28 |
Query result
category | weekdate | previous_points | points_added | points_removed | new_points |
---|---|---|---|---|---|
AC | 2023-03-30 | 200 | 0 | 0 | 200 |
AC | 2023-04-06 | 200 | 300 | 200 | 300 |
AC | 2023-04-13 | 300 | 200 | 0 | 500 |
AC | 2023-04-20 | 500 | 0 | 0 | 500 |
AC | 2023-04-27 | 500 | 0 | 100 | 400 |
AC | 2023-05-04 | 400 | 0 | 0 | 400 |
Upvotes: 0