Reputation: 2201
Here is a toy example:
select *
from(
select 1 as row_num,298807 as id1,104 as id2,'2018-07-10' as date
union all
select 2,298807,104,'2018-08-02'
union all
select 3,298807,104,'2018-08-06'
union all
select 4,298807,104,'2018-08-08'
union all
select 5,298807,104,'2018-08-24'
union all
select 6,298807,104,'2018-09-28'
union all
select 7,298807,104,'2018-10-01'
union all
select 8,298807,104,'2018-10-28'
union all
select 9,298807,300,'2018-10-30'
union all
select 10,298807,104,'2018-11-12'
union all
select 11,298807,300,'2018-11-20'
union all
select 12,298807,104,'2018-11-30'
union all
select 13,298807,104,'2018-12-02'
union all
select 14,298807,104,'2018-12-03')
For each row, I would like to find the last row of distinct id2 within id1. For example for row #11 the output should be an array of two elements “row #9, row#10”, and for row #14 it is “row #11,row #13”.
Upvotes: 1
Views: 938
Reputation: 172993
Below is for BigQuery Standard SQL
#standardSQL
SELECT * EXCEPT(candidates),
ARRAY_TO_STRING(ARRAY(
SELECT CAST(MAX(row_num) AS STRING) row_num
FROM t.candidates
GROUP BY id2
ORDER BY row_num
), ',') AS output
FROM (
SELECT *, ARRAY_AGG(STRUCT(id2, row_num)) OVER(win) candidates
FROM `project.dataset.table`
WINDOW win AS (PARTITION BY id1 ORDER BY row_num ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING)
) t
-- ORDER BY row_num
If to apply to sample data from your question - output is
Row row_num id1 id2 date output
1 1 298807 104 2018-07-10
2 2 298807 104 2018-08-02 1
3 3 298807 104 2018-08-06 2
4 4 298807 104 2018-08-08 3
5 5 298807 104 2018-08-24 4
6 6 298807 104 2018-09-28 5
7 7 298807 104 2018-10-01 6
8 8 298807 104 2018-10-28 7
9 9 298807 300 2018-10-30 8
10 10 298807 104 2018-11-12 8,9
11 11 298807 300 2018-11-20 10,9
12 12 298807 104 2018-11-30 10,11
13 13 298807 104 2018-12-02 11,12
14 14 298807 104 2018-12-03 11,13
Upvotes: 2
Reputation: 1269803
You can do this using arrays. Amass a big array with all the values. Then use unnest()
and some aggregation logic to get what you want:
with t as (
select *
from(
select 1 as row_num,298807 as id1,104 as id2,'2018-07-10' as date
union all
select 2,298807,104,'2018-08-02'
union all
select 3,298807,104,'2018-08-06'
union all
select 4,298807,104,'2018-08-08'
union all
select 5,298807,104,'2018-08-24'
union all
select 6,298807,104,'2018-09-28'
union all
select 7,298807,104,'2018-10-01'
union all
select 8,298807,104,'2018-10-28'
union all
select 9,298807,300,'2018-10-30'
union all
select 10,298807,104,'2018-11-12'
union all
select 11,298807,300,'2018-11-20'
union all
select 12,298807,104,'2018-11-30'
union all
select 13,298807,104,'2018-12-02'
union all
select 14,298807,104,'2018-12-03')
)
select tt.* except (ar),
(select array_agg(row_num order by el.row_num)
from (select el.id2, max(el.row_num) as row_num
from unnest(ar) el
where el.row_num <= tt.row_num
group by el.id2
) el
) as id2s
from (select t.*,
array_agg(struct(row_num, id2)) over (partition by id1) as ar
from t
) tt;
Upvotes: 1