bnsh
bnsh

Reputation: 840

Window based averages based on ranges within another table

I have a table that has positions like so:

create or replace table data (
  pos int not null,
  val float not null,
  constraint data_pk primary key (pos)
);

And, a ranges type table like so:

create or replace table ranges (
  label varchar(32) not null,
  left int not null,
  right int not null,
  constraint ranges_pk primary key (label)
);

with ranges like

        ('alpha', 11, 13),
        ('bravo', 11, 14),
        ('charlie', 11, 15),
        ('echo', 12, 18),
        ('delta', 12, 19),
        ('foxtrot', 13, 20)

for each label, I need to lookup every possible 3 subrange within the "data" table, take these 3 subrange averages, and then average them...

I couldn't think of a good way to describe what I'm after, so I thought I'd show what I'd expect for 'charlie':

The results for charlie in the select should be:

    ('charlie', 40.111), -- avg(avg(data[pos=11], data[pos=12], data[pos=13]), avg(data[pos=12], data[pos=13], data[pos=14]), avg(data[pos=13], data[pos=14], data[pos=15]))
                         -- -> avg(avg(31, 37, 41), avg(37, 41, 43), avg(41, 43, 47))
                         -- -> avg(36.333, 40.333, 43.667) -> 40.111

(for data like)

insert into data (pos, val) values
    (1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
    (9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
    (17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
    (25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
    (33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
    (41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
    (49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251);

Is there a way to do this within Snowflake SQL? Or must I resort to python to do this? If it helps I made a gist with more data..

Thanks!

Upvotes: 3

Views: 50

Answers (2)

Simeon Pilgrim
Simeon Pilgrim

Reputation: 25968

Here is a step by step answer:

WITH data(pos, val) AS (
    SELECT * FROM VALUES
      (1, 2), (2, 3), (3, 5), (4, 7), (5, 11), (6, 13), (7, 17), (8, 19),
      (9, 23), (10, 29), (11, 31), (12, 37), (13, 41), (14, 43), (15, 47), (16, 53),
      (17, 59), (18, 61), (19, 67), (20, 71), (21, 73), (22, 79), (23, 83), (24, 89),
      (25, 97), (26, 101), (27, 103), (28, 107), (29, 109), (30, 113), (31, 127), (32, 131),
      (33, 137), (34, 139), (35, 149), (36, 151), (37, 157), (38, 163), (39, 167), (40, 173),
      (41, 179), (42, 181), (43, 191), (44, 193), (45, 197), (46, 199), (47, 211), (48, 223),
      (49, 227), (50, 229), (51, 233), (52, 239), (53, 241), (54, 251)
), codes(name,s_val, e_val) AS (
     SELECT * FROM VALUES
        ('alpha', 11, 13),
        ('bravo', 11, 14),
        ('charlie', 11, 15),
        ('echo', 12, 18),
        ('delta', 12, 19),
        ('foxtrot', 13, 20)
), ranges as (
    SELECT row_number() over (order by null)-1 as seq 
    FROM table(generator(rowcount => 200))
), boost_codes AS (
    select c.name
      ,c.s_val + r.seq + 0 as b1
      ,c.s_val + r.seq + 2 as b3
    from codes as c
    join ranges as r 
      ON r.seq <= (e_val - s_val - 2)
), almost_there AS (
    select 
       bc.name
       ,avg(d.val) as partial
    from boost_codes as bc
    join data as d ON d.pos between bc.b1 and bc.b3
    GROUP BY 1, bc.b1
)
SELECT name
    ,round(avg(partial),3) as output
FROM almost_there
GROUP BY 1
ORDER BY 1;

which gives:

NAME OUTPUT
alpha 36.333
bravo 38.333
charlie 40.111
delta 50.778
echo 48.467
foxtrot 55.111

Upvotes: 2

Lukasz Szozda
Lukasz Szozda

Reputation: 175736

Is there a way to do this within Snowflake SQL? Or must I resort to python to do this?

SQL language is expressive enough to handle such case.

Key point here is to use windowed average with windows size of 3 and then average moving averages:

WITH cte AS (
 SELECT r.label, r.left, r.right, d.val, 
        AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos ROWS 
                        BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
 FROM ranges r
 JOIN data d
   ON d.pos BETWEEN r.left AND r.right
  QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2 
)
SELECT label, AVG(r) AS output
FROM cte
GROUP BY label
ORDER BY label;

Output:

enter image description here


Intermediate step to ilustrate:

WITH cte AS (
 SELECT r.label, r.left, r.right, d.val,
        AVG(d.val) OVER(PARTITION BY r.label ORDER BY d.pos 
                        ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS r
 FROM ranges r
 JOIN data d
   ON d.pos BETWEEN r.left AND r.right
  QUALIFY ROW_NUMBER() OVER(PARTITION BY r.label ORDER BY d.pos) > 2 
)
SELECT *
FROM cte
ORDER BY label, r;

Output:

enter image description here

Upvotes: 3

Related Questions