Graham Polley
Graham Polley

Reputation: 14781

How to ignore nulls in BigQuery using LAG()?

When using LAG() (in BigQuery standard SQL), how do you skip NULL values so that it takes the first preceding value which is not NULL?

I've prepared some sample rows in the same format in the source table, but obfuscated. In the example, it only works for rows that do not have a preceding NULL value. Specifically, rows 3 & 4 should be assigned '2017-01-25 04:02:36' (as was the case for row 5), but they are NULL.

This makes sense. However, surely there is an easy way to do specify something like INGORE_NULLS?

--TEMP
with example as (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_1' as col_d 
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_3' as col_d))
--TEMP
SELECT col_a, col_b, col_c,
  case when val_1_transposed is null then LAG(val_1_transposed) over (order by col_c) else val_1_transposed end as val_1_transposed,
  case when val_2_transposed is null then LAG(val_2_transposed) over (order by col_c) else val_2_transposed end as val_2_transposed,
  case when val_3_transposed is null then LAG(val_3_transposed) over (order by col_c) else val_3_transposed end as val_3_transposed
FROM (
  SELECT col_a, col_b, col_c,
    MAX(IF(col_d = 'val_1', col_c, NULL)) AS val_1_transposed,
    MAX(IF(col_d = 'val_2', col_c, NULL)) AS val_2_transposed,
    MAX(IF(col_d = 'val_3', col_c, NULL)) AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, col_d FROM example) GROUP BY 1,2,3) ORDER BY col_c DESC

enter image description here

Upvotes: 8

Views: 10846

Answers (4)

류홍규
류홍규

Reputation: 1

IF you use LAST_VALUE in Bigquery, you can solve these problem.

I've written that code below.

WITH example AS (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_1' as col_d 
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_3' as col_d))

, base AS (
SELECT col_a, col_b, col_c,
  MAX(IF(col_d = 'val_1', col_c, NULL)) AS val_1_transposed,
  MAX(IF(col_d = 'val_2', col_c, NULL)) AS val_2_transposed,
  MAX(IF(col_d = 'val_3', col_c, NULL)) AS val_3_transposed
FROM (
  SELECT
    col_a, col_b, col_c, col_d
  FROM
    example
)
GROUP BY ALL
)

SELECT
  col_a, col_b, col_c,
  LAST_VALUE(val_1_transposed IGNORE NULLS) OVER(ORDER BY col_c) AS val_1_transposed,
  LAST_VALUE(val_2_transposed IGNORE NULLS) OVER(ORDER BY col_c) AS val_2_transposed,
  LAST_VALUE(val_3_transposed IGNORE NULLS) OVER(ORDER BY col_c) AS val_3_transposed
FROM base
ORDER BY col_c DESC;

The Question

The answer

Upvotes: 0

Mikhail Berlyant
Mikhail Berlyant

Reputation: 172944

Try below version
It is BigQuery adoption of second solution from link provided by Pentium10
Looks like it can outperform the first one - as it involves only one window aggregate function

#standardSQL
--TEMP
-- dummy data here
--TEMP
SELECT 
  col_a, col_b, col_c,
  TIMESTAMP(SUBSTR(MAX(CONCAT(STRING(col_c), STRING(val_1_transposed))) OVER(win), 23)) AS val_1_transposed,  
  TIMESTAMP(SUBSTR(MAX(CONCAT(STRING(col_c), STRING(val_2_transposed))) OVER(win), 23)) AS val_2_transposed, 
  TIMESTAMP(SUBSTR(MAX(CONCAT(STRING(col_c), STRING(val_3_transposed))) OVER(win), 23)) AS val_3_transposed  
FROM (
  SELECT 
    col_a, col_b, col_c, 
    MAX(IF(col_d = 'val_1', col_c, NULL)) AS val_1_transposed,
    MAX(IF(col_d = 'val_2', col_c, NULL)) AS val_2_transposed,
    MAX(IF(col_d = 'val_3', col_c, NULL)) AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, col_d FROM example
  ) 
  GROUP BY 1,2,3
)
WINDOW win AS (PARTITION BY col_a, col_b ORDER BY col_c ROWS UNBOUNDED PRECEDING)
ORDER BY col_c DESC  

Upvotes: 1

Mikhail Berlyant
Mikhail Berlyant

Reputation: 172944

Try below

#standardSQL
--TEMP
WITH example AS (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 03:19:50') AS col_c, 'val_1' AS col_d 
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 03:19:50') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 03:19:50') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:23') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:23') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:23') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:59') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:59') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:01:59') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:36') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:36') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:36') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:55') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 04:02:55') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 07:16:58') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 07:16:58') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:35:39') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:35:39') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:46:48') AS col_c, 'val_1' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:46:48') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 09:46:48') AS col_c, 'val_3' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 10:47:48') AS col_c, 'val_2' AS col_d)
UNION ALL (SELECT 'some_id' AS col_a, 'foo' AS col_b, TIMESTAMP('2017-01-25 10:47:48') AS col_c, 'val_3' AS col_d))
--TEMP
SELECT 
  col_a, col_b, col_c, 
  (SELECT TIMESTAMP(s) FROM UNNEST(SPLIT(val_1_transposed)) AS s WITH OFFSET pos WHERE pos = 0) AS val_1_transposed, 
  (SELECT TIMESTAMP(s) FROM UNNEST(SPLIT(val_2_transposed)) AS s WITH OFFSET pos WHERE pos = 0) AS val_2_transposed, 
  (SELECT TIMESTAMP(s) FROM UNNEST(SPLIT(val_3_transposed)) AS s WITH OFFSET pos WHERE pos = 0) AS val_3_transposed
FROM (
  SELECT col_a, col_b, col_c,
    CASE WHEN val_1_transposed IS NULL THEN STRING_AGG(CAST(val_1_transposed AS STRING)) OVER (ORDER BY col_c DESC ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) ELSE CAST(val_1_transposed AS STRING) END AS val_1_transposed,
    CASE WHEN val_2_transposed IS NULL THEN STRING_AGG(CAST(val_2_transposed AS STRING)) OVER (ORDER BY col_c DESC ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) ELSE CAST(val_2_transposed AS STRING) END AS val_2_transposed,
    CASE WHEN val_3_transposed IS NULL THEN STRING_AGG(CAST(val_3_transposed AS STRING)) OVER (ORDER BY col_c DESC ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING) ELSE CAST(val_3_transposed AS STRING) END AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, 
      MAX(IF(col_d = 'val_1', col_c, NULL)) AS val_1_transposed,
      MAX(IF(col_d = 'val_2', col_c, NULL)) AS val_2_transposed,
      MAX(IF(col_d = 'val_3', col_c, NULL)) AS val_3_transposed
    FROM (
      SELECT col_a, col_b, col_c, col_d FROM example
    ) GROUP BY 1,2,3
  ) 
)
ORDER BY col_c DESC  

I realized above looks overly heavy so - below is slightly refactored version (which is now even lighter than your original code):

#standardSQL
CREATE TEMP FUNCTION GetFirst(list STRING) AS 
((SELECT TIMESTAMP(s) FROM UNNEST(SPLIT(list)) AS s WITH OFFSET pos WHERE pos = 0));
--TEMP
-- dummy data here ...
--TEMP
SELECT col_a, col_b, col_c,
  GetFirst(IFNULL(val_1_transposed, STRING_AGG(val_1_transposed) OVER (lookback))) AS val_1_transposed,
  GetFirst(IFNULL(val_2_transposed, STRING_AGG(val_2_transposed) OVER (lookback))) AS val_2_transposed,
  GetFirst(IFNULL(val_3_transposed, STRING_AGG(val_3_transposed) OVER (lookback))) AS val_3_transposed
FROM (
  SELECT col_a, col_b, col_c, 
    STRING(MAX(IF(col_d = 'val_1', col_c, NULL))) AS val_1_transposed,
    STRING(MAX(IF(col_d = 'val_2', col_c, NULL))) AS val_2_transposed,
    STRING(MAX(IF(col_d = 'val_3', col_c, NULL))) AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, col_d FROM example
  ) GROUP BY 1,2,3
) 
WINDOW lookback AS (ORDER BY col_c DESC ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
ORDER BY col_c DESC

Upvotes: 1

Pentium10
Pentium10

Reputation: 207830

There are two solutions, described in detail here: http://sqlmag.com/t-sql/last-non-null-puzzle

I adapted one of the solutions, basically it uses a MAX window aggregate function to return the maximum relevant id so far. By using ROWS UNBOUNDED PRECEDING you can constantly reach new MAX levels, than are carried over and replace the NULL lag entries.

--TEMP
with example as (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_1' as col_d 
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 03:19:50') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:23') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:01:59') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:36') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 04:02:55') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 07:16:58') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:35:39') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_1' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 09:46:48') as col_c, 'val_3' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_2' as col_d)
UNION ALL (select 'some_id' as col_a, 'foo' as col_b, timestamp('2017-01-25 10:47:48') as col_c, 'val_3' as col_d))
--TEMP
SELECT col_a, col_b, col_c,
  case when val_1_transposed is null then LAG(val_1_transposed) over (order by col_c) else val_1_transposed end as val_1_transposed,
  case when val_2_transposed is null then LAG(val_2_transposed) over (order by col_c) else val_2_transposed end as val_2_transposed,
  case when val_3_transposed is null then LAG(val_3_transposed) over (order by col_c) else val_3_transposed end as val_3_transposed,
  MAX(val_2_transposed) OVER( PARTITION BY grp ORDER BY col_a ROWS UNBOUNDED PRECEDING ) as lag_ignored_nulls
FROM (
select *, 
  MAX(CASE WHEN val_2_transposed IS NOT NULL THEN col_a END ) OVER( ORDER BY col_a ROWS UNBOUNDED PRECEDING ) AS grp
            from (
  SELECT col_a, col_b, col_c,
    MAX(IF(col_d = 'val_1', col_c, NULL)) AS val_1_transposed,
    MAX(IF(col_d = 'val_2', col_c, NULL)) AS val_2_transposed,
    MAX(IF(col_d = 'val_3', col_c, NULL)) AS val_3_transposed
  FROM (
    SELECT col_a, col_b, col_c, col_d FROM example) GROUP BY 1,2,3)) ORDER BY col_c DESC

enter image description here

Upvotes: 2

Related Questions