waltavista
waltavista

Reputation: 157

SQL: Fill missing values multiple columns

With given data and sql command

WITH
  TableItem AS (
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0004' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 4 AS Quantity UNION ALL
  # missing 2018-05-01
  # missing Item18-004
  # new Item18-0006
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0005' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 5 AS Quantity UNION ALL
  # missing Item18-0004, Item18-0005
  # new Item18-0006
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0006' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 6 AS Quantity UNION ALL
  # some missing
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 3 AS Quantity UNION ALL
  # some missing, some new
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0005' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 5 AS Quantity UNION ALL
  SELECT 'Item18-0007' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 7 AS Quantity ),
  # Cross Join to get all combinations of ItemNr and PostingDate
  TableItemNrPostingDate AS (
  SELECT
    ItemNr,
    PostingDate
  FROM (
    SELECT
      it1.ItemNr
    FROM
      TableItem it1
    GROUP BY
      it1.ItemNr ) t2
  CROSS JOIN (
    SELECT
      it2.PostingDate
    FROM
      TableItem it2
    GROUP BY
      it2.PostingDate ) t3 ),
  # Create Calender Table to get missing dates
  TableCalenderDayItemNrPostingDate AS (
  SELECT
    CalenderDay,
    TableItemNrPostingDate.ItemNr As ItemNr,
    TableItemNrPostingDate.PostingDate as PostingDate
  FROM
    UNNEST( GENERATE_DATE_ARRAY("2018-04-30", DATE_ADD(DATE_ADD(DATE_TRUNC(CURRENT_DATE(), MONTH), INTERVAL 1 MONTH), INTERVAL -1 DAY), INTERVAL 1 DAY)) AS CalenderDay
  LEFT JOIN
    TableItemNrPostingDate
  ON
    CalenderDay = DATE(TableItemNrPostingDate.PostingDate)
  ORDER BY
    CalenderDay )
SELECT
  CalenderDay,
  FIRST_VALUE(ItemNr) OVER (PARTITION BY ItemNr ORDER BY CalenderDay ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),
  ItemNr,
  PostingDate
FROM
  TableCalenderDayItemNrPostingDate
ORDER BY
  CalenderDay, ItemNr

I would like get ItemNr, PostingDate, Quantity for every ItemNr and every calender date "from beginning". Missing values has to be filled with previous values.

My table will be filled / updated only with changed quantity and date. That means not all items are updated every day, new items come into the table later and at some days there are no changes at all.

For given data I would like to have the result. Quantity = ItemNr to make it easier to identfy.

Item18-0001   2018-04-30   1
Item18-0002   2018-04-30   2
Item18-0003   2018-04-30   3
Item18-0004   2018-04-30   4
Item18-0005   2018-04-30   0 (or null or empty row)
Item18-0006   2018-04-30   0 (or null or empty row)
Item18-0007   2018-04-30   0 (or null or empty row)

Item18-0001   2018-05-01   1
Item18-0002   2018-05-01   2
Item18-0003   2018-05-01   3
Item18-0004   2018-05-01   4
Item18-0005   2018-05-01   0 (or null or empty row)
Item18-0006   2018-05-01   0 (or null or empty row)
Item18-0007   2018-05-01   0 (or null or empty row)

Item18-0001   2018-05-02   1
Item18-0002   2018-05-02   2
Item18-0003   2018-05-02   3
Item18-0004   2018-05-02   4
Item18-0005   2018-05-02   5
Item18-0006   2018-05-02   0 (or null or empty row)
Item18-0007   2018-05-02   0 (or null or empty row)

Item18-0001   2018-05-03   1
Item18-0002   2018-05-03   2
Item18-0003   2018-05-03   3
Item18-0004   2018-05-03   4
Item18-0005   2018-05-03   5
Item18-0006   2018-05-03   6
Item18-0007   2018-05-03   0 (or null or empty row)

Item18-0001   2018-05-04   1
Item18-0002   2018-05-04   2
Item18-0003   2018-05-04   3
Item18-0004   2018-05-04   4
Item18-0005   2018-05-04   5
Item18-0006   2018-05-04   6
Item18-0007   2018-05-03   0 (or null or empty row)

Item18-0001   2018-05-05   1
Item18-0002   2018-05-05   2
Item18-0003   2018-05-05   3
Item18-0004   2018-05-05   4
Item18-0005   2018-05-05   5
Item18-0006   2018-05-05   6
Item18-0007   2018-05-05   7

My SQL command is not ready and wrong. I would only like to show my effort or intention.

Upvotes: 0

Views: 887

Answers (1)

Mikhail Berlyant
Mikhail Berlyant

Reputation: 172993

Below is for BigQuery StandardSQL

#standardSQL
WITH TableItemNr AS (
  SELECT DISTINCT ItemNr FROM `project.dataset.TableItem`
), TableDates AS (
  SELECT CAST(PostingDate AS TIMESTAMP) PostingDate
  FROM (
    SELECT DATE(MIN(PostingDate)) minPostingDate, DATE(MAX(PostingDate)) maxPostingDate 
    FROM `project.dataset.TableItem`
  ), UNNEST(GENERATE_DATE_ARRAY(minPostingDate, maxPostingDate)) PostingDate --  CURRENT_DATE() can be used instead of maxPostingDate depends on your needs  
)
SELECT i.ItemNr, d.PostingDate, t.Quantity, 
  IF(t.ItemNr IS NULL, 0, 1) original,
  LAST_VALUE(Quantity IGNORE NULLS) OVER(PARTITION BY ItemNr ORDER BY PostingDate) updatedQuantity
FROM TableDates d
CROSS JOIN TableItemNr i
LEFT JOIN `project.dataset.TableItem` t
USING(ItemNr, PostingDate)
-- ORDER BY PostingDate, ItemNr

You can tes / play with above using dummy data from your question

#standardSQL
WITH `project.dataset.TableItem` AS (
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0004' AS ItemNr, TIMESTAMP '2018-04-30' AS PostingDate, 4 AS Quantity UNION ALL
  # missing 2018-05-01
  # missing Item18-004
  # new Item18-0006
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0005' AS ItemNr, TIMESTAMP '2018-05-02' AS PostingDate, 5 AS Quantity UNION ALL
  # missing Item18-0004, Item18-0005
  # new Item18-0006
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0006' AS ItemNr, TIMESTAMP '2018-05-03' AS PostingDate, 6 AS Quantity UNION ALL
  # some missing
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0002' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 2 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-04' AS PostingDate, 3 AS Quantity UNION ALL
  # some missing, some new
  SELECT 'Item18-0001' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 1 AS Quantity UNION ALL
  SELECT 'Item18-0003' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 3 AS Quantity UNION ALL
  SELECT 'Item18-0005' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 5 AS Quantity UNION ALL
  SELECT 'Item18-0007' AS ItemNr, TIMESTAMP '2018-05-05' AS PostingDate, 7 AS Quantity 
), TableItemNr AS (
  SELECT DISTINCT ItemNr FROM `project.dataset.TableItem`
), TableDates AS (
  SELECT CAST(PostingDate AS TIMESTAMP) PostingDate
  FROM (
    SELECT DATE(MIN(PostingDate)) minPostingDate, DATE(MAX(PostingDate)) maxPostingDate 
    FROM `project.dataset.TableItem`
  ), UNNEST(GENERATE_DATE_ARRAY(minPostingDate, maxPostingDate)) PostingDate
)
SELECT i.ItemNr, d.PostingDate, t.Quantity, 
  IF(t.ItemNr IS NULL, 0, 1) original,
  LAST_VALUE(Quantity IGNORE NULLS) OVER(PARTITION BY ItemNr ORDER BY PostingDate) updatedQuantity
FROM TableDates d
CROSS JOIN TableItemNr i
LEFT JOIN `project.dataset.TableItem` t
USING(ItemNr, PostingDate)
ORDER BY PostingDate, ItemNr

with results as

Row ItemNr      PostingDate                 Quantity    original    updatedQuantity  
1   Item18-0001 2018-04-30 00:00:00.000 UTC 1       1   1    
2   Item18-0002 2018-04-30 00:00:00.000 UTC 2       1   2    
3   Item18-0003 2018-04-30 00:00:00.000 UTC 3       1   3    
4   Item18-0004 2018-04-30 00:00:00.000 UTC 4       1   4    
5   Item18-0005 2018-04-30 00:00:00.000 UTC null    0   null     
6   Item18-0006 2018-04-30 00:00:00.000 UTC null    0   null     
7   Item18-0007 2018-04-30 00:00:00.000 UTC null    0   null     
8   Item18-0001 2018-05-01 00:00:00.000 UTC null    0   1    
9   Item18-0002 2018-05-01 00:00:00.000 UTC null    0   2    
10  Item18-0003 2018-05-01 00:00:00.000 UTC null    0   3    
11  Item18-0004 2018-05-01 00:00:00.000 UTC null    0   4    
12  Item18-0005 2018-05-01 00:00:00.000 UTC null    0   null     
13  Item18-0006 2018-05-01 00:00:00.000 UTC null    0   null     
14  Item18-0007 2018-05-01 00:00:00.000 UTC null    0   null     
15  Item18-0001 2018-05-02 00:00:00.000 UTC 1       1   1    
16  Item18-0002 2018-05-02 00:00:00.000 UTC 1       1   1    
17  Item18-0003 2018-05-02 00:00:00.000 UTC 1       1   1    
18  Item18-0004 2018-05-02 00:00:00.000 UTC null    0   4    
19  Item18-0005 2018-05-02 00:00:00.000 UTC 5       1   5    
20  Item18-0006 2018-05-02 00:00:00.000 UTC null    0   null     
21  Item18-0007 2018-05-02 00:00:00.000 UTC null    0   null     
22  Item18-0001 2018-05-03 00:00:00.000 UTC 1       1   1    
23  Item18-0002 2018-05-03 00:00:00.000 UTC 2       1   2    
24  Item18-0003 2018-05-03 00:00:00.000 UTC 3       1   3    
25  Item18-0004 2018-05-03 00:00:00.000 UTC null    0   4    
26  Item18-0005 2018-05-03 00:00:00.000 UTC null    0   5    
27  Item18-0006 2018-05-03 00:00:00.000 UTC 6       1   6    
28  Item18-0007 2018-05-03 00:00:00.000 UTC null    0   null     
29  Item18-0001 2018-05-04 00:00:00.000 UTC null    0   1    
30  Item18-0002 2018-05-04 00:00:00.000 UTC 2       1   2    
31  Item18-0002 2018-05-04 00:00:00.000 UTC 2       1   2    
32  Item18-0003 2018-05-04 00:00:00.000 UTC 3       1   3    
33  Item18-0003 2018-05-04 00:00:00.000 UTC 3       1   3    
34  Item18-0004 2018-05-04 00:00:00.000 UTC null    0   4    
35  Item18-0005 2018-05-04 00:00:00.000 UTC null    0   5    
36  Item18-0006 2018-05-04 00:00:00.000 UTC null    0   6    
37  Item18-0007 2018-05-04 00:00:00.000 UTC null    0   null     
38  Item18-0001 2018-05-05 00:00:00.000 UTC 1       1   1    
39  Item18-0002 2018-05-05 00:00:00.000 UTC null    0   2    
40  Item18-0003 2018-05-05 00:00:00.000 UTC 3       1   3    
41  Item18-0004 2018-05-05 00:00:00.000 UTC null    0   4    
42  Item18-0005 2018-05-05 00:00:00.000 UTC 5       1   5    
43  Item18-0006 2018-05-05 00:00:00.000 UTC null    0   6    
44  Item18-0007 2018-05-05 00:00:00.000 UTC 7       1   7        

Note: for some reason you have your dates as a TIMESTAMP data type do I had to do some extra CAST'ing

Upvotes: 3

Related Questions