Preet Sangha
Preet Sangha

Reputation: 65516

How to fill in gaps in a SQL Table using a single statement as opposed to multiple steps

Imagine I have the following tables:

enter image description here

I want to fill in the gaps on table like so

enter image description here

Now I can do this using multiple steps with CTEs and temp tables building up groups, and max/min values and then inserting them.

However I as wondering if there is a more elegant (!) way of doing this in single statement, that joins the source on itself with the months.

Bonus points for doing this in ansi sql as opposed to tsql, but either is acceptable.

===

here is some sql to create a test

drop table months;
create table months (year int, month int);

insert into months(year, month) values
(2000, 200007),   (2001, 200101),  (2002, 200201),
(2000, 200008),   (2001, 200102),  (2002, 200202),
(2000, 200009),   (2001, 200103),  (2002, 200203),
(2000, 200010),   (2001, 200104),  (2002, 200204),
(2000, 200011),   (2001, 200105),  (2002, 200205),
(2000, 200012),   (2001, 200106),  (2002, 200206),
                  (2001, 200107),  (2002, 200007),
                  (2001, 200108),  (2002, 200208),
                  (2001, 200109),  (2002, 200208),
                  (2001, 200110),
                  (2001, 200111),
                  (2001, 200112)

drop table source;
create table source (name varchar(10), month int, item int, val float);

insert into source(name, month, item, val) values
('bob', 200101, 1, 1.5),
('bob', 200102, 2, 1.5),
('bob', 200103, 2, 2),
('bob', 200108, 1, 0),
('bob', 200109, 10, 6.6),
('bob', 200110, 11, 2.2),
('bob', 200111, 9, 1),
('bob', 200207, 23, 0)

Upvotes: 3

Views: 1197

Answers (2)

Pieter Geerkens
Pieter Geerkens

Reputation: 11893

Using the sample data provided,

if object_id('#months','U') is not null drop table #months;
create table #months (year int, month int);

insert into #months(year, month) values
(2000, 200007),   (2001, 200101),  (2002, 200201),
(2000, 200008),   (2001, 200102),  (2002, 200202),
(2000, 200009),   (2001, 200103),  (2002, 200203),
(2000, 200010),   (2001, 200104),  (2002, 200204),
(2000, 200011),   (2001, 200105),  (2002, 200205),
(2000, 200012),   (2001, 200106),  (2002, 200206),
                  (2001, 200107),  (2002, 200007),
                  (2001, 200108),  (2002, 200208),
                  (2001, 200109),  (2002, 200208),
                  (2001, 200110),
                  (2001, 200111),
                  (2001, 200112)

if object_id('#source','U') is not null drop table #source;
create table #source (name varchar(10), month int, item int, val float);

insert into #source(name, month, item, val) values
('bob', 200101, 1, 1.5),
('bob', 200102, 2, 1.5),
('bob', 200103, 2, 2),
('bob', 200108, 1, 0),
('bob', 200109, 10, 6.6),
('bob', 200110, 11, 2.2),
('bob', 200111, 9, 1),
('bob', 200207, 23, 0)
;

this SQL:

with source as (
    select
         this.*
        ,( select top 1 month 
           from #source 
           where month > this.month 
             and name  = this.name 
           order by month
         ) as NextMonth
    from #source        this
),
months as (
    select 
        m.month, n.name
    from #months m
    cross join (select name from #source group by name) n
    group by 
        m.month, n.name
)
select 
    s.name, 
    m.month,
    s.item, 
    case when m.month = s.month then s.val else 0 end val
from months m
left join source s
   on s.month <= m.month and m.month < s.NextMonth 
  and s.name = m.name
where s.name is not null
order by 
     s.name
    ,m.month;
go

yields as desired:

name       month       item        val
---------- ----------- ----------- ----------------------
bob        200101      1           1.5
bob        200102      2           1.5
bob        200103      2           2
bob        200104      2           2
bob        200105      2           2
bob        200106      2           2
bob        200107      2           2
bob        200108      1           0
bob        200109      10          6.6
bob        200110      11          2.2
bob        200111      9           1
bob        200112      9           1
bob        200201      9           1
bob        200202      9           1
bob        200203      9           1
bob        200204      9           1
bob        200205      9           1
bob        200206      9           1

Edit: Minor corrections to ensure val is 0 for inserted rows

Upvotes: 1

zedfoxus
zedfoxus

Reputation: 37089

SQLFiddle example solution for SQL Server 2008

Perhaps not the most elegant one but it's one way to do it

Table and data

create table sources 
(
  source varchar(20),
  month int,
  item int,
  val decimal
);

insert into sources values
('bob', 200101, 1, 1.5),
('bob', 200102, 2, 1.5),
('bob', 200104, 2, 1.5);

Notice that 200103 is not available. We will fill that up using SQL.

Code

-- Recursive CTE to create all dates between 2001-01-01 and 2001-03-02
WITH Dates as
(
  select cast('2001-01-01' as date) as CalendarDate
  union all     
  select dateadd(day , 1, CalendarDate) AS CalendarDate
  from Dates
  where dateadd (day, 1, CalendarDate) < '2001-03-02'
),

-- CTE to get distinct yyyymm
FormattedDates as
(
  select 
    distinct cast(replace(left(CalendarDate,7), '-', '') as int) dt
  from Dates
)

-- compare list of yyyymm with sources and fill missing information
select 
  'bob', 
  FormattedDates.dt, 
  (select top 1 item from sources where month < formatteddates.dt order by month desc), 
  0
from FormattedDates
left join sources on FormattedDates.dt = sources.month
where source is null

-- add sources to the mix
union
select * from sources;

How does this work

First we use recursion to list out all dates between two dates with a CTE

WITH Dates as
(
  select cast('2001-01-01' as date) as CalendarDate
  union all     
  select dateadd(day , 1, CalendarDate) AS CalendarDate
  from Dates
  where dateadd (day, 1, CalendarDate) < '2001-03-02'
)
select * from Dates;

This is a recursive CTE (common table expression) that starts with a date and recurses till it reaches the end date. It outputs dates from 2001-01-01 through 2001-03-01.

We crudely convert dates to yyyymm format using cast(replace(left(CalendarDate,7), '-', '') as int). But that would create multiple 200101, 200102, etc. So we use distinct.

WITH Dates as
(
  SELECT cast('2001-01-01' as date) as CalendarDate

  UNION ALL

  SELECT dateadd(day , 1, CalendarDate) AS CalendarDate
  FROM Dates
  WHERE dateadd (day, 1, CalendarDate) < '2001-03-02'
)
select 
  distinct cast(replace(left(CalendarDate,7), '-', '') as int) dt
from Dates;

That will list out 200101, 200102 and 200103. The following code gives the same effect:

-- Recursive CTE to create all dates between 2001-01-01 and 2001-03-02
WITH Dates as
(
  select cast('2001-01-01' as date) as CalendarDate
  union all     
  select dateadd(day , 1, CalendarDate) AS CalendarDate
  from Dates
  where dateadd (day, 1, CalendarDate) < '2001-03-02'
),

-- CTE to get distinct yyyymm
FormattedDates as
(
  select 
    distinct cast(replace(left(CalendarDate,7), '-', '') as int) dt
  from Dates
)

-- List out the distinct dates
select dt from FormattedDates;

We left join these dates with the one in our sources table and select only those records where there isn't a record in sources table.

-- Recursive CTE to create all dates between 2001-01-01 and 2001-03-02
WITH Dates as
(
  select cast('2001-01-01' as date) as CalendarDate
  union all     
  select dateadd(day , 1, CalendarDate) AS CalendarDate
  from Dates
  where dateadd (day, 1, CalendarDate) < '2001-03-02'
),

-- CTE to get distinct yyyymm
FormattedDates as
(
  select 
    distinct cast(replace(left(CalendarDate,7), '-', '') as int) dt
  from Dates
)

-- compare list of yyyymm with sources and fill missing information
select 
  'bob', 
  FormattedDates.dt, 
  (select top 1 item from sources where month < formatteddates.dt order by month desc), 
  0
from FormattedDates
left join sources on FormattedDates.dt = sources.month
where source is null;

This would list out 'bob', 200103, 2, 0. Great, so we have the missing piece of information. Let's combine it with the data from sources to get a full dataset.

All that needs to be added now is union all select * from sources; to get what you want.

Upvotes: 1

Related Questions