Alan Featherston
Alan Featherston

Reputation: 1086

Remove duplicate data in 'date-ranged' rows

I have a table like the following.

ID    StartDate    EndDate     AttributeA     AttributeB
--    ---------    -------     ----------     ----------
1     1/1/2009     2/1/2009    0              C
1     2/1/2009     3/1/2009    1              C
1     3/1/2009     4/1/2009    1              C
2     1/1/2010     2/1/2010    0              D
2     3/1/2010     4/1/2010    1              D

The date range is used to know for what time period the rest of the Attributes were valid, the problem i have is that there are several consecutive time ranges where the Attributes ramain the same, what I would like is to obtain the same data but without the duplicate rows.

From the previous example, my expected end result would be like this:

ID    StartDate    EndDate     AttributeA     AttributeB
--    ---------    -------     ----------     ----------
1     1/1/2009     2/1/2009    0              C
1     2/1/2009     4/1/2009    1              C
2     1/1/2010     2/1/2010    0              D
2     3/1/2010     4/1/2010    1              D

What I did was merge the 2nd and 3rd row into one (All attribute except the date were the same), but I kept the StartDate of the 2nd and the endDate of the 3rd row.

I first thought of grouping by the values obtaining the MAX and MIN like this

SELECT ID, MIN(StartDate), MAX(EndDate), attributeA, attributeB
FROM MyTable
Group BY ID, AttributeA, AttributeB

But as soon as I run it I realized that when the attributes change several times and go back to their original value I'd end up with overlapping intervals. I've been stuck for a while now trying to figure out how to fix this problem.

Here's an example of what I meant in my previous statement.

When the initial data looks like the following:

ID    StartDate    EndDate     AttributeA     AttributeB
--    ---------    -------     ----------     ----------
1     1/1/2009     2/1/2009    0              C
1     2/1/2009     3/1/2009    0              D
1     3/1/2009     4/1/2009    0              D
1     4/1/2009     5/1/2009    1              D
1     6/1/2010     6/1/2009    0              D

Grouping the results would end up like the following

ID    StartDate    EndDate     AttributeA     AttributeB
--    ---------    -------     ----------     ----------
1     1/1/2009     2/1/2009    0              C
1     2/1/2009     6/1/2009    0              D
1     4/1/2009     5/1/2009    1              D

And what I'd like to obtain is this

ID    StartDate    EndDate     AttributeA     AttributeB
--    ---------    -------     ----------     ----------
1     1/1/2009     2/1/2009    0              C
1     2/1/2009     4/1/2009    0              D
1     4/1/2009     5/1/2009    1              D
1     6/1/2010     6/1/2009    0              D

Any help would be welcomed :)

EDIT: I'll be uploading some sample data soon to make my problem a bit easier to understand.

EDIT2: Here's a script with some of my data. From that sample what I'd like to obtain are the following rows.

ID        StartDate     EndDate       A     B      C     D     E     F
--        ---------     -------       --    --     --    --    --    --
708513    1980-01-01    2006-07-23    15    ASDB   A     ACT   130   0
708513    2006-07-24    2009-12-08    15    ASDB   A     ACT   130   2
708513    2009-12-09    2010-01-12    0     ASDB   A     ACT   130   2
708513    2010-01-13    2079-05-30    15    ASDB   A     ACT   130   2

Upvotes: 0

Views: 2319

Answers (2)

Tech
Tech

Reputation: 661

I made a version without recursion if someone is interested. I didn't really figure out how to add extra columns not used in comparison in the previous example.

IF OBJECT_ID('tempdb..#test') IS NOT NULL drop table #test

create table #test (
    id int identity(1, 1)
    , ship nvarchar(64)
    , color nvarchar(16)
    , [length] int
    , height int
    , [type] nvarchar(16)
    , country nvarchar(16)
    , StartDate date
)

insert into #test(ship, color, [length], height, [type], country, StartDate)
values 
    ('Ship 1', 'Blue', 200, 13, 'sailboat', 'sweden', '2019-01-01')
    , ('Ship 1', 'Blue', 200, 13, 'sailboat', 'sweden', '2019-02-01')
    , ('Ship 1', 'Blue', 200, 13, 'sailboat', 'sweden', '2019-03-01')
    , ('Ship 1', 'Red', 200, 13, 'motorboat', 'sweden', '2019-11-01')
    , ('Ship 1', 'Blue', 200, 13, 'sailboat', 'sweden', '2019-12-01')
    , ('Ship 2', 'Green', 400, 27, 'RoRo', 'denmark', '2019-02-01')

;
with step1 as (
    select t.*
        , [EndDate] = dateadd(day, -1, lead(t.StartDate, 1, '9999-12-31') over(partition by t.ship order by t.StartDate))
    from #test t
    where 1 = 1
)
, step2 as (
    select t.*
        -- Check if preceeding row with same attribute has enddate between this startdate
        , [IdenticalPreceeding] = case 
                                    when t.StartDate = dateadd(day, 1, lag(t.EndDate, 1, '1900-01-01') over (partition by t.ship, t.color, t.[length], t.height, t.[type], t.country order by t.Startdate)) then 1
                                    else 0
                                end
    from step1 t
)

select t.*
    , [EndDateFinal] = dateadd(day, -1, lead(t.StartDate, 1, '9999-12-31') over(partition by t.ship order by t.StartDate))
from step2 t
where 1 = 1
-- Remove rows with identical preceeders
and t.IdenticalPreceeding = 0
order by t.ship
    , t.StartDate

Upvotes: 0

user359040
user359040

Reputation:

EDITED, following comments. Try:

;with cte as (
select m1.ID, m1.StartDate, m1.EndDate, m1.a, m1.b, m1.c, m1.d, m1.e, m1.f
from sampledata m1
where not exists
(select null from sampledata m0
 where m1.ID = m0.ID and 
       m1.a = m0.a and 
       m1.b = m0.b and 
       m1.c = m0.c and 
       m1.d = m0.d and 
       m1.e = m0.e and 
       m1.f = m0.f and 
       dateadd(day, -1, m1.StartDate) = m0.EndDate)
union all
select m1.ID, m1.StartDate, m2.EndDate, m1.a, m1.b, m1.c, m1.d, m1.e, m1.f
from cte m1
join sampledata m2 
       on m1.ID = m2.ID and 
          m1.a = m2.a and 
          m1.b = m2.b and 
          m1.c = m2.c and 
          m1.d = m2.d and 
          m1.e = m2.e and 
          m1.f = m2.f and 
          dateadd(day, 1, m1.EndDate) = m2.StartDate)
select ID, StartDate, max(EndDate) EndDate, a, b, c, d, e, f
from cte 
group by ID, StartDate, a, b, c, d, e, f
OPTION (MAXRECURSION 32767)

Upvotes: 1

Related Questions