Reputation: 51
I need to group some data based on dates at locations including identifying when a range of dates has no location. I'm some of the way there in that I have managed to produce a list of ALL the dates in the range and the location.
using a normal group by (showing min(date) and max(date) I would get something like:
But I want this:
I also need to filter out short ranges of Unknown but that's secondary.
I hope this makes sense, it looks like something that should be really easy.
Upvotes: 4
Views: 194
Reputation: 3262
Take a look into the Islands and Gaps problem and Itzik Ben-gan. There is a set based way to get the results you want.
I was looking into using ROW_NUMBER or RANK, but then I stumbled upon LAG and LEAD (introduced in SQL 2012) which are nice. I've got the solution below. It could definitely be simplified, but having it as several CTEs makes my thought process (as flawed as it may be) easier to see. I just slowly transform the data into what I want. Uncomment one select at a time if you want to see what each new CTE produces.
create table Junk
(aDate Datetime,
aLocation varchar(32))
insert into Junk values
('2000', 'Location1'),
('2001', 'Location1'),
('2002', 'Location1'),
('2004', 'Unknown'),
('2005', 'Unknown'),
('2006', 'Unknown'),
('2007', 'Location2'),
('2008', 'Location2'),
('2009', 'Location2'),
('2010', 'Location2'),
('2011', 'Location1'),
('2012', 'Location1'),
('2013', 'Location1'),
('2014', 'Location3')
;WITH StartsMiddlesAndEnds AS
(
select
aLocation,
aDate,
CASE(LAG(aLocation) OVER (ORDER BY aDate, aLocation)) WHEN aLocation THEN 0 ELSE 1 END [isStart],
CASE(LEAD(aLocation) OVER (ORDER BY aDate, aLocation)) WHEN aLocation THEN 0 ELSE 1 END [isEnd]
from Junk
)
--select * from NumberedStartsMiddlesAndEnds
,NumberedStartsAndEnds AS --let's get rid of the rows that are in the middle of consecutive date groups
(
select
aLocation,
aDate,
isStart,
isEnd,
ROW_NUMBER() OVER(ORDER BY aDate, aLocation) i
FROM StartsMiddlesAndEnds
WHERE NOT(isStart = 0 AND isEnd = 0) --it is a middle row
)
--select * from NumberedStartsAndEnds
,CombinedStartAndEnds AS --now let's put the start and end dates in the same row
(
select
rangeStart.aLocation,
rangeStart.aDate [aStart],
rangeEnd.aDate [aEnd]
FROM NumberedStartsAndEnds rangeStart
join NumberedStartsAndEnds rangeEnd ON rangeStart.aLocation = rangeEnd.aLocation
WHERE rangeStart.i = rangeEnd.i - 1 --consecutive rows
and rangeStart.isStart = 1
and rangeEnd.isEnd = 1
)
--select * from CombinedStartAndEnds
,OneDateIntervals AS --don't forget the cases where a single row is both a start and end
(
select
aLocation,
aDate [aStart],
aDate [aEnd]
FROM NumberedStartsAndEnds
WHERE isStart = 1 and isEnd = 1
)
--select * from OneDateIntervals
select aLocation, DATEPART(YEAR, aStart) [start], DATEPART(YEAR, aEnd) [end] from OneDateIntervals
UNION
select aLocation, DATEPART(YEAR, aStart) [start], DATEPART(YEAR, aEnd) [end] from CombinedStartAndEnds
ORDER BY DATEPART(YEAR, aStart)
and it produces
aLocation start end
Location1 2000 2002
Unknown 2004 2006
Location2 2007 2010
Location1 2011 2013
Location3 2014 2014
Don't have 2012? Then you can still get the same StartsMiddlesAndEnds CTE using ROW_NUMBER:
;WITH NumberedRows AS
(
SELECT aLocation, aDate, ROW_NUMBER() OVER (ORDER BY aDate, aLocation) [i] FROM Junk
)
,StartsMiddlesAndEnds AS
(
select
currentRow.aLocation,
currentRow.aDate,
CASE upperRow.aLocation WHEN currentRow.aLocation THEN 0 ELSE 1 END [isStart],
CASE lowerRow.aLocation WHEN currentRow.aLocation THEN 0 ELSE 1 END [isEnd]
from
NumberedRows currentRow
left outer join NumberedRows upperRow on upperRow.i = currentRow.i-1
left outer join NumberedRows lowerRow on lowerRow.i = currentRow.i+1
)
--select * from StartsMiddlesAndEnds
Upvotes: 1