Reputation: 2932
I have a large data set which for the purpose of this question has 3 fields:
On any given row the From Date
will always be less than the To Date
but within each group the time periods (which are in no particular order) represented by the date pairs could overlap, be contained one within another, or even be identical.
What I'd like to end up with is a query that condenses the results for each group down to just the continuous periods. For example a group that looks like this:
| Group ID | From Date | To Date |
--------------------------------------
| A | 01/01/2012 | 12/31/2012 |
| A | 12/01/2013 | 11/30/2014 |
| A | 01/01/2015 | 12/31/2015 |
| A | 01/01/2015 | 12/31/2015 |
| A | 02/01/2015 | 03/31/2015 |
| A | 01/01/2013 | 12/31/2013 |
Would result in this:
| Group ID | From Date | To Date |
--------------------------------------
| A | 01/01/2012 | 11/30/2014 |
| A | 01/01/2015 | 12/31/2015 |
I've read a number of articles on date packing but I can't quite figure out how to apply that to my data set.
How can construct a query that would give me those results?
Upvotes: 11
Views: 3678
Reputation: 5169
Another geometric apporach I have done si the following one :
WITH
T_GEO AS
(
SELECT GroupID,
geometry::STLineFromText(
CONCAT('LINESTRING ('
, DATEDIFF(second, '20000101', FromDate),
' 0,',
DATEDIFF(second, '20000101', ToDate),
' 0'
,')'), 0) AS GEO
FROM THE_TABLE
),
T_GEOAGG AS
(
SELECT GroupID,
geometry::UnionAggregate(GEO) AS UGEO
FROM T_GEO
GROUP BY GroupID
)
SELECT GroupID,
CAST(DATEADD(second, UGEO.STGeometryN(value).STPointN(1).STX, '20000101') AS DATETIME2(0)) AS FromDate,
CAST(DATEADD(second, UGEO.STGeometryN(value).STPointN(2).STX, '20000101') AS DATETIME2(0)) AS ToDate
FROM T_GEOAGG
CROSS APPLY generate_series(1, UGEO.STNumGeometries());
tested on 1000 intervals performs better than @pwilcox one...
Upvotes: 0
Reputation: 5763
Here and elsewhere I've noticed that date packing questions
don't provide a geometric approach to this problem. After all,
any range, date-ranges included, can be interpreted as a line.
So why not convert them to a sql geometry type and utilize
geometry::UnionAggregate
to merge the ranges. So I gave a stab
at it with your post.
In 'numbers':
In 'mergeLines':
In the outer query:
with
numbers as (
select row_number() over (order by (select null)) i
from @spans -- Where I put your data
),
mergeLines as (
select groupId,
lines = geometry::UnionAggregate(line)
from @spans
cross apply (select
startP = geometry::Point(convert(float,fromDate), 0, 0),
stopP = geometry::Point(convert(float,toDate) + 1, 0, 0)
) pointify
cross apply (select line = startP.STUnion(stopP).STEnvelope()) lineify
group by groupId
)
select groupId, fromDate, toDate
from mergeLines ml
join numbers n on n.i between 1 and ml.lines.STNumGeometries()
cross apply (select line = ml.lines.STGeometryN(i).STEnvelope()) l
cross apply (select
fromDate = convert(datetime, l.line.STPointN(1).STX),
toDate = convert(datetime, l.line.STPointN(3).STX) - 1
) unprepare
order by groupId, fromDate;
Upvotes: 1
Reputation: 24793
; with
cte as
(
select *, rn = row_number() over (partition by [Group ID] order by [From Date])
from tbl
),
rcte as
(
select rn, [Group ID], [From Date], [To Date], GrpNo = 1, GrpFrom = [From Date], GrpTo = [To Date]
from cte
where rn = 1
union all
select c.rn, c.[Group ID], c.[From Date], c.[To Date],
GrpNo = case when c.[From Date] between r.GrpFrom and dateadd(day, 1, r.GrpTo)
or c.[To Date] between r.GrpFrom and r.GrpTo
then r.GrpNo
else r.GrpNo + 1
end,
GrpFrom= case when c.[From Date] between r.GrpFrom and dateadd(day, 1, r.GrpTo)
or c.[To Date] between r.GrpFrom and r.GrpTo
then case when c.[From Date] > r.GrpFrom then c.[From Date] else r.GrpFrom end
else c.[From Date]
end,
GrpTo = case when c.[From Date] between r.GrpFrom and dateadd(day, 1, r.GrpTo)
or c.[To Date] between r.GrpFrom and dateadd(day, 1, r.GrpTo)
then case when c.[To Date] > r.GrpTo then c.[To Date] else r.GrpTo end
else c.[To Date]
end
from rcte r
inner join cte c on r.[Group ID] = c.[Group ID]
and r.rn = c.rn - 1
)
select [Group ID], min(GrpFrom), max(GrpTo)
from rcte
group by [Group ID], GrpNo
Upvotes: 2
Reputation: 32693
I'd use a Calendar
table. This table simply has a list of dates for several decades.
CREATE TABLE [dbo].[Calendar](
[dt] [date] NOT NULL,
CONSTRAINT [PK_Calendar] PRIMARY KEY CLUSTERED
(
[dt] ASC
))
There are many ways to populate such table.
For example, 100K rows (~270 years) from 1900-01-01:
INSERT INTO dbo.Calendar (dt)
SELECT TOP (100000)
DATEADD(day, ROW_NUMBER() OVER (ORDER BY s1.[object_id])-1, '19000101') AS dt
FROM sys.all_objects AS s1 CROSS JOIN sys.all_objects AS s2
OPTION (MAXDOP 1);
Once you have a Calendar
table, here is how to use it.
Each original row is joined with the Calendar
table to return as many rows as there are dates between From and To.
Then possible duplicates are removed.
Then classic gaps-and-islands by numbering the rows in two sequences.
Then grouping found islands together to get the new From and To.
Sample data
I added a second group.
DECLARE @T TABLE (GroupID int, FromDate date, ToDate date);
INSERT INTO @T (GroupID, FromDate, ToDate) VALUES
(1, '2012-01-01', '2012-12-31'),
(1, '2013-12-01', '2014-11-30'),
(1, '2015-01-01', '2015-12-31'),
(1, '2015-01-01', '2015-12-31'),
(1, '2015-02-01', '2015-03-31'),
(1, '2013-01-01', '2013-12-31'),
(2, '2012-01-01', '2012-12-31'),
(2, '2013-01-01', '2013-12-31');
Query
WITH
CTE_AllDates
AS
(
SELECT DISTINCT
T.GroupID
,CA.dt
FROM
@T AS T
CROSS APPLY
(
SELECT dbo.Calendar.dt
FROM dbo.Calendar
WHERE
dbo.Calendar.dt >= T.FromDate
AND dbo.Calendar.dt <= T.ToDate
) AS CA
)
,CTE_Sequences
AS
(
SELECT
GroupID
,dt
,ROW_NUMBER() OVER(PARTITION BY GroupID ORDER BY dt) AS Seq1
,DATEDIFF(day, '2001-01-01', dt) AS Seq2
,DATEDIFF(day, '2001-01-01', dt) -
ROW_NUMBER() OVER(PARTITION BY GroupID ORDER BY dt) AS IslandNumber
FROM CTE_AllDates
)
SELECT
GroupID
,MIN(dt) AS NewFromDate
,MAX(dt) AS NewToDate
FROM CTE_Sequences
GROUP BY GroupID, IslandNumber
ORDER BY GroupID, NewFromDate;
Result
+---------+-------------+------------+
| GroupID | NewFromDate | NewToDate |
+---------+-------------+------------+
| 1 | 2012-01-01 | 2014-11-30 |
| 1 | 2015-01-01 | 2015-12-31 |
| 2 | 2012-01-01 | 2013-12-31 |
+---------+-------------+------------+
Upvotes: 3
Reputation: 390
The solution from book "Microsoft® SQL Server ® 2012 High-Performance T-SQL Using Window Functions"
;with C1 as(
select GroupID, FromDate as ts, +1 as type, 1 as sub
from dbo.table_name
union all
select GroupID, dateadd(day, +1, ToDate) as ts, -1 as type, 0 as sub
from dbo.table_name),
C2 as(
select C1.*
, sum(type) over(partition by GroupID order by ts, type desc
rows between unbounded preceding and current row) - sub as cnt
from C1),
C3 as(
select GroupID, ts, floor((row_number() over(partition by GroupID order by ts) - 1) / 2 + 1) as grpnum
from C2
where cnt = 0)
select GroupID, min(ts) as FromDate, dateadd(day, -1, max(ts)) as ToDate
from C3
group by GroupID, grpnum;
Create table:
if object_id('table_name') is not null
drop table table_name
create table table_name(GroupID varchar(100), FromDate datetime,ToDate datetime)
insert into table_name
select 'A', '01/01/2012', '12/31/2012' union all
select 'A', '12/01/2013', '11/30/2014' union all
select 'A', '01/01/2015', '12/31/2015' union all
select 'A', '01/01/2015', '12/31/2015' union all
select 'A', '02/01/2015', '03/31/2015' union all
select 'A', '01/01/2013', '12/31/2013'
Upvotes: 7