Reputation: 15
I have the following dataset where person has id and their amount can change over time so changes are recorded as date ranges. Sometimes there are gaps in date ranges where there is no information available. It is fine. However I want to merge continuous date ranges where they happen to have same amount while it is spanned over multiple records like row 2 and 3 for ID = 1.
DECLARE @DataTable TABLE (
ID [int] NULL,
StartDate [date] NULL,
EndDate [date] NULL,
Amount [decimal](12,2) NULL
)
INSERT INTO @DataTable
SELECT 1, '20180101','20180513', 10.00 UNION ALL
SELECT 1, '20180630','20190301', 15.00 UNION ALL
SELECT 1, '20190302','20190615', 15.00 UNION ALL
SELECT 1, '20190616','20991231', 5.00 UNION ALL
SELECT 2, '20190101','20190331', 35.00 UNION ALL
SELECT 2, '20190401','20191031', 30.00 UNION ALL
SELECT 3, '20180505','20180930', 19.00 UNION ALL
SELECT 3, '20181001','20190228', 1.00 UNION ALL
SELECT 3, '20190501','20190815', 1.00 UNION ALL
SELECT 3, '20190819','20190827', 5.00 UNION ALL
SELECT 3, '20190828','20991231', 1.00 UNION ALL
SELECT 4, '2017-10-01', '2017-12-31', 688.96 UNION ALL
SELECT 4, '2018-01-01', '2018-04-30', 707.96 UNION ALL
SELECT 4, '2018-05-01', '2018-05-31', 783.96 UNION ALL
SELECT 4, '2018-06-01', '2018-12-31', 707.96 UNION ALL
SELECT 4, '2019-01-01', '2019-03-31', 707.96 UNION ALL
SELECT 4, '2019-04-01', '2019-04-30', 571.46 UNION ALL
SELECT 4, '2019-05-01', '2019-06-30', 707.96 UNION ALL
SELECT 4, '2019-07-01', '2099-12-31', 707.96
;
I solved it by generating date lines between start and end date using dimDate and then kept record where either amount changed from previous record or there is gap of dates for ID. Then I used next record date available to use that for end date. Query is as follows:
WITH DateList AS (
SELECT DT.*, DD.DateOnly AS RecordDate
FROM @DataTable DT
INNER JOIN dimDate DD ON DT.StartDate <= DD.DateOnly AND CASE WHEN DT.EndDate > GETDATE() THEN CONVERT(DATE,GETDATE()) ELSE DT.EndDate END >= DD.DateOnly
)
, PrevValue AS (
SELECT
*
, LAG(RecordDate) OVER (PARTITION BY ID ORDER BY RecordDate) AS PrevDate
, LAG(Amount) OVER (PARTITION BY ID ORDER BY RecordDate) AS PrevAmt
FROM DateList
)
, KeepHistory AS (
SELECT
*
FROM PrevValue
WHERE Amount <> PrevAmt OR PrevAmt IS NULL OR DATEADD(DAY,1,PrevDate) <> RecordDate OR PrevDate IS NULL
)
, FINAL AS (
SELECT
*
, LEAD(PrevDate) OVER (PARTITION BY ID ORDER BY StartDate) AS NextEndDate
FROM KeepHistory
)
SELECT
ID
, StartDate
, CASE WHEN NextEndDate > EndDate THEN NextEndDate ELSE EndDate END AS EndDate
, Amount
FROM FINAL
My question is there an alternative way to approach this without going through dimDate or generating dates between start and end date. Can I achieve this simply by using window functions like gap and island problem like here?
Please let me know if you see any issues with my current solution. Thanks.
About @Larnu response, your updated query works for most . I added ID = 4 example that seems to cause an issue when merging its 4th and 5th row. It also merges 7th row when 6th row has different amount.
ID = 4 4th, 5th and 7th are merged; 6th one has different amount
Upvotes: 1
Views: 895
Reputation: 95544
Is this what you're after?
WITH Gaps AS(
SELECT DT.ID,
DT.StartDate,
DT.EndDate,
DT.Amount,
DATEDIFF(DAY,LAG(DATEADD(DAY,1,DT.EndDate),1,DT.StartDate) OVER (PARTITION BY DT.ID, DT.Amount ORDER BY DT.StartDate ASC), DT.StartDate) AS Gap
FROM @DataTable DT),
Grps AS(
SELECT G.ID,
G.StartDate,
G.EndDate,
G.Amount,
ROW_NUMBER() OVER (PARTITION BY G.ID ORDER BY G.StartDate) -
ROW_NUMBER() OVER (PARTITION BY G.ID,Amount ORDER BY G.StartDate) + Gap AS Grp
FROM Gaps G)
SELECT G.ID,
MIN(G.StartDate) AS StartDate,
MAX(EndDate) AS EndDate,
G.Amount
FROM Grps G
GROUP BY G.ID,
G.Amount,
G.Grp
ORDER BY ID,
StartDate;
Added a Unique ID, to get around a "feature":
DECLARE @DataTable TABLE (
UniqueID int IDENTITY(1,1),
ID [int] NULL,
StartDate [date] NULL,
EndDate [date] NULL,
Amount [decimal](12,2) NULL
)
INSERT INTO @DataTable
SELECT 1, '20180101','20180513', 10.00 UNION ALL
SELECT 1, '20180630','20190301', 15.00 UNION ALL
SELECT 1, '20190302','20190615', 15.00 UNION ALL
SELECT 1, '20190616','20991231', 5.00 UNION ALL
SELECT 2, '20190101','20190331', 35.00 UNION ALL
SELECT 2, '20190401','20191031', 30.00 UNION ALL
SELECT 3, '20180505','20180930', 19.00 UNION ALL
SELECT 3, '20181001','20190228', 1.00 UNION ALL
SELECT 3, '20190501','20190815', 1.00 UNION ALL
SELECT 3, '20190819','20190827', 5.00 UNION ALL
SELECT 3, '20190828','20991231', 1.00 UNION ALL
SELECT 4, '20171001', '20171231', 688.96 UNION ALL
SELECT 4, '20180101', '20180430', 707.96 UNION ALL
SELECT 4, '20180501', '20180531', 783.96 UNION ALL
SELECT 4, '20180601', '20181231', 707.96 UNION ALL
SELECT 4, '20190101', '20190331', 707.96 UNION ALL
SELECT 4, '20190401', '20190430', 571.46 UNION ALL
SELECT 4, '20190501', '20190630', 707.96 UNION ALL
SELECT 4, '20190701', '20991231', 707.96;
--SELECT *
--FROM @DataTable;
WITH Gaps AS(
SELECT DT.UniqueID,
DT.ID,
DT.StartDate,
DT.EndDate,
DT.Amount,
DATEDIFF(DAY,LAG(DATEADD(DAY,1,DT.EndDate),1,DT.StartDate) OVER (PARTITION BY DT.ID, DT.Amount ORDER BY DT.UniqueID ASC), DT.StartDate) AS Gap
FROM @DataTable DT),
Grps AS(
SELECT G.UniqueID,
G.ID,
G.StartDate,
G.EndDate,
G.Amount,
G.Gap,
ROW_NUMBER() OVER (PARTITION BY G.ID ORDER BY G.UniqueID) -
ROW_NUMBER() OVER (PARTITION BY G.ID,Amount ORDER BY G.UniqueID) + (Gap * UniqueID) AS Grp
FROM Gaps G)
SELECT G.ID,
MIN(G.StartDate) AS StartDate,
MAX(EndDate) AS EndDate,
G.Amount
FROM Grps G
GROUP BY G.ID,
G.Amount,
G.Grp
ORDER BY ID,
StartDate;
Upvotes: 1
Reputation: 1037
You can use Recursion as follows
DECLARE @DataTable TABLE (
ID [int] NULL,
StartDate [date] NULL,
EndDate [date] NULL,
Amount [decimal](12,2) NULL
)
INSERT INTO @DataTable
SELECT 1, '20180101','20180513', 10.00 UNION ALL
SELECT 1, '20180630','20190301', 15.00 UNION ALL
SELECT 1, '20190302','20190615', 15.00 UNION ALL
SELECT 1, '20190616','20991231', 5.00 UNION ALL
SELECT 2, '20190101','20190331', 35.00 UNION ALL
SELECT 2, '20190401','20191031', 30.00 UNION ALL
SELECT 3, '20180505','20180930', 19.00 UNION ALL
SELECT 3, '20181001','20190228', 1.00 UNION ALL
SELECT 3, '20190501','20190815', 1.00 UNION ALL
SELECT 3, '20190819','20190827', 5.00 UNION ALL
SELECT 3, '20190828','20991231', 1.00;
;with cte as(
select t1.id,
t1.startDate,
t1.EndDate,
t1.Amount,
row_number() over (partition by t1.id order by t1.Id,t1.startDate) R#
from @DataTable t1
left outer join @DataTable t2 on dateadd(d,-1,t1.startdate)=t2.enddate and t1.Amount=t2.Amount
and t1.id=t2.id
where t2.amount is null),
cte1 as
(select
t1.id,
t1.startDate,
t1.EndDate,
t1.Amount,
t1.R# from cte t1
union all
select t1.id,
t2.startdate,
t1.endDate,
t1.amount,
R# from @DataTable t1 inner join cte1 t2
on dateadd(d,-1,t1.startdate)=t2.enddate and t1.Amount=t2.Amount)
select id,
min(stArtdate)[startdate],
max(enddate)[enddate],
min(amount)[amount] from cte1 GROUP by R#,id
Order by id,max(enddate)
Upvotes: 0