Christian Lundheim
Christian Lundheim

Reputation: 95

SQL table fixed interval interpolation

We have been working for a while with heavy data sets containing a sensor name, a timestamp and a value. I need these data processed into a fixed interval for reporting and calculation purposes. To keep the data input systems flexible we want to receive the data in any log interval, such as:

Sensor1    10.3    18-05-2017 10:27:50
Sensor1     9.3    18-05-2017 10:28:01
Sensor1     8.3    18-05-2017 11:28:00
Sensor1     7.3    18-05-2017 14:01:50
Sensor1    15.0    18-05-2017 15:10:00
Sensor1    18.3    18-05-2017 15:30:00
Sensor1    10.3    18-05-2017 16:30:00

Then convert it later into a fixed interval with linearly interpolated values, such as hour interval:

--Sensor--   --Timestamp--         --Value_AT--  --Value_MIN-- --Value_MAX-- --Value_AVG--
Sensor1      18-05-2017 11:00:00   8.767         8.092         8.767         x.x
Sensor1      18-05-2017 12:00:00   8.092         7.702         8.092         x.x
Sensor1      18-05-2017 13:00:00   7.702         7.312         7.702         x.x
Sensor1      18-05-2017 14:00:00   7.312         7.300        13.870         x.x
Sensor1      18-05-2017 15:00:00  13.870        13.780        18.300         x.x
Sensor1      18-05-2017 16:00:00  14.300        10.300        14.300         x.x

Column descriptions:

TimeStamp = Whole hours.
Value_AT  = Linearly interpolated value @ the given timestamp
Value_MIN = The lowest value within timestamp + 1 hour
            (if value is always increasing, this would be the same as Value_AT)
Value_MAX = The highest value within timestamp + 1 hour
            (if value is always increasing, this would be the same as Value_AT for the next hour)
Value_AVG     = The average value within timestamp + 1 hour

Want to focus on an hour for now, but also going to implement the same kind of interpolation for:

So if the code can be re-used for that, it's the ideal solution. The processed data is going to be used for the following things:

I have reviewed these options:

Currently, I have experimented with this procedure, which is working fine, but I'm not too happy with it. It has a small bug causing a slight offset on Value_MIN / Value_MAX for now. Also kind of low on performance with big datasets for obvious reasons :).

Any suggestion for an alternate approach, or optimizations that can be done for this?

-- TODO: OPTIMIZE!! Perhaps use temp tables to do that. Super-slow now....

-- This procedure takes an input table:
    -- TimeStamp        DateTime
    -- ItemID           uniqueidentifier
    -- ItemProperty     nvarchar(20)
    -- Value            decimal(18,6)

-- Result:
-- This procedure takes an input table and selects aggregated data with the following columns
    -- TimeStamp        DateTime            = A whole hour, like 2014-10-03 19:00:00.000
    -- ItemID           uniqueidentifier    = Same as the ID passed in
    -- ItemProperty     nvarchar(20)        = Same as the property passed in
    -- Value_AT         decimal(18,6)       = Linear interpolation of what the value should've been exactly at the hour
    -- Value_MIN        decimal(18,6)       = The lowest value within that hour based on linear interpolation
    -- Value_MAX        decimal(18,6)       = The highest value within the hour based on linear interpolation
    -- Value_AVG        decimal(18,6)       = The average value within the hour based on linear interpolation
    -- Value_DIFF       decimal(18,6)       = The differential value between beginning of the hour and the end of the hour

ALTER PROCEDURE [dbo].[CORE_spPerformAggregation_Hourly]
    @InputTable AS AggregationTable READONLY
AS
BEGIN

    -- Retrieve the earliest and latest time from input table. Used to limit the result time.
    --   (MUCH quicker to do it here than directly as a sub-select in WHERE- clause)
    DECLARE @EarliestTimeStampInputTable DATETIME = (SELECT TOP 1 [TimeStamp] FROM @InputTable ORDER BY [TimeStamp] ASC);
    DECLARE @LatestTimeStampInputTable DATETIME = (SELECT TOP 1 [TimeStamp] FROM @InputTable ORDER BY [TimeStamp] DESC);


    DECLARE @InputWithMissingValues AS TABLE
    (
         [TimeStamp] datetime
        ,ItemID uniqueidentifier
        ,ItemProperty nvarchar(20)
        ,[Value] decimal(18,6)
        ,InterpolatedTime datetime
        ,ForrigeVerdi decimal(18,6)
        ,ForrigeTid datetime
    );



    -- *** STEP 1 ***
    -- Create a local table with the input table with these stamps:
    --    xx:00:00 (whole hours)
    --    xx:59:59 (last part of hour)
    --    Also every missing hour from the input
    INSERT INTO @InputWithMissingValues

        -- 5.1 Inner select. Round off input table to nearest hours for later interpolation
        -- TODO: EXPLAIN AND OPTIMIZE
        -- xx:00:00 interpolations
        SELECT   [TimeStamp]
                ,ItemID
                ,ItemProperty
                ,[Value]
                ,DATEADD(hour, DATEDIFF(hour,0,[TimeStamp]),0)      AS InterpolatedTime
                ,LAG([Value],1,NULL) OVER (PARTITION BY ItemID,ItemProperty ORDER BY [TimeStamp])           AS ForrigeVerdi
                ,LAG([TimeStamp],1,NULL) OVER (PARTITION BY ItemID,ItemProperty ORDER BY [TimeStamp])           AS ForrigeTid
        FROM @InputTable

        UNION ALL

        -- 5.2 Inner select. Round off input table to nearest hours xx:59:59 for later interpolation
        -- TODO: EXPLAIN AND OPTIMIZE
        -- xx:59:59 interpolations
        SELECT [TimeStamp]
                ,[ItemID]
                ,[ItemProperty]
                ,[Value]
                ,DATEADD(second,3599,DATEADD(hour, DATEDIFF(hour,0,[TimeStamp]),0))     AS InterpolatedTime
                ,LAG([Value],1,NULL) OVER (PARTITION BY ItemID,ItemProperty ORDER BY [TimeStamp])           AS ForrigeVerdi
                ,LAG(TimeStamp,1,NULL) OVER (PARTITION BY ItemID,ItemProperty ORDER BY [TimeStamp])         AS ForrigeTid
        FROM @InputTable


        UNION ALL

        -- 5.3 Inner select. Every missing hour from input table is output here
        -- TODO: EXPLAIN AND OPTIMIZE
        -- Missing hours (both XX:00:00 and XX:59:59)
        SELECT
                NULL                            AS [TimeStamp]
            ,Test.ItemID                    AS ItemID
            ,Test.ItemProperty              AS ItemProperty
            ,NULL                           AS [Value]
            ,DATEADD(second,Test2.SecOffset,[DateTime])                     AS InterpolatedTime
            ,NULL                           AS ForrigeVerdi
            ,NULL                           AS ForrigeTid
        FROM ViewDateTimeDimensions AS DD
        CROSS JOIN (SELECT DISTINCT ItemID,ItemProperty FROM @InputTable) AS Test
        CROSS JOIN ((SELECT 0 AS SecOffset UNION SELECT 3599 AS SecOffset)) AS Test2
        LEFT JOIN @InputTable AS AKVH ON
        Test.ItemID=AKVH.ItemID
        AND DATEPART(yyyy,AKVH.TimeStamp) = DATEPART(yyyy,DD.DateTime)
        AND DATEPART(month,AKVH.TimeStamp) = DATEPART(month,DD.DateTime)
        AND DATEPART(day,AKVH.TimeStamp) = DATEPART(day,DD.DateTime)
        AND DATEPART(hh,AKVH.TimeStamp) = DATEPART(hh,DD.DateTime)
            WHERE ([Value] IS NULL)
                    -- Skip entries before the first time and after the last time as that would create lots of unnecessary data.
                    AND ([DateTime]>=@EarliestTimeStampInputTable)
                    AND ([DateTime]<=@LatestTimeStampInputTable)



    -- ** STEP 2 **
    -- 4. Filling in TimeStamp, Value, ForrigeVerdi and ForrigeTid for NULL- rows
    --    The NULL columns are the rows added by DateTimeDimensions so we can fill in those hours without values in input table
    UPDATE @InputWithMissingValues
    SET
        [TimeStamp] = GETDATE()
        ,[Value] = 0
        ,ForrigeVerdi = 0
        ,ForrigeTid = GETDATE()
    WHERE ForrigeTid IS NULL OR ForrigeVerdi IS NULL OR [Value] IS NULL OR [TimeStamp] IS NULL



    DECLARE @InputInterpolated AS TABLE
    (
         [TimeStamp] datetime
        ,ItemID uniqueidentifier
        ,ItemProperty nvarchar(20)
        ,[Value] decimal(18,6)
        ,[AveragingWeight] decimal(18,6)
        ,[Value_AT] decimal(18,6)
        ,RoundedHour datetime
    );



    -- ** STEP 3 ** 
    -- Performing the interpolation 
    INSERT INTO @InputInterpolated

        -- 2. Select calculating AveragingWeight
        SELECT
             [TimeStamp]
            ,ItemID
            ,ItemProperty
            ,[Value]

            -- Formula for AveragingWeight is: (PreviousValue+ThisValue)*(ThisTimestamp-PreviousTimestamp)*0,5
            -- This is calculated in this column
            ,(LAG([Value],1,NULL) OVER (PARTITION BY ItemID,ItemProperty ORDER BY [TimeStamp])+[Value]) * (convert(decimal(18,6),DATEDIFF(second,LAG([TimeStamp],1,NULL)OVER (PARTITION BY ItemID,ItemProperty ORDER BY [TimeStamp]), [TimeStamp]),0)) * 0.5        AS AveragingWeight

            -- Value at whole hour
            ,IIF(DATEPART(minute,[TimeStamp])=0 AND DATEPART(second,[TimeStamp])=0 AND DATEPART(ms,[TimeStamp])=0
                    ,[Value]
                    ,NULL   )       AS Value_AT

            -- Rounded hour (used for grouping)
            ,DATEADD(hour, DATEDIFF(hour, 0, [TimeStamp]),0)            AS RoundedHour

        FROM
        (

            SELECT   InterpolatedTime           AS [TimeStamp]
                    ,ItemID
                    ,ItemProperty

                    -- IIF: We can only perform interpolation if we have bigger than 0 time-difference between previous value
                    --         (prevents div by zero errors)
                    ,   IIF(    DATEDIFF(ms,[ForrigeTid],[TimeStamp])=0
                                ,[Value]

                                -- Formula for calculating interpolated value is:
                                --   ((TimeToCalcFor-PrevTime) * Value + (TimeToCalcFor-NextValue) * PrevValue) / (PrevTime-TimeToCalcFor)
                                ,
                                ( Convert(decimal,DATEDIFF(ms, [ForrigeTid], InterpolatedTime) * [Value]
                                    + CONVERT(decimal,DATEDIFF(ms,InterpolatedTime, [TimeStamp])) * [ForrigeVerdi]
                                    ) / CONVERT(decimal,DATEDIFF(ms, [ForrigeTid], [TimeStamp]))
                                    )

                            )  AS [Value]               
            FROM
            @InputWithMissingValues

            -- Avoid records that are not rounded by us here. Because we append that data in the next select.
            WHERE
            (
                DATEPART(hh,[TimeStamp])<>DATEPART(hh,[ForrigeTid])
            OR  DATEPART(dd,[TimeStamp])<>DATEPART(dd,[ForrigeTid])
            OR  DATEPART(mm,[TimeStamp])<>DATEPART(mm,[ForrigeTid])
            OR  DATEPART(yyyy,[TimeStamp])<>DATEPART(yyyy,[ForrigeTid])
            )


            -- Append every record from the input table so that we don't just get the hourly rounded records, but every record.
            UNION ALL
            SELECT    [TimeStamp]
                        ,[ItemID]
                        ,[ItemProperty]
                        ,[Value]
            FROM @InputTable
        ) AS Sub3



    -- ** STEP 4 ** 
    -- The final grouping by whole hours
    SELECT
         RoundedHour                                            AS [TimeStamp]      -- The lowest timestamp should be the whole hour, ie 15:00:00.000
        ,ItemID
        ,ItemProperty
        ,MAX(Value_AT)                                                              -- The value from the row with whole hour, ie 15:00:00.000, is the Value_AT value

        ,MIN([Value])                                           AS Value_MIN        -- The MIN value. Simply use the MIN() aggregator because we have values both at 15:00:00.000 and 15:59:59.000 and everything logged inbetween.
        ,MAX([Value])                                           AS Value_MAX        -- The MAX value. Simply use the MAX() aggregator because we have values both at 15:00:00.000 and 15:59:59.000 and everything logged inbetween.
        ,SUM(AveragingWeight) / 3599                            AS Value_AVG        -- The AVG value is based on AveragingWeight calculated in sub-query divided on number of seconds in one hour.  AveragingWeight is value*time in seconds.
        ,
        -- The DIFF value. Delta value from previous hour.
        MAX(Value_AT) - LAG(MAX(Value_AT)) OVER(PARTITION BY ItemID,ItemProperty ORDER BY RoundedHour)      AS Value_DIFF
    FROM
    @InputInterpolated AS Sub4

    -- Final group by whole hours
    GROUP BY ItemID,ItemProperty,RoundedHour

    -- Avoid the first record for hours that don't have data for the entire hour
    HAVING DATEPART(minute,MIN([TimeStamp]))=0 AND DATEPART(second,MIN([TimeStamp]))=0 AND DATEPART(ms,MIN([TimeStamp]))=0

END

Upvotes: 0

Views: 1085

Answers (1)

S&#248;ren Kongstad
S&#248;ren Kongstad

Reputation: 1440

I do not get the same results as you, but here is a try:

First I just get your test data, and then I select the period we want to see. In my example I just get the 245 hours of the day.

with 
sensors as
(select * from 
(values
('Sensor1',    10.3     ,cast('20170518 10:27:50' as datetime2))
,('Sensor1',     9.3    ,cast('20170518 10:28:01' as datetime2))
,('Sensor1',     8.3    ,cast('20170518 11:28:00' as datetime2))
,('Sensor1',     7.3    ,cast('20170518 14:01:50' as datetime2))
,('Sensor1',    15.0    ,cast('20170518 15:10:00' as datetime2))
,('Sensor1',    18.3    ,cast('20170518 15:30:00' as datetime2))
,('Sensor1',    10.3    ,cast('20170518 16:30:00' as datetime2))
) a(Sensor,Reading,TS)
)
,TimePeriods as (
select 
dateadd(hour,h,dt) dt
from
(select distinct cast(cast(ts as date) as datetime2) dt from sensors) days
cross join (values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9),(10),(11),(12)
        ,(13),(14),(15),(16),(17),(18),(19),(20),(21),(22),(23)) h(h)
)

Then I get the previous measurement for each sensor measurement, and in the process I pad it out, so the first measurement is added a day before, and the last a day after.

,sensors2 as
(
select Sensor
    ,Reading
    ,TS
    ,isnull(lag(reading) over (partition by sensor order by ts), Reading) prevReading
    ,isnull(lag(ts) over (partition by sensor order by ts),dateadd(day,-1,ts)) prevts
from sensors
union all
select s.Sensor
      ,s.Reading
      ,dateadd(day,1,s.ts)
      ,s.Reading
      ,s.TS
from sensors s
inner join (select sensor,max(ts) ts from sensors group by Sensor) ms
on ms.Sensor=s.Sensor and ms.ts=s.ts
)

This gives

Sensor  Reading TS  prevReading prevts
Sensor1 10.3    2017-05-18 10:27:50.0000000 10.3    2017-05-17 10:27:50.0000000
Sensor1 9.3     2017-05-18 10:28:01.0000000 10.3    2017-05-18 10:27:50.0000000
Sensor1 8.3     2017-05-18 11:28:00.0000000 9.3     2017-05-18 10:28:01.0000000
Sensor1 7.3     2017-05-18 14:01:50.0000000 8.3     2017-05-18 11:28:00.0000000
Sensor1 15.0    2017-05-18 15:10:00.0000000 7.3     2017-05-18 14:01:50.0000000
Sensor1 18.3    2017-05-18 15:30:00.0000000 15.0    2017-05-18 15:10:00.0000000
Sensor1 10.3    2017-05-18 16:30:00.0000000 18.3    2017-05-18 15:30:00.0000000
Sensor1 10.3    2017-05-19 16:30:00.0000000 10.3    2017-05-18 16:30:00.0000000

Now i know enough to calculate the synthetic values at the full hours>

,artificial as
(
select 
    s.Sensor
    ,p.dt [TimeStamp]
    ,(cast(s.Reading-s.prevReading as float)*datediff(second,s.prevts,p.dt))/datediff(second,s.prevts,s.ts)+s.prevReading value_at
from sensors2  s
inner join TimePeriods  p
on p.dt between s.prevts and s.ts
)

Each hour will be in one interval, and using the previous reading, current reading, and prevts and ts I can calculate the value at the stroke of the hour.

For the full 24 Hours I get:

Sensor1 2017-05-18 00:00:00.0000000 10,3
Sensor1 2017-05-18 01:00:00.0000000 10,3
Sensor1 2017-05-18 02:00:00.0000000 10,3
Sensor1 2017-05-18 03:00:00.0000000 10,3
Sensor1 2017-05-18 04:00:00.0000000 10,3
Sensor1 2017-05-18 05:00:00.0000000 10,3
Sensor1 2017-05-18 06:00:00.0000000 10,3
Sensor1 2017-05-18 07:00:00.0000000 10,3
Sensor1 2017-05-18 08:00:00.0000000 10,3
Sensor1 2017-05-18 09:00:00.0000000 10,3
Sensor1 2017-05-18 10:00:00.0000000 10,3
Sensor1 2017-05-18 11:00:00.0000000 8,76679633231453
Sensor1 2017-05-18 12:00:00.0000000 8,0919826652221
Sensor1 2017-05-18 13:00:00.0000000 7,70195016251354
Sensor1 2017-05-18 14:00:00.0000000 7,31191765980498
Sensor1 2017-05-18 15:00:00.0000000 13,8704156479218
Sensor1 2017-05-18 16:00:00.0000000 14,3
Sensor1 2017-05-18 17:00:00.0000000 10,3
Sensor1 2017-05-18 18:00:00.0000000 10,3
Sensor1 2017-05-18 19:00:00.0000000 10,3
Sensor1 2017-05-18 20:00:00.0000000 10,3
Sensor1 2017-05-18 21:00:00.0000000 10,3
Sensor1 2017-05-18 22:00:00.0000000 10,3
Sensor1 2017-05-18 23:00:00.0000000 10,3

Now I need to calculate min, max and avg

I start by making a union between my artificial values and the real meaurements, and measure the time to the next value as duration:

,Allvalues as
(
select Sensor,TimeStamp,Reading,value_at 
,isnull(datediff(second,timestamp,lead(TimeStamp) over (partition by sensor order by timestamp)),3600) duration
from
(
select 
Sensor,TimeStamp,value_at Reading,value_at 
from artificial a
union
select
Sensor,TS,Reading,null value_at
from sensors) a
)

I then calculate the min, max and average. Average is duration of reading times reading, summed for the hour, and averaged out

select
a.Sensor,p.dt Timestamp,max(value_at) value_at,max(Reading) Value_Max,min(Reading) Value_Min
,sum(duration*reading)/3600 Value_Avg
from TimePeriods p
inner join Allvalues a
on a.TimeStamp >=p.dt
and a.TimeStamp < dateadd(hour,1,p.dt)
group by a.Sensor,p.dt

The result is:

Sensor  Timestamp   value_at    Value_Max   Value_Min   Value_Avg
Sensor1 2017-05-18 00:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 01:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 02:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 03:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 04:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 05:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 06:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 07:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 08:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 09:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 10:00:00.0000000 10,3    10,3    9,3 9,766944444
Sensor1 2017-05-18 11:00:00.0000000 8,766796332 8,766796332 8,3 8,517838288
Sensor1 2017-05-18 12:00:00.0000000 8,091982665 8,091982665 8,091982665 8,091982665
Sensor1 2017-05-18 13:00:00.0000000 7,701950163 7,701950163 7,701950163 7,701950163
Sensor1 2017-05-18 14:00:00.0000000 7,31191766  7,31191766  7,3 7,300364151
Sensor1 2017-05-18 15:00:00.0000000 13,87041565 18,3    13,87041565 16,46173594
Sensor1 2017-05-18 16:00:00.0000000 14,3    14,3    10,3    12,3
Sensor1 2017-05-18 17:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 18:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 19:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 20:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 21:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 22:00:00.0000000 10,3    10,3    10,3    10,3
Sensor1 2017-05-18 23:00:00.0000000 10,3    10,3    10,3    10,3

Here is the full select:

with 
sensors as
(select * from 
(values
('Sensor1',    10.3     ,cast('20170518 10:27:50' as datetime2))
,('Sensor1',     9.3    ,cast('20170518 10:28:01' as datetime2))
,('Sensor1',     8.3    ,cast('20170518 11:28:00' as datetime2))
,('Sensor1',     7.3    ,cast('20170518 14:01:50' as datetime2))
,('Sensor1',    15.0    ,cast('20170518 15:10:00' as datetime2))
,('Sensor1',    18.3    ,cast('20170518 15:30:00' as datetime2))
,('Sensor1',    10.3    ,cast('20170518 16:30:00' as datetime2))
) a(Sensor,Reading,TS)
)
,TimePeriods as (
select 
dateadd(hour,h,dt) dt
from
(select distinct cast(cast(ts as date) as datetime2) dt from sensors) days
cross join (values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9),(10),(11),(12)
        ,(13),(14),(15),(16),(17),(18),(19),(20),(21),(22),(23)) h(h)
)
,sensors2 as
(
select Sensor
    ,Reading
    ,TS
    ,isnull(lag(reading) over (partition by sensor order by ts), Reading) prevReading
    ,isnull(lag(ts) over (partition by sensor order by ts),dateadd(day,-1,ts)) prevts
from sensors
union all
select s.Sensor
      ,s.Reading
      ,dateadd(day,1,s.ts)
      ,s.Reading
      ,s.TS
from sensors s
inner join (select sensor,max(ts) ts from sensors group by Sensor) ms
on ms.Sensor=s.Sensor and ms.ts=s.ts
)
,artificial as
(
select 
    s.Sensor
    ,p.dt [TimeStamp]
    ,(cast(s.Reading-s.prevReading as float)*datediff(second,s.prevts,p.dt))/datediff(second,s.prevts,s.ts)+s.prevReading value_at
from sensors2  s
inner join TimePeriods  p
on p.dt between s.prevts and s.ts
)

,Allvalues as
(
select Sensor,TimeStamp,Reading,value_at 
,isnull(datediff(second,timestamp,lead(TimeStamp) over (partition by sensor order by timestamp)),3600) duration
from
(
select 
Sensor,TimeStamp,value_at Reading,value_at 
from artificial a
union
select
Sensor,TS,Reading,null value_at
from sensors) a
)

select
a.Sensor,p.dt Timestamp,max(value_at) value_at,max(Reading) Value_Max,min(Reading) Value_Min
,sum(duration*reading)/3600 Value_Avg
from TimePeriods p
inner join Allvalues a
on a.TimeStamp >=p.dt
and a.TimeStamp < dateadd(hour,1,p.dt)
group by a.Sensor,p.dt

Upvotes: 1

Related Questions