cpp
cpp

Reputation: 21

SQL query based on time series data

I have 2 tables like below:

create table parent_child
(
    parent_id int not null,
    child_id int not null
    
);

INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 273215);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 117936);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 117873);
INSERT INTO parent_child (parent_id, child_id) VALUES (117722, 123305);

INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 240006);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 240005);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 239415);
INSERT INTO parent_child (parent_id, child_id) VALUES (104151, 239414);

INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118310);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 130627);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 298564);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118311);
INSERT INTO parent_child (parent_id, child_id) VALUES (5316, 118312);
create table child
(
    child_id int not null,
    tstamp datetime not null,
    value float,
);

-- Parent 117722
INSERT INTO child (child_id, tstamp, value) VALUES (273215, '2021-07-14 00:00:00.000000',  29);
INSERT INTO child (child_id, tstamp, value) VALUES (117936, '2021-07-14 00:00:00.000000',  52);
INSERT INTO child (child_id, tstamp, value) VALUES (117873, '2021-07-14 00:00:00.000000',  51);
INSERT INTO child (child_id, tstamp, value) VALUES (123305, '2021-07-14 00:00:00.000000',  31);

-- Parent 104151
INSERT INTO child (child_id, tstamp, value) VALUES (240006, '2021-07-14 00:00:00.000000',  37);
INSERT INTO child (child_id, tstamp, value) VALUES (240005, '2021-07-14 00:00:00.000000',  88);
INSERT INTO child (child_id, tstamp, value) VALUES (239415, '2021-07-14 00:00:00.000000',  29);
INSERT INTO child (child_id, tstamp, value) VALUES (239414, '2021-07-14 00:00:00.000000',  19);

-- Parent 5316
INSERT INTO child (child_id, tstamp, value) VALUES (118310, '2021-07-14 00:00:00.000000',  42);
INSERT INTO child (child_id, tstamp, value) VALUES (130627, '2021-07-14 00:00:00.000000',  11);
INSERT INTO child (child_id, tstamp, value) VALUES (298564, '2021-07-14 00:00:00.000000',  36);
INSERT INTO child (child_id, tstamp, value) VALUES (118311, '2021-07-14 00:00:00.000000',  22);
INSERT INTO child (child_id, tstamp, value) VALUES (118312, '2021-07-14 00:00:00.000000',   9);

-- Parent 117722
INSERT INTO child (child_id, tstamp, value) VALUES (273215, '2021-07-14 00:05:00.000000',  72);
INSERT INTO child (child_id, tstamp, value) VALUES (117936, '2021-07-14 00:05:00.000000',  99);
INSERT INTO child (child_id, tstamp, value) VALUES (117873, '2021-07-14 00:05:00.000000',  13);
INSERT INTO child (child_id, tstamp, value) VALUES (123305, '2021-07-14 00:05:00.000000',  24);

-- Parent 104151
INSERT INTO child (child_id, tstamp, value) VALUES (240006, '2021-07-14 00:05:00.000000',  65);
INSERT INTO child (child_id, tstamp, value) VALUES (240005, '2021-07-14 00:05:00.000000',  63);
INSERT INTO child (child_id, tstamp, value) VALUES (239415, '2021-07-14 00:05:00.000000',  23);
INSERT INTO child (child_id, tstamp, value) VALUES (239414, '2021-07-14 00:05:00.000000',  15);

-- Parent 5316
INSERT INTO child (child_id, tstamp, value) VALUES (118310, '2021-07-14 00:05:00.000000',  19);
INSERT INTO child (child_id, tstamp, value) VALUES (130627, '2021-07-14 00:05:00.000000',  22);
INSERT INTO child (child_id, tstamp, value) VALUES (298564, '2021-07-14 00:05:00.000000',  47);
INSERT INTO child (child_id, tstamp, value) VALUES (118311, '2021-07-14 00:05:00.000000',  54);
INSERT INTO child (child_id, tstamp, value) VALUES (118312, '2021-07-14 00:05:00.000000',  12);

The data in child table repeats every 5 minutes for each child. That is, for each child of a parent, there will be 288 data points. And this will further repeat for each day with a different (or same) value at a data point.

Questions:

(1) Find out parent_id, date_when_count_of_value_above_30_more_than_12_times_in_a_day, count_of_values_above_30_each_day where value across all children of a parent exceeded 30 for more than 12 times a day and at least 3 days a week. The data points need not be consecutive. Put in other words, if the MAX(value) across all children of a parent at a data point (e.g. 2021-07-14 00:00:00.000000) is above 30, then that is counted as one occurance for that parent on that date.

(2) Find out parent_id, latest_datetime_in_the_week, max_value_across_all_children_in_a_week where the value across all children of a parent was maximum and the datetime when it was maximum. If more than one datetime has the same max value, then pick the latest datetime.

If these two queries can be combined in a single query, that is what I want. Otherwise, these can be 2 differet queries. If single query, then the output of question (2) will repeat for each row from question (1) and that is acceptable.

Inputs:

A list of parent_ids for example, WHERE parent_id IN (117722, 5316)

Sample output (if 2 queries are combined in one; column names can be anything, I am putting long names for brevity):

parent_id | date_when_count_of_value_above_30_more_than_12_times_in_a_day | count_of_values_above_30_each_day | max_value_across_all_children_in_a_week | latest_datetime_in_the_week
117722    | 2021-07-14                                                    | 13                                | 99                                      | 2021-07-09 16:15:00.000000
117722    | 2021-07-11                                                    | 28                                | 99                                      | 2021-07-09 16:15:00.000000
104151    | 2021-07-14                                                    | 19                                | 65                                      | 2021-07-11 18:30:00.000000
104151    | 2021-07-13                                                    | 27                                | 65                                      | 2021-07-11 18:30:00.000000
104151    | 2021-07-11                                                    | 36                                | 65                                      | 2021-07-11 18:30:00.000000

The above is just a sample output. Of course, a parent_id will not appear on the output if the number of occurances in a week is not more than 3.

Upvotes: 1

Views: 329

Answers (3)

marcothesane
marcothesane

Reputation: 6749

Three selects, all depending on the parent id. And using Vertica's analytic limit clause: LIMIT 1 OVER()

WITH
w_p AS (
  SELECT
    parent_id
  , child.*
  FROM parent_child
  JOIN child USING(child_id)
)
,
o_30_count AS (
  SELECT
    parent_id
  , tstamp::DATE
  , SUM(CASE WHEN val > 30 THEN 1 END) AS over_30_count
  FROM w_p 
  GROUP BY 1,2 
  HAVING over_30_count > 12
)
,
max_per_week AS (
  SELECT
    parent_id
  , WEEK(tstamp)
  , MAX(val) AS max_per_week
  FROM w_p
  GROUP BY 1,2
  LIMIT 1 OVER(PARTITION BY parent_id ORDER BY max_per_week DESC)
)
,
last_max_per_week AS (
  SELECT
    parent_id
  , tstamp AS last_week_ts
  FROM w_p
  LIMIT 1 OVER(PARTITION BY parent_id,WEEK(tstamp) ORDER BY val DESC)
)
SELECT
  o_30_count.parent_id
, over_30_count
, max_per_week
, last_week_ts
FROM o_30_count 
JOIN max_per_week USING(parent_id)
JOIN last_max_per_week USING(parent_id);

Upvotes: 1

Varun Vishnoi
Varun Vishnoi

Reputation: 990

I tested this in Sybase. You can leverage the logic and convert Vertica specific. Also, For week I am taking year of week from timestamp. You can modify that as per your requirement. Like today -7 days or window based.

(1)

Select * from
(Select
p.parent_id                     as 'parent_id',
convert(varchar, c.tstamp, 101) as 'date_when_count_of_value_above_30_more_than_12_times_in_a_day',
count(*)                        as 'count_of_values_above_30_each_day '
from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
where 
c.value > 30 and
p.parent_id in (5316, 117722, 104151)
group by 
convert(varchar, c.tstamp, 101), p.parent_id
having count(*) > 12) tb1
group by datepart(cwk, tb1.date_when_count_of_value_above_30_more_than_12_times_in_a_day), parent_id
having count(*) >=3

(2)

Select
parent_id      as 'parent_id',
MAX(tstamp)    as 'latest_datetime_in_the_week',
value          as 'max_value_across_all_children_in_a_week' 
from 
(Select
p.parent_id,c.tstamp,c.value from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
JOIN 
(Select
MAX(c.value) as 'max_value'
from #parent_child p LEFT OUTER JOIN #child c
on p.child_id = c.child_id
where
datepart(cwk, tstamp) = 28
group by 
datepart(cwk, tstamp), p.parent_id) tbl1
ON c.value = tbl1.max_value) tbl2
group by parent_id, value

Upvotes: 1

Neal Burns
Neal Burns

Reputation: 849

For (1):

Note: I moved parentId into the child table for readability

WITH BYDAY AS
(
    SELECT COUNT(*) as CountHiValsOnDay, parent_id, CAST(tstamp as date) as tsDate, DATEPART(week, CAST(tstamp as date)) as TheWeek, YEAR(CAST(tstamp as date)) as TheYear
    FROM child
    WHERE value > 30
    GROUP BY parent_id, CAST(tstamp as date)
    HAVING COUNT(*) > 12
)
SELECT
    bd1.parent_id,
    bd1.tsDate,
    bd1.CountHiValsOnDay,
    bd2.CountThisWeek
FROM BYDAY bd1
    JOIN (
        SELECT COUNT(*) as CountThisWeek, jj.parent_id, jj.TheWeek, jj.TheYear
        FROM BYDAY jj
        GROUP BY jj.parent_id, jj.TheWeek, jj.TheYear
    ) bd2
      ON bd2.parent_id = bd1.parent_id AND
          bd2.TheWeek = bd1.TheWeek AND
          bd2.TheYear = bd1.TheYear
WHERE bd2.CountThisWeek >= 3

Upvotes: 0

Related Questions