Reputation: 33
I have a table that looks like this: (There is not a record for every day and not for every field_name)
date | field_name | field_value |
---|---|---|
2021-01-01 | coffee_available | 1 |
2021-01-02 | coffee_available | 1 |
2021-01-03 | tea_available | 0 |
2021-01-03 | coffee_available | 1 |
2021-01-04 | tea_available | 0 |
2021-01-06 | coffee_available | 0 |
2021-01-07 | coffee_available | 0 |
2021-01-08 | coffee_available | 1 |
2021-01-08 | tea_available | 1 |
I want to query the database for specific conditions, eg. the longest streak of days with coffee available or the longest streak of days with no tea available. So the result should be a single number or - even better - the longest streaks with start or end date.
I looked at the other streak questions, but I couldn't figure out how to change it so it fits my problem. Could you please help me out?
Upvotes: 0
Views: 659
Reputation: 10035
You may use the following approach which uses ROW_NUMBER
to determine groups of sequential dates for specific conditions and DATEDIFF
to determine the streak of days and that days within each streak differ only by 1. COUNT
with an adjustment of 1 may also be used to determine the streak as denoted below. I included an overall query and additional queries specific to the scenario questions shared in the question.
Query #1
SELECT
MIN(`DATE`),
MAX(`DATE`),
DATEDIFF(MAX(`DATE`), MIN(`DATE`)) AS day_streak_method_1,
COUNT(1) - 1 AS day_streak_method_2,
t1.field_name,
t1.field_value
FROM
(
SELECT
DATEDIFF(`DATE`, LAG(`DATE`, 1, `DATE`) OVER (PARTITION BY gnum
ORDER BY
`DATE`)) AS day_diff,
t.*
FROM
(
SELECT
*,
(
ROW_NUMBER() OVER (PARTITION BY `field_name`
ORDER BY
`DATE`, `field_name`) - ROW_NUMBER() OVER (PARTITION BY `field_name`, `field_value`
ORDER BY
`DATE`, `field_name`)
)
AS gnum
FROM
conditions
)
t
)
t1
WHERE
day_diff < 2
GROUP BY
gnum,
field_name,
field_value;
MIN(DATE ) |
MAX(DATE ) |
day_streak_method_1 | day_streak_method_2 | field_name | field_value |
---|---|---|---|---|---|
2021-01-01 00:00:00 | 2021-01-03 00:00:00 | 2 | 2 | coffee_available | 1 |
2021-01-03 00:00:00 | 2021-01-04 00:00:00 | 1 | 1 | tea_available | 0 |
2021-01-08 00:00:00 | 2021-01-08 00:00:00 | 0 | 0 | coffee_available | 1 |
2021-01-08 00:00:00 | 2021-01-08 00:00:00 | 0 | 0 | tea_available | 1 |
2021-01-06 00:00:00 | 2021-01-07 00:00:00 | 1 | 1 | coffee_available | 0 |
Query #2
SELECT 'The longest streak of days with coffee available' as `Question 1`;
Question 1 |
---|
The longest streak of days with coffee available |
Query #3
SELECT
MIN(`DATE`),
MAX(`DATE`),
DATEDIFF(MAX(`DATE`), MIN(`DATE`)) AS day_streak_method_1
FROM
(
SELECT
DATEDIFF(`DATE`, LAG(`DATE`, 1, `DATE`) OVER (PARTITION BY gnum
ORDER BY
`DATE`)) AS day_diff,
t.*
FROM
(
SELECT
*,
(
ROW_NUMBER() OVER (PARTITION BY `field_name`
ORDER BY
`DATE`, `field_name`) - ROW_NUMBER() OVER (PARTITION BY `field_name`, `field_value`
ORDER BY
`DATE`, `field_name`)
)
AS gnum
FROM
conditions
WHERE
field_name='coffee_available' AND
field_value=1
)
t
)
t1
WHERE
day_diff < 2
GROUP BY
gnum,
field_name,
field_value
ORDER BY
DATEDIFF(MAX(`DATE`), MIN(`DATE`)) DESC
LIMIT 1;
MIN(DATE ) |
MAX(DATE ) |
day_streak_method_1 |
---|---|---|
2021-01-01 00:00:00 | 2021-01-03 00:00:00 | 2 |
Query #4
SELECT 'The longest streak of days with no tea available' as `Question 2`;
Question 2 |
---|
The longest streak of days with no tea available |
Query #5
SELECT
MIN(`DATE`),
MAX(`DATE`),
DATEDIFF(MAX(`DATE`), MIN(`DATE`)) AS day_streak_method_1
FROM
(
SELECT
DATEDIFF(`DATE`, LAG(`DATE`, 1, `DATE`) OVER (PARTITION BY gnum
ORDER BY
`DATE`)) AS day_diff,
t.*
FROM
(
SELECT
*,
(
ROW_NUMBER() OVER (PARTITION BY `field_name`
ORDER BY
`DATE`, `field_name`) - ROW_NUMBER() OVER (PARTITION BY `field_name`, `field_value`
ORDER BY
`DATE`, `field_name`)
)
AS gnum
FROM
conditions
WHERE
field_name='tea_available' AND
field_value=0
)
t
)
t1
WHERE
day_diff < 2
GROUP BY
gnum,
field_name,
field_value
ORDER BY
DATEDIFF(MAX(`DATE`), MIN(`DATE`)) DESC
LIMIT 1;
MIN(DATE ) |
MAX(DATE ) |
day_streak_method_1 |
---|---|---|
2021-01-03 00:00:00 | 2021-01-04 00:00:00 | 1 |
Edit 1
Used less subqueries with the assistance of a case expression. Apply additional filters in where clause where needed.
SELECT
MIN(`DATE`),
MAX(`DATE`),
DATEDIFF(MAX(`DATE`), MIN(`DATE`)) AS day_streak_method_1,
field_name,
field_value
FROM (
SELECT
*,
CASE
WHEN DATEDIFF(
`date`,
LAG(`date`,1,`date`) OVER (
PARTITION BY `field_name`, `field_value`
ORDER BY `date`
)
) < 2 THEN (
ROW_NUMBER() OVER (
PARTITION BY `field_name`
ORDER BY `date`, `field_name`
) -
ROW_NUMBER() OVER (
PARTITION BY `field_name`, `field_value`
ORDER BY `date`, `field_name`
)
)
ELSE NULL
END as gnum
FROM
conditions
) t
WHERE
gnum IS NOT NULL
GROUP BY
gnum, field_name, field_value
Using older versions of mysql
SELECT
MIN(`DATE`),
MAX(`DATE`),
DATEDIFF(MAX(`DATE`), MIN(`DATE`)) AS day_streak,
field_name,
field_value
FROM (
SELECT
*,
@seq1:=IF(@seq1_prev = `field_name`,@seq1+1,1),
@seq2:=IF(@seq2_prev = CONCAT(`field_name`, `field_value`),@seq2+1,1),
CASE
WHEN DATEDIFF(
`date`,
IF(
@seq2_prev=CONCAT(`field_name`, `field_value`),
@date_prev,
`date`
)
) < 2 THEN @seq1 - @seq2
ELSE NULL
END as gnum,
@seq1_prev:=`field_name`,
@seq2_prev:=CONCAT(`field_name`, `field_value`),
@date_prev:=`date`
FROM
conditions
CROSS JOIN (
SELECT
@seq1:=0,
@seq1_prev:=NULL,
@seq2:=0,
@seq2_prev:=NULL,
@date_prev:=NULL
) as vars
ORDER BY `date`, `field_name`
) t
GROUP BY
gnum, field_name, field_value
View working demo on db fiddle
Let me know if this works for you
Upvotes: 2