Ashutosh SIngh
Ashutosh SIngh

Reputation: 1019

BigQuery: How to calculate the running count of distinct visitors for each last 2 days

I want to calculate unique user count in last 2 days for each Date.

First Query:- I tried with CASE statement give me user count for that day which is not expected result, even I tried with window function.

I know one alternate solution through self join (already mention as second query) which give me correct answer what I' expecting, but I want to do it in a single query.

Reason to do in a single query want to reduce processed data size, if I make self join it will read complete table twice, and the original table size is multi TB.

SELECT
(CASE WHEN dt BETWEEN DATE_SUB(dt, INTERVAL 1 DAY) AND dt THEN 
CONCAT(CAST(DATE_SUB(dt, INTERVAL 1 DAY) AS STRING), '::', CAST(dt AS STRING)) END) AS Date_range,
COUNT(DISTINCT (CASE WHEN dt BETWEEN DATE_SUB(dt, INTERVAL 1 DAY) AND dt THEN Visitor_Name END)) AS Visitor_Count
FROM
(SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
UNION ALL
SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
UNION ALL
SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
UNION ALL
SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
UNION ALL
SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
UNION ALL
SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
UNION ALL
SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
UNION ALL
SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
UNION ALL
SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
UNION ALL
SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
UNION ALL
SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
UNION ALL
SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
UNION ALL
SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
UNION ALL
SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name)
GROUP BY Date_range
ORDER BY Date_range;

Solution:

SELECT
    (CASE WHEN a.dt BETWEEN DATE_SUB(b.dt, INTERVAL 1 DAY) AND b.dt THEN 
    CONCAT(CAST(DATE_SUB(b.dt, INTERVAL 1 DAY) AS STRING), '::', CAST(b.dt AS STRING)) END) AS Date_range,
    COUNT(DISTINCT (CASE WHEN a.dt BETWEEN DATE_SUB(b.dt, INTERVAL 1 DAY) AND b.dt THEN a.Visitor_Name END)) AS Visitor_Count
FROM


    (SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name) AS a


    INNER JOIN


    (SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
    UNION ALL
    SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name) AS b

    ON (a.dt <= b.dt)
GROUP BY Date_range
ORDER BY Date_range;

Upvotes: 0

Views: 85

Answers (2)

Mikhail Berlyant
Mikhail Berlyant

Reputation: 173190

Below is for BigQuery Standard SQL

#standardSQL
SELECT CONCAT(CAST(DATE_SUB(dt, INTERVAL 1 DAY) AS STRING), '::', CAST(dt AS STRING)) Date_range, 
  ANY_VALUE((SELECT COUNT(DISTINCT visitor) FROM UNNEST(arr_visitors) visitor)) AS Visitor_Count
FROM (
  SELECT dt, 
    ARRAY_AGG(visitor_name) OVER(ORDER BY UNIX_DATE(dt) RANGE BETWEEN 1 PRECEDING AND CURRENT ROW) AS arr_visitors
  FROM `project.dataset.your_table`
)
GROUP BY Date_range 

You can test / play with it using dummy data from your question as below

#standardSQL
WITH `project.dataset.your_table` AS (
  SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name UNION ALL
  SELECT '2018-01-01', 'B' UNION ALL
  SELECT '2018-01-01', 'C' UNION ALL
  SELECT '2018-01-01', 'D' UNION ALL
  SELECT '2018-01-02', 'B' UNION ALL
  SELECT '2018-01-02', 'C' UNION ALL
  SELECT '2018-01-02', 'E' UNION ALL
  SELECT '2018-01-03', 'A' UNION ALL
  SELECT '2018-01-03', 'P' UNION ALL
  SELECT '2018-01-04', 'A' UNION ALL
  SELECT '2018-01-04', 'C' UNION ALL
  SELECT '2018-01-05', 'D' UNION ALL
  SELECT '2018-01-05', 'B' UNION ALL
  SELECT '2018-01-05', 'B' UNION ALL
  SELECT '2018-01-06', 'P' 
)
SELECT CONCAT(CAST(DATE_SUB(dt, INTERVAL 1 DAY) AS STRING), '::', CAST(dt AS STRING)) Date_range, 
  ANY_VALUE((SELECT COUNT(DISTINCT visitor) FROM UNNEST(arr_visitors) visitor)) AS Visitor_Count
FROM (
  SELECT dt, 
    ARRAY_AGG(visitor_name) OVER(ORDER BY UNIX_DATE(dt) RANGE BETWEEN 1 PRECEDING AND CURRENT ROW) AS arr_visitors
  FROM `project.dataset.your_table`
)
GROUP BY Date_range 
ORDER BY Date_range   

with result

Row Date_range              Visitor_Count    
1   2017-12-31::2018-01-01  4    
2   2018-01-01::2018-01-02  5    
3   2018-01-02::2018-01-03  5    
4   2018-01-03::2018-01-04  3    
5   2018-01-04::2018-01-05  4    
6   2018-01-05::2018-01-06  3      

Upvotes: 0

Gordon Linoff
Gordon Linoff

Reputation: 1271003

You can accomplish this by "multiplying" the records before aggregating. That is, give each user a record for each date that the user should count.

Here is an example:

with t as (
      SELECT DATE('2018-01-01') AS dt, 'A' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-01' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-01' AS dt, 'C' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-01' AS dt, 'D' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-02' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-02' AS dt, 'C' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-02' AS dt, 'E' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-03' AS dt, 'A' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-03' AS dt, 'P' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-04' AS dt, 'A' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-04' AS dt, 'C' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-05' AS dt, 'D' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-05' AS dt, 'B' AS Visitor_Name
      UNION ALL
      SELECT '2018-01-06' AS dt, 'P' AS Visitor_Name
     )
select dt, count(distinct visitor_name) as num_visitors
from (select distinct date_add(dt, interval inc day) as dt, visitor_name
      from t CROSS JOIN
           (select 0 as inc UNION ALL
            SELECT 1
           ) x
     ) t
group by t.dt
order by t.dt;

Upvotes: 2

Related Questions