Reputation: 77
Using BigQuery, I want to group pages depending on their title with one query and calculate different metrics on the groups. As the rules on titles are not mutually exclusive, I've done it this way:
SELECT SUM(views) views, ..., title_group
FROM `fh-bigquery.wikipedia_v3.pageviews_2019`,
UNNEST([
CASE WHEN (title LIKE '%game%')
THEN 'games_group' END,
CASE WHEN (title LIKE '%sport%')
THEN 'sports_group' END,
CASE WHEN (title LIKE '%rarerare%')
THEN 'rare_group' END
]) AS title_group
WHERE DATE(datehour) BETWEEN '2019-01-01' AND '2019-01-10'AND wiki='en'
GROUP BY title_group
And I get:
views ... title_group
3414469869 ...
4355264 ... games_group
1361074 ... sports_group
What I would like to obtain is a bit different:
views ... title_group
4355264 ... games_group
1361074 ... sports_group
0 ... rare_group
Indeed, I need to have the results even if a group didn't show up.
How could I modify my query to get this result ?
Upvotes: 1
Views: 89
Reputation: 1269873
Why not just put the values in separate columns for each hour?
SELECT SUM(CASE WHEN title LIKE '%game%' THEN views ELSE 0 END) as game_views,
SUM(CASE WHEN title LIKE '%sport%' THEN views ELSE 0 END) as sport_views,
SUM(CASE WHEN title LIKE '%rarerare%' THEN views ELSE 0 END) as rare_views
FROM `fh-bigquery.wikipedia_v3.pageviews_2019`
WHERE DATE(datehour) BETWEEN '2019-01-01' AND '2019-01-10' AND
wiki = 'en';
You could then unpivot this if you like:
SELECT el.title_group, el.views
FROM (SELECT SUM(CASE WHEN title LIKE '%game%' THEN views ELSE 0 END) as game_views,
SUM(CASE WHEN title LIKE '%sport%' THEN views ELSE 0 END) as sport_views,
SUM(CASE WHEN title LIKE '%rarerare%' THEN views ELSE 0 END) as rare_views
FROM `fh-bigquery.wikipedia_v3.pageviews_2019`
WHERE DATE(datehour) BETWEEN '2019-01-01' AND '2019-01-10' AND
wiki = 'en'
) v CROSS JOIN
UNNEST([STRUCT('games_groups' as title_group, v.game_views as views),
STRUCT('sports_groups' as title_group, v.sports_views as views),
STRUCT('rare_groups' as title_group, v.rare_views as views)
]
) el
Upvotes: 1
Reputation: 172993
The simplest way is to wrap your original (no any changes) query with additional left join as in example below
#standardSQL
SELECT IFNULL(views, 0) views, title_group
FROM (SELECT title_group FROM UNNEST(['games_group', 'sports_group', 'rare_group']) AS title_group)
LEFT JOIN (
SELECT SUM(views) views, title_group
FROM `fh-bigquery.wikipedia_v3.pageviews_2019`
CROSS JOIN UNNEST([
CASE WHEN (title LIKE '%game%') THEN 'games_group' END,
CASE WHEN (title LIKE '%sport%') THEN 'sports_group' END,
CASE WHEN (title LIKE '%rarerare%') THEN 'rare_group' END
]) AS title_group
WHERE DATE(datehour) BETWEEN '2019-01-01' AND '2019-01-10'AND wiki='en'
GROUP BY title_group
)
USING(title_group)
-- ORDER BY views DESC
with output
Row views title_group
1 4355264 games_group
2 1361074 sports_group
3 0 rare_group
If you are willing to have above optimized / refactored - consider below version (with same output obviously)
#standardSQL
WITH title_groups AS (
SELECT title_group, pattern
FROM UNNEST([STRUCT<title_group STRING, pattern STRING>
('games_group', '%game%'), ('sports_group', '%sport%'), ('rare_group', '%rarerare%')
])
)
SELECT IFNULL(views, 0) views, title_group FROM title_groups
LEFT JOIN (
SELECT SUM(CASE WHEN title LIKE pattern THEN views END) views, title_group
FROM `fh-bigquery.wikipedia_v3.pageviews_2019`
CROSS JOIN title_groups
WHERE DATE(datehour) BETWEEN '2019-01-01' AND '2019-01-10'AND wiki='en'
GROUP BY title_group
)
USING(title_group)
Upvotes: 1