Reputation: 868
Suppose I have a table called my.table
and a split
function that is already defined and returns an array of strings.
SELECT split(lang) as langs
FROM my.table
which returns:
+-----------------------------+
| langs |
+-----------------------------+
| [French, English, English] |
+-----------------------------+
| [Dutch, French, English] |
+-----------------------------+
| [English] |
+-----------------------------+
| [French, Dutch] |
+-----------------------------+
Now I am trying to apply unnest
to turn the above into a table with occurrence of each language, for example:
+--------------------------+
| English | French | Dutch |
+--------------------------+
| 2 | 1 | 0 | # corresponds to [French, English, English] (0 Dutch)
+--------------------------+
| 1 | 1 | 1 |
+--------------------------+
| 1 | 0 | 0 |
+--------------------------+
| 0 | 1 | 1 |
+--------------------------+
I can count the total number of say 'English' in a naive way such as:
WITH x AS (SELECT split(lang) as langs
FROM my.table)
SELECT count(arr_item) as English
FROM x, UNNEST(arr) as arr_item where arr_item = 'English'
Edited:
Each row may contain repeated elements like [English, English, French]
. See first table:row1.
So the output for that one is shown in the second table.
Upvotes: 0
Views: 4215
Reputation: 173106
Below is for BigQuery Standard SQL
Most likely number of languages in your data is not known in advance - so I recommend below approach which first collect all languages in data and puts it in alphabetical order and then for each row produces vector of 0s and 1s representing existence of respective language based on their position in that base languages list
#standardSQL
WITH `project.dataset.table` AS (
SELECT 'French,English' langs UNION ALL
SELECT 'Dutch,French,English' UNION ALL
SELECT 'English' UNION ALL
SELECT 'French,Dutch'
), base AS (
SELECT STRING_AGG(lang ORDER BY lang) all_langs
FROM (
SELECT DISTINCT lang
FROM `project.dataset.table`,
UNNEST(SPLIT(langs)) lang
)
)
SELECT langs, all_langs,
(SELECT STRING_AGG(IF(lang IS NULL, '0', '1') ORDER BY pos)
FROM UNNEST(SPLIT(all_langs)) base_lang WITH OFFSET pos
LEFT JOIN UNNEST(SPLIT(langs)) lang
ON base_lang = lang
) AS value
FROM `project.dataset.table` t
CROSS JOIN base b
result is
Row langs all_langs value
1 French,English Dutch,English,French 0,1,1
2 Dutch,French,English Dutch,English,French 1,1,1
3 English Dutch,English,French 0,1,0
4 French,Dutch Dutch,English,French 1,0,1
Hope, this will give you good starting point for your specific use case
Note: BigQuery does not support native PIVOT'ing, so above approach is most likely the most optimal for you
... my rows are array of strings already ... I have [‘French’, ‘English’] instead of ‘French, English’ ... So does that still work?
Absolutelly - Yes! The only change you need to do is to replace UNNEST(SPLIT(langs))
with UNNEST(langs)
as in example below
#standardSQL
WITH `project.dataset.table` AS (
SELECT ['French','English'] langs UNION ALL
SELECT ['Dutch','French','English'] UNION ALL
SELECT ['English'] UNION ALL
SELECT ['French','Dutch']
), base AS (
SELECT STRING_AGG(lang ORDER BY lang) all_langs
FROM (
SELECT DISTINCT lang
FROM `project.dataset.table`,
UNNEST(langs) lang
)
)
SELECT langs, all_langs,
(SELECT STRING_AGG(IF(lang IS NULL, '0', '1') ORDER BY pos)
FROM UNNEST(SPLIT(all_langs)) base_lang WITH OFFSET pos
LEFT JOIN UNNEST(langs) lang
ON base_lang = lang
) AS value
FROM `project.dataset.table` t
CROSS JOIN base b
with result
if a row is [French, English, English]. desired is 0,1,2
see example below
#standardSQL
WITH `project.dataset.table` AS (
SELECT ['French','English','English'] langs UNION ALL
SELECT ['Dutch','French','English'] UNION ALL
SELECT ['English','English'] UNION ALL
SELECT ['French','Dutch']
), base AS (
SELECT STRING_AGG(lang ORDER BY lang) all_langs
FROM (
SELECT DISTINCT lang
FROM `project.dataset.table`,
UNNEST(langs) lang
)
)
SELECT langs, all_langs,
ARRAY_TO_STRING(ARRAY(SELECT CAST(SUM(IF(lang IS NULL, 0, 1)) AS STRING)
FROM UNNEST(SPLIT(all_langs)) base_lang WITH OFFSET pos
LEFT JOIN UNNEST(langs) lang
ON base_lang = lang
GROUP BY base_lang
ORDER BY MIN(pos)
), ',') AS value
FROM `project.dataset.table` t
CROSS JOIN base b
with result
Upvotes: 3