Reputation: 155
In big query I have a table with one column with is an array of strings. The data will look like this :
['a','b']
['b','c']
['c', 'd']
['e']
Now my desired output is :
['a','b','c','d']
['e']
Basically I want to merge all the array that have at least one value in common.
Any way I can do that?
thanks
Upvotes: 2
Views: 2811
Reputation: 196
Bigquery now supports Recursive CTEs.
https://cloud.google.com/bigquery/docs/recursive-ctes
So, borrowing some of the strategy from Mikhail, I put this together. (I also expanded the sample data to ensure it handled other test cases correctly.)
WITH RECURSIVE input AS (
SELECT ['a', 'b'] arr UNION ALL
SELECT ['b', 'c'] UNION ALL
SELECT ['c', 'd'] UNION ALL
SELECT ['x', 'y'] UNION ALL
SELECT ['y', 'a'] UNION ALL
SELECT ['e'] UNION ALL
SELECT ['m', 'n'] UNION ALL
SELECT ['n', 'm']
), preprocessed AS (
-- drops dupes and sorts each array; this is necessary for the ['e', 'f'], ['f', 'e'] case
SELECT ARRAY(SELECT DISTINCT val FROM UNNEST(arr) val ORDER BY val) arr
FROM input
), accumulated AS (
-- recursively accumulate intersecting arrays
SELECT arr, 1 AS run FROM preprocessed
UNION ALL
SELECT
ARRAY(SELECT DISTINCT val FROM UNNEST(ARRAY_CONCAT( t1.arr, t2.arr)) val ORDER BY val) arr,
t2.run + 1 AS run
FROM preprocessed t1, accumulated t2
-- where at least one common element, but not every element in the first
WHERE (SELECT COUNT(1) FROM UNNEST(t1.arr) val JOIN UNNEST(t2.arr) val USING(val))
BETWEEN 1 AND ARRAY_LENGTH(t1.arr) -1
), max_run_by_item AS (
-- gets the final run index for each item in any array
SELECT DISTINCT item, run
FROM accumulated, UNNEST(arr) item
QUALIFY MAX(run) OVER(PARTITION BY item) = run
), finally AS (
-- reduce down to the distinct final run arrays
SELECT ANY_VALUE(a.arr) arr
FROM max_run_by_item mr
JOIN accumulated a USING(run)
WHERE a.arr[0] = mr.item -- more efficient than WHERE EXISTS(SELECT * FROM UNNEST(a.arr) AS x WHERE x = mr.item)
GROUP BY FORMAT('%t', a.arr) -- this, along with the ANY_VALUE serves the purpose of SELECT DISTINCT but in the case of an array
)
SELECT FORMAT('%t', arr) AS arr -- nice format of array for inspection
FROM finally
Note that the final selection formats it as a string for ease of viewing, so skip that step if you need the array.
Upvotes: 0
Reputation: 172993
Usually this type of logic is implemented using so called recursive CTEs, but BigQuery does not support such!
Luckily, recently introduced scripting functionality allows to implement this in BigQuery
So, below is for BigQuery Standard SQL
DECLARE rows_count, run_away_stop INT64 DEFAULT 0;
CREATE TEMP TABLE ttt AS WITH input AS (
SELECT ['a', 'b'] arr UNION ALL
SELECT ['b', 'c'] UNION ALL
SELECT ['c', 'd'] UNION ALL
SELECT ['x', 'y'] UNION ALL
SELECT ['y', 'a'] UNION ALL
SELECT ['e']
)
SELECT ARRAY(SELECT val FROM UNNEST(arr) val ORDER BY val ) arr FROM input;
LOOP
SET rows_count = (SELECT COUNT(1) FROM ttt);
SET run_away_stop = run_away_stop + 1;
CREATE OR REPLACE TEMP TABLE ttt AS
SELECT ANY_VALUE(arr) arr FROM (
SELECT ARRAY(SELECT DISTINCT val FROM UNNEST(arr) val ORDER BY val) arr
FROM (
SELECT ANY_VALUE(arr1) arr1, ARRAY_CONCAT_AGG(arr) arr
FROM (
SELECT t1.arr arr1, t2.arr arr2, ARRAY(SELECT DISTINCT val FROM UNNEST(ARRAY_CONCAT( t1.arr, t2.arr)) val ORDER BY val) arr
FROM ttt t1, ttt t2
WHERE (SELECT COUNT(1) FROM UNNEST(t1.arr) val JOIN UNNEST(t2.arr) val USING(val)) > 0
) GROUP BY FORMAT('%t', arr1)
)
) GROUP BY FORMAT('%t', arr);
IF (rows_count = (SELECT COUNT(1) FROM ttt) AND run_away_stop > 1) OR run_away_stop > 10 THEN BREAK; END IF;
END LOOP;
SELECT ARRAY_TO_STRING(arr, ',') arr FROM ttt;
with final output
Row arr
1 a,b,c,d,x,y
2 e
Above took 3 iterations. In real life example it obviously will take more - so yo need to adjust max allowed iterations - currently it is 10 (see last statement within the LOOP)
Note: most likely above can be optimized - leaving this up to you
Upvotes: 5