Reputation: 172
I have a recursion/hierarchical problem that I'm trying to figure out in BigQuery.
I have a list of employees and each employee has a manager ID. I need to be able to enter a single Employee_ID and return an array of every person beneath them.
CREATE TABLE p_RLS.testHeirarchy
(
Employee_ID INT64,
Employee_Name STRING,
Position STRING,
Line_Manager_ID INT64
);
INSERT INTO p_RLS.testHeirarchy (Employee_ID, Employee_Name, Position, Line_Manager_ID)
VALUES(1,'Joe','Worker',11),
(2,'James','Worker',11),
(3,'Jack','Worker',11),
(4,'Jill','Worker',12),
(5,'Jan','Worker',12),
(6,'Jacquie','Worker',13),
(7,'Joaquin','Worker',14),
(8,'Jeremy','Worker',14),
(9,'Jade','Worker',15),
(10,'Jocelyn','Worker',15),
(11, 'Bob', 'Store Manager',16),
(12, 'Bill', 'Store Manager',16),
(13, 'Barb', 'Store Manager',16),
(14, 'Ben', 'Store Manager',17),
(15, 'Burt', 'Store Manager',17),
(16, 'Sally','Group Manager',18),
(17, 'Sam','Group Manager',19),
(18, 'Anna', 'Ops Manager',20),
(19, 'Amy', 'Ops Manager',20),
(20, 'Zoe', 'State Manager', NULL);
My desired output would resemble:
SELECT 20 as Employee_ID, [19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] as Reports;
SELECT 11 as Employee_ID, [3,2,1] as Reports;
SELECT 1 as Employee_ID, [] as Reports;
I have got the following working but it seems very ugly/inconvenient and doesn't support unlimited levels:
WITH test as (
SELECT L0.Employee_ID, L0.Employee_Name, L0.Position, L0.Line_Manager_ID,
ARRAY_AGG(DISTINCT L1.Employee_ID IGNORE NULLS) as Lvl1,
ARRAY_AGG(DISTINCT L2.Employee_ID IGNORE NULLS) as Lvl2,
ARRAY_AGG(DISTINCT L3.Employee_ID IGNORE NULLS) as Lvl3,
ARRAY_AGG(DISTINCT L4.Employee_ID IGNORE NULLS) as Lvl4,
ARRAY_AGG(DISTINCT L5.Employee_ID IGNORE NULLS) as Lvl5,
ARRAY_AGG(DISTINCT L6.Employee_ID IGNORE NULLS) as Lvl6,
ARRAY_AGG(DISTINCT L7.Employee_ID IGNORE NULLS) as Lvl7
FROM p_RLS.testHeirarchy as L0
LEFT OUTER JOIN p_RLS.testHeirarchy L1 ON L0.Employee_ID = L1.Line_Manager_ID
LEFT OUTER JOIN p_RLS.testHeirarchy L2 ON L1.Employee_ID = L2.Line_Manager_ID
LEFT OUTER JOIN p_RLS.testHeirarchy L3 ON L2.Employee_ID = L3.Line_Manager_ID
LEFT OUTER JOIN p_RLS.testHeirarchy L4 ON L3.Employee_ID = L4.Line_Manager_ID
LEFT OUTER JOIN p_RLS.testHeirarchy L5 ON L4.Employee_ID = L5.Line_Manager_ID
LEFT OUTER JOIN p_RLS.testHeirarchy L6 ON L5.Employee_ID = L6.Line_Manager_ID
LEFT OUTER JOIN p_RLS.testHeirarchy L7 ON L6.Employee_ID = L7.Line_Manager_ID
WHERE L0.Employee_ID = 16
GROUP BY 1,2,3,4)
SELECT
Employee_ID, ARRAY_CONCAT(
IFNULL(Lvl1,[]),
IFNULL(Lvl2,[]),
IFNULL(Lvl3,[]),
IFNULL(Lvl4,[]),
IFNULL(Lvl5,[]),
IFNULL(Lvl6,[]),
IFNULL(Lvl7,[])) as All_reports
FROM test
Is there a better way to do this? Is a recursive approach possible in BigQuery?
Upvotes: 10
Views: 14502
Reputation: 172993
Recursive CTE was recently introduced !
This makes things so much easier
with recursive iterations as (
select line_manager_id, employee_id, 1 pos from your_table
union all
select b.line_manager_id, a.employee_id, pos + 1
from your_table a join iterations b
on b.employee_id = a.line_manager_id
)
select line_manager_id, string_agg('' || employee_id order by pos, employee_id desc) as reports_as_list
from iterations
where not line_manager_id is null
group by line_manager_id
order by line_manager_id desc
If applied to sample data in question - output is
Upvotes: 15
Reputation: 172993
Below is for BigQuery Standard SQL
DECLARE rows_count, run_away_stop INT64 DEFAULT 0;
CREATE TEMP TABLE initialData AS WITH input AS (
SELECT 1 Employee_ID,'Joe' Employee_Name,'Worker' Position,11 Line_Manager_ID UNION ALL
SELECT 2,'James','Worker',11 UNION ALL
SELECT 3,'Jack','Worker',11 UNION ALL
SELECT 4,'Jill','Worker',12 UNION ALL
SELECT 5,'Jan','Worker',12 UNION ALL
SELECT 6,'Jacquie','Worker',13 UNION ALL
SELECT 7,'Joaquin','Worker',14 UNION ALL
SELECT 8,'Jeremy','Worker',14 UNION ALL
SELECT 9,'Jade','Worker',15 UNION ALL
SELECT 10,'Jocelyn','Worker',15 UNION ALL
SELECT 11, 'Bob', 'Store Manager',16 UNION ALL
SELECT 12, 'Bill', 'Store Manager',16 UNION ALL
SELECT 13, 'Barb', 'Store Manager',16 UNION ALL
SELECT 14, 'Ben', 'Store Manager',17 UNION ALL
SELECT 15, 'Burt', 'Store Manager',17 UNION ALL
SELECT 16, 'Sally','Group Manager',18 UNION ALL
SELECT 17, 'Sam','Group Manager',19 UNION ALL
SELECT 18, 'Anna', 'Ops Manager',20 UNION ALL
SELECT 19, 'Amy', 'Ops Manager',20 UNION ALL
SELECT 20, 'Zoe', 'State Manager', NULL
)
SELECT * FROM input;
CREATE TEMP TABLE ttt AS
SELECT Line_Manager_ID, ARRAY_AGG(Employee_ID) Reports FROM initialData WHERE NOT Line_Manager_ID IS NULL GROUP BY Line_Manager_ID;
LOOP
SET (run_away_stop, rows_count) = (SELECT AS STRUCT run_away_stop + 1, COUNT(1) FROM ttt);
CREATE OR REPLACE TEMP TABLE ttt1 AS
SELECT Line_Manager_ID, ARRAY(SELECT DISTINCT Employee_ID FROM UNNEST(Reports) Employee_ID ORDER BY Employee_ID DESC) Reports
FROM (
SELECT Line_Manager_ID, ARRAY_CONCAT_AGG(Reports) Reports
FROM (
SELECT t2.Line_Manager_ID, ARRAY_CONCAT(t1.Reports, t2.Reports) Reports
FROM ttt t1, ttt t2
WHERE (SELECT COUNTIF(t1.Line_Manager_ID = Employee_ID) FROM UNNEST(t2.Reports) Employee_ID) > 0
) GROUP BY Line_Manager_ID
);
CREATE OR REPLACE TEMP TABLE ttt AS
SELECT * FROM ttt1 UNION ALL
SELECT * FROM ttt WHERE NOT Line_Manager_ID IN (SELECT Line_Manager_ID FROM ttt1);
IF (rows_count = (SELECT COUNT(1) FROM ttt) AND run_away_stop > 1) OR run_away_stop > 10 THEN BREAK; END IF;
END LOOP;
SELECT Employee_ID,
(
SELECT STRING_AGG(CAST(Employee_ID AS STRING), ',' ORDER BY Employee_ID DESC)
FROM ttt.Reports Employee_ID
) Reports_as_list
FROM (SELECT DISTINCT Employee_ID FROM initialData) d
LEFT JOIN ttt ON Employee_ID = Line_Manager_ID
ORDER BY Employee_ID DESC;
with result
Row Employee_ID Reports_as_list
1 20 19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1
2 19 17,15,14,10,9,8,7
3 18 16,13,12,11,6,5,4,3,2,1
4 17 15,14,10,9,8,7
5 16 13,12,11,6,5,4,3,2,1
6 15 10,9
7 14 8,7
8 13 6
9 12 5,4
10 11 3,2,1
11 10 null
12 9 null
13 8 null
14 7 null
15 6 null
16 5 null
17 4 null
18 3 null
19 2 null
20 1 null
In case if you need Reports as array - replace last statement in above script with below
SELECT Employee_ID, Reports Reports_as_array
FROM (SELECT DISTINCT Employee_ID FROM initialData) d
LEFT JOIN ttt ON Employee_ID = Line_Manager_ID
ORDER BY Employee_ID DESC;
Note: depends on level of nesting in your hierarchy - you might need to adjust 10
in OR run_away_stop > 10
Upvotes: 7
Reputation: 59175
To the question: "Is a recursive approach possible in BigQuery?"
Yes!
Now that BigQuery supports scripting and loops I solved some recursive problems from the Advent of Code with BigQuery:
CREATE TEMP TABLE planets AS SELECT 'YOU' planet;
LOOP
SET steps = steps+1
;
CREATE OR REPLACE TEMP TABLE planets AS
SELECT DISTINCT planet
FROM (
SELECT origin planet FROM t1 WHERE dest IN (SELECT planet FROM planets)
UNION ALL
SELECT dest planet FROM t1 WHERE origin IN (SELECT planet FROM planets)
)
;
IF 'SAN' IN (SELECT * FROM planets )
THEN LEAVE;
END IF;
END LOOP
;
SELECT steps-2
I would use a similar approach to navigate the graph and annotate all parent relationships.
Soon: I'll write a blog post on the specifics of tree traversal to get everyone under x. But this code will help you in the meantime.
Upvotes: 2