Reputation: 2902
I'd like to do the following in SQL (Snowflake dialect):
Given a group of edges containing cycles
FROM_NODE | TO_NODE
--------- | -------
A | B
B | C
C | D
D | C
E | F
G | H
H | G
Identify all unique groups of related nodes
FROM_NODE | TO_NODE | GROUP_NAME
--------- | ------- | ----------
A | B | 1
B | C | 1
C | D | 1
D | B | 1
E | F | 2
G | H | 3
H | G | 3
The code below runs forever (because of the cycles?) (taken from here: How to identify groups/clusters in set of arcs/edges in SQL?):
WITH edges as (
select * from (
values
('A', 'B'),
('B', 'C'),
('C', 'D'),
('D', 'B'),
('E', 'F'),
('G', 'H'),
('H', 'G')
) as e (FROM_NODE, TO_NODE)
),
groups (FROM_NODE, TO_NODE, GROUP_NAME) AS (
SELECT
e1.FROM_NODE,
e1.TO_NODE,
Rank() Over (ORDER BY e1.FROM_NODE) as GROUP_NAME
FROM edges AS e1
UNION ALL SELECT
e2.FROM_NODE,
e2.TO_NODE,
g.GROUP_NAME
FROM edges as e2, groups as g
WHERE g.FROM_NODE = e2.TO_NODE
)
select * from groups order by GROUP_NAME
Upvotes: 0
Views: 139
Reputation: 2746
I have included row id in data-set itself - same can be generated in one of the CTEs as a pseudo-column.
Query -
with data (id,from_node,to_node) as (
select * from values
(1,'A','B'),
(2,'B','C'),
(3,'C','D'),
(4,'D','C'),
(5,'E','F'),
(6,'G','H'),
(7,'H','G')
), cte_1 (id,arr) as
(select id,
array_construct(
coalesce(from_node, id::string),
coalesce(to_node, id::string))
from data
), cte_2 as (
select c1.id id1,c2.id id2,
case
when arrays_overlap(c1.arr,c2.arr) then
least(c1.id,c2.id)
else
null
end chk,
case
when (id1 = chk) then
null
else
chk
end child
from cte_1 c1 left join cte_1 c2
where c1.id != c2.id
), grp_cte as (
select id1,
min(child) chk
from cte_2
group by id1
order by id1
), relation_cte as (
select id1, connect_by_root id1 as parent from grp_cte
start with chk is null
connect by chk = prior id1
order by id1
)
select from_node, to_node,
dense_rank() over (order by parent) group_name
from relation_cte, data
where data.id = relation_cte.id1;
Will give following as output -
FROM_NODE | TO_NODE | GROUP_NAME |
---|---|---|
A | B | 1 |
B | C | 1 |
C | D | 1 |
D | C | 1 |
E | F | 2 |
G | H | 3 |
H | G | 3 |
Here is another approach -
with data (id,from_node,to_node) as (
select * from values
(1,'A','B'),
... truncated for bravity
(7,'H','G')
), cte_1 as (
select id, from_node as node from data
union all
select id, to_node from data
), cte_2 as (
select d.id ,
(select min(id) from cte_1 c
where d.from_node = c.node
or d.to_node = c.node
) id1,
case when id <> id1 then id1 end chk
from data d
), relation_cte as (
select id, connect_by_root id as parent from cte_2
start with chk is null
connect by chk = prior id
order by id
)
select from_node, to_node,
dense_rank() over (order by parent) group_name
from relation_cte, data
where data.id = relation_cte.id;
Adding another approach using CONDITIONAL_CHANGE_EVENT
-
with data (from_node,to_node) as (
select * from values
('A','B'),
..... truncated
('H','G')
), cte_1 as (
select *,
array_construct (from_node, to_node) arr1,
array_construct (lead (from_node) over (order by from_node),
lead (to_node) over (order by to_node)) arr2,
arrays_overlap(arr1,arr2) ao
from data
)
select from_node, to_node,
conditional_change_event(ao)
over (order by from_node,to_node) + 1 grp
from cte_1
order by from_node,to_node
Upvotes: 1