Jay
Jay

Reputation: 2902

Identify related groups of edges in cyclic graph in SQL

I'd like to do the following in SQL (Snowflake dialect):

Given a group of edges containing cycles

FROM_NODE | TO_NODE
--------- | -------
A         | B
B         | C
C         | D
D         | C
E         | F
G         | H
H         | G

Identify all unique groups of related nodes

FROM_NODE | TO_NODE | GROUP_NAME
--------- | ------- | ----------
A         | B       | 1
B         | C       | 1
C         | D       | 1
D         | B       | 1
E         | F       | 2
G         | H       | 3
H         | G       | 3

The code below runs forever (because of the cycles?) (taken from here: How to identify groups/clusters in set of arcs/edges in SQL?):

WITH edges as (
    select * from (
        values
            ('A', 'B'),
            ('B', 'C'),
            ('C', 'D'),
            ('D', 'B'),
            ('E', 'F'),
            ('G', 'H'),
            ('H', 'G')
    ) as e (FROM_NODE, TO_NODE)
),

groups (FROM_NODE, TO_NODE, GROUP_NAME) AS (
    SELECT
        e1.FROM_NODE,
        e1.TO_NODE,
        Rank() Over (ORDER BY e1.FROM_NODE) as GROUP_NAME
    FROM edges AS e1
    UNION ALL SELECT
        e2.FROM_NODE,
        e2.TO_NODE,
        g.GROUP_NAME
       FROM edges as e2, groups as g
       WHERE g.FROM_NODE = e2.TO_NODE
)

select * from groups order by GROUP_NAME

Upvotes: 0

Views: 139

Answers (1)

Pankaj
Pankaj

Reputation: 2746

I have included row id in data-set itself - same can be generated in one of the CTEs as a pseudo-column.

Query -

with data (id,from_node,to_node) as (
        select * from values
        (1,'A','B'),
        (2,'B','C'),
        (3,'C','D'),
        (4,'D','C'),
        (5,'E','F'),
        (6,'G','H'),
        (7,'H','G')
), cte_1 (id,arr) as
        (select id,
        array_construct(
            coalesce(from_node, id::string),
            coalesce(to_node, id::string))
        from data
), cte_2 as (
        select c1.id id1,c2.id id2,
        case 
            when arrays_overlap(c1.arr,c2.arr) then 
                least(c1.id,c2.id)
            else 
                null 
            end chk,
        case 
            when (id1 = chk) then 
                null 
            else 
                chk 
            end child
        from cte_1 c1 left join cte_1 c2
        where c1.id != c2.id 
    ), grp_cte as (
            select id1, 
            min(child) chk
            from cte_2
            group by id1
            order by id1
    ), relation_cte as (
            select id1, connect_by_root id1 as parent from grp_cte 
            start with chk is null 
            connect by chk = prior id1 
            order by id1
    )
select from_node, to_node, 
dense_rank() over (order by parent) group_name 
from relation_cte, data
where data.id = relation_cte.id1;

Will give following as output -

FROM_NODE TO_NODE GROUP_NAME
A B 1
B C 1
C D 1
D C 1
E F 2
G H 3
H G 3

Here is another approach -

with data (id,from_node,to_node) as (
        select * from values
        (1,'A','B'),
        ... truncated for bravity
        (7,'H','G')
), cte_1 as (
        select id, from_node as node from data 
        union all
        select id, to_node from data
), cte_2 as (
        select d.id , 
        (select min(id) from cte_1 c
            where d.from_node = c.node
            or d.to_node = c.node
        ) id1,
        case when id <> id1 then id1 end chk
        from data d
), relation_cte as (
        select id, connect_by_root id as parent from cte_2 
        start with chk is null 
        connect by chk = prior id
        order by id
)
select from_node, to_node, 
dense_rank() over (order by parent) group_name 
from relation_cte, data
where data.id = relation_cte.id;

Adding another approach using CONDITIONAL_CHANGE_EVENT -

with data (from_node,to_node) as (
        select * from values
        ('A','B'),
        ..... truncated
        ('H','G')
), cte_1 as (
        select *,
        array_construct (from_node, to_node) arr1,
        array_construct (lead (from_node) over (order by from_node), 
                        lead (to_node) over (order by to_node)) arr2,
        arrays_overlap(arr1,arr2) ao
        from data 
)
select from_node, to_node,
conditional_change_event(ao) 
    over (order by from_node,to_node) + 1  grp
from cte_1
order by from_node,to_node

Upvotes: 1

Related Questions