MAK
MAK

Reputation: 7260

Performance tuning on Recursive CTE

I have the following table with sample data:

Table: tbl_nodes

create table tbl_nodes
(
    nod1 varchar(50),
    nod2 varchar(50)
);

Sample data:

insert into tbl_nodes values('Node1','Node2'); 
insert into tbl_nodes values('Node2','Node4'); 
insert into tbl_nodes values('Node2','Node3'); 
insert into tbl_nodes values('Node2','Node5'); 
insert into tbl_nodes values('Node3','Node5'); 
insert into tbl_nodes values('Node3','Node6'); 
insert into tbl_nodes values('Node6','Node7'); 
insert into tbl_nodes values('Node10','Node11');
insert into tbl_nodes values('Node6','Node8');
insert into tbl_nodes values('Node18','Node19');
insert into tbl_nodes values('Node9','Node10');
insert into tbl_nodes values('Node12','Node13');
insert into tbl_nodes values('Node15','Node16');

NOTE: I am having more than 5000 records in the above table.

Expected Result:

------------------------------------
Connectivity        
------------------------------------
Node1->Node2->Node3->Node5
Node1->Node2->Node3->Node6->Node7
Node1->Node2->Node3->Node6->Node8
Node1->Node2->Node4
Node1->Node2->Node5
Node9->Node10->Node11

Explaination About expected result: I want to find the connectivity between nodes which are having more than 2 nodes, for an example Node1 has connectivity with Node2 and Node2 with 3,4,5 and so on as shown in the expected result set. And want display each connectivity till the end node found, for an example end nodes are Node4,Node5,Node7,Node8 and Node11.

I tried the following query:

My try:

;WITH CTE AS
(
    SELECT  nod1,nod2,
            CAST(nod1 AS VARCHAR(MAX))+'->' AS conn, 
            1 as lvl 
    from tbl_nodes T1
    where EXISTS (select 1 from tbl_nodes T2 where T1.nod2 =T2.nod1) OR 
    EXISTS (select 1 from tbl_nodes T3 WHERE T1.nod1 =T3.nod2)
    UNION ALL
    SELECT C1.nod1,C1.nod2,
           C.conn+CAST(C1.nod1 AS VARCHAR(MAX))+'->',
           c.lvl+1 
    FROM CTE C INNER JOIN tbl_nodes C1 ON  C.nod2 = C1.nod1
    WHERE CHARINDEX(','+C.nod2+',',C.conn)=0 
),cte2 as 
(
    select * , ROW_NUMBER() over (partition by nod1,nod2 order by lvl)as rn From CTE
),cte3 as
(
    select nod1,nod2 ,MAX(LEN(conn)) conn,MAX(rn) rn
    from cte2
    group by nod1,nod2
)
SELECT DISTINCT c2.conn+c3.nod2 AS Connectivity
from cte3 c3
inner join cte2 c2 on c3.rn = c2.rn and c3.nod1 = c2.nod1
where c3.nod2 not in (select nod1 from cte2)

Above query works fine but unable to get the result for records more than 5000, query keeps running no result.

Edit: I can't attach running data as it has sensitive information, But will explain! I have table with columns Name1 and Name2 which I have referred as Nod1 and Nod2. I want to find out the relationship between the names like we are finding the link between the nodes here in the given example. The person one (Name1) may have done some transaction to second person (Name2) and Name2 may have to do any other person. So I need to find out the link of transactions between the persons. Its just same as the given example. I tried with you given query by partitioning data, for 100 records it comes within seconds, for 500 records it took 1 min and for 5000 records it keeps running because of more permutation and combinations are there. The problem is with last data set (5000) we have to find out the links.

Upvotes: 0

Views: 1204

Answers (2)

There are 2 questions that need to be solved about this problem:

  1. Remove paths that haven't end yet.
  2. Detect loop (which is causing the recursive cte loop infinitely).

So, here is my own version of the answer:

IF  OBJECT_ID('tempdb..#tbl_nodes') IS NOT NULL
    DROP TABLE  #tbl_nodes;
CREATE TABLE    #tbl_nodes (
                    nod1    VARCHAR(50)
                ,   nod2    VARCHAR(50)
                );

CREATE NONCLUSTERED INDEX #IX_tbl_nodes_1 ON #tbl_nodes (nod1, nod2);
CREATE NONCLUSTERED INDEX #IX_tbl_nodes_2 ON #tbl_nodes (nod2, nod1);

INSERT INTO #tbl_nodes (nod1, nod2)
VALUES  ('Node1','Node2')
    ,   ('Node2','Node4')
    ,   ('Node2','Node3')
    ,   ('Node2','Node5')
    ,   ('Node3','Node5')
    ,   ('Node3','Node6')
    ,   ('Node6','Node7')
    ,   ('Node10','Node11')
    ,   ('Node6','Node8')
    ,   ('Node18','Node19')
    ,   ('Node9','Node10')
    ,   ('Node12','Node13')
    ,   ('Node15','Node16')
    ,   ('Node8', 'Node3')
    ;


WITH cte AS (
    SELECT  parent.nod1, parent.nod2
        ,   [link] = CAST('[' + parent.nod1 + '] -> [' + parent.nod2 + ']' AS VARCHAR(MAX))
        ,   [flag] = f.flag
        ,   [loop] = 0
        ,   [stop] = 0
        ,   [nodes] = 2
    FROM        #tbl_nodes AS parent
    LEFT JOIN   #tbl_nodes AS child
            ON  parent.nod1 = child.nod2
    CROSS APPLY (
        SELECT  _f.flag, [rn] = ROW_NUMBER() OVER(ORDER BY _f.flag ASC)
        FROM    (
            SELECT  [flag] = CAST(1 AS BIT)
            UNION ALL
            SELECT  [flag] = CAST(0 AS BIT)
            FROM    #tbl_nodes AS __f
            WHERE   parent.nod2 = __f.nod1
        ) AS _f
    ) AS f
    WHERE   child.nod2 IS NULL
        AND f.rn = 1
    UNION ALL
    SELECT  parent.nod1, child.nod2
        ,   [link] = CAST(parent.link + ' -> [' + child.nod2 + ']' AS VARCHAR(MAX))
        ,   [flag] = f.flag
        ,   [loop] = l.loop
        ,   [stop] = l.stop
        ,   [nodes] = parent.nodes + 1
    FROM        cte AS parent
    CROSS APPLY (
        SELECT  _child.nod1, _child.nod2, [rn] = ROW_NUMBER() OVER(PARTITION BY _child.nod2 ORDER BY _child.nod2)
        FROM    #tbl_nodes AS _child
        WHERE   parent.nod2 = _child.nod1
    ) AS child
    CROSS APPLY (
        SELECT  _f.flag, [rn] = ROW_NUMBER() OVER(ORDER BY _f.flag ASC)
        FROM    (
            SELECT  [flag] = CAST(1 AS BIT)
            UNION ALL
            SELECT  [flag] = CAST(0 AS BIT)
            FROM    #tbl_nodes AS __f
            WHERE   child.nod2 = __f.nod1
        ) AS _f
    ) AS f
    CROSS APPLY (
        SELECT  [loop] = CASE WHEN (LEN(parent.link + ' -> [' + child.nod2 + ']') - LEN(REPLACE(parent.link + ' -> [' + child.nod2 + ']', '[' + child.nod2 + ']', ''))) / (LEN(child.nod2) + 2) > 1 THEN 1 ELSE 0 END
            ,   [stop] = CASE WHEN (LEN(parent.link) - LEN(REPLACE(parent.link, '[' + parent.nod2 + ']', ''))) / (LEN(parent.nod2) + 2) > 1 THEN 1 ELSE 0 END
    ) AS l
    WHERE   child.rn = 1
        AND f.rn = 1
        AND l.stop = 0
)
SELECT  cte.link, cte.loop
FROM    cte
WHERE   (cte.flag = 1 OR cte.loop = 1)
    AND cte.nodes > 2
ORDER BY cte.nod1
OPTION (MAXRECURSION 0);

Cheers.

Updated: As @MAK requested, I update my answer to get paths that have more than 2 nodes.

Upvotes: 1

Salman Arshad
Salman Arshad

Reputation: 272256

Here is a simplified version of your recursive query that uses EXISTS operator:

WITH cte_nodes AS (
    SELECT CAST(nod1 + '->' + nod2 AS VARCHAR(4000)) AS path, nod2
    FROM tbl_nodes AS root
    WHERE NOT EXISTS (
        -- no parent exists thus represents a root node
        SELECT 1
        FROM tbl_nodes
        WHERE nod2 = root.nod1
    ) AND EXISTS (
        -- at least one child exists thus connected with at least one node
        SELECT 1
        FROM tbl_nodes
        WHERE nod1 = root.nod2
    )
    UNION ALL
    SELECT CAST(prnt.path + '->' + chld.nod2 AS VARCHAR(4000)), chld.nod2
    FROM cte_nodes AS prnt
    JOIN tbl_nodes AS chld ON prnt.nod2 = chld.nod1
)
SELECT path
FROM cte_nodes
WHERE NOT EXISTS (
    -- no child exists thus represents a leaf node
    SELECT 1
    FROM tbl_nodes
    WHERE nod1 = cte_nodes.nod2
)
ORDER BY path
OPTION (MAXRECURSION 100) -- increase this value just enough to get the results

Upvotes: 1

Related Questions