Reputation: 597
I have two tables each with around 200,000 rows. I have run the query below and it still hasn't completed after running for more than an hour. What could be the explanation for this?
SELECT
dbo.[new].[colom1],
dbo.[new].[colom2],
dbo.[new].[colom3],
dbo.[new].[colom4],
dbo.[new].[Value] as 'nieuwe Value',
dbo.[old].[Value] as 'oude Value'
FROM dbo.[new]
JOIN dbo.[old]
ON dbo.[new].[colom1] = dbo.[old].[colom1]
and dbo.[new].[colom2] = dbo.[old].[colom2]
and dbo.[new].[colom3] = dbo.[old].[colom3]
and dbo.[new].[colom4] = dbo.[old].[colom4]
where dbo.[new].[Value] <> dbo.[old].[Value]
from comment;
Upvotes: 12
Views: 2817
Reputation: 44941
It seems that for an equality join on a single column, the rows with NULL value in the join key are being filtered out, but this is not the case for joins on multiple columns.
As a result, the hash join complexity is changed from O(N) to O(N^2).
======================================================================
In that context I would like to recommend a great article written by Paul White on similar issues - Hash Joins on Nullable Columns
======================================================================
I have generated a small simulation of this use-case and I encourage you to test your solutions.
create table mytab1 (c1 int null,c2 int null)
create table mytab2 (c1 int null,c2 int null)
;with t(n) as (select 1 union all select n+1 from t where n < 10)
insert into mytab1 select null,null from t t0,t t1,t t2,t t3,t t4
insert into mytab2 select null,null from mytab1
insert into mytab1 values (111,222);
insert into mytab2 values (111,222);
select * from mytab1 t1 join mytab2 t2 on t1.c1 = t2.c1 and t1.c2 = t2.c2
For the OP query we should remove rows with NULL values in any of the join key columns.
SELECT
dbo.[new].[colom1],
dbo.[new].[colom2],
dbo.[new].[colom3],
dbo.[new].[colom4],
dbo.[new].[Value] as 'nieuwe Value',
dbo.[old].[Value] as 'oude Value'
FROM dbo.[new]
JOIN dbo.[old]
ON dbo.[new].[colom1] = dbo.[old].[colom1]
and dbo.[new].[colom2] = dbo.[old].[colom2]
and dbo.[new].[colom3] = dbo.[old].[colom3]
and dbo.[new].[colom4] = dbo.[old].[colom4]
where dbo.[new].[Value] <> dbo.[old].[Value]
and dbo.[new].[colom1] is not null
and dbo.[new].[colom2] is not null
and dbo.[new].[colom3] is not null
and dbo.[new].[colom4] is not null
and dbo.[old].[colom1] is not null
and dbo.[old].[colom2] is not null
and dbo.[old].[colom3] is not null
and dbo.[old].[colom4] is not null
Upvotes: 7
Reputation: 2504
Using EXCEPT join, you only have to make the larger HASH join on those values that have changed, so much faster:
/*
create table [new] ( colom1 int, colom2 int, colom3 int, colom4 int, [value] int)
create table [old] ( colom1 int, colom2 int, colom3 int, colom4 int, [value] int)
insert old values (1,2,3,4,10)
insert old values (1,2,3,5,10)
insert old values (1,2,3,6,10)
insert old values (1,2,3,7,10)
insert old values (1,2,3,8,10)
insert old values (1,2,3,9,10)
insert new values (1,2,3,4,11)
insert new values (1,2,3,5,10)
insert new values (1,2,3,6,11)
insert new values (1,2,3,7,10)
insert new values (1,2,3,8,10)
insert new values (1,2,3,9,11)
*/
select n.colom1, n.colom2 , n.colom3, n.colom4, n.[value] as newvalue, o.value as oldvalue
from new n
inner join [old] o on n.colom1=o.colom1 and n.colom2=o.colom2 and n.colom3=o.colom3 and n.colom4=o.colom4
inner join
(
select colom1, colom2 , colom3, colom4, [value] from new
except
select colom1, colom2 , colom3, colom4, [value] from old
) i on n.colom1=i.colom1 and n.colom2=i.colom2 and n.colom3=i.colom3 and n.colom4=i.colom4
Upvotes: -1