Reputation: 2434
I have a dataset that has three fields: an address, the numbers stripped from the address, and the letters stripped from the address.
IF OBJECT_ID ('tempdb..#addresses') IS NOT NULL
DROP TABLE #addresses
create table #addresses (
address_numbers varchar(50),
address_all varchar(100),
address_letters varchar(100)
)
insert into #addresses
values ('12345678','123 Something Rd, Somewhere NY 45678', 'SOMETHINGRDSOMEWHERENY'),
('12345678','123 Something Road, Somewhere NY 45678', 'SOMETHINGROADSOMEWHERENY'),
('23445678','234 Something Road, Somewhere NY 45678', 'SOMETHINGROADSOMEWHERENY')
I would like to find groups of addresses by similarity within the same stripped numbers. I know how to find the similarity between two text strings...
select *
from #addresses a
left outer join #addresses b on a.address_numbers = b.address_numbers and MDS_DB.MDQ.SIMILARITY(a.address_letters ,b.address_letters , 2, 0, .90) >= .90
...but I'm not sure how to assign an exemplar/grouping code to each address in the original data. The desired results looks like this:
IF OBJECT_ID ('tempdb..#addresses_desired_result') IS NOT NULL
DROP TABLE #addresses_desired_result
create table #addresses_desired_result (
address_numbers varchar(50),
address_all varchar(100),
address_letters varchar(100),
address_group varchar(100)
)
insert into #addresses_desired_result
values ('12345678','123 Something Rd, Somewhere NY 45678', 'SOMETHINGRDSOMEWHERENY', '123 Something Rd, Somewhere NY 45678'),
('12345678','123 Something Road, Somewhere NY 45678', 'SOMETHINGROADSOMEWHERENY', '123 Something Rd, Somewhere NY 45678'),
('23445678','234 Something Road, Somewhere NY 45678', 'SOMETHINGROADSOMEWHERENY', '234 Something Road, Somewhere NY 45678')
select *
from #addresses_desired_result
address_group
could be one of the addresses within the group, or it could just be an integer. The goal is to join the distinct list of addresses and exemplars back to a much larger transaction table and group records by the exemplar/group number.
How do I assign an exemplar address/group number to each group of similar addresses within the same stripped numbers?
Upvotes: 1
Views: 105
Reputation: 1034
To get clarification:
IF OBJECT_ID ('tempdb..#addresses') IS NOT NULL
DROP TABLE #addresses
create table #addresses (
id int identity(1,1),
address_numbers varchar(50),
address_all varchar(100),
address_letters varchar(100)
)
insert into #addresses
values ('12345678','123 Something Rd, Somewhere NY 45678', 'SOMETHINGRDSOMEWHERENY'),
('12345678','123 Something Road, Somewhere NY 45678', 'SOMETHINGROADSOMEWHERENY'),
('23445678','234 Something Road, Somewhere NY 45678', 'SOMETHINGROADSOMEWHERENY')
select A.address_numbers, A.address_all, A.address_letters,
isnull(B.address_all, A.address_all) as address_group
from #addresses A
left join
(
select A.id, B.address_all,
row_number() over(order by case when B.address_all + ' ' like '% rd %' then 1 when B.address_all + ' ' like '% road %' then 2 end,
case when B.address_all + ' ' like '% st %' then 1 when B.address_all + ' ' like '% street %' then 2 end) AS RowNr
from #addresses A
cross join #addresses B
where left(A.address_all, 5) = left(b.address_all, 5) --place similarity function here
and A.id <> B.id
) B on A.id = B.id and B.RowNr = 1
results:
address_numbers address_all address_letters address_group
12345678 123 Something Rd, Somewhere NY 45678 SOMETHINGRDSOMEWHERENY 123 Something Rd, Somewhere NY 45678
12345678 123 Something Road, Somewhere NY 45678 SOMETHINGROADSOMEWHERENY 123 Something Rd, Somewhere NY 45678
23445678 234 Something Road, Somewhere NY 45678 SOMETHINGROADSOMEWHERENY 234 Something Road, Somewhere NY 45678
I used left(address_all, 5) in place of similarity function, but you can perform any calculation you like.
Upvotes: 1