Reputation: 4173
I have a Database with many columns, one of them containing Names.
My vb.net software acts as telegram server and waits for the user to send its full name.
The database could have its name spelled differently, for example "Marco Dell'Orso" could be spelled "Marco Dellorso" or "Marco Dell Orso" od "Dell Orso Marco" or whatever. The user could also misspell his name and invert two letters.. for esample "MaCRo Dell'Orso"
I would need a way to return the 5 rows that are the most similar to the words used in the query. What would be the best way? I was thinking of splitting the name on whitechars and then use LIKE in the query with the single words, but that does not work with mistyped words.
EDIT:
My current plan is to that if the database contains more than one or less then one rows with the exact name, then split the input into the single words and return all strings that contain ANY of the input words. this should reduce the rows to analyze from 42000 to a few hundred. Once I have these few hundred lines, i could run a Levenshtein function on the rows and return the 5 most matching..
Is this a good idea?
Upvotes: 0
Views: 52
Reputation: 4173
Solved it this way by combining my custom function with a premade Levenshtein function from this link: How to calculate distance similarity measure of given 2 strings? . I assign a score for each single word that appears in the other wordcomplex. then I add a score based on the Levenshtein comparison of each word to another. works great:
Public Class Form1
Private Sub TextBox1_KeyUp(sender As Object, e As KeyEventArgs) Handles TextBox1.KeyUp
calc()
End Sub
Private Sub TextBox2_KeyUp(sender As Object, e As KeyEventArgs) Handles TextBox2.KeyUp
calc()
End Sub
Sub calc()
Label1.Text = compare(TextBox1.Text, TextBox2.Text)
End Sub
Public Function compare(source As String, target As String) As Integer
Dim score As Double
Dim sourcewords As String() = source.Split(New Char() {" "c, "'"c, "`"c, "´"c})
Dim targetwords As String() = target.Split(New Char() {" "c, "'"c, "`"c, "´"c})
For Each s In sourcewords
If target.Contains(s) Then score = score + 1
For Each t In targetwords
score = score + 1 / (DamerauLevenshteinDistance(s, t, 100) + 1)
Next
Next
For Each s In targetwords
If source.Contains(s) Then score = score + 1
For Each t In sourcewords
score = score + 1 / (DamerauLevenshteinDistance(s, t, 100) + 1)
Next
Next
Return score
End Function
''' <summary>
''' Computes the Damerau-Levenshtein Distance between two strings, represented as arrays of
''' integers, where each integer represents the code point of a character in the source string.
''' Includes an optional threshhold which can be used to indicate the maximum allowable distance.
''' </summary>
''' <param name="source">An array of the code points of the first string</param>
''' <param name="target">An array of the code points of the second string</param>
''' <param name="threshold">Maximum allowable distance</param>
''' <returns>Int.MaxValue if threshhold exceeded; otherwise the Damerau-Leveshteim distance between the strings</returns>
Public Shared Function DamerauLevenshteinDistance(source As String, target As String, threshold As Integer) As Integer
Dim length1 As Integer = source.Length
Dim length2 As Integer = target.Length
' Return trivial case - difference in string lengths exceeds threshhold
If Math.Abs(length1 - length2) > threshold Then
Return Integer.MaxValue
End If
' Ensure arrays [i] / length1 use shorter length
If length1 > length2 Then
Swap(target, source)
Swap(length1, length2)
End If
Dim maxi As Integer = length1
Dim maxj As Integer = length2
Dim dCurrent As Integer() = New Integer(maxi) {}
Dim dMinus1 As Integer() = New Integer(maxi) {}
Dim dMinus2 As Integer() = New Integer(maxi) {}
Dim dSwap As Integer()
For i As Integer = 0 To maxi
dCurrent(i) = i
Next
Dim jm1 As Integer = 0, im1 As Integer = 0, im2 As Integer = -1
For j As Integer = 1 To maxj
' Rotate
dSwap = dMinus2
dMinus2 = dMinus1
dMinus1 = dCurrent
dCurrent = dSwap
' Initialize
Dim minDistance As Integer = Integer.MaxValue
dCurrent(0) = j
im1 = 0
im2 = -1
For i As Integer = 1 To maxi
Dim cost As Integer = If(source(im1) = target(jm1), 0, 1)
Dim del As Integer = dCurrent(im1) + 1
Dim ins As Integer = dMinus1(i) + 1
Dim [sub] As Integer = dMinus1(im1) + cost
'Fastest execution for min value of 3 integers
Dim min As Integer = If((del > ins), (If(ins > [sub], [sub], ins)), (If(del > [sub], [sub], del)))
If i > 1 AndAlso j > 1 AndAlso source(im2) = target(jm1) AndAlso source(im1) = target(j - 2) Then
min = Math.Min(min, dMinus2(im2) + cost)
End If
dCurrent(i) = min
If min < minDistance Then
minDistance = min
End If
im1 += 1
im2 += 1
Next
jm1 += 1
If minDistance > threshold Then
Return Integer.MaxValue - 1
End If
Next
Dim result As Integer = dCurrent(maxi)
Return If((result > threshold), Integer.MaxValue, result)
End Function
Private Shared Sub Swap(Of T)(ByRef arg1 As T, ByRef arg2 As T)
Dim temp As T = arg1
arg1 = arg2
arg2 = temp
End Sub
End Class
Upvotes: 1