jonv
jonv

Reputation: 53

Define acronyms when running vb code calculating similarity

I am utilizing the following vb code in excel to calculate the degree similarity between column A and column B. It runs great.

The next step for me is to define acronyms so the calculated degree of similarity is not impacted. IE: If I have in column A, "ABC LLC" and in column B, "ABC limited liability company", the current vb code will return that the two columns are not very similar. However, I want them to return as 100% similar by defining that "LLC" and "Limited Liability Company" are really the same thing. What can I do and where can I put it in the code to accomplish this? Thanks!

Disclaimer - yes I know there are add-ins to do this. However, my data set is too large to utilize them.

Public Function Similarity(ByVal String1 As String, _
                           ByVal String2 As String, _
                           Optional ByRef RetMatch As String, _
                           Optional min_match = 1) As Single

'Returns percentile of similarity between 2 strings (ignores case)

'"RetMatch"  returns the characters that match(in order)
'"min_match" specifies minimum number af char's in a row to match


Dim b1() As Byte, b2() As Byte
Dim lngLen1 As Long, lngLen2 As Long
Dim lngResult As Long

  If UCase(String1) = UCase(String2) Then       '..Exactly the same
    Similarity = 1

  Else                                          '..one string is empty
    lngLen1 = Len(String1)
    lngLen2 = Len(String2)
    If (lngLen1 = 0) Or (lngLen2 = 0) Then
      Similarity = 0

    Else                                        '..otherwise find similarity
      b1() = StrConv(UCase(String1), vbFromUnicode)
      b2() = StrConv(UCase(String2), vbFromUnicode)
      lngResult = Similarity_sub(0, lngLen1 - 1, _
                                 0, lngLen2 - 1, _
                                 b1, b2, _
                                 String1, _
                                 RetMatch, _
                                 min_match)
      Erase b1
      Erase b2
      If lngLen1 >= lngLen2 Then
        Similarity = lngResult / lngLen1
      Else
        Similarity = lngResult / lngLen2
      End If
    End If
  End If

End Function

Private Function Similarity_sub(ByVal start1 As Long, ByVal end1 As Long, _
                                ByVal start2 As Long, ByVal end2 As Long, _
                                ByRef b1() As Byte, ByRef b2() As Byte, _
                                ByVal FirstString As String, _
                                ByRef RetMatch As String, _
                                ByVal min_match As Long, _
                                Optional recur_level As Integer = 0) As Long
'* CALLED BY: Similarity *  (RECURSIVE)

Dim lngCurr1 As Long, lngCurr2 As Long
Dim lngMatchAt1 As Long, lngMatchAt2 As Long
Dim i As Long
Dim lngLongestMatch As Long, lngLocalLongestMatch As Long
Dim strRetMatch1 As String, strRetMatch2 As String

  If (start1 > end1) Or (start1 < 0) Or (end1 - start1 + 1 < min_match) _
  Or (start2 > end2) Or (start2 < 0) Or (end2 - start2 + 1 < min_match) Then
    Exit Function     '(exit if start/end is out of string, or length is too short)
  End If

  For lngCurr1 = start1 To end1        '(for each char of first string)
    For lngCurr2 = start2 To end2        '(for each char of second string)
      i = 0
      Do Until b1(lngCurr1 + i) <> b2(lngCurr2 + i)   'as long as chars DO match..
        i = i + 1
        If i > lngLongestMatch Then     '..if longer than previous best, store starts & length
          lngMatchAt1 = lngCurr1
          lngMatchAt2 = lngCurr2
          lngLongestMatch = i
        End If
        If (lngCurr1 + i) > end1 Or (lngCurr2 + i) > end2 Then Exit Do
      Loop
    Next lngCurr2
  Next lngCurr1

  If lngLongestMatch < min_match Then Exit Function 'no matches at all, so no point checking for sub-matches!

  lngLocalLongestMatch = lngLongestMatch                   'call again for BEFORE + AFTER
  RetMatch = ""
                              'Find longest match BEFORE the current position
  lngLongestMatch = lngLongestMatch _
                  + Similarity_sub(start1, lngMatchAt1 - 1, _
                                   start2, lngMatchAt2 - 1, _
                                   b1, b2, _
                                   FirstString, _
                                   strRetMatch1, _
                                   min_match, _
                                   recur_level + 1)
  If strRetMatch1 <> "" Then
    RetMatch = RetMatch & strRetMatch1 & "*"
  Else
    RetMatch = RetMatch & IIf(recur_level = 0 _
                              And lngLocalLongestMatch > 0 _
                              And (lngMatchAt1 > 1 Or lngMatchAt2 > 1) _
                              , "*", "")
  End If

                              'add local longest
  RetMatch = RetMatch & Mid$(FirstString, lngMatchAt1 + 1, lngLocalLongestMatch)

                              'Find longest match AFTER the current position
  lngLongestMatch = lngLongestMatch _
                  + Similarity_sub(lngMatchAt1 + lngLocalLongestMatch, end1, _
                                   lngMatchAt2 + lngLocalLongestMatch, end2, _
                                   b1, b2, _
                                   FirstString, _
                                   strRetMatch2, _
                                   min_match, _
                                   recur_level + 1)

  If strRetMatch2 <> "" Then
    RetMatch = RetMatch & "*" & strRetMatch2
  Else
    RetMatch = RetMatch & IIf(recur_level = 0 _
                              And lngLocalLongestMatch > 0 _
                              And ((lngMatchAt1 + lngLocalLongestMatch < end1) _
                                   Or (lngMatchAt2 + lngLocalLongestMatch < end2)) _
                              , "*", "")
  End If
                             'Return result
  Similarity_sub = lngLongestMatch

End Function

Upvotes: 3

Views: 326

Answers (2)

Slai
Slai

Reputation: 22876

It might be easier to check if strings are Like each other. For example

If "ABC limited liability company" Like "ABC L*L*C*" Then

is True as * matches any 0 or more characters.

Option Compare Text    ' makes string comparisons case insensitive

Function areLike(str1 As String, str2 As String) As Single

    If str1 = str2 Then areLike = 1: Exit Function

    Dim pattern As String, temp As String

    If LenB(str1) < LenB(str2) Then 
        pattern = str1
        temp = str2
    Else
        pattern = str2
        temp = str1
    End If

    pattern = StrConv(pattern, vbUnicode)       ' "ABC LLC" to "A␀B␀C␀ ␀L␀L␀C␀"   
    pattern = Replace(pattern, vbNullChar, "*") ' "A*B*C* *L*L*C*"
    pattern = Replace(pattern, " *", " ")       ' "A*B*C* L*L*C*"

    If temp Like pattern Then areLike = 1: Exit Function

    ' else areLike = some other similarity function

End Function

Upvotes: 0

A.S.H
A.S.H

Reputation: 29332

Without much involvement into your solution, that is your own responsibility, I can suggest some way to incorporate those Abbreviations. However. Please be aware that this method is not guaranteed 100% success, but you are in the fuzzy world already.

Suppose that we have a Dictionary where:

  • The keys are the long phrases
  • The values are the abbreviations

Before comparing two strings, we minimize both of them, by replacing each occurring long phrase by its abbreviation. Then we can compare them with the rest of your method Similarity (or by any other method).

' Fills an abbreviation dictionary
Sub InitializeDict(ByRef abbrev As Scripting.Dictionary)
    abbrev("limited liability company") = "LLC"
    abbrev("United Kingdom") = "U.K."
    '... Add all abbreviations into dict

    ' Instead of harcoding, you can better load the key/value
    ' pairs from a dedicated worksheet... 

End Sub

' Minimizes s by putting abbreviations
Sub Abbreviate(ByRef s As String)
    Static abbrev As Scripting.Dictionary ' <-- static, inititlized only once
    If abbrev Is Nothing Then
        Set abbrev = CreateObject("Scripting.Dictionary")
        abbrev.CompareMode = vbTextCompare
        InitializeDict abbrev
    End If

    Dim phrase
    For Each phrase In abbrev.Keys
        s = Replace(s, phrase, abbrev(phrase), vbTextCompare)
    Next
End Sub

' A small amendment to this function: abbreviate strings before comparing
Public Function Similarity(ByVal String1 As String, _
                       ByVal String2 As String, _
                       Optional ByRef RetMatch As String, _
                       Optional min_match = 1) As Single

    Abbreviate String1
    Abbreviate String2
    ' ... Rest of the routine
End Function

Upvotes: 4

Related Questions