Reputation: 53

Fuzzy matching of rows of two datasets without using a for-loop

I have two datasets A and B with 8 coloumns each. Dataset A has 942 rows and Dataset B has 5079 rows. I have to compare Dataset A and Dataset B and do fuzzy matching. If there is any row is matched in Dataset B I have to mark "Matched" in dataset A in additional column.

I'm relatively new to R and not sure how to optimize r code with lapply, mapply or sapply instead of forloop.

Following is my code

##############################
# Install Necessary Packages #
##############################


#install.packages("openxlsx")
#install.packages("stringdist")
#install.packages("XLConnect")


##############################
#        Load Packages       #
##############################


library(openxlsx)
library(stringdist)
library(XLConnect)


cmd_newleads <- read.xlsx("Src/CMD - New Leads to Load.xlsx", sheet = "Top Leads Full Data", startRow = 1, colNames = TRUE)
cmd_newleads[c("Lead_Match","Opportunity_Match")] <- ""
c4c_leads <- read.xlsx("Src/C4C - Leads.xlsx", sheet = "Leads", startRow = 1, colNames = TRUE)
#c4c_opportunities <- read.xlsx("Src/C4C - Opportunities Data 6-24-16.xlsx", sheet = "Export 06-24-2016 04.55.46 PM", startRow = 1, colNames = TRUE)


cmd_newleads_selcols <- cmd_newleads[,c("project_name","project_address","project_city","project_state_province_region_code","project_postalcode","project_country","project_sector","project_type")]
cmd_newleads_selcols[is.na(cmd_newleads_selcols)] <- ""
#rownames(cmd_newleads_selcols)

c4cleads_selcols <- c4c_leads[,c("Lead","Address1.(Lead)","City.(Lead)","Region.(Lead)","Postal.Code.(Lead)","Country.(Lead)","Sector.(Lead)","Type.(Lead)")]
c4cleads_selcols[is.na(c4cleads_selcols)] <- ""
#cmd_c4copportunities_selcols <- c4c_opportunities[,c("project_name","project_address","project_city","project_state_province_region_code","project_postalcode","project_country","project_sector","project_type")]

rcount_cmdnewleads <- nrow(cmd_newleads)
rcount_c4cleads <- nrow(c4c_leads)
#rcount_c4copportunities <- nrow(c4c_opportunities)




for(i in 1:rcount_cmdnewleads)
{

    cmd_project_name <- cmd_newleads_selcols[i,1]
    cmd_project_address <- cmd_newleads_selcols[i,2]
    cmd_project_city <- cmd_newleads_selcols[i,3]
    cmd_project_region_code <- cmd_newleads_selcols[i,4]
    cmd_project_postalcode <- cmd_newleads_selcols[i,5]
    cmd_project_country <- cmd_newleads_selcols[i,6]
    cmd_project_sector <- cmd_newleads_selcols[i,7]
    cmd_project_type <- cmd_newleads_selcols[i,8]


    for(j in 1:rcount_c4cleads)
    {

      c4cleads_project_name <- c4cleads_selcols[j,1]
      c4cleads_project_address <- c4cleads_selcols[j,2]
      c4cleads_project_city <- c4cleads_selcols[j,3]
      c4cleads_project_region_code <- c4cleads_selcols[j,4]
      c4cleads_project_postalcode <- c4cleads_selcols[j,5]
      c4cleads_project_country <- c4cleads_selcols[j,6]
      c4cleads_project_sector <- c4cleads_selcols[j,7]
      c4cleads_project_type <- c4cleads_selcols[j,8]

      project_percent <- stringsim(cmd_project_name,c4cleads_project_name, method="dl", p=0.1)
      address_percent <- stringsim(cmd_project_address,c4cleads_project_address, method="dl", p=0.1)
      city_percent <- stringsim(cmd_project_city,c4cleads_project_city, method="dl", p=0.1)
      region_percent <- stringsim(cmd_project_region_code,c4cleads_project_region_code, method="dl", p=0.1)
      postalcode_percent <- stringsim(cmd_project_postalcode,c4cleads_project_postalcode, method="dl", p=0.1)
      country_percent <- stringsim(cmd_project_country,c4cleads_project_country, method="dl", p=0.1)
      sector_percent <- stringsim(cmd_project_sector,c4cleads_project_sector, method="dl", p=0.1)
      type_percent <- stringsim(cmd_project_type,c4cleads_project_type, method="dl", p=0.1)

      if(project_percent > 0.833 && address_percent > 0.833 && city_percent > 0.833 && region_percent > 0.833 && postalcode_percent > 0.833 && country_percent > 0.833 && sector_percent > 0.833 && type_percent > 0.833)
      {
        cmd_newleads[i,51] <- c4cleads[j,c4cleads$Lead.ID]
      }
      else 
      {
        cmd_newleads[i,51] <- "New Lead"
      }

    }

}

Sample data for cmd_newleads_selcols and c4cleads_selcols respectively

 project_name project_address project_city
1     Wynn Mystic Casino & Hotel  22 Chemical Ln      Everett
2 Northpoint Complex Development     East Street    Cambridge
3 Northpoint Complex Development     East Street    Cambridge
4 Northpoint Complex Development     East Street    Cambridge
5 Northpoint Complex Development     East Street    Cambridge
6 Northpoint Complex Development     East Street    Cambridge
  project_state_province_region_code project_postalcode
1                                 MA              02149
2                                 MA              02138
3                                 MA              02138
4                                 MA              02138
5                                 MA              02138
6                                 MA              02138
           project_country project_sector project_type
1 United States of America    Hospitality New Building
2 United States of America     Apartments New Building
3 United States of America     Apartments New Building
4 United States of America     Apartments New Building
5 United States of America     Apartments New Building
6 United States of America     Apartments New Building







Lead           Address1.(Lead) City.(Lead) Region.(Lead) Postal.Code.(Lead) Country.(Lead)
1 1 Hotel Brooklyn Bridge Park Old Fulton St & Furman St    Brooklyn      New York              11201  United States
2      10 Trinity Square Hotel         10 Trinity Square      London             #               EC3P United Kingdom
3                  100 Stewart           1900 1st Avenue     Seattle    Washington              98101  United States
4                1136 S Wabash                         #           #             #                  #   Not assigned
5          115-129 37th Street       115-129 37th Street  Union CIty    New Jersey                  #  United States
6               1418 W Addison            1418 w Addison     Chicago             #              60613   Not assigned
          Sector.(Lead)      Type.(Lead)
1           Hospitality     New Building
2           Hospitality Brand Conversion
3           Hospitality     New Building
4 High Rise Residential     New Building
5             Developer     New Building
6 High Rise Residential     New Building

Upvotes: 1

Answers (2)

Joel Becker

Reputation: 31

If you are experiencing efficiency problems, it's not because you are using a for loop. The main issue is that you are doing a lot of work for every possible combination of rows in your two data sets. Using more efficient language features might speed things up a bit, but it wouldn't change the fact that you're doing a lot of unnecessary computation.

One of the best ways to increase efficiency in data matching problems is to rule out obvious non-matches to cut down on unnecessary computations. For example, you could change your inner loop to first check some key condition; if the score is low (i.e. it's obviously a non-match) you don't need to compute similarity scores for the rest of the attributes.

For example:

for(i in 1:rcount_cmdnewleads)
{

    cmd_project_name <- cmd_newleads_selcols[i,1]
    ...

for(j in 1:rcount_c4cleads)
  {

  c4cleads_project_name <- c4cleads_selcols[j,1]
  project_percent <- stringsim(cmd_project_name,c4cleads_project_name, method="dl", p=0.1)
  if (project_percent < .83) {

      # you already know that this is a non-match, so go to the next one
      next

    } else {

      # check the rest of the values!
      ...

    }
  }
}

I'm not familiar with the R RecordLinkage package, but the Python recordlinkage package has tools for ruling out obvious non-matches early in the process to increase efficiency. Consider checking out this tutorial to learn more about speeding up record linkage by ruling out obvious non matches.

Upvotes: 1

InspectorSands

Reputation: 2949

You might want to look at the package RecordLinkage, which allows you to perform phonetic matching, probabilistic record linkage and machine learning approaches.

Upvotes: 0

Fuzzy matching of rows of two datasets without using a for-loop

Answers (2)

Related Questions