Neal Barsch
Neal Barsch

Reputation: 2940

How can I determine the longest common substring in two columns of a LARGE data.table in R

I modified this question's answer: Find length of overlap in strings but have issues applying to big data as the iteration is slow.

How can I improve the function below which finds the longest common overlap between two strings anywhere in the two strings (disregarding case)?

Slow function that works, but I'd like to replace with a better one:

strlcs <- function(str1, str2,type="lcs") {
  
  
  if(nchar(str1) < nchar(str2)) {
    x <- str2
    str2 <- str1
    str1 <- x
  }
  
  x <- strsplit(str2, "")[[1L]]
  n <- length(x)
  s <- sequence(seq_len(n))
  s <- split(s, cumsum(s == 1L))
  s <- rep(list(s), n)
  
  for(i in seq_along(s)) {
    s[[i]] <- lapply(s[[i]], function(x) {
      x <- x + (i-1L)
      x[x <= n]
    })
    s[[i]] <- unique(s[[i]])
  }
  
  s <- unlist(s, recursive = FALSE)
  s <- unique(s[order(-lengths(s))])
  
  i <- 1L
  len_s <- length(s)
  while(i < len_s) {
    lcs <- paste(x[s[[i]]], collapse = "")
    check <- grepl(lcs, str1, fixed = TRUE)
    if(check) {
      if(type=="nchar"){
        return(nchar(lcs))
      }else{
        return(lcs)
      }
      break
    } else {
      i <- i + 1L 
    }
  }
}

Sample data:

library(data.table)
sampdata <- data.frame(
  str1=c("Doug Olivas", "GRANT MANAGEMENT LLC", "LUNA VAN DERESH", "wendy t marzardo", "AMIN NYGUEN COMPANY LLC", "GERARDO CONTRARAS", "miguel martinez","albert marks porter"),
  str2=c("doug olivas", "miguel grant", "LUNA VAN DERESH MANAGEMENT LLC", "marzardo", "amin nyguen llc", "gerardo contraras", "miggy martinez","albert"),
  stringsAsFactors = F
)

###Create sample big data from previous sampledata and apply on huge DT
samplist <- lapply(c(1:10000),FUN=function(x){sampdata})
bigsampdata <- rbindlist(samplist)

The above function is NOT optimized for big data.

How do I make the following happen in less than the currently brutal 20+ seconds?

DESIRED FUNCTION APPLIED ON BIG DATA: 
system.time(bigsampdata$desired_LCSnchar <- sapply(c(1:nrow(bigsampdata)),FUN=function(x){strlcs(tolower(bigsampdata$str1[x]),tolower(bigsampdata$str2[x]),type="lcs")}))
   user  system elapsed 
 24.290   0.008  24.313 

Upvotes: 1

Views: 559

Answers (2)

Paul
Paul

Reputation: 9107

I implemented Wikipedia's solution pseudocode in c++ using Rcpp.

library(Rcpp)


cppFunction('
String largeset_common_substring(String str1, String str2) 
{ 
    std::string S = str1;
    std::string T = str2;
    int r = S.length();
    int n = T.length();
    std::vector<std::vector<int> > L(r , std::vector<int>(n));
    int z = 0;
    std::string ret;

    for (int i = 0; i < r; ++i)
    {
        for (int j = 0; j < n; ++j)
        {
            if (S[i] == T[j])
            {
                if (i == 0 || j == 0)
                    L[i][j] = 1;
                else
                    L[i][j] = L[i - 1][j - 1] + 1;
 
                if (L[i][j] > z)
                {
                    z = L[i][j];
                    ret = S.substr(i - z + 1, z);
                }
            }
            else
            {
                L[i][j] = 0;
            }
        }
    }
    return ret;
} 
')
largeset_common_substring(tolower("GRANT MANAGEMENT LLC"), "miguel grant")
#> [1] "grant"

Here is the timing for your big dataset.

library(data.table)
sampdata <- data.frame(
  str1=c("Doug Olivas", "GRANT MANAGEMENT LLC", "LUNA VAN DERESH", "wendy t marzardo", "AMIN NYGUEN COMPANY LLC", "GERARDO CONTRARAS", "miguel martinez","albert marks porter"),
  str2=c("doug olivas", "miguel grant", "LUNA VAN DERESH MANAGEMENT LLC", "marzardo", "amin nyguen llc", "gerardo contraras", "miggy martinez","albert"),
  stringsAsFactors = F
)

###Create sample big data from previous sampledata and apply on huge DT
samplist <- lapply(c(1:10000),FUN=function(x){sampdata})
bigsampdata <- rbindlist(samplist)


system.time(
  bigsampdata[, desired_LCSnchar := purrr::map2_chr(
      tolower(bigsampdata$str1),
      tolower(bigsampdata$str2),
      largeset_common_substring
  )]
)
#> user  system elapsed 
#> 0.78    0.07    1.28 

Upvotes: 1

Jon Nagra
Jon Nagra

Reputation: 1660

I have found a faster solution using the LCS function in the qualV package:

library(data.table)
library(qualV)

strlcs_op <- function(str1, str2) {
    v1 <- unlist(strsplit(str1, ""))
    v2 <- unlist(strsplit(str2, ""))
    
    return(paste(v1[LCS(v1, v2)$va], collapse = ""))
    
}

# same as yours but with data.table syntax
system.time(bigsampdata[, desired_LCSnchar := mapply(strlcs,
                                                     tolower(str1),
                                                     tolower(str2))])
#>    user  system elapsed 
#>   41.64    0.04   42.20
# optimised function
system.time(bigsampdata[, desired_LCSnchar := mapply(strlcs_op,
                                                     tolower(str1),
                                                     tolower(str2))])
#>    user  system elapsed 
#>    4.58    0.00    4.75

You can speed it up further by parallelising the mapply with mcmapply

Upvotes: 2

Related Questions