Reputation: 2940
I modified this question's answer: Find length of overlap in strings but have issues applying to big data as the iteration is slow.
How can I improve the function below which finds the longest common overlap between two strings anywhere in the two strings (disregarding case)?
Slow function that works, but I'd like to replace with a better one:
strlcs <- function(str1, str2,type="lcs") {
if(nchar(str1) < nchar(str2)) {
x <- str2
str2 <- str1
str1 <- x
}
x <- strsplit(str2, "")[[1L]]
n <- length(x)
s <- sequence(seq_len(n))
s <- split(s, cumsum(s == 1L))
s <- rep(list(s), n)
for(i in seq_along(s)) {
s[[i]] <- lapply(s[[i]], function(x) {
x <- x + (i-1L)
x[x <= n]
})
s[[i]] <- unique(s[[i]])
}
s <- unlist(s, recursive = FALSE)
s <- unique(s[order(-lengths(s))])
i <- 1L
len_s <- length(s)
while(i < len_s) {
lcs <- paste(x[s[[i]]], collapse = "")
check <- grepl(lcs, str1, fixed = TRUE)
if(check) {
if(type=="nchar"){
return(nchar(lcs))
}else{
return(lcs)
}
break
} else {
i <- i + 1L
}
}
}
Sample data:
library(data.table)
sampdata <- data.frame(
str1=c("Doug Olivas", "GRANT MANAGEMENT LLC", "LUNA VAN DERESH", "wendy t marzardo", "AMIN NYGUEN COMPANY LLC", "GERARDO CONTRARAS", "miguel martinez","albert marks porter"),
str2=c("doug olivas", "miguel grant", "LUNA VAN DERESH MANAGEMENT LLC", "marzardo", "amin nyguen llc", "gerardo contraras", "miggy martinez","albert"),
stringsAsFactors = F
)
###Create sample big data from previous sampledata and apply on huge DT
samplist <- lapply(c(1:10000),FUN=function(x){sampdata})
bigsampdata <- rbindlist(samplist)
The above function is NOT optimized for big data.
How do I make the following happen in less than the currently brutal 20+ seconds?
DESIRED FUNCTION APPLIED ON BIG DATA:
system.time(bigsampdata$desired_LCSnchar <- sapply(c(1:nrow(bigsampdata)),FUN=function(x){strlcs(tolower(bigsampdata$str1[x]),tolower(bigsampdata$str2[x]),type="lcs")}))
user system elapsed
24.290 0.008 24.313
Upvotes: 1
Views: 559
Reputation: 9107
I implemented Wikipedia's solution pseudocode in c++
using Rcpp
.
library(Rcpp)
cppFunction('
String largeset_common_substring(String str1, String str2)
{
std::string S = str1;
std::string T = str2;
int r = S.length();
int n = T.length();
std::vector<std::vector<int> > L(r , std::vector<int>(n));
int z = 0;
std::string ret;
for (int i = 0; i < r; ++i)
{
for (int j = 0; j < n; ++j)
{
if (S[i] == T[j])
{
if (i == 0 || j == 0)
L[i][j] = 1;
else
L[i][j] = L[i - 1][j - 1] + 1;
if (L[i][j] > z)
{
z = L[i][j];
ret = S.substr(i - z + 1, z);
}
}
else
{
L[i][j] = 0;
}
}
}
return ret;
}
')
largeset_common_substring(tolower("GRANT MANAGEMENT LLC"), "miguel grant")
#> [1] "grant"
Here is the timing for your big dataset.
library(data.table)
sampdata <- data.frame(
str1=c("Doug Olivas", "GRANT MANAGEMENT LLC", "LUNA VAN DERESH", "wendy t marzardo", "AMIN NYGUEN COMPANY LLC", "GERARDO CONTRARAS", "miguel martinez","albert marks porter"),
str2=c("doug olivas", "miguel grant", "LUNA VAN DERESH MANAGEMENT LLC", "marzardo", "amin nyguen llc", "gerardo contraras", "miggy martinez","albert"),
stringsAsFactors = F
)
###Create sample big data from previous sampledata and apply on huge DT
samplist <- lapply(c(1:10000),FUN=function(x){sampdata})
bigsampdata <- rbindlist(samplist)
system.time(
bigsampdata[, desired_LCSnchar := purrr::map2_chr(
tolower(bigsampdata$str1),
tolower(bigsampdata$str2),
largeset_common_substring
)]
)
#> user system elapsed
#> 0.78 0.07 1.28
Upvotes: 1
Reputation: 1660
I have found a faster solution using the LCS
function in the qualV
package:
library(data.table)
library(qualV)
strlcs_op <- function(str1, str2) {
v1 <- unlist(strsplit(str1, ""))
v2 <- unlist(strsplit(str2, ""))
return(paste(v1[LCS(v1, v2)$va], collapse = ""))
}
# same as yours but with data.table syntax
system.time(bigsampdata[, desired_LCSnchar := mapply(strlcs,
tolower(str1),
tolower(str2))])
#> user system elapsed
#> 41.64 0.04 42.20
# optimised function
system.time(bigsampdata[, desired_LCSnchar := mapply(strlcs_op,
tolower(str1),
tolower(str2))])
#> user system elapsed
#> 4.58 0.00 4.75
You can speed it up further by parallelising the mapply
with mcmapply
Upvotes: 2