Adam_G
Adam_G

Reputation: 7879

Use mapply() within attr()

I am running the function below, but adist() is not vectorized, so I need to run it using rowwise(). Obviously this is very slow with a large amount of data.

In my scenario, I only have current_text and previous_text, and change is generated from adist() and the "trafos" attribute is extracted.

df <- tibble(current_text = c("A","AB","ABC"),
             previous_text = c("","A","AB"),
             change = c("II","MI","MMI"))

df <- df %>% 
  rowwise() %>% 
  mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))

How can I run this as a vectorized function, or the very least as a faster function?

Upvotes: 2

Views: 65

Answers (2)

one
one

Reputation: 3902

Using Vectorize:

my_function <- function(previous_text,current_text){
  attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}

vectorized_function <- Vectorize(my_function)

df <- df %>%
  mutate(change_vectorized=vectorized_function(previous_text,current_text))

df
  current_text previous_text change change_vectorized
  <chr>        <chr>         <chr>  <chr>            
1 A            ""            II     II               
2 AB           A             MI     MI               
3 ABC          AB            MMI    MMI  

Here is the benchmark (Note: standalone adist fails with n=100000):

library(microbenchmark)
library(dplyr)

df = data.frame(current_text = c("A","AB","ABC"),
                previous_text = c("","A","AB"))

my_function <- function(previous_text,current_text){
  attr(adist(previous_text, current_text, counts=TRUE),"trafos")
}

vectorized_function <- Vectorize(my_function)

rowwise_func<- function(df){
  df %>% 
    rowwise() %>% 
    mutate(change = attr(adist(previous_text, current_text, counts=TRUE),"trafos"))
}

vectorized_func<-function(df){
  df %>%
    mutate(change_vectorized=vectorized_function(previous_text,current_text))
}

apply_func<-function(df){
  df$change =apply(df, 1, function(x) {
    attr(adist(x[2], x[1], counts=TRUE), "trafos")
  })
}  

adist_func <- function(df){
  df %>% 
    mutate(change = diag(attr(adist(previous_text, current_text, counts=TRUE),"trafos")))
}

#n=3

microbenchmark(
  rowwise_func(df),
  vectorized_func(df),
  apply_func(df),
  adist_func(df)
)

Unit: microseconds
                expr    min      lq     mean  median      uq     max neval cld
    rowwise_func(df) 3179.2 3296.35 4242.982 3396.35 3629.15 13649.8   100   c
 vectorized_func(df) 1480.2 1541.00 2105.509 1590.45 1739.95  6113.3   100  b 
      apply_func(df)  110.4  146.50  232.154  174.70  189.95  4628.8   100 a  
      adist_func(df) 1499.7 1558.30 2303.539 1593.45 1665.15 44858.4   100  b

#n=1000
df_test <- df[sample(1:3,1000,replace=T),]

microbenchmark(
  rowwise_func(df_test),
  vectorized_func(df_test),
  apply_func(df_test),
  adist_func(df_test)
)

Unit: milliseconds
                     expr      min        lq      mean    median        uq      max neval cld
    rowwise_func(df_test)  30.0989  31.65590  38.99190  32.62200  41.28515 175.3273   100  b 
 vectorized_func(df_test)  14.0995  14.73965  19.01849  15.33360  21.55875  48.4703   100 a  
      apply_func(df_test)  13.9025  14.45740  18.17469  14.89315  18.94695  45.1587   100 a  
      adist_func(df_test) 174.6990 186.81355 209.26667 208.16060 221.72470 295.1197   100   c

#n=100000
df_test <- df[sample(1:3,100000,replace=T),]

  microbenchmark(
    rowwise_func(df_test),
    vectorized_func(df_test),
    apply_func(df_test),
    adist_func(df_test)
  )


  
Unit: seconds
                     expr      min       lq     mean   median       uq      max neval cld
    rowwise_func(df_test) 3.505702 3.991654 4.264263 4.144656 4.541689 5.356368   100   c
 vectorized_func(df_test) 1.438014 1.781691 1.981270 1.934037 2.207320 2.605152   100 a  
      apply_func(df_test) 1.682728 2.027163 2.252507 2.238286 2.486441 3.211242   100  b 

Error in `mutate()`:
ℹ In argument: `change = diag(...)`.
Caused by error:
! cannot allocate vector of size 74.5 Gb

Upvotes: 1

BigFinger
BigFinger

Reputation: 1043

A possible solution:

df = data.frame(current_text = c("A","AB","ABC"),
                previous_text = c("","A","AB"))

df$change = apply(df, 1, function(x) {
    attr(adist(x[2], x[1], counts=TRUE), "trafos")
})

Upvotes: 1

Related Questions