paulbouu
paulbouu

Reputation: 134

Optimizing R Code for Faster Execution Time

Here is a piece of R code that works fine at the moment :

library(data.table)

DT <- as.data.table(iris)


Start_Contextual <- "This is Start"
End_Contextual <- "This is End"

DT[,Row_num:=seq.int(nrow(DT))]

DT[,Start_End:=ifelse(Row_num %% 2==0,Start_Contextual, End_Contextual)]

Num <- "Petal.Width"
Cluster <- "Species"

ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
                               DenomStart  = 1,
                               DenomEnd    = 1)
            ,by=eval(Cluster)]

It's in a loop therefore I'm forced to use get() in order to call Num because it changes at every loop !

eval() for Cluster seems also inevitable, it is a variable containing the name of a column of DT, it is the same for Start_End

However I want to make it faster, changing the last line, here’s what I tried with the microbenchmark library (runs each piece of code 100 times and computes duration min, max, mean, median...) :

library(microbenchmark)

mbm <- microbenchmark("original method" = {
  ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
                                                         NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
                                                         DenomStart  = 1,
                                                         DenomEnd    = 1)
                                                      ,by=eval(Cluster)]
  },
  "using which" = {
    ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
                                        NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
                                        DenomStart  = 1,
                                        DenomEnd    = 1)
                                     ,by=eval(Cluster)]
    },
  "creating new column" = {
    DT[,by_column:=get(Cluster)]
    ClusterName <- DT[,.(NumStart    = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
                         NumEnd      = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
                         DenomStart  = 1,
                         DenomEnd    = 1),
                      by = "by_column"]
  },
  "using set names" = {
    setnames(DT,Cluster,"by_column")
    ClusterName <- DT[,.(NumStart    = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
                         NumEnd      = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
                         DenomStart  = 1,
                         DenomEnd    = 1),
                      by = "by_column"]
    setnames(DT,"by_column",Cluster)
  }

)

But it’s not much better… here are the performance results :

Microbenchmark results

Is there another thing I could try ?

Upvotes: 1

Views: 196

Answers (1)

Roland
Roland

Reputation: 132969

I would compute on the language. This ensures that data.table optimizations are used (set verbose = TRUE within [.data.table to see the effect). I would also avoid ifelse. Use data.table::fifelse if you must, but I don't see the need here.

microbenchmark(
  "original method" = {
  ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
                       NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
                       DenomStart  = 1,
                       DenomEnd    = 1)
                    ,by=eval(Cluster)]
  },
  "roland" = {
  ClusterName <- eval(substitute(DT[,.(NumStart = sum((Start_End==Start_Contextual) * x, na.rm = TRUE),
                                       NumEnd = sum((Start_End==End_Contextual) * x, na.rm = TRUE),
                                       DenomStart  = 1,
                                       DenomEnd    = 1),
                                    by=clust], 
                                 list(x = as.symbol(Num), clust = as.symbol(Cluster))))
  }, 
  check = "equal")

#Unit: microseconds
#            expr     min      lq    mean  median       uq      max neval
# original method 489.501 504.201 523.362 513.401 529.4010  732.701   100
#          roland 345.801 357.201 432.717 363.751 374.0515 6311.001   100

Note that your example data is too small for meaningful performance benchmarks.

Upvotes: 2

Related Questions