Reputation: 134
Here is a piece of R code that works fine at the moment :
library(data.table)
DT <- as.data.table(iris)
Start_Contextual <- "This is Start"
End_Contextual <- "This is End"
DT[,Row_num:=seq.int(nrow(DT))]
DT[,Start_End:=ifelse(Row_num %% 2==0,Start_Contextual, End_Contextual)]
Num <- "Petal.Width"
Cluster <- "Species"
ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
DenomStart = 1,
DenomEnd = 1)
,by=eval(Cluster)]
It's in a loop therefore I'm forced to use get() in order to call Num because it changes at every loop !
eval() for Cluster seems also inevitable, it is a variable containing the name of a column of DT, it is the same for Start_End
However I want to make it faster, changing the last line, here’s what I tried with the microbenchmark library (runs each piece of code 100 times and computes duration min, max, mean, median...) :
library(microbenchmark)
mbm <- microbenchmark("original method" = {
ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
DenomStart = 1,
DenomEnd = 1)
,by=eval(Cluster)]
},
"using which" = {
ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
DenomStart = 1,
DenomEnd = 1)
,by=eval(Cluster)]
},
"creating new column" = {
DT[,by_column:=get(Cluster)]
ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
DenomStart = 1,
DenomEnd = 1),
by = "by_column"]
},
"using set names" = {
setnames(DT,Cluster,"by_column")
ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
DenomStart = 1,
DenomEnd = 1),
by = "by_column"]
setnames(DT,"by_column",Cluster)
}
)
But it’s not much better… here are the performance results :
Is there another thing I could try ?
Upvotes: 1
Views: 196
Reputation: 132969
I would compute on the language. This ensures that data.table optimizations are used (set verbose = TRUE
within [.data.table
to see the effect). I would also avoid ifelse
. Use data.table::fifelse
if you must, but I don't see the need here.
microbenchmark(
"original method" = {
ClusterName <- DT[,.(NumStart = sum(ifelse(Start_End==Start_Contextual,get(Num),0),na.rm=TRUE),
NumEnd = sum(ifelse(Start_End==End_Contextual,get(Num),0),na.rm=TRUE),
DenomStart = 1,
DenomEnd = 1)
,by=eval(Cluster)]
},
"roland" = {
ClusterName <- eval(substitute(DT[,.(NumStart = sum((Start_End==Start_Contextual) * x, na.rm = TRUE),
NumEnd = sum((Start_End==End_Contextual) * x, na.rm = TRUE),
DenomStart = 1,
DenomEnd = 1),
by=clust],
list(x = as.symbol(Num), clust = as.symbol(Cluster))))
},
check = "equal")
#Unit: microseconds
# expr min lq mean median uq max neval
# original method 489.501 504.201 523.362 513.401 529.4010 732.701 100
# roland 345.801 357.201 432.717 363.751 374.0515 6311.001 100
Note that your example data is too small for meaningful performance benchmarks.
Upvotes: 2