R most rapid way conditional selections

Question

is there a faster way for fast conditional selections? Maybe better to transform the data.frame into another type? In this test version I have ~700k rows but could be millions?

I'm wondering about the benchmarks , because everything is in memory. Alternative might be via db with some extra work (ddl,indexing).

> str(df.test)
'data.frame':   694118 obs. of  4 variables:
 $ uid  : chr  "ZyVOZrPOXwkuGSPv" "qBwuxhbrszRcISSRmIlYaQXHRUZE" "azCESULsUinrAeFkGIjEZpOLhrJcnB" "yLXPfpGlnLrtKmCRERj" ...
 $ g1   : chr  "group_70" "group_85" "group_150" "group_32" ...
 $ g2   : chr  "D" "A" "A" "C" ...
 $ value: num  0.7756 0.1389 0.8924 0.2278 0.0709 ...
> df.test[200,]
              uid      g1 g2 value
200 appoBThmLxqFTyjFWyAqzsyJh group_2  E 0.604
> 
> benchmark(replications = 100,df.test[(df.test$uid=='appoBThmLxqFTyjFWyAqzsyJh') & 
+                                            (df.test$g1 == 'group_2') & 
+                                            (df.test$g2 == 'E'),'value'])
                                                          test replications elapsed relative user.self sys.self user.child sys.child
1 df.test[(df.test$uid == "appoBThmLxqFTyjFWyAqzsyJh") & (df.test$g1 == "group_2") & (df.test$g2 == "E"), "value"]          100   10.72        1    10.713    0.007          0         0
> 
> benchmark(replications = 100,subset(df.test,uid=='appoBThmLxqFTyjFWyAqzsyJh' & g1 == 'group_2' & g2== 'E' ))
                                           test replications elapsed relative user.self sys.self user.child sys.child
1 subset(df.test, uid == "appoBThmLxqFTyjFWyAqzsyJh" & g1 == "group_2" & g2 == "E")          100  18.987        1    18.993        0          0         0
> 
> library(data.table)          
> dt.test <- data.table(df.test)
> benchmark(replications = 100,dt.test[(uid=='appoBThmLxqFTyjFWyAqzsyJh') & 
+                                       (g1 == 'group_2') & 
+                                       (g2 == 'E'),value])
                                            test replications elapsed relative user.self sys.self user.child sys.child
1 dt.test[(uid == "appoBThmLxqFTyjFWyAqzsyJh") & (g1 == "group_2") & (g2 == "E"), value]          100  10.376        1    10.374    0.002          0         0
> setkey(dt.test,uid,g1,g2)
> #rm(dt.test)                     
> benchmark(replications = 100,dt.test[(uid=='appoBThmLxqFTyjFWyAqzsyJh') & 
+                                       (g1 == 'group_2') & 
+                                       (g2 == 'E'),value])
                                            test replications elapsed relative user.self sys.self user.child sys.child
1 dt.test[(uid == "appoBThmLxqFTyjFWyAqzsyJh") & (g1 == "group_2") & (g2 == "E"), value]          100  13.244        1    13.261        0          0         0

A5C1D2H2I1M1N2O1R2T1 · Accepted Answer

You aren't really making use of "data.table" efficiently in your attempts. For instance, after setting the key, you should consider using J in "data.table".

Here, I've recreated some sample data to play with (sharing some sample data makes it much easier for others to answer such questions) and I've created some functions to benchmark with.

Here's the sample data. Change "n" if you want to experiment with different sizes of datasets:

library(stringi) ## for generating random strings
set.seed(1)
uid <- stri_rand_strings(10000, 5)
g1 <- paste0("g", 1:1000)
g2 <- c(letters, LETTERS)
n <- 1000000
df.test <- data.frame(
  uid = sample(uid, n, TRUE),
  g1 = sample(g1, n, TRUE),
  g2 = sample(g2, n, TRUE),
  value = rnorm(n)
)

df.test[200, ] ## The 200th row

Here are the attempts you had made:

f1 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  df.test[(df.test$uid == a1) & (df.test$g1 == a2) & (df.test$g2 == a3), ]
} 
f2 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  subset(df.test, uid == a1 & g1 == a2 & g2 == a3)
} 

library(data.table)        
dt.test <- data.table(df.test)
dt.test.keyed <- copy(dt.test)
setkey(dt.test.keyed, uid, g1, g2)

f3 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  dt.test[uid == a1 & g1 == a2 & g2 == a3]
}

f4 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  dt.test.keyed[uid == a1 & g1 == a2 & g2 == a3]
}

Here is one more with "data.table" and one with "dplyr":

f5 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  dt.test.keyed[J(a1, a2, a3)]
}

library(dplyr)
f6 <- function(a1 = "wzd3u", a2 = "g215", a3 = "x") {
  filter(df.test, uid == a1 & g1 == a2 & g2 == a3)
}

And, here are the results:

library(microbenchmark)
out <- microbenchmark(f1(), f2(), f3(), f4(), f5(), f6())
out
# Unit: milliseconds
#  expr        min         lq       mean     median         uq       max neval
#  f1() 315.560939 327.623885 340.639557 335.504160 342.442239 403.29851   100
#  f2() 333.233436 350.439403 362.876115 356.168562 366.324454 440.86664   100
#  f3() 227.923877 237.390578 249.932411 241.037701 246.196354 329.29018   100
#  f4() 222.598481 232.748170 242.396059 237.787355 243.125148 302.71212   100
#  f5()   1.606372   1.931555   2.602466   2.083269   2.367882  12.00145   100
#  f6() 233.259460 243.932592 255.202134 249.279015 257.420772 329.48901   100

boxplot(out) ## That's a log scale there....

enter image description here

R most rapid way conditional selections

Answers (1)

Related Questions