Speedup a data.table intersect

Question

I'm looking for the faster way to compute this kind of things. (the real database have 109 964 694 rows).

DT<-data.table(id=c(1,2,1,4,2,1,7,8,8,10),
               effect=c("A","A","B","B","B","C","C","C","A","D"),
               value=1)

I would like to have the id with effect =="A" AND effect == "C"

intersect(DT[effect=="A",(id)], DT[effect=="C",(id)])

Is there any other faster way to do that ?

I also would like to have the id with effect =="A" AND effect =="C" .. but not with effect=="B"

setdiff(
  intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]),
DT[effect=="B",(id)]
)

any better (faster) idea ?

Regards

PS : I tried this kind of things :

DT[,effect=="diag_998" | effect=="diag_1900",by=id][,sum(V1)==2,by=id][V1==TRUE,(id)]

but it's very very very slow...

EDIT : thanks all for your proposal.

nothing seems to be faster than V1 and A1 :

system.time(V1<-intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]))# 0.014 sec
system.time(V2<-DT[effect=="A" | effect=="C", unique(id[duplicated(id)])]) #17sec
system.time(V3<-DT[,list(cond=all(c("A","C") %in% effect)),by=id][cond==TRUE,id] ) #more than 1 min
system.time(V4<-DT[effect=="A" | effect=="C", .N[.N > 1], by = id]$id) # 17 sec
system.time(V5<-DT[, .GRP[sum(c("A", "C") %chin% effect)==2], id]$id) # 103 sec

system.time(V6<-DT[, .GRP[sum(c("A", "C") %in% effect)==2], id]$id)#more than 1 min
setkey(DT, effect)
system.time(V7<-DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id)#0.19 sec



system.time(A1<-setdiff(intersect(DT[effect=="A",(id)],DT[effect=="C",(id)]),DT[effect=="D",(id)])) # 0.014 sec
system.time(A2<-DT[,list(cond=all(c("A","C") %in% effect) & (!"D" %in% effect)),by=id][cond==TRUE,id])#more than 1 min
system.time(A3<-DT[,list(cond=all(c("A","C") %chin% effect) & (!"D" %chin% effect)),by=id][cond==TRUE,id])
system.time(A4<-DT[.(c("A", "C", "D")), if (.N == 2 & !("D" %in% effect)) TRUE, by = id]$id)# 1sec

EDIT : another benchmark, thanks to @MichaelChirico

microbenchmark(times=50L,
+                bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
+                bakal.keyed=intersect(DT["A", id], DT["C", id]),
+                rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
+                rscr1.keyed=DT[.(c("A","C")), unique(id[duplicated(id)])])
Unit: milliseconds
        expr       min        lq      mean    median        uq       max neval
       bakal 10.963171 11.003494 11.072844 11.019909 11.060331 12.641751    50
 bakal.keyed 10.738537 10.794715 10.878960 10.838630 10.892020 12.416713    50
       rscr1  9.504886  9.572026  9.662024  9.598426  9.645478 11.127047    50
 rscr1.keyed  9.013076  9.037370  9.065215  9.065669  9.083492  9.206366    50

MichaelChirico · Accepted Answer

It seems you were on the right path; I can only suggest keying to speed it up.

Here's a small-ish sample data set to weed out the worst attempts:

set.seed(10239)
NN<-1e6
DT<-data.table(id=sample(8e5,NN,T),
               effect=sample(LETTERS[1:4],NN,T),
               val=rnorm(NN),key="effect")

My original suggestion was to use keying, but not in the best way, basically: setkey(DT,effect); DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id.

Here is a benchmark of this approach, your original approach, and the others mentioned in the comments (with a little bit of optimization by me, e.g. replacing cond==TRUE with (cond) in @nicola's approach and effect=="A"|effect=="C" with effect %in% c("A", C") in @RichardScriven's):

library(microbenchmark)
microbenchmark(times=50L,
               bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
               rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
               nicol=DT[,.(cond=all(c("A","C") %in% effect)), by=id][(cond), id],
               rscr2=DT[effect %in% c("A","C"), .N[.N > 1], by = id]$id,
               akrun=DT[, .GRP[sum(c("A", "C") %chin% effect) == 2], id]$id,
               mikec=DT[.(c("A", "C")), if (.N > 1) TRUE, by = id]$id)

With results on my machine:

Unit: milliseconds
  expr        min         lq       mean     median         uq        max neval   cld
 bakal   14.82926   15.18540   17.86200   15.56453   16.70924   64.99443    50 a    
 rscr1   13.41102   13.98252   20.11127   14.93054   18.02248   66.14476    50 a    
 nicol 1329.82013 1377.03884 1436.45650 1404.48956 1483.47944 1758.00831    50     e
 rscr2  260.54888  269.86605  294.05219  276.66802  310.76356  479.50419    50   c  
 akrun  997.43300 1075.17322 1103.06220 1095.08246 1118.16848 1360.80793    50    d 
 mikec  154.39418  158.90884  180.01096  163.32763  206.59246  235.45523    50  b

Let's see how these perform on a data.table of the magnitude you mention:

set.seed(12039)
NN<-1e8
DT<-data.table(id=sample(8e5,NN,T),
               effect=sample(LETTERS[1:4],NN,T),
               val=rnorm(NN),key="effect")
microbenchmark(times=50L,
               bakal=intersect(DT[effect=="A", id], DT[effect=="C", id]),
               bakal.keyed=intersect(DT["A", id], DT["C", id]),
               rscr1=DT[effect %in% c("A","C"), unique(id[duplicated(id)])],
               rscr1.keyed=DT[.(c("A","C")), unique(id[duplicated(id)])])
Unit: seconds
        expr      min       lq     mean   median       uq      max neval cld
       bakal 3.772309 3.991414 4.395669 4.408319 4.681609 5.170224    50  a 
 bakal.keyed 3.708500 3.807447 4.289518 4.384870 4.653427 5.085429    50  a 
       rscr1 4.962940 5.210845 5.721636 5.707369 6.162103 6.779900    50   b
 rscr1.keyed 4.904702 5.117411 5.727848 5.807186 6.194990 6.975508    50   b

Speedup a data.table intersect

Answers (1)

Related Questions