Reputation: 35
As I understand, data.table is more efficient and faster than dplyr but I found the opposite situation today in my job. I created a simulation to explain the situation.
library(data.table)
library(dplyr)
library(microbenchmark)
# data simulated
dt = data.table(A = sample(1:4247,10000, replace = T),
B = sample(1:119,10000,replace = T),
C = sample(1:6,10000,replace = T),
D = sample(1:30,10000,replace = T))
dt[,ID:=paste(A, ":::" ,
D,":::",
C)]
# execution time
microbenchmark(
DATA_TABLE = dt[, .(count=uniqueN(ID)),
by=c("A","B","C")
],
DPLYR = dt %>%
group_by(A,B,C) %>%
summarise(count = n_distinct(ID)),
times = 10
)
Unit: milliseconds
expr min lq mean median uq max neval
DATA_TABLE 14241.57361 14305.67026 15585.80472 14651.16402 16244.22477 21367.56866 10
DPLYR 35.95123 37.63894 47.62637 48.56598 53.59919 62.63978 10
You can see the big difference! Does someone know the reason? Do you have some advice about when use dplyr or data.table?
I have my full code in data.table syntax now I don't know if I need to translate some chunks of code to dplyr due to this situation.
Thanks in advance.
Upvotes: 2
Views: 1050
Reputation: 25225
Here is another option:
dt[order(A, B, C), {
uniqn <- rleidv(c(.SD, .(ID)))
lastidx <- c(which(diff(rowidv(.SD))<1L), .N)
c(.SD[lastidx], .(count=c(uniqn[lastidx[1L]], diff(uniqn[lastidx]))))
}, .SDcols=cols]
timing code:
cols <- c("A","B","C")
microbenchmark(times=1L,
DATA_TABLE = a00 <- dt[, .(count=uniqueN(ID)), cols],
DATA_TABLE1 = a01 <- dt[, .(count=length(unique(ID))), cols],
DPLYR = a_dplyr <- dt %>%
group_by(A,B,C) %>%
summarise(count = n_distinct(ID)),
#https://github.com/Rdatatable/data.table/issues/1120#issuecomment-463584656
mtd0 = a10 <- unique(dt, by=c(cols, "ID"))[, .(count=.N), cols],
#https://github.com/Rdatatable/data.table/issues/1120#issuecomment-463597107
mtd1 = a11 <- dt[, .N, c(cols, "ID")][, .(count=.N), cols],
mtd2 = a2 <- dt[order(A, B, C), {
uniqn <- rleidv(c(.SD, .(ID)))
lastidx <- c(which(diff(rowidv(.SD))<1L), .N)
c(.SD[lastidx], .(count=c(uniqn[lastidx[1L]], diff(uniqn[lastidx]))))
}, .SDcols=cols]
)
checks:
> fsetequal(a00, a01)
[1] TRUE
> fsetequal(a00, setDT(a_dplyr))
[1] TRUE
> fsetequal(a00, a10)
[1] TRUE
> fsetequal(a00, a11)
[1] TRUE
> fsetequal(a00, a2)
[1] TRUE
timings for the particular dataset below:
Unit: milliseconds
expr min lq mean median uq max neval
DATA_TABLE 147478.1089 147478.1089 147478.1089 147478.1089 147478.1089 147478.1089 1
DATA_TABLE1 4998.8236 4998.8236 4998.8236 4998.8236 4998.8236 4998.8236 1
DPLYR 244081.6925 244081.6925 244081.6925 244081.6925 244081.6925 244081.6925 1
mtd0 4519.4046 4519.4046 4519.4046 4519.4046 4519.4046 4519.4046 1
mtd1 2866.5808 2866.5808 2866.5808 2866.5808 2866.5808 2866.5808 1
mtd2 809.7442 809.7442 809.7442 809.7442 809.7442 809.7442 1
data with 1mio rows:
#R-3.6.1 64bit Win10
library(data.table) #data.table_1.12.8 getDTthreads()==4
library(dplyr) #dplyr_1.0.0
library(microbenchmark)
# data simulated
set.seed(0L)
nr <- 1e6
dt = data.table(A = sample(1:424700,nr, replace = T),
B = sample(1:11900,nr, replace = T),
C = sample(1:600, nr, replace = T),
D = sample(1:3000, nr, replace = T))
dt[,ID:=paste(A,":::",D,":::",C)]
Upvotes: 3