Reputation: 101
I need to estimate some statistics within different groups. My actual data set has more than 10 millions of rows.
It is very easy to do it using a for-loop. However, I attempted to do the same using data.table. But, it was very slow compared with the for-loop.
library("data.table")
set.seed(1234) # reproducibility
dt1 <- data.table(id=rep(letters[1:5], rep(10,5)), group=1:10, value=rnorm(100))
# Method 1: using for loops
loopFUN <- function(DT){
Out1 <- list()
idx <- 1
for(i in unique(DT$id)){
for(g in unique(DT$group)){
sub.g <- subset(DT, group==g)
sub.i.g <- subset(sub.g, id==i)
vt <- var.test(x=sub.i.g$value, y=sub.g$value, alternative="greater", ratio=1)
Out1[[idx]] <- c(i, g, vt$statistic, sqrt(vt$estimate),vt$p.value)
idx <- idx + 1
}
}
Out1 <- data.table(do.call(rbind, Out1))
colnames(Out1) <- c("id","group","F.Stat","SD.Ratio","P.value")
return(Out1)
}
# Method 2: using a function inside tada.table
dtFUN <- function(DT){
test <- function(x, y){
VT <- var.test(x, y, alternative="greater", ratio=1)
res <- c(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
return(res)
}
setkey(DT, id, group)
Out2 <- DT[, {x=value; DT[, list(g2=group, vt=test(x, value)), by=group]}, by=.(id,group)]
Out2 <- subset(Out2, group==g2)[, .(id, group, g2=rep(1:3,50), vt)]
Out2 <- dcast(Out2, id + group ~ g2, value.var = "vt")
colnames(Out2) <- c("id","group","F.Stat","SD.Ratio","P.value")
return(Out2)
}
LP <- loopFUN(dt1)
DT <- dtFUN(dt1)
all(LP==DT)
library("microbenchmark")
compare <- microbenchmark(LP <- loopFUN(dt1),
DT <- dtFUN(dt1),
times = 100)
# Results
expr min lq mean median uq max neval cld
LP <- loopFUN(dt1) 42.80655 45.33526 47.35974 46.2085 47.90351 88.42227 100 a
DT <- dtFUN(dt1) 140.41182 145.26643 152.61285 149.7490 154.57031 276.77088 100 b
I can get the correct results. But I am struggling to speed up its performance. I wonder if anyone could give me some advice about how to make it faster. Thank you.
Upvotes: 1
Views: 228
Reputation: 25225
Another approach:
dt1[, {
gv <- value
.SD[, {
VT <- var.test(value, gv, alternative="greater", ratio=1)
.(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
},
by=.(id)]
}, by=.(group)]
timing code (took too long to time OP's approaches so I took them out) and another approach that strips away the checks in var.test
(use with caution):
mtd2 <- function() {
dt1[, {
gv <- value
leng <- .N
.SD[, {
VT <- var.test(value, gv, alternative="greater", ratio=1)
.(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
},
by=.(id)]
}, by=.(group)]
}
mtd3 <- function() {
dt1[, {
gv <- value
leng <- .N
.SD[, {
#see stats:::var.test.default
ESTIMATE <- var(value) / var(gv)
.(F.Stat=ESTIMATE, P.value=1 - pf(ESTIMATE, .N - 1L, leng - 1L))
},
by=.(id)]
}, by=.(group)][, SD.Ratio:=sqrt(F.Stat)][]
}
library(bench)
bench::mark(mtd2(), mtd3(), check=FALSE)
timings:
# A tibble: 2 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <list> <list>
1 mtd2() 2.75s 2.75s 2.75s 2.75s 0.364 1.9GB 54 1 2.75s <data.table [10,000 x 5]> <Rprofmem [82,295 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd3() 1.29s 1.29s 1.29s 1.29s 0.774 13.6MB 4 1 1.29s <data.table [10,000 x 5]> <Rprofmem [2,432 x 3]> <bch:tm> <tibble [1 x 3]>
data:
library(data.table)
set.seed(1234) # reproducibility
nr <- 1e6
ng <- 100
dt1 <- data.table(id=sample(ng, nr, TRUE),
group=sample(ng, nr, TRUE),
value=rnorm(nr))
Upvotes: 1