Fernando Brito Lopes
Fernando Brito Lopes

Reputation: 101

Perform for-loop inside data.table

I need to estimate some statistics within different groups. My actual data set has more than 10 millions of rows.

It is very easy to do it using a for-loop. However, I attempted to do the same using data.table. But, it was very slow compared with the for-loop.

library("data.table")
set.seed(1234) # reproducibility
dt1 <- data.table(id=rep(letters[1:5], rep(10,5)), group=1:10, value=rnorm(100))

# Method 1: using for loops
loopFUN <- function(DT){
  Out1 <- list()
  idx <- 1
  for(i in unique(DT$id)){
    for(g in unique(DT$group)){
      sub.g <- subset(DT, group==g)
      sub.i.g <- subset(sub.g, id==i)
      vt <- var.test(x=sub.i.g$value, y=sub.g$value, alternative="greater", ratio=1)
      Out1[[idx]] <- c(i, g, vt$statistic, sqrt(vt$estimate),vt$p.value)
      idx <- idx + 1
    }
  }
  Out1 <- data.table(do.call(rbind, Out1))
  colnames(Out1) <- c("id","group","F.Stat","SD.Ratio","P.value")
  return(Out1)
}

# Method 2: using a function inside tada.table
dtFUN <- function(DT){
  test <- function(x, y){
    VT <- var.test(x, y, alternative="greater", ratio=1)
    res <- c(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
    return(res)
  }
  setkey(DT, id, group)
  Out2 <- DT[, {x=value; DT[, list(g2=group, vt=test(x, value)), by=group]}, by=.(id,group)]
  Out2 <- subset(Out2, group==g2)[, .(id, group, g2=rep(1:3,50), vt)]
  Out2 <- dcast(Out2, id + group ~ g2, value.var = "vt")
  colnames(Out2) <- c("id","group","F.Stat","SD.Ratio","P.value")
  return(Out2)
}


LP <- loopFUN(dt1)
DT <- dtFUN(dt1)
all(LP==DT)

library("microbenchmark")
compare <- microbenchmark(LP <- loopFUN(dt1),
                          DT <- dtFUN(dt1),
                          times = 100)

# Results
              expr       min        lq      mean   median        uq       max neval cld
LP <- loopFUN(dt1)  42.80655  45.33526  47.35974  46.2085  47.90351  88.42227   100  a 
DT <- dtFUN(dt1)   140.41182 145.26643 152.61285 149.7490 154.57031 276.77088   100   b

I can get the correct results. But I am struggling to speed up its performance. I wonder if anyone could give me some advice about how to make it faster. Thank you.

Upvotes: 1

Views: 228

Answers (1)

chinsoon12
chinsoon12

Reputation: 25225

Another approach:

dt1[, { 
        gv <- value
        .SD[, {
                VT <- var.test(value, gv, alternative="greater", ratio=1)
                .(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
            }, 
            by=.(id)]
    }, by=.(group)]

timing code (took too long to time OP's approaches so I took them out) and another approach that strips away the checks in var.test (use with caution):

mtd2 <- function() {
    dt1[, { 
            gv <- value
            leng <- .N
            .SD[, {
                    VT <- var.test(value, gv, alternative="greater", ratio=1)
                    .(F.Stat=VT$statistic, SD.Ratio=sqrt(VT$estimate), P.value=VT$p.value)
                }, 
                by=.(id)]
        }, by=.(group)]
}

mtd3 <- function() {
    dt1[, { 
        gv <- value
        leng <- .N
        .SD[, {
            #see stats:::var.test.default
            ESTIMATE <- var(value) / var(gv)
            .(F.Stat=ESTIMATE, P.value=1 - pf(ESTIMATE, .N - 1L, leng - 1L))
        }, 
            by=.(id)]
    }, by=.(group)][, SD.Ratio:=sqrt(F.Stat)][]
}

library(bench)
bench::mark(mtd2(), mtd3(), check=FALSE)

timings:

# A tibble: 2 x 14
  expression      min     mean   median      max `itr/sec` mem_alloc  n_gc n_itr total_time result                    memory                  time     gc              
  <chr>      <bch:tm> <bch:tm> <bch:tm> <bch:tm>     <dbl> <bch:byt> <dbl> <int>   <bch:tm> <list>                    <list>                  <list>   <list>          
1 mtd2()        2.75s    2.75s    2.75s    2.75s     0.364     1.9GB    54     1      2.75s <data.table [10,000 x 5]> <Rprofmem [82,295 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd3()        1.29s    1.29s    1.29s    1.29s     0.774    13.6MB     4     1      1.29s <data.table [10,000 x 5]> <Rprofmem [2,432 x 3]>  <bch:tm> <tibble [1 x 3]>

data:

library(data.table)
set.seed(1234) # reproducibility
nr <- 1e6
ng <- 100
dt1 <- data.table(id=sample(ng, nr, TRUE), 
    group=sample(ng, nr, TRUE), 
    value=rnorm(nr))

Upvotes: 1

Related Questions