Code Optimization - data.table, current for loop with multiple reference to be optimized into data.table

Question

I have a requirement, where in I have a huge database of around 2 million records where in I need to create new variables with codes based on information from another data frame for some specific variables. So the situation is -

Have a reference database which contains cutoffs for variables by IBD (inter1)
Have a vector that contains list of variables for whom codes needs to be created based on the cutoffs (v0int)
Main database on which new variables with codes based on cutoffs needs to be created (smpl)

So for example for an IBD 5 and variable var1a consider below information in inter1 file -

Based on above information I want to create a new variable in smpl data frame such that -

if smpl$var1a <= 11 then var1a_INT = 1
if smpl$var1a > 11 & smpl$var1a <= 18 then var1a_INT = 2
if smpl$var1a > 18 & smpl$var1a <= 30 then var1a_INT = 3
if smpl$var1a > 30 & smpl$var1a <= 63 then var1a_INT = 4
if smpl$var1a > 63 then var1a_INT = 5

Since this needs to be done for multiple variables and by IBD, I have written my code using for loop. My sample code is as below -

    set.seed(1200)

    IBD <- sort(rep(1:10,4))

    var1a <- c()
    var2a <- c()
    var3a <- c()
    var4a <- c()
    var5a <- c()

    j=10
    for (i in 1:10){
      set.seed(1200)+(j*i)
      var1 <- sort(sample(1:(10*i),4))
      var2 <- sort(sample(11:(15*i),4))
      var3 <- sort(sample(10:(17*i),4))
      var4 <- sort(sample(11:(19*i),4))
      var5 <- sort(sample(10:(16*i),4))

      var1a <- c(var1a,var1)
      var2a <- c(var2a,var2)
      var3a <- c(var3a,var3)
      var4a <- c(var4a,var4)
      var5a <- c(var5a,var5)
    }

    inter1 <- data.frame(IBD,var1a,var2a,var3a,var4a,var5a)

    sm=5000

    ID <- seq(1:sm)
    IBD <- sample(1:10,sm,replace = T)
    CELL <- sample(1001:9999,sm)
    var1a <- sample(1:150,sm,replace = T)
    var2a <- sample(1:200,sm,replace = T)
    var3a <- sample(1:200,sm,replace = T)
    var4a <- sample(1:350,sm,replace = T)
    var5a <- sample(1:250,sm,replace = T)
    var6a <- sample(1:150,sm,replace = T)
    var7a <- sample(1:250,sm,replace = T)
    var8a <- sample(1:350,sm,replace = T)
    var9a <- sample(1:450,sm,replace = T)
    loc <- sample(1:20,sm,replace = T)
    bill <- sample(1:2,sm,replace = T)

        smpl <- data.frame(ID,IBD,CELL,var1a,var2a,var3a,var4a,var5a,var6a,var7a,var8a,var9a,loc,bill)



    v0int <- c("var1a","var2a","var3a","var4a","var5a")

    df_smpl <- data.frame(matrix(NA,nrow = 0,ncol = ncol(smpl)))

    #l=1
    start_time <- Sys.time()

        for (l in (unique(inter1$IBD))){
      df1 <- subset(smpl,IBD == l)
      for (k in 1:length(v0int)){
        #k=1
        q0 <- v0int[k]
        q1 <- sort(inter1[inter1$IBD == l,q0])
        for (m in 1:nrow(df1)){
          #print(q0)
          #print(l)
          #print(m)
          if (length(q1) == 0){
            df1[m,paste0(q0,"_INT")]=NA
          } else if(length(q1) == 1){
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] <= q1[1]) df1[m,paste0(q0,"_INT")]=1
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[1]) df1[m,paste0(q0,"_INT")]=2
          } else if(length(q1) == 2){
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] <= q1[1]) df1[m,paste0(q0,"_INT")]=1
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[1] & df1[m,q0] <= q1[2]) df1[m,paste0(q0,"_INT")]=2
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[2]) df1[m,paste0(q0,"_INT")]=3
          } else if(length(q1) == 3) {
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] <= q1[1]) df1[m,paste0(q0,"_INT")]=1
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[1] & df1[m,q0] <= q1[2]) df1[m,paste0(q0,"_INT")]=2
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[2] & df1[m,q0] <= q1[3]) df1[m,paste0(q0,"_INT")]=3
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[3]) df1[m,paste0(q0,"_INT")]=4
          } else if(length(q1) == 4) {
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] <= q1[1]) df1[m,paste0(q0,"_INT")]=1
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[1] & df1[m,q0] <= q1[2]) df1[m,paste0(q0,"_INT")]=2
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[2] & df1[m,q0] <= q1[3]) df1[m,paste0(q0,"_INT")]=3
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[3] & df1[m,q0] <= q1[4]) df1[m,paste0(q0,"_INT")]=4
            if(!is.null(df1[m,q0]) & df1[m,"IBD"]==l & df1[m,q0] > q1[4]) df1[m,paste0(q0,"_INT")]=5
          }
        }
        #q1 <- NULL
      }
      df_smpl <- rbind(df_smpl,df1)
      #q0 <- NULL
    }


    time_taken <- as.numeric(difftime(Sys.time(), start_time, units = 'secs'))

For sample data of 5000 records this takes 5.859623 seconds on my machine which is having 16GB RAM SSD HDD with 2 cores.

When tried for a data with 500000 records this takes 752.7261 seconds.

My actual data is having 2 million records and I need to run this in a iterative manner multiple times so the time needed would increase in a big way.

On doing some search I understand data.table is much faster and saves huge amount of time. I do not know data.table very well and want to seek your help on this.

It would be a huge help and huge time saving if we can optimize this code.

minem · Accepted Answer

For your example data I got the same results using this loop:

for (l in (unique(inter1$IBD))){
  df1 <- subset(smpl, IBD == l)
  for (k in 1:length(v0int)){
    q0 <- v0int[k]
    q1 <- sort(inter1[inter1$IBD == l,q0])
    x <- as.integer(cut(df1[, q0], c(0, q1, Inf)))
    df1[, paste0(q0,"_INT")] <- x
  }
  df_smpl <- rbind(df_smpl, df1)
}

0.42 sek vs 10 sek

Using data.table we can easily add the results straight to the original data table. Which will be mush faster than using rbind.

setDT(smpl) # convert smpl to data.table
setkey(smpl, IBD) # setkey on IBD for faster `IBD == l` operation

start_time <- Sys.time()

for (l in (unique(inter1$IBD))) {
  for (k in 1:length(v0int)) {
    q0 <- v0int[k]
    q1 <- sort(inter1[inter1$IBD == l, q0])
    smpl[IBD == l, paste0(q0, "_INT") := as.integer(cut(get(q0), c(0, q1, Inf)))]
  }
}
smpl # end result data.table

The major difference will be that the end result will have different row order than your original result.

Using this line it should be faster:

smpl[IBD == l, paste0(q0, "_INT") := cut(get(q0), c(0, q1, Inf), labels = F)]

Code Optimization - data.table, current for loop with multiple reference to be optimized into data.table

Answers (2)

Non-equi join

Rolling join

Benchmark

Related Questions