Erk
Erk

Reputation: 13

how to dynamically calculate row values in data.table, avoiding FOR loop

Think of following cases:

It's quite easy to program this as FOR loop. But this becomes very slow when running big data sets (e.g. think 100.000 consumers with daily purchase records for 2 years, that is with more 1 MM row entries).

How can this be programmed with e.g. Data.Table? I know how to use shift function in data.table for given data set. But dynamic calculation in this use case poses some challenge.

I programmed this with FOR loop with data.frame. But performance with huge number of rows is very, very slow

Concept behind the example: x = think off as e.g ad contact or purchase volume by period etc.

y = think off as dynamic variable such as ad stock which depends on x and decay of y

decay = from any type of function, which calculates y based on previous y values and time generally time period, dynamically changing depending on x - vales | events; here simplified as random function

n <- 100
DF <- data.frame(x = c(1,rep(0,n-1)), y = c(1, rep(0,n-1)), decay = c(1, rep(0,n-1)), index=rep(0,n))

set.seed(10)
for(i in 2:n){
  DF$x[i] <- sample(x=c(0:2), replace = T, size = 1, prob = c(0.8, 0.15, 0.05))
  if(DF$x[i] > 0){ DF$index[i] <- 0} else { DF$index[i] <- (DF$index[i-1] + 1) }
  DF$decay[i] <- round((DF$index[i] + 1)^-0.1, 2)
  DF$y[i] <- round((DF$x[i] + DF$y[i-1]) * DF$decay[i],2)}

plot(DF$y, type="o")

Upvotes: 1

Views: 278

Answers (1)

Cole
Cole

Reputation: 11255

Vectors are your friend. Your current loop extracts a vector from the data.frame each pass and that becomes expensive. Instead you should:

  1. Create your x as a vector all at once
  2. Use vectors for the other variables y, index, and decay.
  3. Declare it as a function - the compiler will improve performance
base_loop <- function(x) {
  y <- vector('numeric', n); y[1] <- 1
  decay <- vector('numeric', n); decay[1] <- 1
  index <- vector('integer', n)
  for(i in 2:n){
    if(x[i] > 0){index[i] <- 0} else {index[i] <- (index[i-1] + 1) }
    decay[i] <- (index[i] + 1)^-0.1
    y[i] <- (x[i] + y[i-1]) * decay[i]
  }
  data.frame(x, y, decay, index)
}

set.seed(10)
n = 1E2
x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))

DF <- base_loop(x)

This can easily be translated into an loop as well:

// [[Rcpp::export]]
DataFrame decay_func(NumericVector x) {
  IntegerVector ind = x.size();
  NumericVector decay = x.size(); decay[1] = 1;
  NumericVector y = x.size(); y[1] = 1;



  for (int i = 0; i < x.size(); i++){
    if (x[i] > 0) {
      ind[i] = 0;
    } else {
      ind[i] = ind[i-1] + 1;
    }
    decay[i] = pow(ind[i] + 1,-0.1);
    y[i] = (x[i] + y[i-1]) * decay[i];
  }

  return DataFrame::create(Named("x") = x,
                           Named("y") = y,
                           Named("decay") = decay,
                           Named("index") = ind);
}

Performance

# n = 100 
# A tibble: 4 x 13
  expression          min  median `itr/sec` mem_alloc
  <bch:expr>      <bch:t> <bch:t>     <dbl> <bch:byt>
1 OP_loop          21.5ms  21.7ms      43.9  702.62KB
2 vector_loop      11.4ms  11.8ms      82.4  100.33KB
3 compiled_vector 473.2us 483.2us    2028.     9.61KB
4 rcpp_func       412.6us 423.3us    2321.    11.27KB

# n = 10,000
# A tibble: 4 x 13
  expression          min  median `itr/sec` mem_alloc
  <bch:expr>      <bch:t> <bch:t>     <dbl> <bch:byt>
1 OP_loop           1.23s   1.23s     0.816    3.01GB
2 vector_loop     16.73ms 17.11ms    56.2    525.72KB
3 compiled_vector   5.8ms  5.88ms   167.        435KB
4 rcpp_func        1.52ms  1.55ms   606.     359.32KB

# n= 1,000,000
# A tibble: 3 x 13
  expression        min median `itr/sec` mem_alloc
  <bch:expr>      <bch> <bch:>     <dbl> <bch:byt>
1 vector_loop     563ms  563ms      1.78    42.1MB
2 compiled_vector 556ms  556ms      1.80      42MB
3 rcpp_func       115ms  120ms      7.56    34.3MB

Code for reference:

library(Rcpp)

set.seed(10)
n = 1E6
x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))

bench::mark(
  # OP_loop = {
  #   DF <- data.frame(x = c(1,rep(0,n-1)), y = c(1, rep(0,n-1)), decay = c(1, rep(0,n-1)), index=rep(0,n))
  #   
  #   set.seed(10)
  #   for(i in 2:n){
  #     DF$x[i] <- sample(x=c(0:2), replace = T, size = 1, prob = c(0.8, 0.15, 0.05))
  #     if(DF$x[i] > 0){ DF$index[i] <- 0} else { DF$index[i] <- (DF$index[i-1] + 1) }
  #     DF$decay[i] <- (DF$index[i] + 1)^-0.1
  #     DF$y[i] <- (DF$x[i] + DF$y[i-1]) * DF$decay[i]
  #   }
  #   
  #   DF
  # }
  # ,
  vector_loop = {
    set.seed(10)
    x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))

    y <- vector('numeric', n); y[1] <- 1
    decay <- vector('numeric', n); decay[1] <- 1
    index <- vector('integer', n)
    for(i in 2:n){
      if(x[i] > 0){index[i] <- 0} else {index[i] <- (index[i-1] + 1) }
      decay[i] <- (index[i] + 1)^-0.1
      y[i] <- (x[i] + y[i-1]) * decay[i]
    }
    data.frame(x, y, decay, index)
  }
  ,
  compiled_vector = {
    set.seed(10)
    x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))

    base_loop(x)
  }
  ,
  rcpp_func = {
    set.seed(10)
    x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))

    decay_func(x)
  }
)

decay_base <- function(x) {


    rle_x <- rle(x > 0)

    index <- sequence(rle_x$lengths)
    index[x != 0] <- 0

    decay <- (index + 1)^(-0.1)

    # initialize y vector and other information
    cum_rle_len <- cumsum(rle_x$lengths)
    y <- vector('numeric', n)
    y[1] <- 1

    # loops through the elements of rle
    for (i in seq_len(length(rle_x$values))[-1]){
      prev_ind <- cum_rle_len[i-1]
      ind_rng <- (prev_ind + 1):(prev_ind + rle_x$lengths[i])

      if (rle_x$values[i]) {
        y[ind_rng] <- y[prev_ind] + cumsum(x[ind_rng])
      } else {
        y[ind_rng] <- cumprod(c(y[prev_ind], decay[ind_rng]))[-1]
      }
    }
    data.frame(x, y, decay, index)

}

base_loop <- function(x) {
  y <- vector('numeric', n); y[1] <- 1
  decay <- vector('numeric', n); decay[1] <- 1
  index <- vector('integer', n)
  for(i in 2:n){
    if(x[i] > 0){index[i] <- 0} else {index[i] <- (index[i-1] + 1) }
    decay[i] <- (index[i] + 1)^-0.1
    y[i] <- (x[i] + y[i-1]) * decay[i]
  }
  data.frame(x, y, decay, index)
}

Upvotes: 1

Related Questions