Reputation: 13
Think of following cases:
It's quite easy to program this as FOR loop. But this becomes very slow when running big data sets (e.g. think 100.000 consumers with daily purchase records for 2 years, that is with more 1 MM row entries).
How can this be programmed with e.g. Data.Table? I know how to use shift function in data.table for given data set. But dynamic calculation in this use case poses some challenge.
I programmed this with FOR loop with data.frame. But performance with huge number of rows is very, very slow
Concept behind the example: x = think off as e.g ad contact or purchase volume by period etc.
y = think off as dynamic variable such as ad stock which depends on x and decay of y
decay = from any type of function, which calculates y based on previous y values and time generally time period, dynamically changing depending on x - vales | events; here simplified as random function
n <- 100
DF <- data.frame(x = c(1,rep(0,n-1)), y = c(1, rep(0,n-1)), decay = c(1, rep(0,n-1)), index=rep(0,n))
set.seed(10)
for(i in 2:n){
DF$x[i] <- sample(x=c(0:2), replace = T, size = 1, prob = c(0.8, 0.15, 0.05))
if(DF$x[i] > 0){ DF$index[i] <- 0} else { DF$index[i] <- (DF$index[i-1] + 1) }
DF$decay[i] <- round((DF$index[i] + 1)^-0.1, 2)
DF$y[i] <- round((DF$x[i] + DF$y[i-1]) * DF$decay[i],2)}
plot(DF$y, type="o")
Upvotes: 1
Views: 278
Reputation: 11255
Vectors are your friend. Your current loop extracts a vector from the data.frame
each pass and that becomes expensive. Instead you should:
x
as a vector all at oncey, index, and decay
.base_loop <- function(x) {
y <- vector('numeric', n); y[1] <- 1
decay <- vector('numeric', n); decay[1] <- 1
index <- vector('integer', n)
for(i in 2:n){
if(x[i] > 0){index[i] <- 0} else {index[i] <- (index[i-1] + 1) }
decay[i] <- (index[i] + 1)^-0.1
y[i] <- (x[i] + y[i-1]) * decay[i]
}
data.frame(x, y, decay, index)
}
set.seed(10)
n = 1E2
x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))
DF <- base_loop(x)
This can easily be translated into an rcpp loop as well:
// [[Rcpp::export]]
DataFrame decay_func(NumericVector x) {
IntegerVector ind = x.size();
NumericVector decay = x.size(); decay[1] = 1;
NumericVector y = x.size(); y[1] = 1;
for (int i = 0; i < x.size(); i++){
if (x[i] > 0) {
ind[i] = 0;
} else {
ind[i] = ind[i-1] + 1;
}
decay[i] = pow(ind[i] + 1,-0.1);
y[i] = (x[i] + y[i-1]) * decay[i];
}
return DataFrame::create(Named("x") = x,
Named("y") = y,
Named("decay") = decay,
Named("index") = ind);
}
Performance
# n = 100
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc
<bch:expr> <bch:t> <bch:t> <dbl> <bch:byt>
1 OP_loop 21.5ms 21.7ms 43.9 702.62KB
2 vector_loop 11.4ms 11.8ms 82.4 100.33KB
3 compiled_vector 473.2us 483.2us 2028. 9.61KB
4 rcpp_func 412.6us 423.3us 2321. 11.27KB
# n = 10,000
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc
<bch:expr> <bch:t> <bch:t> <dbl> <bch:byt>
1 OP_loop 1.23s 1.23s 0.816 3.01GB
2 vector_loop 16.73ms 17.11ms 56.2 525.72KB
3 compiled_vector 5.8ms 5.88ms 167. 435KB
4 rcpp_func 1.52ms 1.55ms 606. 359.32KB
# n= 1,000,000
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc
<bch:expr> <bch> <bch:> <dbl> <bch:byt>
1 vector_loop 563ms 563ms 1.78 42.1MB
2 compiled_vector 556ms 556ms 1.80 42MB
3 rcpp_func 115ms 120ms 7.56 34.3MB
Code for reference:
library(Rcpp)
set.seed(10)
n = 1E6
x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))
bench::mark(
# OP_loop = {
# DF <- data.frame(x = c(1,rep(0,n-1)), y = c(1, rep(0,n-1)), decay = c(1, rep(0,n-1)), index=rep(0,n))
#
# set.seed(10)
# for(i in 2:n){
# DF$x[i] <- sample(x=c(0:2), replace = T, size = 1, prob = c(0.8, 0.15, 0.05))
# if(DF$x[i] > 0){ DF$index[i] <- 0} else { DF$index[i] <- (DF$index[i-1] + 1) }
# DF$decay[i] <- (DF$index[i] + 1)^-0.1
# DF$y[i] <- (DF$x[i] + DF$y[i-1]) * DF$decay[i]
# }
#
# DF
# }
# ,
vector_loop = {
set.seed(10)
x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))
y <- vector('numeric', n); y[1] <- 1
decay <- vector('numeric', n); decay[1] <- 1
index <- vector('integer', n)
for(i in 2:n){
if(x[i] > 0){index[i] <- 0} else {index[i] <- (index[i-1] + 1) }
decay[i] <- (index[i] + 1)^-0.1
y[i] <- (x[i] + y[i-1]) * decay[i]
}
data.frame(x, y, decay, index)
}
,
compiled_vector = {
set.seed(10)
x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))
base_loop(x)
}
,
rcpp_func = {
set.seed(10)
x <- c(1,sample(c(0:2), replace = T, size = n-1, prob = c(0.8, 0.15, 0.05)))
decay_func(x)
}
)
decay_base <- function(x) {
rle_x <- rle(x > 0)
index <- sequence(rle_x$lengths)
index[x != 0] <- 0
decay <- (index + 1)^(-0.1)
# initialize y vector and other information
cum_rle_len <- cumsum(rle_x$lengths)
y <- vector('numeric', n)
y[1] <- 1
# loops through the elements of rle
for (i in seq_len(length(rle_x$values))[-1]){
prev_ind <- cum_rle_len[i-1]
ind_rng <- (prev_ind + 1):(prev_ind + rle_x$lengths[i])
if (rle_x$values[i]) {
y[ind_rng] <- y[prev_ind] + cumsum(x[ind_rng])
} else {
y[ind_rng] <- cumprod(c(y[prev_ind], decay[ind_rng]))[-1]
}
}
data.frame(x, y, decay, index)
}
base_loop <- function(x) {
y <- vector('numeric', n); y[1] <- 1
decay <- vector('numeric', n); decay[1] <- 1
index <- vector('integer', n)
for(i in 2:n){
if(x[i] > 0){index[i] <- 0} else {index[i] <- (index[i-1] + 1) }
decay[i] <- (index[i] + 1)^-0.1
y[i] <- (x[i] + y[i-1]) * decay[i]
}
data.frame(x, y, decay, index)
}
Upvotes: 1