Reputation: 1801
Suppose I have a data.table
data.table(A=c(1,2,3,4,5,6,4,2))
How can I calculate the sum of a sequences of n
elements?
Suppose n=3
, The result of the sequence sum of A should be the column seq_sum
,
data.table(A=c(1,2,3,4,5,6,4,2),seq_sum=c(1+2+3,2+3+4,3+4+5,4+5+6,5+6+4,6+4+2,4+2,2))
How to effectively do this?
Upvotes: 2
Views: 2095
Reputation: 25223
Here is another method using RcppRoll:suml
and some timings for your reference. @Jaap's solution using data.table
in-built functions is the fastest.
library(data.table)
library(microbenchmark)
N <- 1e5
set.seed(0L)
dt <- data.table(A=rnorm(N))
n <- 3
dt_cumsum <- copy(dt)
fun_cumsum <- function() {
dt_cumsum[, seq_sum := {
cs <- cumsum(c(A, rep_len(0, n - 1)))
diff(c(0, cs), n)
}]
}
dt_Reduce <- copy(dt)
fun_Reduce <- function() {
dt_Reduce[, seq_sum := Reduce(`+`, shift(A, n = seq_len(n) - 1, fill = 0, type = 'lead'))]
}
library(zoo)
dt_zoo <- copy(dt)
fun_zoo <- function() {
dt_zoo[, seq_sum := rollapply(A, width = n, FUN = "sum", align = "left", partial = TRUE)]
}
fun_base <- function() {
sapply(1:(length(dt$A)), function(i) {sum(dt$A[i:(min(i+n-1,length(dt$A)))])})
}
library(RcppRoll)
dt_RcppRoll <- copy(dt)
fun_RcppRoll <- function() {
dt_RcppRoll[, seq_sum:=head(roll_suml(c(A, rep_len(0, n - 1)), n), -(n-1))]
}
ans <- capture.output(microbenchmark(
fun_cumsum(),
fun_Reduce(),
fun_zoo(),
fun_base(),
fun_RcppRoll(),
times=5L))
writeLines(paste("#", ans))
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun_cumsum() 2.5983 2.6427 2.67526 2.6462 2.7311 2.7580 5
# fun_Reduce() 1.3903 1.4274 2.84070 1.6620 1.7047 8.0191 5
# fun_zoo() 1225.1620 1242.9112 1289.76416 1258.1143 1355.1070 1367.5263 5
# fun_base() 2731.6609 2849.1003 2909.27308 2922.9430 2971.9956 3070.6656 5
# fun_RcppRoll() 1.7890 1.8430 3.49892 1.9663 2.0774 9.8189 5
Upvotes: 1
Reputation: 13132
To avoid repeating summations, a cummulative sum can be stored:
n = 3
A2 = c(A, rep_len(0, n - 1))
cs = cumsum(A2)
And subtract the respective differences:
cs[-seq_len(n - 1)] - c(0, cs[seq_len(length(A2) - n)])
#[1] 6 9 12 15 15 12 6 2
Or, equivalently:
diff(c(0, cs), n)
#[1] 6 9 12 15 15 12 6 2
Upvotes: 3
Reputation: 83275
Another option is to use Reduce
and shift
:
dt[, seq_sum := Reduce(`+`, shift(A, 0:2, 0, 'lead'))]
which gives:
> dt
A seq_sum
1: 1 6
2: 2 9
3: 3 12
4: 4 15
5: 5 15
6: 6 12
7: 4 6
8: 2 2
Full notation with parameter names:
dt[, seq_sum := Reduce(`+`, shift(A, n = 0:2, fill = 0, type = 'lead'))]
Upvotes: 7
Reputation: 1475
Updated based on comments:
You can also use rollapply
from the zoo
package:
library(data.table)
library(zoo)
dt <- data.table(A=c(1,2,3,4,5,6,4,2))
dt[, seq_sum := rollapply(A, width = 3, FUN = "sum", align = "left", partial = TRUE)]
# > dt
# A seq_sum
# 1: 1 6
# 2: 2 9
# 3: 3 12
# 4: 4 15
# 5: 5 15
# 6: 6 12
# 7: 4 6
# 8: 2 2
Upvotes: 3
Reputation: 67
library(zoo)
dtab <- data.table(A=c(1,2,3,4,5,6,4,2))
dtab[, seq_sum := rollapplyr(A, 3, sum, partial = TRUE, align = "left")]
Upvotes: 2
Reputation: 4378
library(data.table)
dt <- data.table(A=c(1,2,3,4,5,6,4,2))
n = 3
sapply(1:(length(dt$A)), function(i) {sum(dt$A[i:(min(i+n-1,length(dt$A)))])})
# [1] 6 9 12 15 15 12 6 2
Upvotes: 2