Reputation: 397
The dataset I am using is structured as follows:
library("data.table")
dt <- data.table(Id = c(1,2,3,4,5,6,7,8), "01.01.2005" = c(10,0,0,0,5,7,7,0),
"02.01.2005" = c(4,5,7,0,0,0,9,9), "03.01.2005" = c(2,3,3,6,7,77,45,0),
"04.01.2005" = c(0,0,0,0,0,0,0,1), "05.01.2005" = c(0,1,1,1,1,0,0,2),
"06.01.2005" = c(45,15,45,54,5,76,67,10), "07.01.2005" = c(0,0,0,0,466,21,832,54),
"08.01.2005" = c(2,3,734,2,3,4,5,6))
>dt
Id 01.01.2005 02.01.2005 03.01.2005 04.01.2005 05.01.2005 06.01.2005 07.01.2005 08.01.2005
1: 1 10 4 2 0 0 45 0 2
2: 2 0 5 3 0 1 15 0 3
3: 3 0 7 3 0 1 45 0 734
4: 4 0 0 6 0 1 54 0 2
5: 5 5 0 7 0 1 5 466 3
6: 6 7 0 77 0 0 76 21 4
7: 7 7 9 45 0 0 67 832 5
8: 8 0 9 0 1 2 10 54 6
I need to sum the an amount of columns by a a certain step. I.e. if the step = 2 then column (2,3) are aggregated, (4,5), (6,7), and (8,9) as well. The output needs to look as follows:
>output
Id 01.01.2005-02.01.2005 03.01.2005-04.01.2005 05.01.2005-06.01.2005 07.01.2005-08.01.2005
1: 1 14 2 45 2
2: 2 5 3 16 3
3: 3 7 3 46 734
4: 4 0 6 55 2
5: 5 5 7 6 469
6: 6 7 77 76 25
7: 7 16 45 67 837
8: 8 9 1 12 60
To achieve this I am using a loop:
output <- dt[, list(Id)]
step = 2
for(i in seq(nrow(dt), 2, by = -step)){
output <- cbind(output, temp.col = rowSums(dt[, i:(i-step+1), with = F],
na.rm = FALSE, dims = 1))
setnames(output, "temp.col", "new.name...")
}
but for large datasets this approach is very slow. Is there a function that exists that does what I need without a loop?
Furthermore: "step" needs to be a variable input.
thanks in advance
Upvotes: 2
Views: 116
Reputation: 887691
You can try
f1 <- function(DT, step){
m1 <- as.matrix(DT[,-1])
lst <- lapply(seq(step), function(i) seq(i, ncol(m1), by = step))
nm1 <- do.call(paste, c(lapply(lst, function(x) colnames(m1)[x]), sep="_"))
res <- cbind(Id= DT[["Id"]], as.data.table(Reduce(`+`,
lapply(lst, function(i) m1[,i]))))
setnames(res, 2:ncol(res), nm1)
res}
f1(dt, 2)
# Id 01.01.2005_02.01.2005 03.01.2005_04.01.2005 05.01.2005_06.01.2005
#1: 1 14 2 45
#2: 2 5 3 16
#3: 3 7 3 46
#4: 4 0 6 55
#5: 5 5 7 6
#6: 6 7 77 76
#7: 7 16 45 67
#8: 8 9 1 12
# 07.01.2005_08.01.2005
#1: 2
#2: 3
#3: 734
#4: 2
#5: 469
#6: 25
#7: 837
#8: 60
Testing with other "step"
dt1 <- data.table(Id = c(1,2,3,4,5,6,7,8),
"01.01.2005" = c(10,0,0,0,5,7,7,0),
"02.01.2005" = c(4,5,7,0,0,0,9,9), "03.01.2005" = c(2,3,3,6,7,77,45,0),
"04.01.2005" = c(0,0,0,0,0,0,0,1), "05.01.2005" = c(0,1,1,1,1,0,0,2),
"06.01.2005" = c(45,15,45,54,5,76,67,10),
"07.01.2005" = c(0,0,0,0,466,21,832,54),
"08.01.2005" = c(2,3,734,2,3,4,5,6),
"09.01.2005"= c(4, 3, 3, 5, 7, 9, 10, 11))
f1(dt1, 3)
# Id 01.01.2005_02.01.2005_03.01.2005 04.01.2005_05.01.2005_06.01.2005
#1: 1 16 45
#2: 2 8 16
#3: 3 10 46
#4: 4 6 55
#5: 5 12 6
#6: 6 84 76
#7: 7 61 67
#8: 8 9 13
# 07.01.2005_08.01.2005_09.01.2005
#1: 6
#2: 6
#3: 737
#4: 7
#5: 476
#6: 34
#7: 847
#8: 71
Another option would be using set
from data.table
. The new function f2
will not depend on the number of columns in the dataset. f1
will fail on 'dt' if the step <- 3
. With f2
, if the number of columns to be used for computation is '8' with step as '3', it gets the rowSums
of 1st 3, rowSums
of 4:6, and rowSums
of 7:8. In the example, it was not clear how the OP want to sum
in those cases. Similarly for step <- 5
, we get the the rowSums
of 1st 5 columns, then 6:8
f2 <- function(DT, step){
indx <- 2:ncol(DT)
indx1 <- (seq_along(indx)-1)%/%step +1L
lst <- split(indx, indx1)
nm1 <- sapply(lst, function(i) paste(names(DT)[i], collapse="_"))
res <- as.data.table(matrix(0, ncol=length(lst),
nrow=nrow(dt), dimnames=list(NULL, nm1)))
for(j in seq_along(res)){
set(res, i=NULL, j=j, value=DT[, lst[[j]], with=FALSE][, Reduce(`+`, .SD)])
}
cbind(Id=DT[['Id']], res)
}
f2(dt,2)
# Id 01.01.2005_02.01.2005 03.01.2005_04.01.2005 05.01.2005_06.01.2005
#1: 1 14 2 45
#2: 2 5 3 16
#3: 3 7 3 46
#4: 4 0 6 55
#5: 5 5 7 6
#6: 6 7 77 76
#7: 7 16 45 67
#8: 8 9 1 12
# 07.01.2005_08.01.2005
#1: 2
#2: 3
#3: 734
#4: 2
#5: 469
#6: 25
#7: 837
#8: 60
f2(dt,3)
# Id 01.01.2005_02.01.2005_03.01.2005 04.01.2005_05.01.2005_06.01.2005
#1: 1 16 45
#2: 2 8 16
#3: 3 10 46
#4: 4 6 55
#5: 5 12 6
#6: 6 84 76
#7: 7 61 67
#8: 8 9 13
# 07.01.2005_08.01.2005
#1: 2
#2: 3
#3: 734
#4: 2
#5: 469
#6: 25
#7: 837
#8: 60
f2(dt,5)
# Id 01.01.2005_02.01.2005_03.01.2005_04.01.2005_05.01.2005
#1: 1 16
#2: 2 9
#3: 3 11
#4: 4 7
#5: 5 13
#6: 6 84
#7: 7 61
#8: 8 12
# 06.01.2005_07.01.2005_08.01.2005
#1: 47
#2: 18
#3: 779
#4: 56
#5: 474
#6: 101
#7: 904
#8: 70
Using a slightly bigger dataset
set.seed(24)
dt <- as.data.table(matrix(sample(0:9, 5000*5000, replace=TRUE),
ncol=5000))
dt <- cbind(Id=1:5000, dt)
dim(dt)
#[1] 5000 5001
system.time(f2(dt, 4))
# user system elapsed
# 2.269 0.022 2.291
Upvotes: 3
Reputation: 1282
Basically the same as David Arenburg's answer, but maybe a little more readable is to first take the group number by using the optimized .GRP
operator from data.table
and then taking the modulo of the step to obtain the index.
step <- 2
temp <- melt(dt, "Id")
temp[, group := .GRP + step-1L, variable]
temp[, indx := group %/% step]
dcast(temp, Id ~ indx, sum, value.var = "value")
Upvotes: 1
Reputation: 92300
Here's another possible approach
step <- 2
temp <- melt(dt, "Id")[, indx := rep(seq_len((ncol(dt)-1L)/step), each = nrow(dt)*step)]
dcast(temp, Id ~ indx, sum, value.var = "value")
# Id 1 2 3 4
# 1: 1 14 2 45 2
# 2: 2 5 3 16 3
# 3: 3 7 3 46 734
# 4: 4 0 6 55 2
# 5: 5 5 7 6 469
# 6: 6 7 77 76 25
# 7: 7 16 45 67 837
# 8: 8 9 1 12 60
Upvotes: 7