greyBag
greyBag

Reputation: 397

How to efficiently aggregate multiple data.table columns by groups, N-at-a-time, where N is variable

The dataset I am using is structured as follows:

library("data.table")

   dt <- data.table(Id = c(1,2,3,4,5,6,7,8), "01.01.2005" = c(10,0,0,0,5,7,7,0),
 "02.01.2005" = c(4,5,7,0,0,0,9,9), "03.01.2005" = c(2,3,3,6,7,77,45,0), 
"04.01.2005" = c(0,0,0,0,0,0,0,1), "05.01.2005" = c(0,1,1,1,1,0,0,2), 
"06.01.2005" = c(45,15,45,54,5,76,67,10), "07.01.2005" = c(0,0,0,0,466,21,832,54), 
"08.01.2005" = c(2,3,734,2,3,4,5,6))

>dt
       Id 01.01.2005 02.01.2005 03.01.2005 04.01.2005 05.01.2005 06.01.2005 07.01.2005 08.01.2005
    1:  1         10          4          2          0          0         45          0          2
    2:  2          0          5          3          0          1         15          0          3
    3:  3          0          7          3          0          1         45          0        734
    4:  4          0          0          6          0          1         54          0          2
    5:  5          5          0          7          0          1          5        466          3
    6:  6          7          0         77          0          0         76         21          4
    7:  7          7          9         45          0          0         67        832          5
    8:  8          0          9          0          1          2         10         54          6

I need to sum the an amount of columns by a a certain step. I.e. if the step = 2 then column (2,3) are aggregated, (4,5), (6,7), and (8,9) as well. The output needs to look as follows:

>output
       Id 01.01.2005-02.01.2005 03.01.2005-04.01.2005 05.01.2005-06.01.2005 07.01.2005-08.01.2005
    1:  1                    14                     2                    45                     2
    2:  2                     5                     3                    16                     3
    3:  3                     7                     3                    46                   734
    4:  4                     0                     6                    55                     2
    5:  5                     5                     7                     6                   469
    6:  6                     7                    77                    76                    25
    7:  7                    16                    45                    67                   837
    8:  8                     9                     1                    12                    60

To achieve this I am using a loop:

output <- dt[, list(Id)]
step = 2
for(i in seq(nrow(dt), 2, by = -step)){
     output <- cbind(output, temp.col = rowSums(dt[, i:(i-step+1), with = F], 
                      na.rm = FALSE, dims = 1))
     setnames(output, "temp.col", "new.name...")
}

but for large datasets this approach is very slow. Is there a function that exists that does what I need without a loop?

Furthermore: "step" needs to be a variable input.

thanks in advance

Upvotes: 2

Views: 116

Answers (3)

akrun
akrun

Reputation: 887691

You can try

f1 <- function(DT, step){
  m1 <- as.matrix(DT[,-1])
  lst <- lapply(seq(step), function(i) seq(i, ncol(m1), by = step))
  nm1 <- do.call(paste, c(lapply(lst, function(x) colnames(m1)[x]), sep="_"))
  res <- cbind(Id= DT[["Id"]], as.data.table(Reduce(`+`,
                                 lapply(lst, function(i) m1[,i]))))
  setnames(res, 2:ncol(res), nm1)
  res}

f1(dt, 2)  
#   Id 01.01.2005_02.01.2005 03.01.2005_04.01.2005 05.01.2005_06.01.2005
#1:  1                    14                     2                    45
#2:  2                     5                     3                    16
#3:  3                     7                     3                    46
#4:  4                     0                     6                    55
#5:  5                     5                     7                     6
#6:  6                     7                    77                    76
#7:  7                    16                    45                    67
#8:  8                     9                     1                    12
#   07.01.2005_08.01.2005
#1:                     2
#2:                     3
#3:                   734
#4:                     2
#5:                   469
#6:                    25
#7:                   837
#8:                    60

Testing with other "step"

dt1 <- data.table(Id = c(1,2,3,4,5,6,7,8),
"01.01.2005" =   c(10,0,0,0,5,7,7,0),
"02.01.2005" = c(4,5,7,0,0,0,9,9), "03.01.2005" = c(2,3,3,6,7,77,45,0), 
"04.01.2005" = c(0,0,0,0,0,0,0,1), "05.01.2005" = c(0,1,1,1,1,0,0,2), 
"06.01.2005" = c(45,15,45,54,5,76,67,10),
"07.01.2005" = c(0,0,0,0,466,21,832,54), 
"08.01.2005" = c(2,3,734,2,3,4,5,6),
"09.01.2005"= c(4, 3, 3, 5, 7, 9, 10, 11))

f1(dt1, 3)
# Id 01.01.2005_02.01.2005_03.01.2005 04.01.2005_05.01.2005_06.01.2005
#1:  1                               16                               45
#2:  2                                8                               16
#3:  3                               10                               46
#4:  4                                6                               55
#5:  5                               12                                6
#6:  6                               84                               76
#7:  7                               61                               67
#8:  8                                9                               13
#   07.01.2005_08.01.2005_09.01.2005
#1:                                6
#2:                                6
#3:                              737
#4:                                7
#5:                              476
#6:                               34
#7:                              847
#8:                               71

Update

Another option would be using set from data.table. The new function f2 will not depend on the number of columns in the dataset. f1 will fail on 'dt' if the step <- 3. With f2, if the number of columns to be used for computation is '8' with step as '3', it gets the rowSums of 1st 3, rowSums of 4:6, and rowSums of 7:8. In the example, it was not clear how the OP want to sum in those cases. Similarly for step <- 5, we get the the rowSums of 1st 5 columns, then 6:8

f2 <- function(DT, step){
  indx <- 2:ncol(DT)
  indx1 <-  (seq_along(indx)-1)%/%step +1L
  lst <- split(indx, indx1)
  nm1 <- sapply(lst, function(i) paste(names(DT)[i], collapse="_")) 
  res <- as.data.table(matrix(0, ncol=length(lst),
               nrow=nrow(dt), dimnames=list(NULL, nm1)))

  for(j in seq_along(res)){
   set(res, i=NULL, j=j, value=DT[, lst[[j]], with=FALSE][, Reduce(`+`, .SD)])
   }
  cbind(Id=DT[['Id']], res)
  } 

f2(dt,2)
#   Id 01.01.2005_02.01.2005 03.01.2005_04.01.2005 05.01.2005_06.01.2005
#1:  1                    14                     2                    45
#2:  2                     5                     3                    16
#3:  3                     7                     3                    46
#4:  4                     0                     6                    55
#5:  5                     5                     7                     6
#6:  6                     7                    77                    76
#7:  7                    16                    45                    67
#8:  8                     9                     1                    12
#   07.01.2005_08.01.2005
#1:                     2
#2:                     3
#3:                   734
#4:                     2
#5:                   469
#6:                    25
#7:                   837
#8:                    60

f2(dt,3)
# Id 01.01.2005_02.01.2005_03.01.2005 04.01.2005_05.01.2005_06.01.2005
#1:  1                               16                               45
#2:  2                                8                               16
#3:  3                               10                               46
#4:  4                                6                               55
#5:  5                               12                                6
#6:  6                               84                               76
#7:  7                               61                               67
#8:  8                                9                               13
#   07.01.2005_08.01.2005
#1:                     2
#2:                     3
#3:                   734
#4:                     2
#5:                   469
#6:                    25
#7:                   837
#8:                    60

f2(dt,5)
#  Id 01.01.2005_02.01.2005_03.01.2005_04.01.2005_05.01.2005
#1:  1                                                     16
#2:  2                                                      9
#3:  3                                                     11
#4:  4                                                      7
#5:  5                                                     13
#6:  6                                                     84
#7:  7                                                     61
#8:  8                                                     12
#   06.01.2005_07.01.2005_08.01.2005
#1:                               47
#2:                               18
#3:                              779
#4:                               56
#5:                              474
#6:                              101
#7:                              904
#8:                               70

Using a slightly bigger dataset

set.seed(24)
dt <- as.data.table(matrix(sample(0:9, 5000*5000, replace=TRUE), 
     ncol=5000))
dt <- cbind(Id=1:5000, dt)
dim(dt)
#[1] 5000 5001
system.time(f2(dt, 4))
#   user  system elapsed 
#  2.269   0.022   2.291 

Upvotes: 3

Dirk
Dirk

Reputation: 1282

Basically the same as David Arenburg's answer, but maybe a little more readable is to first take the group number by using the optimized .GRP operator from data.table and then taking the modulo of the step to obtain the index.

step <- 2
temp <- melt(dt, "Id")
temp[, group := .GRP + step-1L, variable]
temp[, indx := group %/% step]
dcast(temp, Id ~ indx, sum, value.var = "value")

Upvotes: 1

David Arenburg
David Arenburg

Reputation: 92300

Here's another possible approach

step <- 2
temp <- melt(dt, "Id")[, indx := rep(seq_len((ncol(dt)-1L)/step), each = nrow(dt)*step)]
dcast(temp, Id ~ indx, sum, value.var = "value")
#    Id  1  2  3   4
# 1:  1 14  2 45   2
# 2:  2  5  3 16   3
# 3:  3  7  3 46 734
# 4:  4  0  6 55   2
# 5:  5  5  7  6 469
# 6:  6  7 77 76  25
# 7:  7 16 45 67 837
# 8:  8  9  1 12  60

Upvotes: 7

Related Questions