Abraham Mathew
Abraham Mathew

Reputation: 81

Get a count of cumulative values over time

Let's say that I have the following data.

dt = data.table(
  date = c("2020-10-01", "2020-10-02", "2020-10-03", "2020-10-01", "2020-10-01",
           "2020-10-03", "2020-10-04", "2020-10-04", "2020-10-05", "2020-10-05"),
  client = sample(LETTERS[1:3], 10, replace = TRUE),
  vals = rnorm(10))

dt[order(date)]

dt2 = dt[order(date), .(sum_vals = sum(vals)), by = .(date, client)]

         date client    sum_vals
1: 2020-10-01      B  2.53737527
2: 2020-10-01      C  0.64366866
3: 2020-10-02      A  1.01776243
4: 2020-10-03      C -0.06303562
5: 2020-10-03      A  0.63702089
6: 2020-10-04      B  0.12681052
7: 2020-10-04      A  0.82889616
8: 2020-10-05      B -1.45734539
9: 2020-10-05      C  0.02594185

What I would like to do is get a count of cumulative clients by date.

So in this case, it looks like this.

         date.  acts 
1: 2020-10-01      2   # b and c we active on 10/01 or before
2: 2020-10-02      3   # a, b and c we active on 10/02 or before
3: 2020-10-03      3   # a, b and c we active on 10/03 or before
4: 2020-10-04      3   # a, b and c we active on 10/04 or before
5: 2020-10-05      3   # a, b and c we active on 10/05 or before

Any ideas on how to achieve this with data.table or dplyr?

Upvotes: 2

Views: 121

Answers (3)

chinsoon12
chinsoon12

Reputation: 25223

Here is another option:

dt2[, .(date, v=cumsum(!duplicated(client)))][, .(acts=max(v)), date]

output:

         date acts
1: 2020-10-01    2
2: 2020-10-02    3
3: 2020-10-03    3
4: 2020-10-04    3
5: 2020-10-05    3

data:

library(data.table)
dt2 = fread("date client    sum_vals
2020-10-01      B  2.53737527
2020-10-01      C  0.64366866
2020-10-02      A  1.01776243
2020-10-03      C -0.06303562
2020-10-03      A  0.63702089
2020-10-04      B  0.12681052
2020-10-04      A  0.82889616
2020-10-05      B -1.45734539
2020-10-05      C  0.02594185")

edit: sample timing for a larger dataset

data (~1mio rows, and 10k clients in 5y) and timing code:

library(data.table)
set.seed(0L)
nd <- 5L * 365L
nr <- 1e6L
nc <- 1e4L
dt2 <- data.table(date=sample(nd, nr, TRUE), client=sample(nc, nr, TRUE), vals=1L)[,
    .(vals=sum(vals)), keyby=.(date, client)]

microbenchmark::microbenchmark(times=1L,
    m0={a0 <- dt2[,  .(date = unique(date), acts = unlist(lapply(unique(date),  
         function(x) uniqueN(client[date <= x]))))]},
    m1={a1 <- dt2[, .(date, v=cumsum(!duplicated(client)))][, .(acts=max(v)), date]},
    m2={a2 <- dt2[, .(clients = list(unique(client))), by = date
        ][, acts := lengths(Reduce(function(p,n) unique(c(p,n)), clients, accumulate = TRUE))]}
) #in chronological order of post
fsetequal(a0, a1)
#[1] TRUE
fsetequal(a0, a2[, clients:=NULL])
#[1] TRUE

timings:

Unit: milliseconds
 expr        min         lq       mean     median         uq        max neval
   m0 14293.6303 14293.6303 14293.6303 14293.6303 14293.6303 14293.6303     1
   m1    40.2999    40.2999    40.2999    40.2999    40.2999    40.2999     1
   m2  1088.7080  1088.7080  1088.7080  1088.7080  1088.7080  1088.7080     1

Upvotes: 3

r2evans
r2evans

Reputation: 160952

An alternative:

dt2[, .(clients = list(unique(client))), by = date
  ][, accts := lengths(Reduce(function(p,n) unique(c(p,n)), clients, accumulate = TRUE))]
#          date clients accts
#        <IDat>  <list> <int>
# 1: 2020-10-01     B,C     2
# 2: 2020-10-02       A     3
# 3: 2020-10-03     C,A     3
# 4: 2020-10-04     B,A     3
# 5: 2020-10-05     B,C     3

Data

dt2 <- setDT(structure(list(date = structure(c(18536L, 18536L, 18537L, 18538L, 18538L, 18539L, 18539L, 18540L, 18540L), class = c("IDate", "Date")), client = c("B", "C", "A", "C", "A", "B", "A", "B", "C"), sum_vals = c(2.53737527, 0.64366866, 1.01776243, -0.06303562, 0.63702089, 0.12681052, 0.82889616, -1.45734539, 0.02594185)), row.names = c(NA, -9L), class = c("data.table", "data.frame")))

Benchmark,

bench::mark(
  chinsoon12 = dt2[, .(date, v=cumsum(!duplicated(client)))][, .(acts=max(v)), date], 
  akrun = dt2[, .(date = unique(date), acts = unlist(lapply(unique(date), function(x) uniqueN(client[date <= x]))))], 
  r2evans = dt2[, .(clients = list(unique(client))), by = date][, accts := lengths(Reduce(function(p,n) unique(c(p,n)), clients, accumulate = TRUE))], 
  check = FALSE)
# # A tibble: 3 x 13
#   expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory        time     gc        
#   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>        <list>   <list>    
# 1 chinsoon12   2.17ms   2.61ms      359.    97.7KB     0      180     0      502ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 2 akrun       810.3us  923.5us     1010.    48.7KB     0      505     0      500ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 3 r2evans     843.7us  956.5us      913.    81.1KB     2.45   373     1      409ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~

Upvotes: 2

akrun
akrun

Reputation: 887891

We could do

dt2[,  .(date = unique(date), acts = unlist(lapply(unique(date),  
         function(x) uniqueN(client[date <= x]))))]

-output

          date acts
1: 2020-10-01    2
2: 2020-10-02    2
3: 2020-10-03    3
4: 2020-10-04    3
5: 2020-10-05    3

Upvotes: 3

Related Questions