Reputation: 81
Let's say that I have the following data.
dt = data.table(
date = c("2020-10-01", "2020-10-02", "2020-10-03", "2020-10-01", "2020-10-01",
"2020-10-03", "2020-10-04", "2020-10-04", "2020-10-05", "2020-10-05"),
client = sample(LETTERS[1:3], 10, replace = TRUE),
vals = rnorm(10))
dt[order(date)]
dt2 = dt[order(date), .(sum_vals = sum(vals)), by = .(date, client)]
date client sum_vals
1: 2020-10-01 B 2.53737527
2: 2020-10-01 C 0.64366866
3: 2020-10-02 A 1.01776243
4: 2020-10-03 C -0.06303562
5: 2020-10-03 A 0.63702089
6: 2020-10-04 B 0.12681052
7: 2020-10-04 A 0.82889616
8: 2020-10-05 B -1.45734539
9: 2020-10-05 C 0.02594185
What I would like to do is get a count of cumulative clients by date.
So in this case, it looks like this.
date. acts
1: 2020-10-01 2 # b and c we active on 10/01 or before
2: 2020-10-02 3 # a, b and c we active on 10/02 or before
3: 2020-10-03 3 # a, b and c we active on 10/03 or before
4: 2020-10-04 3 # a, b and c we active on 10/04 or before
5: 2020-10-05 3 # a, b and c we active on 10/05 or before
Any ideas on how to achieve this with data.table or dplyr?
Upvotes: 2
Views: 121
Reputation: 25223
Here is another option:
dt2[, .(date, v=cumsum(!duplicated(client)))][, .(acts=max(v)), date]
output:
date acts
1: 2020-10-01 2
2: 2020-10-02 3
3: 2020-10-03 3
4: 2020-10-04 3
5: 2020-10-05 3
data:
library(data.table)
dt2 = fread("date client sum_vals
2020-10-01 B 2.53737527
2020-10-01 C 0.64366866
2020-10-02 A 1.01776243
2020-10-03 C -0.06303562
2020-10-03 A 0.63702089
2020-10-04 B 0.12681052
2020-10-04 A 0.82889616
2020-10-05 B -1.45734539
2020-10-05 C 0.02594185")
edit: sample timing for a larger dataset
data (~1mio rows, and 10k clients in 5y) and timing code:
library(data.table)
set.seed(0L)
nd <- 5L * 365L
nr <- 1e6L
nc <- 1e4L
dt2 <- data.table(date=sample(nd, nr, TRUE), client=sample(nc, nr, TRUE), vals=1L)[,
.(vals=sum(vals)), keyby=.(date, client)]
microbenchmark::microbenchmark(times=1L,
m0={a0 <- dt2[, .(date = unique(date), acts = unlist(lapply(unique(date),
function(x) uniqueN(client[date <= x]))))]},
m1={a1 <- dt2[, .(date, v=cumsum(!duplicated(client)))][, .(acts=max(v)), date]},
m2={a2 <- dt2[, .(clients = list(unique(client))), by = date
][, acts := lengths(Reduce(function(p,n) unique(c(p,n)), clients, accumulate = TRUE))]}
) #in chronological order of post
fsetequal(a0, a1)
#[1] TRUE
fsetequal(a0, a2[, clients:=NULL])
#[1] TRUE
timings:
Unit: milliseconds
expr min lq mean median uq max neval
m0 14293.6303 14293.6303 14293.6303 14293.6303 14293.6303 14293.6303 1
m1 40.2999 40.2999 40.2999 40.2999 40.2999 40.2999 1
m2 1088.7080 1088.7080 1088.7080 1088.7080 1088.7080 1088.7080 1
Upvotes: 3
Reputation: 160952
An alternative:
dt2[, .(clients = list(unique(client))), by = date
][, accts := lengths(Reduce(function(p,n) unique(c(p,n)), clients, accumulate = TRUE))]
# date clients accts
# <IDat> <list> <int>
# 1: 2020-10-01 B,C 2
# 2: 2020-10-02 A 3
# 3: 2020-10-03 C,A 3
# 4: 2020-10-04 B,A 3
# 5: 2020-10-05 B,C 3
Data
dt2 <- setDT(structure(list(date = structure(c(18536L, 18536L, 18537L, 18538L, 18538L, 18539L, 18539L, 18540L, 18540L), class = c("IDate", "Date")), client = c("B", "C", "A", "C", "A", "B", "A", "B", "C"), sum_vals = c(2.53737527, 0.64366866, 1.01776243, -0.06303562, 0.63702089, 0.12681052, 0.82889616, -1.45734539, 0.02594185)), row.names = c(NA, -9L), class = c("data.table", "data.frame")))
Benchmark,
bench::mark(
chinsoon12 = dt2[, .(date, v=cumsum(!duplicated(client)))][, .(acts=max(v)), date],
akrun = dt2[, .(date = unique(date), acts = unlist(lapply(unique(date), function(x) uniqueN(client[date <= x]))))],
r2evans = dt2[, .(clients = list(unique(client))), by = date][, accts := lengths(Reduce(function(p,n) unique(c(p,n)), clients, accumulate = TRUE))],
check = FALSE)
# # A tibble: 3 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 chinsoon12 2.17ms 2.61ms 359. 97.7KB 0 180 0 502ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 2 akrun 810.3us 923.5us 1010. 48.7KB 0 505 0 500ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 3 r2evans 843.7us 956.5us 913. 81.1KB 2.45 373 1 409ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
Upvotes: 2
Reputation: 887891
We could do
dt2[, .(date = unique(date), acts = unlist(lapply(unique(date),
function(x) uniqueN(client[date <= x]))))]
-output
date acts
1: 2020-10-01 2
2: 2020-10-02 2
3: 2020-10-03 3
4: 2020-10-04 3
5: 2020-10-05 3
Upvotes: 3