Wilson Souza
Wilson Souza

Reputation: 860

Generate table with count of all combinations by group in a efficient way

I have the following dataset example:

df <- tibble(group = c(rep(1, 6), rep(2, 6)),
             class1 = c("A", "A", "B", "B", "B", "C", "B", "B", "B", "C", "C", "C"),
             class2 = c("A", "B", "B", "B", "C", "B", "B", "B", "A", "C", "A", "B"))
df

I would like to do a table of all combinations between class1 and class2, by group in a fast way.

I try the code below, but it is painfully slow for my data (that is huge > 10 million rows). It takes more than 30 minutes.

output <- df %>% table() %>% as.data.table()

output desired:

output <- tibble(group = c(rep(1, 9), rep(1, 9)),
                 class1 = c(rep("A", 3), rep("B", 3), rep("C", 3),
                            rep("A", 3), rep("B", 3), rep("C", 3)),
                 class2 = rep(c("A", "B", "C"), 6),
                 N = c(1, 1, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 1, 2, 0, 1, 1, 1))
output

Thanks for any help

Upvotes: 2

Views: 216

Answers (3)

jblood94
jblood94

Reputation: 16981

This can be a bit faster than table:

library(data.table)

df <- data.table(group = c(rep(1, 6), rep(2, 6)),
                 class1 = c("A", "A", "B", "B", "B", "C", "B", "B", "B", "C", "C", "C"),
                 class2 = c("A", "B", "B", "B", "C", "B", "B", "B", "A", "C", "A", "B"))

u <- lapply(df, function(x) sort(unique(x)))
m <- rev(cumprod(c(1, rev(lengths(u)))))
do.call(CJ, u)[
  , N := tabulate(rowSums(mapply(function(i) (match(df[[i]], u[[i]]) - 1)*m[i + 1], 1:ncol(df))) + 1, m[1])
][]
#>     group class1 class2 N
#>  1:     1      A      A 1
#>  2:     1      A      B 1
#>  3:     1      A      C 0
#>  4:     1      B      A 0
#>  5:     1      B      B 2
#>  6:     1      B      C 1
#>  7:     1      C      A 0
#>  8:     1      C      B 1
#>  9:     1      C      C 0
#> 10:     2      A      A 0
#> 11:     2      A      B 0
#> 12:     2      A      C 0
#> 13:     2      B      A 1
#> 14:     2      B      B 2
#> 15:     2      B      C 0
#> 16:     2      C      A 1
#> 17:     2      C      B 1
#> 18:     2      C      C 1

Timing a much larger data set:

library(stringi)

df <- data.table(
  group = sample(20, 2e7, TRUE),
  class1 = stri_rand_strings(2e7, 2, "[A-Za-z]"),
  class2 = stri_rand_strings(2e7, 2, "[A-Za-z]")
)

system.time({
  u <- lapply(df, function(x) sort(unique(x)))
  m <- rev(cumprod(c(1, rev(lengths(u)))))
  output <- do.call(CJ, u)[
    , N := tabulate(rowSums(mapply(function(i) (match(df[[i]], u[[i]]) - 1)*m[i + 1], 1:ncol(df))) + 1, m[1])
  ]
})
#>    user  system elapsed 
#>    3.98    0.68    4.41

Compared to table:

system.time({output <- setorder(as.data.table(table(df)))})
#>    user  system elapsed 
#>   28.40    3.64   13.77

Even with 20M rows, table is finishing within seconds. My guess is the > 30 minute timing experienced by the OP is due to a large number of combinations of group, class1, and class2.

Upvotes: 2

Waldi
Waldi

Reputation: 41210

With data.table:

setDT(df)[CJ(group=unique(group),class1=unique(class1),class2=unique(class2))
          ,.(group,x.group,class1,class2),on=.(group,class1,class2)][
          ,.(N=sum(!is.na(x.group))),by=.(group,class1,class2)]

    group class1 class2     N
    <num> <char> <char> <int>
 1:     1      A      A     1
 2:     1      A      B     1
 3:     1      A      C     0
 4:     1      B      A     0
 5:     1      B      B     2
 6:     1      B      C     1
 7:     1      C      A     0
 8:     1      C      B     1
 9:     1      C      C     0
10:     2      A      A     0
11:     2      A      B     0
12:     2      A      C     0
13:     2      B      A     1
14:     2      B      B     2
15:     2      B      C     0
16:     2      C      A     1
17:     2      C      B     1
18:     2      C      C     1

However, this is much slower than your initial solution:

microbenchmark::microbenchmark(table = {df %>% table() %>% as.data.table()},
                               data.table = setDT(df)[CJ(group=unique(group),class1=unique(class1),class2=unique(class2)),.(group,x.group,class1,class2),on=.(group,class1,class2)][
                                     ,.(N=sum(!is.na(x.group))),by=.(group,class1,class2)] )

Unit: microseconds
       expr      min        lq     mean    median       uq       max neval
      table  546.501  615.9015  737.100  697.6505  775.152  1619.901   100
 data.table 4242.001 4495.0010 5038.249 4766.6005 5192.601 14618.100   100

Upvotes: 0

Karthik S
Karthik S

Reputation: 11584

Does this work:

library(dplyr)
library(tidyr)

df %>% mutate(N = 1) %>% complete( group, class1, class2) %>% 
                distinct() %>% mutate(N = replace_na(N, 0))
# A tibble: 18 × 4
   group class1 class2     N
   <dbl> <chr>  <chr>  <dbl>
 1     1 A      A          1
 2     1 A      B          1
 3     1 A      C          0
 4     1 B      A          0
 5     1 B      B          1
 6     1 B      C          1
 7     1 C      A          0
 8     1 C      B          1
 9     1 C      C          0
10     2 A      A          0
11     2 A      B          0
12     2 A      C          0
13     2 B      A          1
14     2 B      B          1
15     2 B      C          0
16     2 C      A          1
17     2 C      B          1
18     2 C      C          1

Upvotes: 3

Related Questions