adrian1121
adrian1121

Reputation: 914

Count all possible combinations no matter the column order

I have a dataset which looks like

df <- data.frame(rbind(c("A","B","D",NA,NA,NA,3),
c("B","A","D","C",NA,NA,1),
c("B","C","E","A","D",NA,5),
c("A","B",NA,NA,NA,NA,4),
c("A","B","C","D","E","F",2)))


    X1      X2       X3        X4       X5        X6    N
    A        B        D        NA       NA        NA    3 
    B        A        D        C        NA        NA    1 
    B        C        E        A         D        NA    5 
    A        B        NA       NA       NA        NA    4 
    A        B        C        D         E         F    2 

Where the column N is the number of times that combination happens.

and I want like to have a cumulative sum by unordered columns (no matter on which column the letter is located) such that:

     X1      X2       X3        X4       X5        X6     N
    A       NA        NA       NA        NA        NA     15
    B       NA        NA       NA        NA        NA     15
    C       NA        NA       NA        NA        NA     8
    D       NA        NA       NA        NA        NA     11
    E       NA        NA       NA        NA        NA     7
    F       NA        NA       NA        NA        NA     2
    A        B        NA       NA        NA        NA     15 
    A        B        C        NA        NA        NA     8 
    A        B        C        D         NA        NA     8 
    A        B        C        D         E         NA     7 
    A        B        C        D         E         F      1 
    B        C        NA       NA        NA        NA     8
    ....

So the idea is to have all possible combinations and the frequency but taking into account that the order of appearance in Column is not relevant.

Upvotes: 1

Views: 521

Answers (4)

chinsoon12
chinsoon12

Reputation: 25223

An option using RcppAlgos::comboGeneral to generate combinations and data.table::cube to (quoting from ?cube) Calculate aggregates at various levels of groupings producing multiple (sub-)totals.:

library(data.table)
library(RcppAlgos)

v <- unique(unlist(df[-ncol(df)]))
v <- sort(v[!is.na(v)])
nc <- length(v)

DT <- melt(setDT(df)[, rn:=.I], id.vars=c("rn", "X7"), na.rm=TRUE, variable.factor=FALSE)
combi <- DT[, as.data.table(do.call(rbind, lapply(1L:.N, function(m) {
    rcom <- comboGeneral(value, m)
    M <- matrix("", nrow=nrow(rcom), ncol=nc)

    M[cbind(rep(1L:nrow(rcom), ncol(rcom)), match(rcom, v))] <- rcom
    M
}))), .(rn, COUNT=as.integer(X7))]
ans <- cube(combi, .(COUNT=sum(COUNT)), by=paste0("V", 1:6))
setorderv(ans[complete.cases(ans)], paste0("V", 6:1))[]

output:

    V1 V2 V3 V4 V5 V6 COUNT
 1:  A                   15
 2:     B                15
 3:  A  B                15
 4:        C              8
 5:  A     C              8
 6:     B  C              8
 7:  A  B  C              8
 8:           D          11
 9:  A        D          11
10:     B     D          11
11:  A  B     D          11
12:        C  D           8
13:  A     C  D           8
14:     B  C  D           8
15:  A  B  C  D           8
16:              E        7
17:  A           E        7
18:     B        E        7
19:  A  B        E        7
20:        C     E        7
21:  A     C     E        7
22:     B  C     E        7
23:  A  B  C     E        7
24:           D  E        7
25:  A        D  E        7
26:     B     D  E        7
27:  A  B     D  E        7
28:        C  D  E        7
29:  A     C  D  E        7
30:     B  C  D  E        7
31:  A  B  C  D  E        7
32:                 F     2
33:  A              F     2
34:     B           F     2
35:  A  B           F     2
36:        C        F     2
37:  A     C        F     2
38:     B  C        F     2
39:  A  B  C        F     2
40:           D     F     2
41:  A        D     F     2
42:     B     D     F     2
43:  A  B     D     F     2
44:        C  D     F     2
45:  A     C  D     F     2
46:     B  C  D     F     2
47:  A  B  C  D     F     2
48:              E  F     2
49:  A           E  F     2
50:     B        E  F     2
51:  A  B        E  F     2
52:        C     E  F     2
53:  A     C     E  F     2
54:     B  C     E  F     2
55:  A  B  C     E  F     2
56:           D  E  F     2
57:  A        D  E  F     2
58:     B     D  E  F     2
59:  A  B     D  E  F     2
60:        C  D  E  F     2
61:  A     C  D  E  F     2
62:     B  C  D  E  F     2
63:  A  B  C  D  E  F     2
    V1 V2 V3 V4 V5 V6 COUNT

Upvotes: 1

ThomasIsCoding
ThomasIsCoding

Reputation: 102700

Here is a base R solution

l <- Map(function(x) c(na.omit(x)),data.frame(t(df[1:6]),stringsAsFactors = FALSE))
lout <- Map(function(x) c(na.omit(x)),data.frame(t(dfout),stringsAsFactors = FALSE))

dfout$N <- sapply(lout, function(x) sum(as.numeric(df$X7)[sapply(l, function(v) all(x %in% v))]))

such that

> dfout
   X1   X2   X3   X4   X5   X6  N
1   A <NA> <NA> <NA> <NA> <NA> 15
2   B <NA> <NA> <NA> <NA> <NA> 15
3   C <NA> <NA> <NA> <NA> <NA>  8
4   D <NA> <NA> <NA> <NA> <NA> 11
5   E <NA> <NA> <NA> <NA> <NA>  7
6   F <NA> <NA> <NA> <NA> <NA>  2
7   A    B <NA> <NA> <NA> <NA> 15
8   A    B    C <NA> <NA> <NA>  8
9   A    B    C    D <NA> <NA>  8
10  A    B    C    D    E <NA>  7
11  A    B    C    D    E    F  2
12  B    C <NA> <NA> <NA> <NA>  8

DATA

df <- structure(list(X1 = structure(c(1L, 2L, 2L, 1L, 1L), .Label = c("A", 
"B"), class = "factor"), X2 = structure(c(2L, 1L, 3L, 2L, 2L), .Label = c("A", 
"B", "C"), class = "factor"), X3 = structure(c(2L, 2L, 3L, NA, 
1L), .Label = c("C", "D", "E"), class = "factor"), X4 = structure(c(NA, 
2L, 1L, NA, 3L), .Label = c("A", "C", "D"), class = "factor"), 
    X5 = structure(c(NA, NA, 1L, NA, 2L), .Label = c("D", "E"
    ), class = "factor"), X6 = structure(c(NA, NA, NA, NA, 1L
    ), .Label = "F", class = "factor"), X7 = structure(c(3L, 
    1L, 5L, 4L, 2L), .Label = c("1", "2", "3", "4", "5"), class = "factor")), class = "data.frame", row.names = c(NA, 
-5L))

dfout <- structure(list(X1 = c("A", "B", "C", "D", "E", "F", "A", "A", 
"A", "A", "A", "B"), X2 = c(NA, NA, NA, NA, NA, NA, "B", "B", 
"B", "B", "B", "C"), X3 = c(NA, NA, NA, NA, NA, NA, NA, "C", 
"C", "C", "C", NA), X4 = c(NA, NA, NA, NA, NA, NA, NA, NA, "D", 
"D", "D", NA), X5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "E", 
"E", NA), X6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "F", 
NA)), row.names = c(NA, -12L), class = "data.frame")

Upvotes: 1

Iroha
Iroha

Reputation: 34726

Here's one approach that generates a list of the combinations of the values by row adds it to the original dataframe, unnests and tallies N by group.

library(dplyr)
library(tidyr)

df %>%
  mutate(comblist = apply(.[1:6], 1, function(x) {
    x <- sort(na.omit(x))
    unlist(sapply(seq_along(x), function(y)
      list(combn(x, y,
        FUN = function(l)
          list(toString(l))
      ))))
  })) %>%
  select(comblist, N) %>%
  unnest(comblist) %>%
  group_by(comblist) %>%
  summarise(x = sum(N))

# A tibble: 63 x 2
   comblist             N
   <chr>            <dbl>
 1 A                   15
 2 A, B                15
 3 A, B, C              8
 4 A, B, C, D           8
 5 A, B, C, D, E        7
 6 A, B, C, D, E, F     2
 7 A, B, C, D, F        2
 8 A, B, C, E           7
 9 A, B, C, E, F        2
10 A, B, C, F           2
# ... with 53 more rows

Upvotes: 1

dario
dario

Reputation: 6483

Not sure what you want to do and it would be helpful to add a minimal reproducible example as well as an expected example. That way you can help others to help you! But if you want the counts for the letters in any and all columns:

library(dplyr)
library(tidyr)
df %>% 
pivot_longer(cols =everything()) %>%
group_by(value) %>%
summarise(N = n())

Upvotes: 0

Related Questions