Reputation: 914
I have a dataset which looks like
df <- data.frame(rbind(c("A","B","D",NA,NA,NA,3),
c("B","A","D","C",NA,NA,1),
c("B","C","E","A","D",NA,5),
c("A","B",NA,NA,NA,NA,4),
c("A","B","C","D","E","F",2)))
X1 X2 X3 X4 X5 X6 N
A B D NA NA NA 3
B A D C NA NA 1
B C E A D NA 5
A B NA NA NA NA 4
A B C D E F 2
Where the column N is the number of times that combination happens.
and I want like to have a cumulative sum by unordered columns (no matter on which column the letter is located) such that:
X1 X2 X3 X4 X5 X6 N
A NA NA NA NA NA 15
B NA NA NA NA NA 15
C NA NA NA NA NA 8
D NA NA NA NA NA 11
E NA NA NA NA NA 7
F NA NA NA NA NA 2
A B NA NA NA NA 15
A B C NA NA NA 8
A B C D NA NA 8
A B C D E NA 7
A B C D E F 1
B C NA NA NA NA 8
....
So the idea is to have all possible combinations and the frequency but taking into account that the order of appearance in Column is not relevant.
Upvotes: 1
Views: 521
Reputation: 25223
An option using RcppAlgos::comboGeneral
to generate combinations and data.table::cube
to (quoting from ?cube
) Calculate aggregates at various levels of groupings producing multiple (sub-)totals.:
library(data.table)
library(RcppAlgos)
v <- unique(unlist(df[-ncol(df)]))
v <- sort(v[!is.na(v)])
nc <- length(v)
DT <- melt(setDT(df)[, rn:=.I], id.vars=c("rn", "X7"), na.rm=TRUE, variable.factor=FALSE)
combi <- DT[, as.data.table(do.call(rbind, lapply(1L:.N, function(m) {
rcom <- comboGeneral(value, m)
M <- matrix("", nrow=nrow(rcom), ncol=nc)
M[cbind(rep(1L:nrow(rcom), ncol(rcom)), match(rcom, v))] <- rcom
M
}))), .(rn, COUNT=as.integer(X7))]
ans <- cube(combi, .(COUNT=sum(COUNT)), by=paste0("V", 1:6))
setorderv(ans[complete.cases(ans)], paste0("V", 6:1))[]
output:
V1 V2 V3 V4 V5 V6 COUNT
1: A 15
2: B 15
3: A B 15
4: C 8
5: A C 8
6: B C 8
7: A B C 8
8: D 11
9: A D 11
10: B D 11
11: A B D 11
12: C D 8
13: A C D 8
14: B C D 8
15: A B C D 8
16: E 7
17: A E 7
18: B E 7
19: A B E 7
20: C E 7
21: A C E 7
22: B C E 7
23: A B C E 7
24: D E 7
25: A D E 7
26: B D E 7
27: A B D E 7
28: C D E 7
29: A C D E 7
30: B C D E 7
31: A B C D E 7
32: F 2
33: A F 2
34: B F 2
35: A B F 2
36: C F 2
37: A C F 2
38: B C F 2
39: A B C F 2
40: D F 2
41: A D F 2
42: B D F 2
43: A B D F 2
44: C D F 2
45: A C D F 2
46: B C D F 2
47: A B C D F 2
48: E F 2
49: A E F 2
50: B E F 2
51: A B E F 2
52: C E F 2
53: A C E F 2
54: B C E F 2
55: A B C E F 2
56: D E F 2
57: A D E F 2
58: B D E F 2
59: A B D E F 2
60: C D E F 2
61: A C D E F 2
62: B C D E F 2
63: A B C D E F 2
V1 V2 V3 V4 V5 V6 COUNT
Upvotes: 1
Reputation: 102700
Here is a base R solution
l <- Map(function(x) c(na.omit(x)),data.frame(t(df[1:6]),stringsAsFactors = FALSE))
lout <- Map(function(x) c(na.omit(x)),data.frame(t(dfout),stringsAsFactors = FALSE))
dfout$N <- sapply(lout, function(x) sum(as.numeric(df$X7)[sapply(l, function(v) all(x %in% v))]))
such that
> dfout
X1 X2 X3 X4 X5 X6 N
1 A <NA> <NA> <NA> <NA> <NA> 15
2 B <NA> <NA> <NA> <NA> <NA> 15
3 C <NA> <NA> <NA> <NA> <NA> 8
4 D <NA> <NA> <NA> <NA> <NA> 11
5 E <NA> <NA> <NA> <NA> <NA> 7
6 F <NA> <NA> <NA> <NA> <NA> 2
7 A B <NA> <NA> <NA> <NA> 15
8 A B C <NA> <NA> <NA> 8
9 A B C D <NA> <NA> 8
10 A B C D E <NA> 7
11 A B C D E F 2
12 B C <NA> <NA> <NA> <NA> 8
DATA
df <- structure(list(X1 = structure(c(1L, 2L, 2L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), X2 = structure(c(2L, 1L, 3L, 2L, 2L), .Label = c("A",
"B", "C"), class = "factor"), X3 = structure(c(2L, 2L, 3L, NA,
1L), .Label = c("C", "D", "E"), class = "factor"), X4 = structure(c(NA,
2L, 1L, NA, 3L), .Label = c("A", "C", "D"), class = "factor"),
X5 = structure(c(NA, NA, 1L, NA, 2L), .Label = c("D", "E"
), class = "factor"), X6 = structure(c(NA, NA, NA, NA, 1L
), .Label = "F", class = "factor"), X7 = structure(c(3L,
1L, 5L, 4L, 2L), .Label = c("1", "2", "3", "4", "5"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
dfout <- structure(list(X1 = c("A", "B", "C", "D", "E", "F", "A", "A",
"A", "A", "A", "B"), X2 = c(NA, NA, NA, NA, NA, NA, "B", "B",
"B", "B", "B", "C"), X3 = c(NA, NA, NA, NA, NA, NA, NA, "C",
"C", "C", "C", NA), X4 = c(NA, NA, NA, NA, NA, NA, NA, NA, "D",
"D", "D", NA), X5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "E",
"E", NA), X6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "F",
NA)), row.names = c(NA, -12L), class = "data.frame")
Upvotes: 1
Reputation: 34726
Here's one approach that generates a list of the combinations of the values by row adds it to the original dataframe, unnests and tallies N by group.
library(dplyr)
library(tidyr)
df %>%
mutate(comblist = apply(.[1:6], 1, function(x) {
x <- sort(na.omit(x))
unlist(sapply(seq_along(x), function(y)
list(combn(x, y,
FUN = function(l)
list(toString(l))
))))
})) %>%
select(comblist, N) %>%
unnest(comblist) %>%
group_by(comblist) %>%
summarise(x = sum(N))
# A tibble: 63 x 2
comblist N
<chr> <dbl>
1 A 15
2 A, B 15
3 A, B, C 8
4 A, B, C, D 8
5 A, B, C, D, E 7
6 A, B, C, D, E, F 2
7 A, B, C, D, F 2
8 A, B, C, E 7
9 A, B, C, E, F 2
10 A, B, C, F 2
# ... with 53 more rows
Upvotes: 1
Reputation: 6483
Not sure what you want to do and it would be helpful to add a minimal reproducible example as well as an expected example. That way you can help others to help you! But if you want the counts for the letters in any and all columns:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols =everything()) %>%
group_by(value) %>%
summarise(N = n())
Upvotes: 0