Reputation: 51
I need to get all possible combinations of rows, where values in the first 2 columns aren't repeated in both of these columns. Let's say I have a dataset:
A | B | PRICE |
---|---|---|
1 | 3 | 8 |
2 | 3 | 7 |
1 | 4 | 6 |
2 | 4 | 5 |
1 | 5 | 4 |
2 | 5 | 3 |
3 | 5 | 2 |
And I need to get all combinations of 2 rows, then the outcome would be:
(1, 3); (2, 4) = 13
(2, 3); (1, 4) = 13
(1, 3); (2, 5) = 11
(2, 3); (1, 5) = 11
(1, 4); (2, 5) = 9
(2, 4); (1, 5) = 9
(1, 4); (3, 5) = 8
(2, 4); (3, 5) = 7
In the real dataset I need to get combinations, that contain more than 2 rows. I tried using for and while loops, but got nowhere.
Upvotes: 4
Views: 135
Reputation: 16981
Benchmarking on four different solutions. The sqldf
solution is the simplest, but slowest. Somewhat surprisingly, the solution that involves 7 separate data.table
non-equi self-joins is the fastest. If (A, B) is equivalent to (B, A), it can be simplified to 3 separate joins.
library(igraph)
library(data.table)
library(sqldf)
# sqldf non-equi self-join
f1 <- function(df) {
df$r <- 1:nrow(df)
sqldf("
select t1.A as A1, t1.B as B1, t2.A as A2, t2.B as B2, t1.PRICE + t2.PRICE as PRICE
from df t1 inner join df t2
on t1.A <> t2.A and t1.A <> t2.B and t1.B <> t2.A and t1.B <> t2.B and t1.r < t2.r
")
}
# igraph solution
f2 <- function(df) {
u <- unique(unlist(df[,1:2]))
g <- graph_from_data_frame(
rbind(
data.frame(from = 0, to = seq_along(u)),
data.frame(from = match(unlist(df[,1:2]), u), to = rep(1:nrow(df), 2) + length(u))
),
FALSE
)
pairlist <- ego(g, 4, V(g)[-(1:(length(u) + 1L))], mindist = 4)
with(
data.table(
row1 = rep.int(1:nrow(df), lengths(pairlist)),
row2 = unlist(pairlist, TRUE, TRUE) - length(u) - 1L
)[row2 > row1],
with(
df,
data.table(
A1 = A[row1],
B1 = B[row1],
A2 = A[row2],
B2 = B[row2],
PRICE = PRICE[row1] + PRICE[row2]
)
)
)
}
# brute-force data.table solution
f3 <- function(df) {
n <- nrow(setDT(df))
m <- t(df[,c(1,2,1,2)])
rbindlist(
lapply(
1:(n - 1L),
function(i) {
idx <- which(colSums(unlist(df[c(i, i), 1:2]) == m[,(i + 1L):n, drop = FALSE]) == 0L)
if (length(idx)) {
list(
A1 = df$A[i],
B1 = df$B[i],
A2 = df$A[idx <- idx + i],
B2 = df$B[idx],
PRICE = df$PRICE[i] + df$PRICE[idx]
)
} else NULL
}
)
)
}
# multiple data.table non-equi self-joins
f4 <- function(df) {
setDT(df)
rbindlist(
list(
df[df, on = .(A > A, B > B, A > B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
df[df, on = .(A > A, B > B, A > B, B < A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
df[df, on = .(A > A, B > B, A < B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
df[df, on = .(A > A, B < B, A > B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
df[df, on = .(A > A, B < B, A > B, B < A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
df[df, on = .(A > A, B < B, A < B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
df[df, on = .(A > A, B < B, A < B, B < A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0]
)
)
}
# multiple data.table non-equi self-joins if (A,B) and (B,A) are equivalent
f5 <- function(df) {
setDT(df)[A > B, `:=`(A = B, B = A)]
rbindlist(
list(
df[df, on = .(A > A, B > B, A > B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
df[df, on = .(A > A, B > B, A < B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
df[df, on = .(A > A, B < B, A < B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0]
)
)
}
df <- data.frame(
A = sequence(c(2,2,3)),
B = rep.int(3:5, c(2,2,3)),
PRICE = 8:2
)
f1(df)
#> A1 B1 A2 B2 PRICE
#> 1 1 3 2 4 13
#> 2 1 3 2 5 11
#> 3 2 3 1 4 13
#> 4 2 3 1 5 11
#> 5 1 4 2 5 9
#> 6 1 4 3 5 8
#> 7 2 4 1 5 9
#> 8 2 4 3 5 7
The pair ordering will not be the same for all solutions, but the the solutions all give the same answers if pair ordering is not considered:
lst <- lapply(list(f1(df)$PRICE, f2(df)$PRICE, f3(df)$PRICE, f4(df)$PRICE, f5(df)$PRICE), sort)
identical(lst[-5], lst[-1])
#> [1] TRUE
Benchmarking on a larger data set:
df <- as.data.frame(t(combn(100, 2)[,sample(choose(100, 2), 2e3)]))
setnames(df, c("A", "B"))
df$PRICE <- sample(100, nrow(df), TRUE)
lst <- lapply(list(f1(df)$PRICE, f2(df)$PRICE, f3(df)$PRICE, f4(df)$PRICE, f5(df)$PRICE), sort)
identical(lst[-5], lst[-1])
#> [1] TRUE
microbenchmark::microbenchmark(f1(df),
f2(df),
f3(df),
f4(df),
f5(df),
times = 10)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1(df) 1291.3883 1302.9996 1315.5939 1310.2130 1318.2701 1365.7842 10
#> f2(df) 403.7088 412.5443 440.7370 429.5795 472.2435 500.7543 10
#> f3(df) 327.0640 331.5406 343.8076 333.9697 337.0370 440.1181 10
#> f4(df) 123.0421 131.0727 145.9585 133.6451 175.0750 182.6041 10
#> f5(df) 100.9089 103.3349 112.6655 106.7373 112.7453 156.7748 10
A generalized Rcpp
solution that will find all non-overlapping sets of n
rows:
Rcpp::cppFunction(
"
List rowsetdiff(const IntegerMatrix& mat1, const IntegerMatrix& mat2) {
const int n1 = mat1.nrow();
const int n2 = mat2.nrow();
const int m1 = mat1.ncol();
const int m2 = mat2.ncol();
List out(n1);
for (int i1 = 0; i1 < n1; i1++) {
IntegerVector idx(n2);
int row = -1;
for (int i2 = i1 + 1; i2 < n2; i2++) {
int c = 0;
for (int j1 = 0; j1 < m1; j1++) {
for (int j2 = 0; j2 < m2; j2++) {
c += mat1(i1, j1) == mat2(i2, j2);
}
}
if (c == 0) idx(++row) = i2;
}
if (row >= 0) out[i1] = idx[Range(0, row)] + 1;
}
return out;
}
"
)
f0 <- function(df, n = 2L) {
f <- function(df1, df2) {
rows <- rowsetdiff(as.matrix(df1[,1:(ncol(df1) - 1L)]), as.matrix(df2[,1:(ncol(df2) - 1L)]))
row1 <- rep.int(seq_along(rows), lengths(rows))
row2 <- unlist(rows, TRUE, FALSE)
cbind(
df1[row1, 1:(ncol(df1) - 1L)],
df2[row2, 1:(ncol(df2) - 1L)],
df1[row1,][[ncol(df1)]] + df2[row2,][[ncol(df2)]]
)
}
dfComb <- f(df, df)
for (i in seq_len(n - 2L)) dfComb <- f(df, dfComb)
setNames(dfComb, c(paste0(rep(colnames(df)[-ncol(df)], n), rep(1:n, each = ncol(df) - 1L)), colnames(df)[ncol(df)]))
}
Upvotes: 2
Reputation: 101343
You can try the code below with combn
Filter(
length,
combn(
1:nrow(df),
2,
function(k) {
d <- df[k, ]
if (!any(duplicated(unlist(d[c("A", "B")])))) {
d
}
},
simplify = FALSE
)
)
which gives
[[1]]
A B PRICE
1 1 3 8
4 2 4 5
[[2]]
A B PRICE
1 1 3 8
6 2 5 3
[[3]]
A B PRICE
2 2 3 7
3 1 4 6
[[4]]
A B PRICE
2 2 3 7
5 1 5 4
[[5]]
A B PRICE
3 1 4 6
6 2 5 3
[[6]]
A B PRICE
3 1 4 6
7 3 5 2
[[7]]
A B PRICE
4 2 4 5
5 1 5 4
[[8]]
A B PRICE
4 2 4 5
7 3 5 2
or
do.call(
rbind,
combn(
1:nrow(df),
2,
function(k) {
d <- df[k, ]
if (!any(duplicated(unlist(d[c("A", "B")])))) {
cbind(d[1, c("A", "B")], d[2, c("A", "B")], totPrice = sum(d$PRICE))
}
},
simplify = FALSE
)
)
gives
A B A B totPrice
1 1 3 2 4 13
2 1 3 2 5 11
21 2 3 1 4 13
22 2 3 1 5 11
3 1 4 2 5 9
31 1 4 3 5 8
4 2 4 1 5 9
41 2 4 3 5 7
Upvotes: 2
Reputation: 719
Possibly there will be a more optimal method, but try this:
dt <- data.table(
A = c(1L, 2L, 1L, 2L, 1L, 2L, 3L),
B = c(3L, 3L, 4L, 4L, 5L, 5L, 5L),
PRICE = c(8L, 7L, 6L, 5L, 4L, 3L, 2L)
)
library(data.table)
x <- list()
for (i in 1:nrow(dt)){
# get i row
a1 <- dt$A[i]
b1 <- dt$B[i]
d <- c(a1, b1)
p1 <- dt$PRICE[i]
# get rows with A,B not in i row
x[[i]] <- dt %>%
filter(!A %in% d, !B %in% d) %>%
rename(a2=A, b2=B) %>%
mutate(a1=a1, b1=b1, price=PRICE+p1) %>%
select(a1, b1, a2, b2, price,-PRICE) %>%
# create dummy cols and order to detect duplicated data
mutate(a1b1=pmin(paste0(a1,b1), paste0(a2,b2)),
a2b2=pmax(paste0(a1,b1), paste0(a2,b2)))
}
# bind the list of data frames and remove duplicated data
x <- rbindlist(x) %>%
distinct(a1b1, a2b2, .keep_all = T) %>%
select(-a1b1, -a2b2)
> x
a1 b1 a2 b2 price
1: 1 3 2 4 13
2: 1 3 2 5 11
3: 2 3 1 4 13
4: 2 3 1 5 11
5: 1 4 2 5 9
6: 1 4 3 5 8
7: 2 4 1 5 9
8: 2 4 3 5 7
Upvotes: 2