briturr
briturr

Reputation: 51

Combinations of rows, where values in columns aren't repeated

I need to get all possible combinations of rows, where values in the first 2 columns aren't repeated in both of these columns. Let's say I have a dataset:

A B PRICE
1 3 8
2 3 7
1 4 6
2 4 5
1 5 4
2 5 3
3 5 2

And I need to get all combinations of 2 rows, then the outcome would be:

(1, 3); (2, 4) = 13
(2, 3); (1, 4) = 13
(1, 3); (2, 5) = 11
(2, 3); (1, 5) = 11
(1, 4); (2, 5) = 9
(2, 4); (1, 5) = 9
(1, 4); (3, 5) = 8
(2, 4); (3, 5) = 7

In the real dataset I need to get combinations, that contain more than 2 rows. I tried using for and while loops, but got nowhere.

Upvotes: 4

Views: 135

Answers (3)

jblood94
jblood94

Reputation: 16981

Benchmarking on four different solutions. The sqldf solution is the simplest, but slowest. Somewhat surprisingly, the solution that involves 7 separate data.table non-equi self-joins is the fastest. If (A, B) is equivalent to (B, A), it can be simplified to 3 separate joins.

library(igraph)
library(data.table)
library(sqldf)

# sqldf non-equi self-join
f1 <- function(df) {
  df$r <- 1:nrow(df)
  sqldf("
    select t1.A as A1, t1.B as B1, t2.A as A2, t2.B as B2, t1.PRICE + t2.PRICE as PRICE
    from df t1 inner join df t2
    on t1.A <> t2.A and t1.A <> t2.B and t1.B <> t2.A and t1.B <> t2.B and t1.r < t2.r
  ")
}

# igraph solution
f2 <- function(df) {
  u <- unique(unlist(df[,1:2]))
  g <- graph_from_data_frame(
    rbind(
      data.frame(from = 0, to = seq_along(u)),
      data.frame(from = match(unlist(df[,1:2]), u), to = rep(1:nrow(df), 2) + length(u))
    ),
    FALSE
  )
  pairlist <- ego(g, 4, V(g)[-(1:(length(u) + 1L))], mindist = 4)
  with(
    data.table(
      row1 = rep.int(1:nrow(df), lengths(pairlist)),
      row2 = unlist(pairlist, TRUE, TRUE) - length(u) - 1L
    )[row2 > row1],
    with(
      df,
      data.table(
        A1 = A[row1],
        B1 = B[row1],
        A2 = A[row2],
        B2 = B[row2],
        PRICE = PRICE[row1] + PRICE[row2]
      )
    )
  )
}

# brute-force data.table solution
f3 <- function(df) {
  n <- nrow(setDT(df))
  m <- t(df[,c(1,2,1,2)])
  rbindlist(
    lapply(
      1:(n - 1L),
      function(i) {
        idx <- which(colSums(unlist(df[c(i, i), 1:2]) == m[,(i + 1L):n, drop = FALSE]) == 0L)
        if (length(idx)) {
          list(
            A1 = df$A[i],
            B1 = df$B[i],
            A2 = df$A[idx <- idx + i],
            B2 = df$B[idx],
            PRICE = df$PRICE[i] + df$PRICE[idx]
          )
        } else NULL
      }
    )
  )
}

# multiple data.table non-equi self-joins
f4 <- function(df) {
  setDT(df)
  rbindlist(
    list(
      df[df, on = .(A > A, B > B, A > B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
      df[df, on = .(A > A, B > B, A > B, B < A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
      df[df, on = .(A > A, B > B, A < B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
      df[df, on = .(A > A, B < B, A > B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
      df[df, on = .(A > A, B < B, A > B, B < A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
      df[df, on = .(A > A, B < B, A < B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
      df[df, on = .(A > A, B < B, A < B, B < A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0]
    )
  )
}

# multiple data.table non-equi self-joins if (A,B) and (B,A) are equivalent
f5 <- function(df) {
  setDT(df)[A > B, `:=`(A = B, B = A)]
  rbindlist(
    list(
      df[df, on = .(A > A, B > B, A > B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
      df[df, on = .(A > A, B > B, A < B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0],
      df[df, on = .(A > A, B < B, A < B, B > A), .(A1 = i.A, B1 = i.B, A2 = x.A, B2 = x.B, PRICE = i.PRICE + x.PRICE), nomatch = 0]
    )
  )
}

df <- data.frame(
  A = sequence(c(2,2,3)),
  B = rep.int(3:5, c(2,2,3)),
  PRICE = 8:2
)
f1(df)
#>   A1 B1 A2 B2 PRICE
#> 1  1  3  2  4    13
#> 2  1  3  2  5    11
#> 3  2  3  1  4    13
#> 4  2  3  1  5    11
#> 5  1  4  2  5     9
#> 6  1  4  3  5     8
#> 7  2  4  1  5     9
#> 8  2  4  3  5     7

The pair ordering will not be the same for all solutions, but the the solutions all give the same answers if pair ordering is not considered:

lst <- lapply(list(f1(df)$PRICE, f2(df)$PRICE, f3(df)$PRICE, f4(df)$PRICE, f5(df)$PRICE), sort)
identical(lst[-5], lst[-1])
#> [1] TRUE

Benchmarking on a larger data set:

df <- as.data.frame(t(combn(100, 2)[,sample(choose(100, 2), 2e3)]))
setnames(df, c("A", "B"))
df$PRICE <- sample(100, nrow(df), TRUE)
lst <- lapply(list(f1(df)$PRICE, f2(df)$PRICE, f3(df)$PRICE, f4(df)$PRICE, f5(df)$PRICE), sort)
identical(lst[-5], lst[-1])
#> [1] TRUE

microbenchmark::microbenchmark(f1(df),
                               f2(df),
                               f3(df),
                               f4(df),
                               f5(df),
                               times = 10)
#> Unit: milliseconds
#>    expr       min        lq      mean    median        uq       max neval
#>  f1(df) 1291.3883 1302.9996 1315.5939 1310.2130 1318.2701 1365.7842    10
#>  f2(df)  403.7088  412.5443  440.7370  429.5795  472.2435  500.7543    10
#>  f3(df)  327.0640  331.5406  343.8076  333.9697  337.0370  440.1181    10
#>  f4(df)  123.0421  131.0727  145.9585  133.6451  175.0750  182.6041    10
#>  f5(df)  100.9089  103.3349  112.6655  106.7373  112.7453  156.7748    10

A generalized Rcpp solution that will find all non-overlapping sets of n rows:

Rcpp::cppFunction(
  "
  List rowsetdiff(const IntegerMatrix& mat1, const IntegerMatrix& mat2) {
    const int n1 = mat1.nrow();
    const int n2 = mat2.nrow();
    const int m1 = mat1.ncol();
    const int m2 = mat2.ncol();
    List out(n1);
  
    for (int i1 = 0; i1 < n1; i1++) {
      IntegerVector idx(n2);
      int row = -1;
      for (int i2 = i1 + 1; i2 < n2; i2++) {
        int c = 0;
        for (int j1 = 0; j1 < m1; j1++) {
          for (int j2 = 0; j2 < m2; j2++) {
            c += mat1(i1, j1) == mat2(i2, j2);
          }
        }
        if (c == 0) idx(++row) = i2;
      }
      if (row >= 0) out[i1] = idx[Range(0, row)] + 1;
    }
    return out;
  }
  "
)

f0 <- function(df, n = 2L) {
  f <- function(df1, df2) {
    rows <- rowsetdiff(as.matrix(df1[,1:(ncol(df1) - 1L)]), as.matrix(df2[,1:(ncol(df2) - 1L)]))
    row1 <- rep.int(seq_along(rows), lengths(rows))
    row2 <- unlist(rows, TRUE, FALSE)
    cbind(
      df1[row1, 1:(ncol(df1) - 1L)],
      df2[row2, 1:(ncol(df2) - 1L)],
      df1[row1,][[ncol(df1)]] + df2[row2,][[ncol(df2)]]
    )
  }
  
  dfComb <- f(df, df)
  for (i in seq_len(n - 2L)) dfComb <- f(df, dfComb)
  
  setNames(dfComb, c(paste0(rep(colnames(df)[-ncol(df)], n), rep(1:n, each = ncol(df) - 1L)), colnames(df)[ncol(df)]))
}

Upvotes: 2

ThomasIsCoding
ThomasIsCoding

Reputation: 101343

You can try the code below with combn

Filter(
  length,
  combn(
    1:nrow(df),
    2,
    function(k) {
      d <- df[k, ]
      if (!any(duplicated(unlist(d[c("A", "B")])))) {
        d
      }
    },
    simplify = FALSE
  )
)

which gives

[[1]]
  A B PRICE
1 1 3     8
4 2 4     5

[[2]]
  A B PRICE
1 1 3     8
6 2 5     3

[[3]]
  A B PRICE
2 2 3     7
3 1 4     6

[[4]]
  A B PRICE
2 2 3     7
5 1 5     4

[[5]]
  A B PRICE
3 1 4     6
6 2 5     3

[[6]]
  A B PRICE
3 1 4     6
7 3 5     2

[[7]]
  A B PRICE
4 2 4     5
5 1 5     4

[[8]]
  A B PRICE
4 2 4     5
7 3 5     2

or

do.call(
  rbind,
  combn(
    1:nrow(df),
    2,
    function(k) {
      d <- df[k, ]
      if (!any(duplicated(unlist(d[c("A", "B")])))) {
        cbind(d[1, c("A", "B")], d[2, c("A", "B")], totPrice = sum(d$PRICE))
      }
    },
    simplify = FALSE
  )
)

gives

   A B A B totPrice
1  1 3 2 4       13
2  1 3 2 5       11
21 2 3 1 4       13
22 2 3 1 5       11
3  1 4 2 5        9
31 1 4 3 5        8
4  2 4 1 5        9
41 2 4 3 5        7

Upvotes: 2

lumartor
lumartor

Reputation: 719

Possibly there will be a more optimal method, but try this:

dt <- data.table(
           A = c(1L, 2L, 1L, 2L, 1L, 2L, 3L),
           B = c(3L, 3L, 4L, 4L, 5L, 5L, 5L),
       PRICE = c(8L, 7L, 6L, 5L, 4L, 3L, 2L)
)

library(data.table)
x <- list()
for (i in 1:nrow(dt)){
  # get i row
  a1 <- dt$A[i]
  b1 <- dt$B[i]
  d <- c(a1, b1)
  p1 <- dt$PRICE[i]
  # get rows with A,B not in i row
  x[[i]] <- dt %>% 
    filter(!A %in% d, !B %in% d) %>%
    rename(a2=A, b2=B) %>%
    mutate(a1=a1, b1=b1, price=PRICE+p1) %>%
    select(a1, b1, a2, b2, price,-PRICE) %>% 
    # create dummy cols and order to detect duplicated data
    mutate(a1b1=pmin(paste0(a1,b1), paste0(a2,b2)),
           a2b2=pmax(paste0(a1,b1), paste0(a2,b2)))
}
# bind the list of data frames and remove duplicated data
x <- rbindlist(x) %>% 
  distinct(a1b1, a2b2, .keep_all = T) %>% 
  select(-a1b1, -a2b2)

> x
   a1 b1 a2 b2 price
1:  1  3  2  4    13
2:  1  3  2  5    11
3:  2  3  1  4    13
4:  2  3  1  5    11
5:  1  4  2  5     9
6:  1  4  3  5     8
7:  2  4  1  5     9
8:  2  4  3  5     7

Upvotes: 2

Related Questions