clp
clp

Reputation: 1528

How can I fast outer-join and filter two vectors (or lists), preferably in base R?

## outer join and filter
outer_join <- function(x, y, FUN) {
  if (missing(y)) {y = x}
  cp <- list()
  for (d1 in x) {
    for (d2 in  y) {
      if ( missing(FUN) || FUN(d1, d2) ) {
            cp <- c( cp, list(c(d1, d2)))       # and collect
       }
    }
  }
  return(cp)
}

## benchmark
system.time(t4 <- outer_join(seq(1e4), FUN=function(a,b){return( a == b)}) )
##  user  system elapsed
## 49.48    0.18   49.80

## examples
system.time(outer_join(seq(2^8))                                                   )  # cartesian product
system.time(outer_join(seq(2^8), FUN=function(a,b){return( a == b)} )              )  # all equal pairs
system.time(outer_join(seq(2^8), FUN=function(a,b,n=7){return( ((a-b) %% n) == 0)}))  # difference is 0 modulo 7

However above method is not suitable for larger datasets (> 1000). Clearly, the nested for loop suggests room for improvement. What is the best practice to do this in R?

Note that the ideal solution works when the expand.grid does not fit in memory (before filtering) but the resulting output does. It is outer-join and filter instantaneously.

Upvotes: 1

Views: 371

Answers (4)

clp
clp

Reputation: 1528

The fasted solution I could find. Genuine improvements, preferably in the R base, are welcome.

## cross join and filter.
cjf <- function(x, y = x, FUN, ...) {
  if (is.matrix(x)) return(NULL); if (is.matrix(y)) return(NULL)
  rrr <- c()
  fmissing <- missing(FUN)
  if (!fmissing && (length(FUN(x,x)) != length(x)) ) {
    warning("length(FUN(x,x)) do not match length(x)")
  }
  if (!fmissing) FUN <- match.fun(FUN)
  for (i in seq_along(y) ) {
    if (fmissing) {
      mmm <- rbind(x, y[i])
    } else {
      idx      <- which(FUN(x, y[i]) ) # idx is possible empty
      if (length(idx) > 0) mmm  <- rbind(x[idx], y[i]) else mmm <- c()
    }
    rrr[length(rrr) + seq_along(mmm)] <- mmm
  }
  if (length(rrr) == 0) rrr <- matrix(0, nrow = 0, ncol = 0)
  return(matrix(rrr, ncol=2, byrow=TRUE))
}
## system.time(jjj <- cjf(seq(1e4), FUN = function(a, b) a>b & (a - b) %% 7L == 0) )
## expr      min       lq     mean   median       uq     max neval
##   0) 1.777018 1.797532 1.806776 1.805949 1.813646 1.89837   100

## Examples.
f_gt <- function(a,b){return( a > b) }
x_1  <- cjf(letters[1:5])
x_2  <- cjf(0:1, letters[1:5]) 
x_3a <- cjf(0:1, letters[1:5],   FUN = function(a,b) a == a)  
x_3b <- cjf(0:1, letters[1:5],   FUN = function(a,b) rep(TRUE, length(a)))
x_3c <- cjf(0:1, letters[1:5],   FUN = function(a,b) a != a) 
x_3d <- cjf(0:1, letters[1:5],   FUN = function(a,b) rep(FALSE, length(a)))
x_3e <- cjf(1:2+1i, 0:5,         FUN = function(a,b) Mod(a) > Mod(b))
x_4a <- cjf(letters[1:5],        FUN = f_gt)
x_4b <- cjf(letters[1:5],        FUN = `>`)
x_4c <- cjf(letters[1:5],        FUN = function(a,b) a > b)
x_4d <- cjf(as.double(seq(1E4)), FUN = function(a,b){return( abs(a*a*a - b) <= .Machine$double.eps) } )
x_5  <- cjf(list("a", "b", "c"))

Upvotes: 0

Jeremy R Ansell
Jeremy R Ansell

Reputation: 196

The main reason your solution is so slow is because of the line cp <- c( cp, list(c(d1, d2))). This is a very inefficient way to grow an object because it results in the object being copied with each c() call.

If you instead insert into a list you will see substantially better performance. We can make a couple of other small optimisations:

  • Checking whether FUN is missing outside of the main loop so that we just need to do it once.
  • Allocating a vector of the correct length up-front if FUN is missing, since we know the length of the output for this case.
outer_join <- function(x, y = x, FUN) {
  fmissing <- missing(FUN)
  if (fmissing) {
    cp <- vector("list", length(x) * length(y))
  } else {
    cp <- list()
  }
  i <- 1L
  for (d1 in x) {
    for (d2 in y) {
      if (fmissing || FUN(d1, d2)) {
        cp[[i]] <- c(d1, d2)
        i <- i + 1L
      }
    }
  }
  cp
}

microbenchmark::microbenchmark(
  `Ex. 1` = outer_join(seq(2^8)),
  `Ex. 2` = outer_join(seq(2^8), FUN = `==`),
  `Ex. 3` = outer_join(seq(2^8), FUN = function(a, b) (a - b) %% 7L == 0),
  times = 10,
  unit = "s"
)
#> Unit: seconds
#>   expr        min         lq       mean     median         uq        max neval
#>  Ex. 1 0.02300627 0.02473937 0.02787098 0.02566033 0.03057122 0.03753821    10
#>  Ex. 2 0.01391696 0.01527710 0.01785506 0.01735052 0.01916601 0.02490142    10
#>  Ex. 3 0.05839193 0.06460381 0.07189763 0.07218238 0.08215803 0.08275439    10

Also see this chapter from Hadley Wickham's Advanced R for a discussion of issues with growing objects, as well as Chapter 2 of R Inferno. My experience is that R is not as slow at loops as it is reputed to be, as long as you avoid growing objects inefficiently.

Upvotes: 1

IceCreamToucan
IceCreamToucan

Reputation: 28675

If your FUN is something that can be translated to sql by dbplyr, you could use duckdb + dbplyr and supply FUN as an expression rather than an actual function. This

library(duckdb)
#> Loading required package: DBI
library(dplyr, warn = FALSE)
library(dbplyr, warn = FALSE)

outer_join <- function(x, y, FUN, show_query = FALSE){
  if (missing(y)) y <- x
  con <- dbConnect(duckdb(), dbdir = ':memory:')
  dbWriteTable(con, 'x', tibble(x))
  dbWriteTable(con, 'y', tibble(y))
  x_tbl <- tbl(con, 'x')
  y_tbl <- tbl(con, 'y')
  out <- 
    x_tbl %>% 
      inner_join(y_tbl, sql_on = '1 = 1') %>% 
      filter({{ FUN }}) %>% 
      {if (show_query) show_query(.) else .} %>% 
      collect
  dbDisconnect(con)
  out
}

Example:

outer_join(seq(2^8), FUN = x == y, show_query = TRUE)
#> <SQL>
#> SELECT *
#> FROM (
#>   SELECT "x", "y"
#>   FROM "x" AS "LHS"
#>   INNER JOIN "y" AS "RHS"
#>     ON (1 = 1)
#> ) "q01"
#> WHERE ("x" = "y")
#> # A tibble: 256 × 2
#>        x     y
#>    <int> <int>
#>  1     1     1
#>  2     2     2
#>  3     3     3
#>  4     4     4
#>  5     5     5
#>  6     6     6
#>  7     7     7
#>  8     8     8
#>  9     9     9
#> 10    10    10
#> # … with 246 more rows

Benchmark (note the memory allocation):

expand_oj <- function(x, y, FUN = `==`) {
  if (missing(y)) {y = x}
  subset(expand.grid(x = x, y = y), FUN(x, y))
}

x <- seq(2^8)
bench::mark(
  duck = outer_join(x, FUN = x == y),
  expand = expand_oj(x, FUN = `==`),
  check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 duck        53.31ms  57.79ms      17.3  266.34KB     21.1
#> 2 expand       1.22ms   1.55ms     461.     3.08MB     29.9

x <- seq(2^13)
bench::mark(
  duck = outer_join(x, FUN = x == y),
  expand = expand_oj(x, FUN = `==`),
  check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 duck        50.69ms  53.34ms    18.2       359KB     5.46
#> 2 expand        1.44s    1.44s     0.693       3GB     2.77

Created on 2022-09-05 with reprex v2.0.2

Or use {sqldf}

library(sqldf)
#> Loading required package: gsubfn
#> Loading required package: proto
#> Loading required package: RSQLite

use_sqldf <- function(x, y = x) {
  df_x <- data.frame(x)
  df_y <- data.frame(y)
  
  sqldf('
  select
    *
  from
    df_x join df_y on 1 = 1
  where
    x = y
  ')
}

x <- seq(2^13)
bench::mark(
  duck = outer_join(x, FUN = x == y),
  expand = expand_oj(x, FUN = `==`),
  use_sqldf = use_sqldf(x),
  check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 3 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 duck        74.16ms  76.79ms    12.6    381.86KB     3.59
#> 2 expand        1.43s    1.43s     0.702       3GB     2.81
#> 3 use_sqldf   36.01ms  37.93ms    24.0      1.69MB     2.00

Upvotes: 2

Darren Tsai
Darren Tsai

Reputation: 35554

You could achieve it with expand.grid + subset:

outer_join <- function(x, y, FUN = `==`) {
  if (missing(y)) {y = x}
  subset(expand.grid(x = x, y = y), FUN(x, y))
}
Test
system.time(res1 <- outer_join(seq(2^8)))
#   user  system elapsed 
#  0.005   0.001   0.005

system.time(res2 <- outer_join(seq(2^8), FUN = function(a, b){ return( a == b) }))
#   user  system elapsed 
#  0.003   0.000   0.004

system.time(res3 <- outer_join(seq(2^8), FUN = function(a, b, n = 7){ return( ((a-b) %% n) == 0) }))
#   user  system elapsed 
#  0.007   0.001   0.007

all.equal(res1, res2)
# [1] TRUE

res3
#        x  y
# 1      1  1
# 8      8  1
# 15    15  1
# 22    22  1
# 29    29  1
# 36    36  1
# etc.

Upvotes: 4

Related Questions