Reputation: 1528
## outer join and filter
outer_join <- function(x, y, FUN) {
if (missing(y)) {y = x}
cp <- list()
for (d1 in x) {
for (d2 in y) {
if ( missing(FUN) || FUN(d1, d2) ) {
cp <- c( cp, list(c(d1, d2))) # and collect
}
}
}
return(cp)
}
## benchmark
system.time(t4 <- outer_join(seq(1e4), FUN=function(a,b){return( a == b)}) )
## user system elapsed
## 49.48 0.18 49.80
## examples
system.time(outer_join(seq(2^8)) ) # cartesian product
system.time(outer_join(seq(2^8), FUN=function(a,b){return( a == b)} ) ) # all equal pairs
system.time(outer_join(seq(2^8), FUN=function(a,b,n=7){return( ((a-b) %% n) == 0)})) # difference is 0 modulo 7
However above method is not suitable for larger datasets (> 1000). Clearly, the nested for loop suggests room for improvement. What is the best practice to do this in R?
Note that the ideal solution works when the expand.grid does not fit in memory (before filtering) but the resulting output does. It is outer-join and filter instantaneously.
Upvotes: 1
Views: 371
Reputation: 1528
The fasted solution I could find. Genuine improvements, preferably in the R base, are welcome.
## cross join and filter.
cjf <- function(x, y = x, FUN, ...) {
if (is.matrix(x)) return(NULL); if (is.matrix(y)) return(NULL)
rrr <- c()
fmissing <- missing(FUN)
if (!fmissing && (length(FUN(x,x)) != length(x)) ) {
warning("length(FUN(x,x)) do not match length(x)")
}
if (!fmissing) FUN <- match.fun(FUN)
for (i in seq_along(y) ) {
if (fmissing) {
mmm <- rbind(x, y[i])
} else {
idx <- which(FUN(x, y[i]) ) # idx is possible empty
if (length(idx) > 0) mmm <- rbind(x[idx], y[i]) else mmm <- c()
}
rrr[length(rrr) + seq_along(mmm)] <- mmm
}
if (length(rrr) == 0) rrr <- matrix(0, nrow = 0, ncol = 0)
return(matrix(rrr, ncol=2, byrow=TRUE))
}
## system.time(jjj <- cjf(seq(1e4), FUN = function(a, b) a>b & (a - b) %% 7L == 0) )
## expr min lq mean median uq max neval
## 0) 1.777018 1.797532 1.806776 1.805949 1.813646 1.89837 100
## Examples.
f_gt <- function(a,b){return( a > b) }
x_1 <- cjf(letters[1:5])
x_2 <- cjf(0:1, letters[1:5])
x_3a <- cjf(0:1, letters[1:5], FUN = function(a,b) a == a)
x_3b <- cjf(0:1, letters[1:5], FUN = function(a,b) rep(TRUE, length(a)))
x_3c <- cjf(0:1, letters[1:5], FUN = function(a,b) a != a)
x_3d <- cjf(0:1, letters[1:5], FUN = function(a,b) rep(FALSE, length(a)))
x_3e <- cjf(1:2+1i, 0:5, FUN = function(a,b) Mod(a) > Mod(b))
x_4a <- cjf(letters[1:5], FUN = f_gt)
x_4b <- cjf(letters[1:5], FUN = `>`)
x_4c <- cjf(letters[1:5], FUN = function(a,b) a > b)
x_4d <- cjf(as.double(seq(1E4)), FUN = function(a,b){return( abs(a*a*a - b) <= .Machine$double.eps) } )
x_5 <- cjf(list("a", "b", "c"))
Upvotes: 0
Reputation: 196
The main reason your solution is so slow is because of the line cp <- c( cp, list(c(d1, d2)))
. This is a very inefficient way to grow an object because it results in the object being copied with each c()
call.
If you instead insert into a list you will see substantially better performance. We can make a couple of other small optimisations:
FUN
is missing outside of the main loop so that we just need to do it once.FUN
is missing, since we know the length of the output for this case.outer_join <- function(x, y = x, FUN) {
fmissing <- missing(FUN)
if (fmissing) {
cp <- vector("list", length(x) * length(y))
} else {
cp <- list()
}
i <- 1L
for (d1 in x) {
for (d2 in y) {
if (fmissing || FUN(d1, d2)) {
cp[[i]] <- c(d1, d2)
i <- i + 1L
}
}
}
cp
}
microbenchmark::microbenchmark(
`Ex. 1` = outer_join(seq(2^8)),
`Ex. 2` = outer_join(seq(2^8), FUN = `==`),
`Ex. 3` = outer_join(seq(2^8), FUN = function(a, b) (a - b) %% 7L == 0),
times = 10,
unit = "s"
)
#> Unit: seconds
#> expr min lq mean median uq max neval
#> Ex. 1 0.02300627 0.02473937 0.02787098 0.02566033 0.03057122 0.03753821 10
#> Ex. 2 0.01391696 0.01527710 0.01785506 0.01735052 0.01916601 0.02490142 10
#> Ex. 3 0.05839193 0.06460381 0.07189763 0.07218238 0.08215803 0.08275439 10
Also see this chapter from Hadley Wickham's Advanced R for a discussion of issues with growing objects, as well as Chapter 2 of R Inferno. My experience is that R is not as slow at loops as it is reputed to be, as long as you avoid growing objects inefficiently.
Upvotes: 1
Reputation: 28675
If your FUN
is something that can be translated to sql by dbplyr, you could use duckdb + dbplyr and supply FUN
as an expression rather than an actual function. This
library(duckdb)
#> Loading required package: DBI
library(dplyr, warn = FALSE)
library(dbplyr, warn = FALSE)
outer_join <- function(x, y, FUN, show_query = FALSE){
if (missing(y)) y <- x
con <- dbConnect(duckdb(), dbdir = ':memory:')
dbWriteTable(con, 'x', tibble(x))
dbWriteTable(con, 'y', tibble(y))
x_tbl <- tbl(con, 'x')
y_tbl <- tbl(con, 'y')
out <-
x_tbl %>%
inner_join(y_tbl, sql_on = '1 = 1') %>%
filter({{ FUN }}) %>%
{if (show_query) show_query(.) else .} %>%
collect
dbDisconnect(con)
out
}
Example:
outer_join(seq(2^8), FUN = x == y, show_query = TRUE)
#> <SQL>
#> SELECT *
#> FROM (
#> SELECT "x", "y"
#> FROM "x" AS "LHS"
#> INNER JOIN "y" AS "RHS"
#> ON (1 = 1)
#> ) "q01"
#> WHERE ("x" = "y")
#> # A tibble: 256 × 2
#> x y
#> <int> <int>
#> 1 1 1
#> 2 2 2
#> 3 3 3
#> 4 4 4
#> 5 5 5
#> 6 6 6
#> 7 7 7
#> 8 8 8
#> 9 9 9
#> 10 10 10
#> # … with 246 more rows
Benchmark (note the memory allocation):
expand_oj <- function(x, y, FUN = `==`) {
if (missing(y)) {y = x}
subset(expand.grid(x = x, y = y), FUN(x, y))
}
x <- seq(2^8)
bench::mark(
duck = outer_join(x, FUN = x == y),
expand = expand_oj(x, FUN = `==`),
check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 duck 53.31ms 57.79ms 17.3 266.34KB 21.1
#> 2 expand 1.22ms 1.55ms 461. 3.08MB 29.9
x <- seq(2^13)
bench::mark(
duck = outer_join(x, FUN = x == y),
expand = expand_oj(x, FUN = `==`),
check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 duck 50.69ms 53.34ms 18.2 359KB 5.46
#> 2 expand 1.44s 1.44s 0.693 3GB 2.77
Created on 2022-09-05 with reprex v2.0.2
Or use {sqldf}
library(sqldf)
#> Loading required package: gsubfn
#> Loading required package: proto
#> Loading required package: RSQLite
use_sqldf <- function(x, y = x) {
df_x <- data.frame(x)
df_y <- data.frame(y)
sqldf('
select
*
from
df_x join df_y on 1 = 1
where
x = y
')
}
x <- seq(2^13)
bench::mark(
duck = outer_join(x, FUN = x == y),
expand = expand_oj(x, FUN = `==`),
use_sqldf = use_sqldf(x),
check = function(a, b) all(a == b)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 duck 74.16ms 76.79ms 12.6 381.86KB 3.59
#> 2 expand 1.43s 1.43s 0.702 3GB 2.81
#> 3 use_sqldf 36.01ms 37.93ms 24.0 1.69MB 2.00
Upvotes: 2
Reputation: 35554
You could achieve it with expand.grid
+ subset
:
outer_join <- function(x, y, FUN = `==`) {
if (missing(y)) {y = x}
subset(expand.grid(x = x, y = y), FUN(x, y))
}
system.time(res1 <- outer_join(seq(2^8)))
# user system elapsed
# 0.005 0.001 0.005
system.time(res2 <- outer_join(seq(2^8), FUN = function(a, b){ return( a == b) }))
# user system elapsed
# 0.003 0.000 0.004
system.time(res3 <- outer_join(seq(2^8), FUN = function(a, b, n = 7){ return( ((a-b) %% n) == 0) }))
# user system elapsed
# 0.007 0.001 0.007
all.equal(res1, res2)
# [1] TRUE
res3
# x y
# 1 1 1
# 8 8 1
# 15 15 1
# 22 22 1
# 29 29 1
# 36 36 1
# etc.
Upvotes: 4