Reputation: 932
I want to apply a custom function to all pairs of columns in a dataframe to get a p x p matrix/dataframe of the results. Is there a quick way to do that in the tidyverse?
The output should be the results
data frame.
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
result <- tibble(cols = c("x","y","z"),
x = c(custom_function(data$x, data$x), custom_function(data$x, data$y), custom_function(data$x, data$z)),
y = c(custom_function(data$y, data$x), custom_function(data$y, data$y), custom_function(data$y, data$z)),
z = c(custom_function(data$z, data$x), custom_function(data$z, data$y), custom_function(data$z, data$z)))
result
Upvotes: 3
Views: 558
Reputation: 3535
One idea:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = custom_function(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
Created on 2021-07-10 by the reprex package (v2.0.0)
And here it is organized as a function:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
custom_summ(data, custom_function)
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
Created on 2021-07-10 by the reprex package (v2.0.0)
And here are some benchmarking data for the various options. The tidyverse approach offered in the accepted answer is not a good one if performance is at all a concern. The fastest option here is the sapply
-based one offered in a comment to the question.
library(tidyverse)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
get_data <- function() {
data <- lapply(letters, function(i) rnorm(1000))
names(data) <- letters
as_tibble(data)
}
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
data <- get_data()
system.time(custom_summ(data, custom_function))
#> user system elapsed
#> 0.053 0.007 0.062
custom_summ_2 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ f(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
}
system.time(custom_summ_2(data, custom_function))
#> user system elapsed
#> 26.479 0.317 27.365
custom_summ_3 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = f(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
}
system.time(custom_summ_3(data, custom_function))
#> user system elapsed
#> 0.048 0.001 0.049
custom_summ_4 <- function(data, f) {
sapply(data, function(y) sapply(data, f, y = y))
}
system.time(custom_summ_4(data, custom_function))
#> user system elapsed
#> 0.003 0.000 0.003
custom_summ_5 <- function(data, f) {
outer(names(data), names(data),
FUN = Vectorize(function(x, y) f (data[x], data[y])))
}
system.time(custom_summ_5(data, custom_function))
#> user system elapsed
#> 0.044 0.001 0.045
Created on 2021-07-11 by the reprex package (v2.0.0)
Upvotes: 3
Reputation: 26238
We could have used outer
directly if the custom_function
was a vectorized one. But it is using sum
which is an scalar function so we can use it by wrapping it around Vectorize()
in FUN =
argument in outer
. Do it like this-
outer(names(data),names(data), FUN = Vectorize(function(x, y) custom_function (data[x], data[y])))
tidyverse strategy Though a little verbose but you can manage this approach in tidyverse
, if you want.
library(tidyverse)
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ custom_function(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
#> x y z
#> x -0.3591433 2.157343 -1.470995
#> y 2.1573430 4.673829 1.045491
#> z -1.4709953 1.045491 -2.582847
Created on 2021-07-10 by the reprex package (v2.0.0)
Upvotes: 2
Reputation: 21938
You can use the following solution:
library(dplyr)
library(tibble)
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = custom_function(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
x y z
x -0.3591433 2.157343 -1.470995
y 2.1573430 4.673829 1.045491
z -1.4709953 1.045491 -2.582847
Upvotes: 3