zimia
zimia

Reputation: 932

Apply a custom function to pairs of columns in a dataframe

I want to apply a custom function to all pairs of columns in a dataframe to get a p x p matrix/dataframe of the results. Is there a quick way to do that in the tidyverse?

The output should be the results data frame.

custom_function <- function(x, y){
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

result <- tibble(cols = c("x","y","z"), 
                 x = c(custom_function(data$x, data$x), custom_function(data$x, data$y), custom_function(data$x, data$z)),
                 y = c(custom_function(data$y, data$x), custom_function(data$y, data$y), custom_function(data$y, data$z)),
                 z = c(custom_function(data$z, data$x), custom_function(data$z, data$y), custom_function(data$z, data$z)))

result

Upvotes: 3

Views: 558

Answers (3)

Ian Gow
Ian Gow

Reputation: 3535

One idea:

library(dplyr, warn.conflicts = FALSE)

custom_function <- function(x, y) {
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

data_long <-
  data %>%
  mutate(id = 1:nrow(.)) %>%
  tidyr::pivot_longer(cols = -id)

result <-  
  data_long %>%
  inner_join(data_long, by = "id") %>%
  group_by(name.x, name.y) %>%
  summarize(value = custom_function(value.x, value.y),
            .groups = "drop") %>%
  tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
  rename(cols = name.y)

result
#> # A tibble: 3 x 4
#>   cols       x     y     z
#>   <chr>  <dbl> <dbl> <dbl>
#> 1 x     -0.359  2.16 -1.47
#> 2 y      2.16   4.67  1.05
#> 3 z     -1.47   1.05 -2.58

Created on 2021-07-10 by the reprex package (v2.0.0)

And here it is organized as a function:

library(dplyr, warn.conflicts = FALSE)

custom_function <- function(x, y) {
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

custom_summ <- function(df, f) {
  
  data_long <-
    data %>%
    mutate(id = 1:nrow(.)) %>%
    tidyr::pivot_longer(cols = -id)

  result <-  
    data_long %>%
    inner_join(data_long, by = "id") %>%
    group_by(name.x, name.y) %>%
    summarize(value = f(value.x, value.y),
              .groups = "drop") %>%
    tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
    rename(cols = name.y)
  
  result
}

custom_summ(data, custom_function)
#> # A tibble: 3 x 4
#>   cols       x     y     z
#>   <chr>  <dbl> <dbl> <dbl>
#> 1 x     -0.359  2.16 -1.47
#> 2 y      2.16   4.67  1.05
#> 3 z     -1.47   1.05 -2.58

Created on 2021-07-10 by the reprex package (v2.0.0)

And here are some benchmarking data for the various options. The tidyverse approach offered in the accepted answer is not a good one if performance is at all a concern. The fastest option here is the sapply-based one offered in a comment to the question.

library(tidyverse)

custom_function <- function(x, y) {
  sum(x, y)
}

set.seed(100)

get_data <- function() {
  data <- lapply(letters, function(i) rnorm(1000))
  names(data) <- letters
  as_tibble(data)
}

custom_summ <- function(df, f) {
  
  data_long <-
    data %>%
    mutate(id = 1:nrow(.)) %>%
    pivot_longer(cols = -id)
  
  result <-  
    data_long %>%
    inner_join(data_long, by = "id") %>%
    group_by(name.x, name.y) %>%
    summarize(value = f(value.x, value.y),
              .groups = "drop") %>%
    pivot_wider(names_from = name.x, values_from = value) %>%
    rename(cols = name.y)
  
  result
}

data <- get_data()

system.time(custom_summ(data, custom_function))
#>    user  system elapsed 
#>   0.053   0.007   0.062

custom_summ_2 <- function(data, f) {
  expand.grid(names(data), names(data)) %>% 
  mutate(val = map2(Var1, Var2, ~ f(data[.x], data[.y]))) %>%
  pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
  column_to_rownames('Var1') %>%
  as.matrix()
}

system.time(custom_summ_2(data, custom_function))
#>    user  system elapsed 
#>  26.479   0.317  27.365

custom_summ_3 <- function(data, f) {
  expand.grid(names(data), names(data)) %>%
    rowwise() %>%
    mutate(Res = f(data[as.character(Var1)], data[as.character(Var2)])) %>%
    pivot_wider(names_from = unique("Var1"), values_from = "Res") %>% 
    column_to_rownames("Var2")
}

system.time(custom_summ_3(data, custom_function))
#>    user  system elapsed 
#>   0.048   0.001   0.049

custom_summ_4 <- function(data, f) {
  sapply(data, function(y) sapply(data, f, y = y))
}

system.time(custom_summ_4(data, custom_function))
#>    user  system elapsed 
#>   0.003   0.000   0.003

custom_summ_5 <- function(data, f) {
  outer(names(data), names(data), 
        FUN = Vectorize(function(x, y) f (data[x], data[y])))
}

system.time(custom_summ_5(data, custom_function))
#>    user  system elapsed 
#>   0.044   0.001   0.045

Created on 2021-07-11 by the reprex package (v2.0.0)

Upvotes: 3

AnilGoyal
AnilGoyal

Reputation: 26238

We could have used outer directly if the custom_function was a vectorized one. But it is using sum which is an scalar function so we can use it by wrapping it around Vectorize() in FUN = argument in outer. Do it like this-

outer(names(data),names(data), FUN = Vectorize(function(x, y) custom_function (data[x], data[y])))


tidyverse strategy Though a little verbose but you can manage this approach in tidyverse, if you want.

library(tidyverse)
custom_function <- function(x, y){
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

expand.grid(names(data), names(data)) %>% 
  mutate(val = map2(Var1, Var2, ~ custom_function(data[.x], data[.y]))) %>%
  pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
  column_to_rownames('Var1') %>%
  as.matrix()
#>            x        y         z
#> x -0.3591433 2.157343 -1.470995
#> y  2.1573430 4.673829  1.045491
#> z -1.4709953 1.045491 -2.582847

Created on 2021-07-10 by the reprex package (v2.0.0)

Upvotes: 2

Anoushiravan R
Anoushiravan R

Reputation: 21938

You can use the following solution:

library(dplyr)
library(tibble)

expand.grid(names(data), names(data)) %>%
  rowwise() %>%
  mutate(Res = custom_function(data[as.character(Var1)], data[as.character(Var2)])) %>%
  pivot_wider(names_from = unique("Var1"), values_from = "Res") %>% 
  column_to_rownames("Var2")

           x        y         z
x -0.3591433 2.157343 -1.470995
y  2.1573430 4.673829  1.045491
z -1.4709953 1.045491 -2.582847

Upvotes: 3

Related Questions