split or group_split dataframe by multiple columns at once

Question

I'm sure this has been asked but can't find what I'm looking for. What is the best way to split (or group_split) over multiple columns at the same time but separately.

Given:

library(tidyverse)
#example data
set.seed(1)
df <- data.frame( group = c(rep("A",3), rep("B", 6)),
                  test_value = c(0,1,2, 0,1,2,3,4,5),
                  ff = rnorm(9, 0, 1),
                  dd = c(rep("C", 5), rep("D", 4)))
df
#   group test_value         ff dd
# 1     A          0 -0.6264538  C
# 2     A          1  0.1836433  C
# 3     A          2 -0.8356286  C
# 4     B          0  1.5952808  C
# 5     B          1  0.3295078  C
# 6     B          2 -0.8204684  D
# 7     B          3  0.4874291  D
# 8     B          4  0.7383247  D
# 9     B          5  0.5757814  D

I want to, for example summarise by group and then by dd using group_split:

df %>% 
  group_split(group) %>% 
  map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE), 
                                   ~median(.x, na.rm = TRUE)))) 
#OR split
#df %>% 
#  split(.$group)) %>% 
#  map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE), 
#                                   ~median(.x, na.rm = TRUE)))) 

# [[1]]
# # A tibble: 1 x 2
#     mean median
#      
# 1 -0.426 -0.626

# [[2]]
# # A tibble: 1 x 2
#    mean median
#     
# 1 0.484  0.532

#then by dd
df %>% 
  group_split(dd) %>% 
  map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE), 
                                          ~median(.x, na.rm = TRUE)))) 
# [[1]]
# # A tibble: 1 x 2
#    mean median
#     
# 1 0.129  0.184

# [[2]]
# # A tibble: 1 x 2
#    mean median
#     
# 1 0.245  0.532

But I want a flexible way to do this in one call. Adding both variables into split or group_split groups the combination of group and dd which is not what I want.

df %>% 
  split(list(.$group, .$dd)) %>% 
  map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE), 
                                          ~median(.x, na.rm = TRUE)))) 
#OR
df %>% 
  group_split(group, dd) %>% 
  map(~.x %>% summarise_at(vars(ff), list(~mean(.x, na.rm = TRUE), 
                                          ~median(.x, na.rm = TRUE))))

Any suggestions? Would be nice to have the grouping variables printed at each output as a bonus!

thanks

akrun · Accepted Answer

If we want to do this separately, then loop over the names

library(dplyr)
library(purrr)
map(c("group", "dd"), ~ df %>%
    split(.[.x]) %>% 
     map(~ .x %>% 
     summarise_at(vars(ff), 
      list(~mean(.x, na.rm = TRUE), 
                                           ~median(.x, na.rm = TRUE)))) )

-output

[[1]]
[[1]]$A
        mean     median
1 -0.4261464 -0.6264538

[[1]]$B
       mean    median
1 0.4843092 0.5316052


[[2]]
[[2]]$C
       mean    median
1 0.1292699 0.1836433

[[2]]$D
       mean    median
1 0.2452667 0.5316052

Or using group_split

out <- map(c("group", "dd"), ~ df %>% 
           group_split(across(all_of(.x))) %>%
           map(~ .x %>% 
           summarise_at(vars(ff), 
                   list(~mean(.x, na.rm = TRUE), 
                        ~median(.x, na.rm = TRUE)))) )
names(out) <- c("group", "dd")

-output

> out
$group
$group[[1]]
# A tibble: 1 x 2
    mean median
     
1 -0.426 -0.626

$group[[2]]
# A tibble: 1 x 2
   mean median
    
1 0.484  0.532


$dd
$dd[[1]]
# A tibble: 1 x 2
   mean median
    
1 0.129  0.184

$dd[[2]]
# A tibble: 1 x 2
   mean median
    
1 0.245  0.532

split or group_split dataframe by multiple columns at once

Answers (1)

Related Questions