James
James

Reputation: 289

How to get quantiles to work with summarise_at and group_by (dplyr)

When using dplyr to create a table of summary statistics that is organized by levels of a variable, I cannot figure out the syntax for calculating quartiles without having to repeat the column name. That is, using calls, such as vars() and list() work with other functions, such as mean() and median() but not with quantile()

Searches have produced antiquated solutions that no longer work because they use deprecated calls, such as do() and/or funs().

data(iris)
library(tidyverse)

#This works: Notice I have not attempted to calculate quartiles yet
summary_stat <- iris %>% 
  group_by(Species) %>% 
  summarise_at(vars(Sepal.Length), 
               list(min=min, median=median, max=max,
               mean=mean, sd=sd)
               )
A tibble: 3 x 6
  Species      min median   max  mean    sd
  <fct>      <dbl>  <dbl> <dbl> <dbl> <dbl>
1 setosa       4.3    5     5.8  5.01 0.352
2 versicolor   4.9    5.9   7    5.94 0.516
3 virginica    4.9    6.5   7.9  6.59 0.636

##########################################################################
#Does NOT work:
five_number_summary <- iris %>% 
  group_by(Species) %>% 
  summarise_at(vars(Sepal.Length),
               list(min=min, Q1=quantile(.,probs = 0.25),
                    median=median, Q3=quantile(., probs = 0.75),
                    max=max))

Error: Must use a vector in `[`, not an object of class matrix.
Call `rlang::last_error()` to see a backtrace

###########################################################################
#This works: Remove the vars() argument, remove the list() argument,
  #replace summarise_at() with summarise()
  #but the code requires repeating the column name (Sepal.Length)

five_number_summary <- iris %>% 
  group_by(Species) %>% 
  summarise(min=min(Sepal.Length), 
            Q1=quantile(Sepal.Length,probs = 0.25),
            median=median(Sepal.Length), 
            Q3=quantile(Sepal.Length, probs = 0.75),
            max=max(Sepal.Length))

# A tibble: 3 x 6
  Species      min    Q1 median    Q3   max
  <fct>      <dbl> <dbl>  <dbl> <dbl> <dbl>
1 setosa       4.3  4.8     5     5.2   5.8
2 versicolor   4.9  5.6     5.9   6.3   7  
3 virginica    4.9  6.22    6.5   6.9   7.9

This last piece of code produces exactly what I am looking for, but I am wondering why there isn't a shorter syntax that doesn't force me to repeat the variable.

Upvotes: 14

Views: 19315

Answers (3)

IceCreamToucan
IceCreamToucan

Reputation: 28695

You can create a data frame / tibble within summarise which is then spliced into separate columns.

library(tidyverse)

iris %>% 
  group_by(Species) %>% 
  summarise(as_tibble_row(quantile(Sepal.Length)))
#> # A tibble: 3 × 6
#>   Species     `0%` `25%` `50%` `75%` `100%`
#>   <fct>      <dbl> <dbl> <dbl> <dbl>  <dbl>
#> 1 setosa       4.3  4.8    5     5.2    5.8
#> 2 versicolor   4.9  5.6    5.9   6.3    7  
#> 3 virginica    4.9  6.22   6.5   6.9    7.9

# with .name_repair
iris %>% 
  group_by(Species) %>% 
  summarise(as_tibble_row(quantile(Sepal.Length), .name_repair = \(x) paste0('q', parse_number(x))))
#> # A tibble: 3 × 6
#>   Species       q0   q25   q50   q75  q100
#>   <fct>      <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 setosa       4.3  4.8    5     5.2   5.8
#> 2 versicolor   4.9  5.6    5.9   6.3   7  
#> 3 virginica    4.9  6.22   6.5   6.9   7.9

Created on 2023-12-30 with reprex v2.0.2

You can create a list column and then use unnest_wider, which requires tidyr 1.0.0

library(tidyverse)

iris %>% 
  group_by(Species) %>% 
  summarise(q = list(quantile(Sepal.Length))) %>% 
  unnest_wider(q)

# # A tibble: 3 x 6
#   Species     `0%` `25%` `50%` `75%` `100%`
#   <fct>      <dbl> <dbl> <dbl> <dbl>  <dbl>
# 1 setosa       4.3  4.8    5     5.2    5.8
# 2 versicolor   4.9  5.6    5.9   6.3    7  
# 3 virginica    4.9  6.22   6.5   6.9    7.9

There's a names_repair argument, but apparently that changes the name of all the columns, and not just the ones being unnested (??)

iris %>% 
  group_by(Species) %>% 
  summarise(q = list(quantile(Sepal.Length))) %>% 
  unnest_wider(q, names_repair = ~paste0('Q_', sub('%', '', .)))

# # A tibble: 3 x 6
#   Q_Species    Q_0  Q_25  Q_50  Q_75 Q_100
#   <fct>      <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 setosa       4.3  4.8    5     5.2   5.8
# 2 versicolor   4.9  5.6    5.9   6.3   7  
# 3 virginica    4.9  6.22   6.5   6.9   7.9

Another option is group_modify

iris %>% 
  group_by(Species) %>% 
  group_modify(~as.data.frame(t(quantile(.$Sepal.Length))))

# # A tibble: 3 x 6
# # Groups:   Species [3]
#   Species     `0%` `25%` `50%` `75%` `100%`
#   <fct>      <dbl> <dbl> <dbl> <dbl>  <dbl>
# 1 setosa       4.3  4.8    5     5.2    5.8
# 2 versicolor   4.9  5.6    5.9   6.3    7  
# 3 virginica    4.9  6.22   6.5   6.9    7.9

Or you could use data.table

library(data.table)
irisdt <- as.data.table(iris)

irisdt[, as.list(quantile(Sepal.Length)), Species]
#       Species  0%   25% 50% 75% 100%
# 1:     setosa 4.3 4.800 5.0 5.2  5.8
# 2: versicolor 4.9 5.600 5.9 6.3  7.0
# 3:  virginica 4.9 6.225 6.5 6.9  7.9

Upvotes: 16

Harlan Nelson
Harlan Nelson

Reputation: 1502

A note about a more up-to-date version of @arienrhod

library(dplyr,quietly = TRUE,verbose = FALSE, warn.conflicts = FALSE)
five_number_summary <- iris %>% 
  group_by(Species) %>% 
  summarise(across(Sepal.Length, list(min=min, Q1=~quantile(., probs = 0.25),
                    median=median, Q3=~quantile(., probs = 0.75),
                    max=max),  .names = "{.fn}"))
five_number_summary
#> # A tibble: 3 x 6
#>   Species      min    Q1 median    Q3   max
#>   <fct>      <dbl> <dbl>  <dbl> <dbl> <dbl>
#> 1 setosa       4.3  4.8     5     5.2   5.8
#> 2 versicolor   4.9  5.6     5.9   6.3   7  
#> 3 virginica    4.9  6.22    6.5   6.9   7.9

Created on 2022-02-21 by the reprex package (v2.0.1)

Upvotes: 5

Arienrhod
Arienrhod

Reputation: 2581

You're missing the ~ in front of the quantile function in the summarise_at call that failed. Try the following:

five_number_summary <- iris %>% 
  group_by(Species) %>% 
  summarise_at(vars(Sepal.Length),
               list(min=min, Q1=~quantile(., probs = 0.25),
                    median=median, Q3=~quantile(., probs = 0.75),
                    max=max))
five_number_summary
# A tibble: 3 x 6
  Species      min    Q1 median    Q3   max
  <fct>      <dbl> <dbl>  <dbl> <dbl> <dbl>
1 setosa       4.3  4.8     5     5.2   5.8
2 versicolor   4.9  5.6     5.9   6.3   7  
3 virginica    4.9  6.22    6.5   6.9   7.9

Upvotes: 12

Related Questions