Reputation: 435
I am trying to generate a table of summary statistics using purrr
/tibble
methods. I am able to calculate group-wise mean (sd) and counts using the following:
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
mtcars %>%
gather(variable, value, -vs, -am) %>%
group_by(vs, am, variable) %>%
nest() %>%
filter(variable %in% c("mpg", "hp")) %>%
mutate(
mean = map_dbl(data, ~mean(.$value, na.rm = TRUE)),
sd = map_dbl(data, ~sd(.$value, na.rm = TRUE)),
n = map_dbl(data, ~sum(!is.na(.$value)))
) %>%
select(vs:variable, mean:n) %>%
mutate_at(vars(mean, sd), round, 3) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")"),
var_group = paste(vs, am, variable, sep = "_")) %>%
select(n:var_group) %>%
nest(n, mean_sd, .key = "summary") %>%
spread(key = var_group, value = summary) %>%
unnest()
My immediate question is, how do I retain the column names as seen in spread(key = var_group, value = summary)
in the unnest()
-ed output?
edit: Thanks to all for the responses.
https://stackoverflow.com/a/55912326/5745045 has the advantages of being easier to read and not storing a temporary variable. A disadvantage is the change of numeric to character in the n
columns.
The final goal is to replace the column names with formatted text within the context of a grouped kable
table.
Upvotes: 2
Views: 1219
Reputation: 91
Here's another method that doesn't require creating a temporary variable. Instead of nesting the data at the end, I used gather()
and unite()
to restructure the data so that it ends up as one key and value pair.
library(tidyverse)
#> Registered S3 methods overwritten by 'ggplot2':
#> method from
#> [.quosures rlang
#> c.quosures rlang
#> print.quosures rlang
#> Registered S3 method overwritten by 'rvest':
#> method from
#> read_xml.response xml2
mtcars %>%
gather(variable, value, -vs, -am) %>%
group_by(vs, am, variable) %>%
nest() %>%
filter(variable %in% c("mpg", "hp")) %>%
mutate(
mean = map_dbl(data, ~mean(.$value, na.rm = TRUE)),
sd = map_dbl(data, ~sd(.$value, na.rm = TRUE)),
n = map_dbl(data, ~sum(!is.na(.$value)))
) %>%
select(vs:variable, mean:n) %>%
mutate_at(vars(mean, sd), round, 3) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")"),
var_group = paste(vs, am, variable, sep = "_")) %>%
select(n:var_group) %>%
gather(key, value, -var_group) %>%
unite(var_group_key, var_group, key) %>%
spread(var_group_key, value)
#> # A tibble: 1 x 16
#> `0_0_hp_mean_sd` `0_0_hp_n` `0_0_mpg_mean_s… `0_0_mpg_n` `0_1_hp_mean_sd`
#> <chr> <chr> <chr> <chr> <chr>
#> 1 194.167 (33.36) 12 15.05 (2.774) 12 180.833 (98.816)
#> # … with 11 more variables: `0_1_hp_n` <chr>, `0_1_mpg_mean_sd` <chr>,
#> # `0_1_mpg_n` <chr>, `1_0_hp_mean_sd` <chr>, `1_0_hp_n` <chr>,
#> # `1_0_mpg_mean_sd` <chr>, `1_0_mpg_n` <chr>, `1_1_hp_mean_sd` <chr>,
#> # `1_1_hp_n` <chr>, `1_1_mpg_mean_sd` <chr>, `1_1_mpg_n` <chr>
Created on 2019-04-29 by the reprex package (v0.2.1)
Upvotes: 1
Reputation: 29109
By storing the "nested" tibble
as a temporary variable1 and using its colnames
2, we can achieve what you desire. Look below;
mtcars %>%
gather(variable, value, -vs, -am) %>%
group_by(vs, am, variable) %>%
nest() %>%
filter(variable %in% c("mpg", "hp")) %>%
mutate(
mean = map_dbl(data, ~mean(.$value, na.rm = TRUE)),
sd = map_dbl(data, ~sd(.$value, na.rm = TRUE)),
n = map_dbl(data, ~sum(!is.na(.$value)))
) %>%
select(vs:variable, mean:n) %>%
mutate_at(vars(mean, sd), round, 3) %>%
mutate(mean_sd = paste0(mean, " (", sd, ")"),
var_group = paste(vs, am, variable, sep = "_")) %>%
select(n:var_group) %>%
nest(n, mean_sd, .key = "summary") %>%
spread(key = var_group, value = summary) %>%
#1: storing the temporary nested variable
{. ->> temptibble} %>%
unnest() %>%
#2: renaming the columns of unnested output and removing temporary variable
rename_all(funs(paste0(., "_", rep(colnames(temptibble), each=2)))); rm(temptibble)
# # A tibble: 1 x 16
# n_0_0_hp mean_sd_0_0_hp n1_0_0_mpg mean_sd1_0_0_mpg n2_0_1_hp mean_sd2_0_1_hp n3_0_1_mpg mean_sd3_0_1_mpg
# <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
# 1 12 194.167 (33.36) 12 15.05 (2.774) 6 180.833 (98.816) 6 19.75 (4.009)
# n4_1_0_hp mean_sd4_1_0_hp n5_1_0_mpg mean_sd5_1_0_mpg n6_1_1_hp mean_sd6_1_1_hp n7_1_1_mpg mean_sd7_1_1_mpg
# <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
# 1 7 102.143 (20.932) 7 20.743 (2.471) 7 80.571 (24.144) 7 28.371 (4.758)
Upvotes: 2