How to do retain names in aggregate by variables

Question

Here example and self-explanatory case with the desired solution:

 set.seed(123)
 df <- data.frame(s=rnorm(100), g1=seq(1,100,,100),g2=seq(5,100,,100),g3=seq(10,1,,100))
 agg <- with(df, aggregate(s, data.frame(g1,g2,g3), 
                 FUN = function(x) c("mean" = mean(x), "median" = median(x))))
 head(agg,5)

       g1        g2       g3     x.mean   x.median
1 100 100.00000 1.000000 -1.0264209 -1.0264209
2  99  99.04040 1.090909 -0.2357004 -0.2357004
3  98  98.08081 1.181818  1.5326106  1.5326106
4  97  97.12121 1.272727  2.1873330  2.1873330
5  96  96.16162 1.363636 -0.6002596 -0.6002596

Desired results (the -xxx are just the numerical values calculated or each by group):

  g1   g2       g3        g1.x.mean  g1.x.median g2.x.mean  g2.x.median g3.x.mean  g3.x.median
1 100 100.00000 1.000000 -1.0264209 -1.0264209   -xxx       -xxxx        -xxxx       -xxxx
2  99  99.04040 1.090909 -0.2357004 -0.2357004   -xxx       -xxxx        -xxxx       -xxxx
3  98  98.08081 1.181818  1.5326106  1.5326106   -xxx       -xxxx        -xxxx       -xxxx
4  97  97.12121 1.272727  2.1873330  2.1873330   -xxx       -xxxx        -xxxx       -xxxx
5  96  96.16162 1.363636 -0.6002596 -0.6002596   -xxx       -xxxx        -xxxx       -xxxx

akrun · Accepted Answer

If the grouping should be separate, we can use lapply to loop over the 'g' columns, apply the aggregate separately and combine them with cbind

nm1 <- paste0('g', 1:3)
out1 <- do.call(cbind, lapply(nm1, function(g)       {
   out <- do.call(data.frame, aggregate(s ~ .,  df[c('s', g)],
        FUN = function(x) c(mean(x), median(x))))
    names(out)[-1] <- paste0(g, c('.x.mean', '.x.median'))
   out}))

 out1 <-  out1[c(nm1, setdiff(names(out1), nm1))]

-output

head(out1)
#  g1       g2       g3   g1.x.mean g1.x.median   g2.x.mean g2.x.median  g3.x.mean g3.x.median
#1  1 5.000000 1.000000 -0.56047565 -0.56047565 -0.56047565 -0.56047565 -1.0264209  -1.0264209
#2  2 5.959596 1.090909 -0.23017749 -0.23017749 -0.23017749 -0.23017749 -0.2357004  -0.2357004
#3  3 6.919192 1.181818  1.55870831  1.55870831  1.55870831  1.55870831  1.5326106   1.5326106
#4  4 7.878788 1.272727  0.07050839  0.07050839  0.07050839  0.07050839  2.1873330   2.1873330
#5  5 8.838384 1.363636  0.12928774  0.12928774  0.12928774  0.12928774 -0.6002596  -0.6002596
#6  6 9.797980 1.454545  1.71506499  1.71506499  1.71506499  1.71506499  1.3606524   1.3606524

Or as mentioned in the tidyverse, if the number of unique groups are not same for each of them, an approach is with ave to create columns and then apply unique

out1 <- unique(cbind(df[nm1], do.call(cbind, lapply(nm1, function(g) {
                 mean <- with(df, ave(s, df[[g]]))
                 median <- with(df, ave(s, df[[g]], FUN = median))
                 setNames(data.frame(mean, median),
                       paste0(g, c('.x.mean', '.x.median')))
                       
                       }))))

-output

head(out1)
#  g1       g2        g3   g1.x.mean g1.x.median   g2.x.mean g2.x.median   g3.x.mean g3.x.median
#1  1 5.000000 10.000000 -0.56047565 -0.56047565 -0.56047565 -0.56047565 -0.56047565 -0.56047565
#2  2 5.959596  9.909091 -0.23017749 -0.23017749 -0.23017749 -0.23017749 -0.23017749 -0.23017749
#3  3 6.919192  9.818182  1.55870831  1.55870831  1.55870831  1.55870831  1.55870831  1.55870831
#4  4 7.878788  9.727273  0.07050839  0.07050839  0.07050839  0.07050839  0.07050839  0.07050839
#5  5 8.838384  9.636364  0.12928774  0.12928774  0.12928774  0.12928774  0.12928774  0.12928774
#6  6 9.797980  9.545455  1.71506499  1.71506499  1.71506499  1.71506499  1.71506499  1.71506499

Or using tidyerse

library(dplyr) # version >= 1.0
library(purrr)
library(stringr)
map_dfc(nm1, ~ df %>% 
      group_by(across(all_of(.x))) %>%
      summarise(!! str_c(.x, ".x.mean") := mean(s),
        !! str_c(.x, ".x.median") := median(s), .groups = 'drop')) %>% 
    select(all_of(nm1), everything())

-output

# A tibble: 100 x 9
#      g1    g2    g3 g1.x.mean g1.x.median g2.x.mean g2.x.median g3.x.mean g3.x.median
#                                         
# 1     1  5     1      -0.560      -0.560    -0.560      -0.560     -1.03       -1.03 
# 2     2  5.96  1.09   -0.230      -0.230    -0.230      -0.230     -0.236      -0.236
# 3     3  6.92  1.18    1.56        1.56      1.56        1.56       1.53        1.53 
# 4     4  7.88  1.27    0.0705      0.0705    0.0705      0.0705     2.19        2.19 
# 5     5  8.84  1.36    0.129       0.129     0.129       0.129     -0.600      -0.600
# 6     6  9.80  1.45    1.72        1.72      1.72        1.72       1.36        1.36 
# 7     7 10.8   1.55    0.461       0.461     0.461       0.461     -0.628      -0.628
# 8     8 11.7   1.64   -1.27       -1.27     -1.27       -1.27       0.239       0.239
# 9     9 12.7   1.73   -0.687      -0.687    -0.687      -0.687      0.548       0.548
#10    10 13.6   1.82   -0.446      -0.446    -0.446      -0.446      0.994       0.994
# … with 90 more rows

If the grouping columns have different number of unique elements, an option would be to mutate/transmute new columns and then take the distinct at the end

map_dfc(nm1, ~ df %>% 
      group_by(across(all_of(.x))) %>%
      transmute(!! str_c(.x, ".x.mean") := mean(s),
        !! str_c(.x, ".x.median") := median(s))) %>% 
    select(all_of(nm1), everything()) %>%
    ungroup %>%
    distinct

-output

# A tibble: 100 x 9
#      g1    g2    g3 g1.x.mean g1.x.median g2.x.mean g2.x.median g3.x.mean g3.x.median
#                                         
# 1     1  5    10      -0.560      -0.560    -0.560      -0.560    -0.560      -0.560 
# 2     2  5.96  9.91   -0.230      -0.230    -0.230      -0.230    -0.230      -0.230 
# 3     3  6.92  9.82    1.56        1.56      1.56        1.56      1.56        1.56  
# 4     4  7.88  9.73    0.0705      0.0705    0.0705      0.0705    0.0705      0.0705
# 5     5  8.84  9.64    0.129       0.129     0.129       0.129     0.129       0.129 
# 6     6  9.80  9.55    1.72        1.72      1.72        1.72      1.72        1.72  
# 7     7 10.8   9.45    0.461       0.461     0.461       0.461     0.461       0.461 
# 8     8 11.7   9.36   -1.27       -1.27     -1.27       -1.27     -1.27       -1.27  
# 9     9 12.7   9.27   -0.687      -0.687    -0.687      -0.687    -0.687      -0.687 
#10    10 13.6   9.18   -0.446      -0.446    -0.446      -0.446    -0.446      -0.446 
# … with 90 more rows

How to do retain names in aggregate by variables

Answers (2)

Related Questions