Maximilian
Maximilian

Reputation: 4229

How to do retain names in aggregate by variables

Here example and self-explanatory case with the desired solution:

 set.seed(123)
 df <- data.frame(s=rnorm(100), g1=seq(1,100,,100),g2=seq(5,100,,100),g3=seq(10,1,,100))
 agg <- with(df, aggregate(s, data.frame(g1,g2,g3), 
                 FUN = function(x) c("mean" = mean(x), "median" = median(x))))
 head(agg,5)

       g1        g2       g3     x.mean   x.median
1 100 100.00000 1.000000 -1.0264209 -1.0264209
2  99  99.04040 1.090909 -0.2357004 -0.2357004
3  98  98.08081 1.181818  1.5326106  1.5326106
4  97  97.12121 1.272727  2.1873330  2.1873330
5  96  96.16162 1.363636 -0.6002596 -0.6002596

Desired results (the -xxx are just the numerical values calculated or each by group):

  g1   g2       g3        g1.x.mean  g1.x.median g2.x.mean  g2.x.median g3.x.mean  g3.x.median
1 100 100.00000 1.000000 -1.0264209 -1.0264209   -xxx       -xxxx        -xxxx       -xxxx
2  99  99.04040 1.090909 -0.2357004 -0.2357004   -xxx       -xxxx        -xxxx       -xxxx
3  98  98.08081 1.181818  1.5326106  1.5326106   -xxx       -xxxx        -xxxx       -xxxx
4  97  97.12121 1.272727  2.1873330  2.1873330   -xxx       -xxxx        -xxxx       -xxxx
5  96  96.16162 1.363636 -0.6002596 -0.6002596   -xxx       -xxxx        -xxxx       -xxxx

Upvotes: 1

Views: 54

Answers (2)

akrun
akrun

Reputation: 887651

If the grouping should be separate, we can use lapply to loop over the 'g' columns, apply the aggregate separately and combine them with cbind

nm1 <- paste0('g', 1:3)
out1 <- do.call(cbind, lapply(nm1, function(g)       {
   out <- do.call(data.frame, aggregate(s ~ .,  df[c('s', g)],
        FUN = function(x) c(mean(x), median(x))))
    names(out)[-1] <- paste0(g, c('.x.mean', '.x.median'))
   out}))

 out1 <-  out1[c(nm1, setdiff(names(out1), nm1))]

-output

head(out1)
#  g1       g2       g3   g1.x.mean g1.x.median   g2.x.mean g2.x.median  g3.x.mean g3.x.median
#1  1 5.000000 1.000000 -0.56047565 -0.56047565 -0.56047565 -0.56047565 -1.0264209  -1.0264209
#2  2 5.959596 1.090909 -0.23017749 -0.23017749 -0.23017749 -0.23017749 -0.2357004  -0.2357004
#3  3 6.919192 1.181818  1.55870831  1.55870831  1.55870831  1.55870831  1.5326106   1.5326106
#4  4 7.878788 1.272727  0.07050839  0.07050839  0.07050839  0.07050839  2.1873330   2.1873330
#5  5 8.838384 1.363636  0.12928774  0.12928774  0.12928774  0.12928774 -0.6002596  -0.6002596
#6  6 9.797980 1.454545  1.71506499  1.71506499  1.71506499  1.71506499  1.3606524   1.3606524

Or as mentioned in the tidyverse, if the number of unique groups are not same for each of them, an approach is with ave to create columns and then apply unique

out1 <- unique(cbind(df[nm1], do.call(cbind, lapply(nm1, function(g) {
                 mean <- with(df, ave(s, df[[g]]))
                 median <- with(df, ave(s, df[[g]], FUN = median))
                 setNames(data.frame(mean, median),
                       paste0(g, c('.x.mean', '.x.median')))
                       
                       }))))

-output

head(out1)
#  g1       g2        g3   g1.x.mean g1.x.median   g2.x.mean g2.x.median   g3.x.mean g3.x.median
#1  1 5.000000 10.000000 -0.56047565 -0.56047565 -0.56047565 -0.56047565 -0.56047565 -0.56047565
#2  2 5.959596  9.909091 -0.23017749 -0.23017749 -0.23017749 -0.23017749 -0.23017749 -0.23017749
#3  3 6.919192  9.818182  1.55870831  1.55870831  1.55870831  1.55870831  1.55870831  1.55870831
#4  4 7.878788  9.727273  0.07050839  0.07050839  0.07050839  0.07050839  0.07050839  0.07050839
#5  5 8.838384  9.636364  0.12928774  0.12928774  0.12928774  0.12928774  0.12928774  0.12928774
#6  6 9.797980  9.545455  1.71506499  1.71506499  1.71506499  1.71506499  1.71506499  1.71506499                           

Or using tidyerse

library(dplyr) # version >= 1.0
library(purrr)
library(stringr)
map_dfc(nm1, ~ df %>% 
      group_by(across(all_of(.x))) %>%
      summarise(!! str_c(.x, ".x.mean") := mean(s),
        !! str_c(.x, ".x.median") := median(s), .groups = 'drop')) %>% 
    select(all_of(nm1), everything())

-output

# A tibble: 100 x 9
#      g1    g2    g3 g1.x.mean g1.x.median g2.x.mean g2.x.median g3.x.mean g3.x.median
#   <dbl> <dbl> <dbl>     <dbl>       <dbl>     <dbl>       <dbl>     <dbl>       <dbl>
# 1     1  5     1      -0.560      -0.560    -0.560      -0.560     -1.03       -1.03 
# 2     2  5.96  1.09   -0.230      -0.230    -0.230      -0.230     -0.236      -0.236
# 3     3  6.92  1.18    1.56        1.56      1.56        1.56       1.53        1.53 
# 4     4  7.88  1.27    0.0705      0.0705    0.0705      0.0705     2.19        2.19 
# 5     5  8.84  1.36    0.129       0.129     0.129       0.129     -0.600      -0.600
# 6     6  9.80  1.45    1.72        1.72      1.72        1.72       1.36        1.36 
# 7     7 10.8   1.55    0.461       0.461     0.461       0.461     -0.628      -0.628
# 8     8 11.7   1.64   -1.27       -1.27     -1.27       -1.27       0.239       0.239
# 9     9 12.7   1.73   -0.687      -0.687    -0.687      -0.687      0.548       0.548
#10    10 13.6   1.82   -0.446      -0.446    -0.446      -0.446      0.994       0.994
# … with 90 more rows

If the grouping columns have different number of unique elements, an option would be to mutate/transmute new columns and then take the distinct at the end

map_dfc(nm1, ~ df %>% 
      group_by(across(all_of(.x))) %>%
      transmute(!! str_c(.x, ".x.mean") := mean(s),
        !! str_c(.x, ".x.median") := median(s))) %>% 
    select(all_of(nm1), everything()) %>%
    ungroup %>%
    distinct

-output

# A tibble: 100 x 9
#      g1    g2    g3 g1.x.mean g1.x.median g2.x.mean g2.x.median g3.x.mean g3.x.median
#   <dbl> <dbl> <dbl>     <dbl>       <dbl>     <dbl>       <dbl>     <dbl>       <dbl>
# 1     1  5    10      -0.560      -0.560    -0.560      -0.560    -0.560      -0.560 
# 2     2  5.96  9.91   -0.230      -0.230    -0.230      -0.230    -0.230      -0.230 
# 3     3  6.92  9.82    1.56        1.56      1.56        1.56      1.56        1.56  
# 4     4  7.88  9.73    0.0705      0.0705    0.0705      0.0705    0.0705      0.0705
# 5     5  8.84  9.64    0.129       0.129     0.129       0.129     0.129       0.129 
# 6     6  9.80  9.55    1.72        1.72      1.72        1.72      1.72        1.72  
# 7     7 10.8   9.45    0.461       0.461     0.461       0.461     0.461       0.461 
# 8     8 11.7   9.36   -1.27       -1.27     -1.27       -1.27     -1.27       -1.27  
# 9     9 12.7   9.27   -0.687      -0.687    -0.687      -0.687    -0.687      -0.687 
#10    10 13.6   9.18   -0.446      -0.446    -0.446      -0.446    -0.446      -0.446 
# … with 90 more rows

Upvotes: 3

Duck
Duck

Reputation: 39613

It can also be done with dplyr:

library(dplyr)
#Code
new <- df %>% group_by(s) %>%
  summarise_all(.funs = list("x.mean"=mean,"x.median"=median))

Output:

# A tibble: 100 x 7
       s g1_x.mean g2_x.mean g3_x.mean g1_x.median g2_x.median g3_x.median
   <dbl>     <dbl>     <dbl>     <dbl>       <dbl>       <dbl>       <dbl>
 1 -2.31        72      73.1      3.55          72        73.1        3.55
 2 -1.97        18      21.3      8.45          18        21.3        8.45
 3 -1.69        26      29.0      7.73          26        29.0        7.73
 4 -1.55        57      58.7      4.91          57        58.7        4.91
 5 -1.27        43      45.3      6.18          43        45.3        6.18
 6 -1.27         8      11.7      9.36           8        11.7        9.36
 7 -1.22        78      78.9      3             78        78.9        3   
 8 -1.14        29      31.9      7.45          29        31.9        7.45
 9 -1.12        46      48.2      5.91          46        48.2        5.91
10 -1.07        65      66.4      4.18          65        66.4        4.18
# ... with 90 more rows

Upvotes: 1

Related Questions