R O'Brien
R O'Brien

Reputation: 33

Means within each group with more than 1 column of group indices

I have a variable and I would like to obtain the means within each group where the group is listed for each observation in a column and I have many such columns. I would then like to associate the group means to the appropriate observation so that if I start with a matrix of m obs x n different groupings I obtain an m x n matrix of means. For example:

> var <- round(runif(10),digits=2)

> var
[1] 0.47 0.21 0.80 0.65 0.32 0.72 0.29 0.93 0.77 0.64
> groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
           sample(c(1,2,3), 10, replace=TRUE),
           sample(c(1,2,3,5), 10, replace=TRUE))
> groupings
      [,1] [,2] [,3]
 [1,]    3    1    5
 [2,]    1    1    5
 [3,]    2    1    5
 [4,]    3    2    3
 [5,]    2    3    1
 [6,]    1    1    1
 [7,]    2    3    1
 [8,]    1    2    1
 [9,]    3    1    5
[10,]    1    3    2

I can obtain the means within each group separately with the following (for example)

> means.1 <- sapply(split(var, groupings[,1]), function(x) mean(x))
> means.2 <- sapply(split(var, groupings[,2]), function(x) mean(x))
> means.3 <- sapply(split(var, groupings[,3]), function(x) mean(x))

> means.1
    1     2     3 
0.625 0.470 0.630 
> means.2
    1         2         3 
0.5940000 0.7900000 0.4166667 
> means.3
    1      2      3      5 
0.5650 0.6400 0.6500 0.5625 

But not only are these separate calls inefficient, they still don't get me what I want, which is the following

       [,1]      [,2]   [,3]
[1,]  0.630 0.5940000 0.5625
[2,]  0.625 0.5940000 0.5625
[3,]  0.470 0.5940000 0.5625
[4,]  0.630 0.7900000 0.6500
[5,]  0.470 0.4166667 0.5650
[6,]  0.625 0.5940000 0.5650
[7,]  0.470 0.4166667 0.5650
[8,]  0.625 0.7900000 0.5650
[9,]  0.630 0.5940000 0.5625
[10,] 0.625 0.4166667 0.6400

Upvotes: 2

Views: 44

Answers (3)

akuiper
akuiper

Reputation: 214957

Another option, you can use apply(because you already have a matrix) to loop through columns( with Margin set to 2) and pass the column to ave function as group variable, you can either explicitly specify FUN parameter to be mean or not specify it as mean is the default function used:

apply(groupings, 2, ave, x = var)  # pass the var as a named parameter since it is the 
                                   # parameter at the first position of ave function, if not
                                   # ave will treat the column as the first position parameter
                                   # which you don't want to

 #      [,1]      [,2]   [,3]
 #[1,] 0.630 0.5940000 0.5625
 #[2,] 0.625 0.5940000 0.5625
 #[3,] 0.470 0.5940000 0.5625
 #[4,] 0.630 0.7900000 0.6500
 #[5,] 0.470 0.4166667 0.5650
 #[6,] 0.625 0.5940000 0.5650
 #[7,] 0.470 0.4166667 0.5650
 #[8,] 0.625 0.7900000 0.5650
 #[9,] 0.630 0.5940000 0.5625
#[10,] 0.625 0.4166667 0.6400

Or with dplyr, you can use the mutate_all() function:

library(dplyr)
mutate_all(as.data.frame(groupings), funs(ave(var, .)))

#      V1        V2     V3
#1  0.630 0.5940000 0.5625
#2  0.625 0.5940000 0.5625
#3  0.470 0.5940000 0.5625
#4  0.630 0.7900000 0.6500
#5  0.470 0.4166667 0.5650
#6  0.625 0.5940000 0.5650
#7  0.470 0.4166667 0.5650
#8  0.625 0.7900000 0.5650
#9  0.630 0.5940000 0.5625
#10 0.625 0.4166667 0.6400

Upvotes: 2

joel.wilson
joel.wilson

Reputation: 8413

library(dplyr)
set.seed(1000)
var <- round(runif(10),digits=2)

groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
                     sample(c(1,2,3), 10, replace=TRUE),
                     sample(c(1,2,3,5), 10, replace=TRUE), var)

df = data.frame(groupings)
df %>% 
  group_by(V1)%>% mutate(x1 =mean(var))%>% ungroup(V1) %>% 
  group_by(V2) %>% mutate(x2=mean(var)) %>% ungroup(V2) %>% 
  group_by(V3) %>% mutate(x3=mean(var)) %>% ungroup(V3)

#      V1    V2    V3   var        x1    x2    x3
#   <dbl> <dbl> <dbl> <dbl>     <dbl> <dbl> <dbl>
#1      2     1     3  0.33 0.4775000 0.322 0.250
#2      3     3     1  0.76 0.6566667 0.470 0.750
#3      1     1     3  0.11 0.1333333 0.322 0.250
#4      3     1     5  0.69 0.6566667 0.322 0.635
#5      3     2     3  0.52 0.6566667 0.630 0.250
#6      1     3     3  0.07 0.1333333 0.470 0.250
#7      2     2     1  0.74 0.4775000 0.630 0.750
#8      2     3     5  0.58 0.4775000 0.470 0.635
#9      1     1     3  0.22 0.1333333 0.322 0.250
#10     2     1     2  0.26 0.4775000 0.322 0.260

# you can simply subset the columns

Upvotes: 3

lmo
lmo

Reputation: 38510

Here is one method using ave along with split and sapply

sapply(split(groupings, rep(seq_len(ncol(groupings)), each=nrow(groupings))),
       function(x) ave(var, x, FUN=mean))
              1         2         3
 [1,] 0.4566667 0.5550000 0.3925000
 [2,] 0.6200000 0.5550000 0.3925000
 [3,] 0.4816667 0.5550000 0.3925000
 [4,] 0.4566667 0.5550000 0.6200000
 [5,] 0.4816667 0.5550000 0.4350000
 [6,] 0.4566667 0.5133333 0.6066667
 [7,] 0.4816667 0.0100000 0.4350000
 [8,] 0.4816667 0.5133333 0.3925000
 [9,] 0.4816667 0.5133333 0.6066667
[10,] 0.4816667 0.5550000 0.6066667

The entry to sapply, split(groupings, rep(seq_len(ncol(groupings)), each=nrow(groupings))) produces a list where each list item is a column of the matrix groupings. Each element of this list is fed to sapply which applies ave using the vector to group the results.

data

set.seed(1234)
var <- round(runif(10),digits=2)
groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
                   sample(c(1,2,3), 10, replace=TRUE),
                   sample(c(1,2,3,5), 10, replace=TRUE))

Upvotes: 1

Related Questions