Reputation: 1999

R: Obtain aggregations of two or more variables by two or more grouping variables

With the following dataframe I need to obtain monthly sums of the following two variables: "CallsHandled" and "Engaged" By the following grouping variables: "Month","ID","Location","LANGUAGE","MemRegion"

structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Week = c(1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L), ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234", 
"F1234"), class = "factor"), Location = structure(c(2L, 1L, 1L, 
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
1L), .Label = c("Corona", "Denver"), class = "factor"), LANGUAGE = structure(c(1L, 
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 
2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"), 
MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 
1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL", 
"SCAL"), class = "factor"), CallsHandled = c(1L, 1L, 8L, 
1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L, 
1L, 1L, 2L), Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L, 
200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L, 
1243L, 75L, 45L, 55L), QueueA = c(0L, 0L, 0L, 1L, 1L, 0L, 
0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), 
QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 
0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L), QueueC = c(0L, 1L, 0L, 0L, 
0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 
0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE", 
"MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"
), class = "data.frame", row.names = c(NA, -20L))

Additionally, in order to include "Queues A:C" as grouping variables, would I have to combine them into a single column? If so, how?

Upvotes: 0

Answers (3)

user3594490

Reputation: 1999

@chappers solution aggregates correctly but leaves me with a bunch of duplicate rows for some reason that I can't figure out. This works for factors and reduces the number of rows in my actual dataframe (no duplicates):

aggregate(cbind(CallsHandled,Engaged~Month + ID + Location + LANGUAGE + MemRegion, data=df, sum, na.rm=TRUE)

Upvotes: 0

Jota

Reputation: 17611

To make the Queue variables into a single factor variable, you could do this:

queues <- which(dat[ , c("QueueA", "QueueB", "QueueC")]==1, arr.ind=TRUE)
queues<-queues[
            order(queues[,"row"]), "col"]
queues<-factor(queues, labels=c("QueueA", "QueueB", "QueueC"))
dat <- data.frame(dat, queues)

Though, @chappers approach for this is nicer.

Then, you can use aggregate:

aggregate(dat[,c("CallsHandled", "Engaged")], 
    by=list(dat$Month, dat$ID, dat$Location, dat$LANGUAGE, dat$MemRegion, dat$queues),
    sum)

#   Group.1 Group.2 Group.3 Group.4 Group.5 Group.6 CallsHandled Engaged
#1        1   A1234  Corona English    NCAL  QueueA            1     200
#2        1   F1234  Denver English    NCAL  QueueA            1      30
#3        1   A1234  Corona Spanish    NCAL  QueueA            3      55
#4        1   F1234  Corona Spanish    NCAL  QueueA            1      75
#5        1   A1234  Corona English    SCAL  QueueA            1      45
#6        1   F1234  Corona Spanish    SCAL  QueueA            1      45
#7        1   F1234  Corona English    NCAL  QueueB            9    1363
#8        1   F1234  Denver English    NCAL  QueueB            1     120
#9        1   A1234  Corona English    SCAL  QueueB           10    1538
#10       1   A1234  Denver English    SCAL  QueueB            1      30
#11       1   F1234  Denver English    SCAL  QueueB            2      55
#12       1   A1234  Corona Spanish    SCAL  QueueB            1      75
#13       1   A1234  Corona English    NCAL  QueueC           10    2065
#14       1   F1234  Corona English    NCAL  QueueC            1     200
#15       1   F1234  Corona Spanish    NCAL  QueueC            1     230
#16       1   A1234  Corona Spanish    SCAL  QueueC            7    1288
#17       1   F1234  Corona Spanish    SCAL  QueueC            1      30

Upvotes: 0

chappers

Reputation: 2415

So there are 2 parts to this question, firstly how do you group things up and sum, and secondly how could you combine Queue A:C into one column.

For the first question you can use the library dplyr which makes it a lot easier and more intuitive.

df <- structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                     Week = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), 
                     ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234", "F1234"), class = "factor"), 
                     Location = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("Corona", "Denver"), class = "factor"), 
                     LANGUAGE = structure(c(1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"), 
                     MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL", "SCAL"), class = "factor"), 
                     CallsHandled = c(1L, 1L, 8L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L, 1L, 1L, 2L), 
                     Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L, 200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L, 1243L, 75L, 45L, 55L), 
                     QueueA = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), 
                     QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L), 
                     QueueC = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE", "MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"),
                class = "data.frame", row.names = c(NA, -20L))

library(dplyr)

df %>% group_by(Month, ID, Location, LANGUAGE) %>%
  mutate(TotalCallsHandled = sum(CallsHandled),
         TotalEngaged = sum(Engaged))

So firstly, we use group_by to group by specifically the variables you have listed, and mutate to sum everything up, and this will do what you think intuitively.

For combining everything into one column, there are probably many ways to do this, but probably the most straight forward way is to create some kind of unique identifier for each column and combine all the columns into one.

df$Queue <- as.factor(df$QueueA + df$QueueB*2 + df$QueueC*3)
levels(df$Queue) <- c("A", "B", "C")

Since everything should be a 0, 1 flag, we can recreate the flags to be 1 -> A, 2 -> B, 3 -> C, and then relevel the factors to be A, B, C again. Then we can simply use group_by function again to get the intended result as above.

df %>% group_by(Month, ID, Location, LANGUAGE, Queue) %>%
  mutate(TotalCallsHandled = sum(CallsHandled),
         TotalEngaged = sum(Engaged)) %>%
  select(-QueueA, -QueueB, -QueueC)

With output:

Source: local data frame [20 x 11]
Groups: Month, ID, Location, LANGUAGE, Queue

   Month Week    ID Location LANGUAGE MemRegion CallsHandled Engaged Queue
1      1    1 F1234   Denver  English      NCAL            1     120     B
2      1    2 F1234   Corona  Spanish      SCAL            1      30     C
3      1    3 F1234   Corona  English      NCAL            8    1243     B
4      1    4 F1234   Corona  Spanish      NCAL            1      75     A
5      1    5 F1234   Corona  Spanish      SCAL            1      45     A
6      1    6 F1234   Denver  English      SCAL            2      55     B
7      1    7 F1234   Corona  English      NCAL            1     200     C
8      1    8 F1234   Corona  English      NCAL            1     120     B
9      1    9 F1234   Denver  English      NCAL            1      30     A
10     1   10 F1234   Corona  Spanish      NCAL            1     230     C
11     1    1 A1234   Corona  English      NCAL           10    2065     C
12     1    2 A1234   Corona  English      SCAL            1      45     A
13     1    3 A1234   Corona  Spanish      NCAL            3      55     A
14     1    4 A1234   Corona  English      NCAL            1     200     A
15     1    5 A1234   Corona  English      SCAL            8    1483     B
16     1    6 A1234   Denver  English      SCAL            1      30     B
17     1    7 A1234   Corona  Spanish      SCAL            6    1243     C
18     1    8 A1234   Corona  Spanish      SCAL            1      75     B
19     1    9 A1234   Corona  Spanish      SCAL            1      45     C
20     1   10 A1234   Corona  English      SCAL            2      55     B
Variables not shown: TotalCallsHandled (int), TotalEngaged (int)

Upvotes: 1

R: Obtain aggregations of two or more variables by two or more grouping variables

Answers (3)

Related Questions