bvowe
bvowe

Reputation: 3394

R Summarize Collapsed Data.Table

I have data such as this

data <- data.table(
 "School" = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 
              1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0),
 "Grade"  = c(0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 
              0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0),
 "CAT"    = c(1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 
              0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1),
 "FOX"    = c(1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 
              1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0),
 "DOG"    = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 
              0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1)
)

and wish to achieve a new data table such as this:

dataWANT <- data.frame(
  "VARIABLE" = c('CAT', 'CAT', 'CAT', 'FOX', 'FOX', 'FOX', 'DOG', 'DOG', 'DOG'),
  "SCHOOL" = c(1, 1, 0, 1, 1, 0, 1, 1, 0),
  "GRADE"  = c(0, 1, 1, 0, 1, 1, 0, 1, 1),
  "MEAN"   = c(NA)
)

dataWANT takes the mean for CAT and FOX and DOG by SCHOOL, GRADE, and SCHOOL X GRADE when they are equal to 1.

I know how to do this one at a time but that is not good for doing this with a big data.

data[, CAT1 := mean(CAT), by = list(SCHOOL)]
data[, FOX1 := mean(FOX), by = list(GRADE)]
data[, DOG1 := mean(DOG), by = list(SCHOOL, GRADE)]
    
data$CAT2 = unique(data[SCHOOL == 1, CAT1])
data$FOX2 = unique(data[GRADE == 1, FOX1])
data$DOG2 = unique(data[SCHOOL == 1 & GRADE == 1, DOG1])

Please only use this:

data <-  data.table(
  "SCHOOL" = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 
               1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0),
  "GRADE"  = c(0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 
               0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0),       
  "CAT"    = c(1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
               0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1),
  "FOX"    = c(1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1,
               1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0),
  "DOG"    = c(0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
               0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1)
)
    
    
data[, CAT1 := mean(CAT), by = list(SCHOOL)]
data[, CAT2 := mean(CAT), by = list(GRADE)]
data[, CAT3 := mean(CAT), by = list(SCHOOL, GRADE)]

data[, FOX1 := mean(FOX), by = list(SCHOOL)]
data[, FOX2 := mean(FOX), by = list(GRADE)]
data[, FOX3 := mean(FOX), by = list(SCHOOL, GRADE)]

data[, DOG1 := mean(DOG), by = list(SCHOOL)]
data[, DOG2 := mean(DOG), by = list(GRADE)]
data[, DOG3 := mean(DOG), by = list(SCHOOL, GRADE)]


dataWANT <- data.frame(
  "VARIABLE" = c('CAT', 'CAT', 'CAT', 'FOX', 'FOX', 'FOX', 'DOG', 'DOG', 'DOG'),
  "TYPE"     = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
  "MEAN"     = c(0.48, 0.44, 0.428, 0.6, 0.611, 0.6428, 0.52, 0.61, 0.6428)
)

where:
TYPE equals to 1 when MEAN in estimated by SCHOOL,
TYPE equals to 2 when MEAN is estimated by GRADE,
TYPE equals to 3 when MEAN is estimated by SCHOOL and GRADE

Upvotes: 1

Views: 134

Answers (3)

rg255
rg255

Reputation: 4169

You could subset and calculate your means first using lapply(.SD,...) then melt that into your output:

melt(data[School != 0 | Grade != 0, lapply(.SD, mean), by = .(School, Grade)], id.vars = c("School", "Grade"))

Adding this after also adds the TYPE variable

...][, TYPE := School + (2*Grade)]

Putting it all together and tidying it up too, it matches your desired output

dataWANT <- melt(data[School != 0 | Grade != 0, lapply(.SD, mean), by = .(School, Grade)], id.vars = c("School", "Grade"))[, TYPE := School + (2*Grade)][order(variable, TYPE), .("VARIABLE" = variable, TYPE, "MEAN" = value)] 

Upvotes: 1

akrun
akrun

Reputation: 887711

We could use rbindlist after creating a list by taking the MEAN after melting the dataset (as in the other post)

library(data.table)
cols <- c('CAT', 'FOX', 'DOG')
data1 <- melt(data, measure.vars = cols)
list_cols <- list('SCHOOL', 'GRADE', c('SCHOOL', 'GRADE'))
lst1 <- lapply(list_cols, function(x)  
       data1[, .(MEAN = mean(value, na.rm = TRUE)), c(x, 'variable')])
rbindlist(lapply(lst1, function(x)  {
     nm1 <- setdiff(names(x), c('variable', 'MEAN'))
     x[Reduce(`&`, lapply(mget(nm1), as.logical)),
     .(VARIABLE = variable, MEAN)]}), idcol = 'TYPE')[order(VARIABLE)]
#   TYPE VARIABLE      MEAN
#1:    1      CAT 0.4800000
#2:    2      CAT 0.4444444
#3:    3      CAT 0.4285714
#4:    1      FOX 0.6000000
#5:    2      FOX 0.5555556
#6:    3      FOX 0.6428571
#7:    1      DOG 0.5200000
#8:    2      DOG 0.6111111
#9:    3      DOG 0.6428571

Upvotes: 1

Ronak Shah
Ronak Shah

Reputation: 389205

Do you mean to get something like this?

library(data.table)

melt(data, measure.vars = c('CAT', 'FOX', 'DOG'))[, 
        .(MEAN = mean(value, na.rm = TRUE)), .(School, Grade, variable)]

To group it by different columns, we can do :

cols <- c('CAT', 'FOX', 'DOG')
data1 <- melt(data, measure.vars = cols)
list_cols <- list('School', 'Grade', c('School', 'Grade'))

lapply(list_cols, function(x)  
         data1[, .(MEAN = mean(value, na.rm = TRUE)), c(x, 'variable')])

Upvotes: 1

Related Questions