microbenchmark with datatable, tapply, aggregate, ave and dplyr

Question

I was attempting to compare speed of several approaches for obtaining summary statistics by group. However, I am get an error when running microbenchmark. The error states:

Error in bmerge(i, x, leftcols, rightcols, io, xo, roll, rollends, nomatch,  : 
  x.'TRIAL_INDEX' is a character column being joined to i.'TRIAL_INDEX' which is type 'integer'. Character columns must join to factor or character columns.

I am not sure, but I think data.table changes an attribute of the variable TRIAL_INDEX. From searching Stack Overflow for similar questions, I guess there have been conflicts between some packages.

Is there a work-around, so I can perhaps change the attribute of TRIAL_INDEX back to integer or take other action so the microbenchmark function will work? Or maybe I am making an error I am not seeing.

Here is the code with the five functions I am attempting to compare. From running subsets of these functions I am impressed by how fast the ave function is.

library(microbenchmark)
library(dplyr)
library(data.table)

poo <- read.table(text = '
     TRIAL_INDEX     RIGHT_PUPIL_SIZE
          1                 10
          1                  8
          1                  6
          1                  4
          1                 NA
          2                  1
          2                  2
          2                 NA
          2                  4
          2                  5
', header = TRUE, stringsAsFactors = FALSE, na.strings = "NA")

tapply.function <- function(x) {

     my.summary <- as.data.frame(do.call("rbind", 
                   tapply(poo$RIGHT_PUPIL_SIZE, poo$TRIAL_INDEX, 
                   function(x) c(index.mean = mean(x, na.rm = TRUE),
                                   index.sd =   sd(x, na.rm = TRUE)))))

     my.summary$TRIAL_INDEX <- rownames(my.summary)

     poo2 <- merge(poo, my.summary, by = 'TRIAL_INDEX')

     return(poo2)

}

str(tapply.function(poo))

aggregate.function <- function(x) {

     my.summary <- with(poo, aggregate(RIGHT_PUPIL_SIZE, by = list(TRIAL_INDEX), 
                        FUN = function(x) {c( index.mean = mean(x, na.rm = TRUE), 
                                              index.sd   =   sd(x, na.rm = TRUE))}))

     my.summary <- do.call(data.frame, my.summary)

     colnames(my.summary) <- c('TRIAL_INDEX', 'index.mean', 'index.sd')

     poo2 <- merge(poo, my.summary, by = 'TRIAL_INDEX')

     return(poo2)

}

str(aggregate.function(poo))

ave.function <- function(x) {

     index.mean <- ave(poo$RIGHT_PUPIL_SIZE, poo$TRIAL_INDEX, FUN = function(x) mean(x, na.rm = TRUE))
     index.sd   <- ave(poo$RIGHT_PUPIL_SIZE, poo$TRIAL_INDEX, FUN = function(x)   sd(x, na.rm = TRUE))

     poo2 <- data.frame(poo, index.mean, index.sd)

     return(poo2)

}

str(ave.function(poo))

dplyr.function <- function(x) {

     my.summary <- poo %>%
         group_by(TRIAL_INDEX) %>% 
         summarise(index.mean = mean(RIGHT_PUPIL_SIZE, na.rm = TRUE),
                     index.sd =   sd(RIGHT_PUPIL_SIZE, na.rm = TRUE))

     poo2 <- merge(poo, as.data.frame(my.summary), by = 'TRIAL_INDEX')

     return(poo2)

}

str(dplyr.function(poo))

data.table.function <- function(x) {

     my.summary <- data.frame(setDT(poo)[, .(index.mean = mean(RIGHT_PUPIL_SIZE, na.rm = TRUE), 
                                               index.sd =   sd(RIGHT_PUPIL_SIZE, na.rm = TRUE)),
                          .(TRIAL_INDEX)])

     poo2 <- merge(poo, my.summary, by = 'TRIAL_INDEX')

     return(poo2)

}

str(data.table.function(poo))

# this does not work
microbenchmark(    tapply.function(poo),
                aggregate.function(poo),
                      ave.function(poo),
                    dplyr.function(poo), 
               data.table.function(poo), times = 1000)

microbenchmark with datatable, tapply, aggregate, ave and dplyr

Answers (1)

Related Questions