Reputation: 6200

Efficient functions over specific data.frame columns in a list of data.frames

I have a list of data.frames. For example

set.seed(1)
my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
  my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))
}

What I want is to efficiently compute for each id (in the "id" column) the variance of the "p", "m", "d", and "a" columns over all data frames in the list. Ideally, this would return a data.frame like this (based on the values drawn above):

> result.df
  id     var_p     var_m      var_d     var_a
1  a 0.2371569 1.7810729 0.08264279 0.5074250
2  b 0.1091675 0.2107997 1.15051229 1.1578691
3  c 0.5385789 0.7650123 0.44215343 0.3137903
4  d 1.0174542 0.7818498 0.06414317 0.6079849
5  e 0.7343667 1.2870542 1.41615858 0.7362462

Upvotes: 3

Answers (4)

Veerendra Gadekar

Reputation: 4472

Using my_list

library(plyr)
df = do.call(rbind, my_list)
out = ddply(df, .(id), colwise(var, c('p','m','d','a')))

#> out
#  id         p         m          d         a
#1  a 0.2371569 1.7810729 0.08264279 0.5074250
#2  b 0.1091675 0.2107997 1.15051229 1.1578691
#3  c 0.5385789 0.7650123 0.44215343 0.3137903
#4  d 1.0174542 0.7818498 0.06414317 0.6079849
#5  e 0.7343667 1.2870542 1.41615858 0.7362462

Or base R alternative, using the combination of lapply and apply

df = do.call(rbind, my_list)
df1 = do.call(rbind, 
      lapply(split(df, df$id), 
      function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))

out = transform(df1, id = row.names(df1))

#> out
#          p         m          d         a id
#a 0.2371569 1.7810729 0.08264279 0.5074250  a
#b 0.1091675 0.2107997 1.15051229 1.1578691  b
#c 0.5385789 0.7650123 0.44215343 0.3137903  c
#d 1.0174542 0.7818498 0.06414317 0.6079849  d
#e 0.7343667 1.2870542 1.41615858 0.7362462  e

Or using doBy

library(doBy)
df = do.call(rbind, my_list)
out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)

#> out
#  id         p         m          d         a
#1  a 0.2371569 1.7810729 0.08264279 0.5074250
#2  b 0.1091675 0.2107997 1.15051229 1.1578691
#3  c 0.5385789 0.7650123 0.44215343 0.3137903
#4  d 1.0174542 0.7818498 0.06414317 0.6079849
#5  e 0.7343667 1.2870542 1.41615858 0.7362462

Or using sqldf

library(sqldf)
df = do.call(rbind, my_list)
out = sqldf("select id, variance(p), variance(m), 
             variance(d), variance(a) from df group by id")

#> out
#  id variance(p) variance(m) variance(d) variance(a)
#1  a   0.2371569   1.7810729  0.08264279   0.5074250
#2  b   0.1091675   0.2107997  1.15051229   1.1578691
#3  c   0.5385789   0.7650123  0.44215343   0.3137903
#4  d   1.0174542   0.7818498  0.06414317   0.6079849
#5  e   0.7343667   1.2870542  1.41615858   0.7362462

Upvotes: 3

C8H10N4O2

Reputation: 19005

library(data.table)
rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")]
#    id         p         m          d         a
# 1:  a 0.2371569 1.7810729 0.08264279 0.5074250
# 2:  b 0.1091675 0.2107997 1.15051229 1.1578691
# 3:  c 0.5385789 0.7650123 0.44215343 0.3137903
# 4:  d 1.0174542 0.7818498 0.06414317 0.6079849
# 5:  e 0.7343667 1.2870542 1.41615858 0.7362462

Upvotes: 3

Rorschach

Reputation: 32426

Updated to use bind_rows() (more efficient than do.call(rbind,...) per @hadley suggestion)

library(dplyr)
dat <- bind_rows(dat)[,c("id","p","m","d","a")]
dat %>% group_by(id) %>% summarise_each(funs(var))

#   id         p         m          d         a
# 1  a 0.2371569 1.7810729 0.08264279 0.5074250
# 2  b 0.1091675 0.2107997 1.15051229 1.1578691
# 3  c 0.5385789 0.7650123 0.44215343 0.3137903
# 4  d 1.0174542 0.7818498 0.06414317 0.6079849
# 5  e 0.7343667 1.2870542 1.41615858 0.7362462

Upvotes: 2

Frank

Reputation: 66819

Here is a base R approach

dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)

which gives

  id         p         m          d         a
1  a 0.2371569 1.7810729 0.08264279 0.5074250
2  b 0.1091675 0.2107997 1.15051229 1.1578691
3  c 0.5385789 0.7650123 0.44215343 0.3137903
4  d 1.0174542 0.7818498 0.06414317 0.6079849
5  e 0.7343667 1.2870542 1.41615858 0.7362462

Upvotes: 3

Efficient functions over specific data.frame columns in a list of data.frames

Answers (4)

Related Questions