leftright
leftright

Reputation: 23

Operation on multiple column in a data frame based on group by

I have below dataframe and would like get median prices from 2003 till 2011 by state.

state freq   2003   2004   2005   2006   2007   2008   2009   2010   2011
MS    2     83000  88300  87000  94400  94400  94400  94400  94400  94400
MS    2     97000  98000 110200 115700 115700 115700 115700 115700 115700
LA    2     154300 164600 181300 149200 149200 149200 149200 149200 149200
LA    2     126800 139200 157100 144500 144500 144500 144500 144500 144500

I am still learning so any help would be appreciated. I was thinking i can use sqldf on the data frame.

Upvotes: 0

Views: 111

Answers (2)

akrun
akrun

Reputation: 886928

Other options for big datasets are

library(dplyr)
df1 %>%
    group_by(state) %>% 
    summarise_each(funs(median), -2)
    #there are many options to select the variables
    #e.g. starts_with, end_with, contains, matches, num_range, one_of..
    #summarise_each(funs(median), matches('^\\d+'))

#   state   2003   2004   2005   2006   2007   2008   2009   2010   2011
# 1    MS  90000  93150  98600 105050 105050 105050 105050 105050 105050
# 2    LA 140550 151900 169200 146850 146850 146850 146850 146850 146850

Or

library(data.table)  
setDT(df1)[, lapply(.SD, median), by = state, .SDcols=2:ncol(df1)]
#  state freq  2003   2004   2005   2006   2007   2008   2009   2010   2011
#1:  MS    2  90000  93150  98600 105050 105050 105050 105050 105050 105050
#2:  LA    2 140550 151900 169200 146850 146850 146850 146850 146850 146850

Benchmarks

set.seed(42)
m1 <- matrix(rnorm(9*1e6), ncol=9, dimnames=list(NULL, 2003:2011))
set.seed(29)
d1 <- data.frame(state=sample(state.abb, 1e6, replace=TRUE), m1, 
    stringsAsFactors=FALSE, check.names=FALSE)

 agg <- function() { aggregate(d1[,-1], by=list(d1$state), FUN=median)}
 dply <- function() {d1 %>% group_by(state) %>% summarise_each(funs(median))}
 dtable <- function() {DT <- as.data.table(d1)
      DT[, lapply(.SD, median), by = state] }

 library(microbenchmark)
 microbenchmark(agg(), dply(), dtable(), times=10L, unit='relative')  
 #Unit: relative
 #    expr        min         lq       mean    median        uq        max neval
 #   agg() 20.8518599 23.0428495 23.3284269 24.702038 21.304252 25.9574602    10
 #  dply()  1.0000000  1.0000000  1.0000000  1.000000  1.000000  1.0000000    10
 #dtable()  0.9273991  0.9062682  0.9769268  1.014912  1.012644  0.9540644    10
 # cld
 #  b
 # a 
 # a

Upvotes: 2

nothing
nothing

Reputation: 3290

If I'm understanding your goal correctly, you're looking for the aggregate() function, which applies a function to all columns of a data.frame by a grouping variable.

aggregate(yourDf[ ,-(1:2)], by = list(yourDf$state), FUN = median) 

Upvotes: 2

Related Questions