89_Simple
89_Simple

Reputation: 3805

Converting dplyr to data.table

library(dplyr)
library(data.table)

df <- data.frame(year = rep(2003:2004, each = 4), 
             mg = rep(rep(c("a", "b"), each = 2), times = 2),
             plant_date = c(20, 30, 20, 30, 33, 40, 33, 40),
             stage1 = c(40, 50, 42, 52, 43, 55, 48, 57),
             stage2 = c(55, 65, 57, 66, 58, 68, 59, 65),
             stage3 = c(61, 75, 63, 76, 66, 77, 68, 79))

  set.seed(123)
  dat <- data.frame(year = rep(2003:2004, each = 365), doy = rep(1:365, times = 2), 
              rainfall = sample(0:20, 730, replace = T))

  final.dat <- dat %>% dplyr::left_join(df)

I have used the dplyr to do so some calculation as follows:

  final.dat %>% dplyr::group_by(year, plant_date, mg) %>% 
  dplyr::summarise(
    sum_rain_stage1 = sum(rainfall[doy >= plant_date & doy <= stage1]),
    sum_rain_stage2 = sum(rainfall[doy >= plant_date & doy <= stage2]),
    mean_rain_stage1 = mean(rainfall[doy >= plant_date & doy <= stage1]),
    mean_rain_stage2 = mean(rainfall[doy >= plant_date & doy <= stage2]),
    var.x1 = (sum_rain_stage1 > sum_rain_stage2) * (mean_rain_stage1 - mean_rain_stage2),
    var.x2 = (mean_rain_stage1 > mean_rain_stage2) * (sum_rain_stage1 - sum_rain_stage2))

I want to do the same thing using data.table

  final.dat <- data.table(final.dat)

  final.dat[, j = list(sum(rainfall[doy >= plant_date & doy <= stage1]), 
                       sum(rainfall[doy >= plant_date & doy <= stage2]),
                       mean(rainfall[doy >= plant_date & doy <= stage1]),
                       mean(rainfall[doy >= plant_date & doy <= stage2])),
            by = list(year, plant_date, mg)]

I have two questions: 1) How can I keep my column names as in dplyr instead of the default V1, V2, V3 and V4. 2) I do not know how to implement this part of dplyr in data.table

  var.x1 = (sum_rain_stage1 > sum_rain_stage2) * (mean_rain_stage1 - mean_rain_stage2),
  var.x2 = (mean_rain_stage1 > mean_rain_stage2) * (sum_rain_stage1 - sum_rain_stage2))

Thanks

Upvotes: 0

Views: 1026

Answers (3)

Frank
Frank

Reputation: 66819

Backing up a step, your approach to conditions like doy >= plant_date & doy <= stage1 can be simplified in two ways...

First, there's doy %between% list(plant_date, stage1) so you don't have to type the var name twice.

Second, since doy and the other columns come from different tables, you can do this inside a non-equi join to update df instead of making new table final.dat:

setDT(df)
setDT(dat)
df[, c("sum_rs1", "mean_rs1") := 
  dat[df, on=.(doy >= plant_date, doy <= stage1), .(sum(rainfall), mean(rainfall)), by=.EACHI][, .(V1, V2)]]
df[, c("sum_rs2", "mean_rs2") := 
  dat[df, on=.(doy >= plant_date, doy <= stage2), .(sum(rainfall), mean(rainfall)), by=.EACHI][, .(V1, V2)]]

# same as @eddi's
df[, `:=`(
  var.x1 = (sum_rs1 > sum_rs2) * (mean_rs1 - mean_rs2),
  var.x2 = (mean_rs1 > mean_rs2) * (sum_rs1 - sum_rs2)
)]

Upvotes: 3

akrun
akrun

Reputation: 887901

For summarise, we can just name as in a list

list(a = 1:2, b = 3:5)

Similarly,

final.dat[,  list(sum_rain_stage1 = sum(rainfall[doy >= plant_date & doy <= stage1]),  
                 sum_rain_stage2 =  sum(rainfall[doy >= plant_date & doy <= stage2]),
                 mean_rain_stage1 =        mean(rainfall[doy >= plant_date & doy <= stage1]),
                 mean_rain_stage2 =        mean(rainfall[doy >= plant_date & doy <= stage2])),
        by = list(year, plant_date, mg)]
# year plant_date mg sum_rain_stage1 sum_rain_stage2 mean_rain_stage1 mean_rain_stage2
#1: 2003         20  a             251             355        11.952381         9.861111
#2: 2003         30  a             176             304         8.380952         8.444444
#3: 2003         20  b             261             361        11.347826         9.500000
#4: 2003         30  b             185             313         8.043478         8.459459
#5: 2004         33  a             109             247         9.909091         9.500000
#6: 2004         40  a             134             279         8.375000         9.620690
#7: 2004         33  b             157             253         9.812500         9.370370
#8: 2004         40  b             158             242         8.777778         9.307692

Upvotes: 0

eddi
eddi

Reputation: 49448

dt = as.data.table(final.dat) # or setDT to convert in place

dt[, .(sum_rain_stage1  = sum(rainfall[doy >= plant_date & doy <= stage1]), 
       sum_rain_stage2  = sum(rainfall[doy >= plant_date & doy <= stage2]),
       mean_rain_stage1 = mean(rainfall[doy >= plant_date & doy <= stage1]),
       mean_rain_stage2 = mean(rainfall[doy >= plant_date & doy <= stage2]))
   , by = .(year, plant_date, mg)][
   , `:=`(var.x1 = (sum_rain_stage1 > sum_rain_stage2) * (mean_rain_stage1 - mean_rain_stage2),
          var.x2 = (mean_rain_stage1 > mean_rain_stage2) * (sum_rain_stage1 - sum_rain_stage2))][]

Upvotes: 3

Related Questions