shiny
shiny

Reputation: 3502

dplyr: sum of daily values for whole year and sum of specific daily values in the same formula

Using df data.frame

date <- rep(as.Date(seq(as.Date("2003-01-01"), 
                        as.Date("2005-12-31"), by = 1), 
                    format="%Y-%m-%d"), 9)
site <- c(rep("Site_1", 3*1096), rep("Site_2", 3*1096), rep("Site_3", 3*1096))
rain <- c(rep(as.numeric(sample(1.1e6:87e6, 1096, replace=T)),3),
               rep(as.numeric(sample(1.3e5:56e6, 1096, replace=T)),3),
               rep(as.numeric(sample(5e5:77e6, 1096, replace=T)),3))
parameter <- rep(c(rep("param_A", 1096), rep("param_B", 1096), rep("param_c", 1096)), 3)
value <- c(runif(1096, 0.005, 2.3)/1e6, 
           runif(1096, 0.5, 3.1)/1e6,
           runif(1096, 0.003, 0.04)/1e6,
           runif(1096, 0.002, 1.7)/1e6, 
           runif(1096, 0.3, 4.5)/1e6,
           runif(1096, 0.001, 0.07)/1e6,
           runif(1096, 0.007, 2.7)/1e6, 
           runif(1096, 0.4, 2.8)/1e6,
           runif(1096, 0.004, 0.09)/1e6)

df <- data.frame( date, site, rain, parameter, value)
df[c(1:4, 8:10, 30:35, 60:65, 90:97, 100:125, 524:645, 
     1000:1100, 1400:1540, 1789:1890, 2100:2250,
     2459:2765, 3942:3987, 4600:4698, 5210:5310, 6081:6154, 7613:7689, 
     8809:8888, 9120:9190, 9600:9650), 5] <- NA

For each site, I want to calculate for each year, for each parameter, a variable let's name it saturation where it equals

(sum(rain*value)/sum(rain)) for days where value is not NA * sum(rain per year)

I wanted to do this using dplyr. I tried the following code

library(dplyr)
df1 <- df %>%
  dplyr::mutate(year = factor(format(date, "%Y"))) %>%
  dplyr::arrange(site, year, parameter)  %>%
  dplyr::group_by(site, year, parameter ) %>%
  dplyr::summarise(sum_rain = sum(rain))

df2 <- df %>%
  dplyr::mutate(year = factor(format(date, "%Y"))) %>%
  dplyr::arrange(site, year, parameter)  %>%
  dplyr::group_by(site, year, parameter ) %>%
  dplyr::filter (!is.na(value)) %>%
  dplyr::summarise(specific_days = sum(rain*value)/sum(rain))

saturation <- df1$sum_rain * df2$specific_days

It worked fine and gave me what I wanted. However, I had to create two data.frames df1 and df2 and multiply df1$sum_rain by df2$specific_days to get the saturation. Is there anyway to do that without creating two data.frames using dplyr.

Upvotes: 3

Views: 332

Answers (2)

alistaire
alistaire

Reputation: 43334

You can use mutate to add columns of repeated values without collapsing the data.frame like summarise would:

df %>% tbl_df() %>%    # for printing
    group_by(site, year = lubridate::year(date), parameter) %>%    # add variables inline
    mutate(sum_rain = sum(rain)) %>%    # add column but don't collapse df
    filter(!is.na(value)) %>% 
    mutate(specific_days = sum(rain*value)/sum(rain), 
           saturation = sum_rain * specific_days) %>%    # add vars dependent on previous
    arrange(site, year, parameter) %>% 
    full_join(df)    # reinsert NA rows

# Source: local data frame [9,864 x 9]
# Groups: site, year, parameter [?]
# 
#          date   site     rain parameter        value  year    sum_rain specific_days saturation
#        (date) (fctr)    (dbl)    (fctr)        (dbl) (int)       (dbl)         (dbl)      (dbl)
# 1  2003-01-05 Site_1 32113070   param_A 1.050169e-07  2003 16033161650  1.225226e-06   19644.24
# 2  2003-01-06 Site_1 37830442   param_A 1.854250e-06  2003 16033161650  1.225226e-06   19644.24
# 3  2003-01-07 Site_1 76670445   param_A 1.386651e-06  2003 16033161650  1.225226e-06   19644.24
# 4  2003-01-11 Site_1 80337620   param_A 4.348852e-07  2003 16033161650  1.225226e-06   19644.24
# 5  2003-01-12 Site_1 77468528   param_A 1.118393e-06  2003 16033161650  1.225226e-06   19644.24
# 6  2003-01-13 Site_1 12609166   param_A 5.386190e-08  2003 16033161650  1.225226e-06   19644.24
# 7  2003-01-14 Site_1 80655681   param_A 1.881504e-06  2003 16033161650  1.225226e-06   19644.24
# 8  2003-01-15 Site_1 73617496   param_A 1.558818e-06  2003 16033161650  1.225226e-06   19644.24
# 9  2003-01-16 Site_1 30367141   param_A 2.068242e-06  2003 16033161650  1.225226e-06   19644.24
# 10 2003-01-17 Site_1 16743355   param_A 1.551760e-06  2003 16033161650  1.225226e-06   19644.24
# ..        ...    ...      ...       ...          ...   ...         ...           ...        ...

or if you just want the grouping variables and saturation,

df %>% tbl_df() %>%
    group_by(site, year = lubridate::year(date), parameter) %>%
    mutate(sum_rain = sum(rain), add = TRUE) %>%
    filter(!is.na(value)) %>% 
    summarise(saturation = unique(sum_rain * sum(rain*value)/sum(rain)))

# Source: local data frame [27 x 4]
# Groups: site, year [?]
# 
#      site  year parameter saturation
#    (fctr) (dbl)    (fctr)      (dbl)
# 1  Site_1  2003   param_A 19644.2422
# 2  Site_1  2003   param_B 28599.3730
# 3  Site_1  2003   param_c   320.6451
# 4  Site_1  2004   param_A 18333.6141
# 5  Site_1  2004   param_B 28856.5608
# 6  Site_1  2004   param_c   357.9545
# 7  Site_1  2005   param_A 17621.4250
# 8  Site_1  2005   param_B 27565.1503
# 9  Site_1  2005   param_c   338.8673
# 10 Site_2  2003   param_A  8584.3319
# ..    ...   ...       ...        ...

Upvotes: 2

akrun
akrun

Reputation: 886948

We can do this in a single chain by subsetting the rain for non-NA 'value' with is.na

res <- df %>%
        mutate(year = factor(format(date, "%Y"))) %>%
        arrange(site, year, parameter)  %>%
        group_by(site, year, parameter ) %>% 
        summarise(sum_rain = sum(rain), 
            specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])) %>% 
        mutate(saturation = sum_rain * specific_days)
res %>%
    as.data.frame()
#      site year parameter    sum_rain    specific_days saturation
#1  Site_1 2003   param_A 15988875602 0.00000123589041 19760.4980
#2  Site_1 2003   param_B 15988875602 0.00000172552158 27589.1499
#3  Site_1 2003   param_c 15988875602 0.00000002161544   345.6067
#4  Site_1 2004   param_A 15180127505 0.00000116507160 17685.9355
#5  Site_1 2004   param_B 15180127505 0.00000181695952 27581.6772
#6  Site_1 2004   param_c 15180127505 0.00000002185010   331.6873
#7  Site_1 2005   param_A 16058234005 0.00000120130563 19290.8469
#8  Site_1 2005   param_B 16058234005 0.00000186185975 29898.1795
#9  Site_1 2005   param_c 16058234005 0.00000002049335   329.0870
#10 Site_2 2003   param_A  9930134442 0.00000079639249  7908.2845
#11 Site_2 2003   param_B  9930134442 0.00000246576645 24485.3923
#12 Site_2 2003   param_c  9930134442 0.00000003348046   332.4655
#13 Site_2 2004   param_A 10926778631 0.00000088141235  9630.9976
#14 Site_2 2004   param_B 10926778631 0.00000244015257 26663.0070
#15 Site_2 2004   param_c 10926778631 0.00000003448817   376.8447
#16 Site_2 2005   param_A  9599581600 0.00000089477811  8589.4955
#17 Site_2 2005   param_B  9599581600 0.00000238522373 22897.1498
#18 Site_2 2005   param_c  9599581600 0.00000003442887   330.5027
#19 Site_3 2003   param_A 13711985538 0.00000142896664 19593.9700
#20 Site_3 2003   param_B 13711985538 0.00000157700917 21623.9270
#21 Site_3 2003   param_c 13711985538 0.00000004665944   639.7935
#22 Site_3 2004   param_A 14371047715 0.00000134324260 19303.8035
#23 Site_3 2004   param_B 14371047715 0.00000156583784 22502.7303
#24 Site_3 2004   param_c 14371047715 0.00000004859102   698.3039
#25 Site_3 2005   param_A 13729491381 0.00000131305086 18027.5205
#26 Site_3 2005   param_B 13729491381 0.00000159005889 21830.6999
#27 Site_3 2005   param_c 13729491381 0.00000004616979   633.8878





identical(df1['sum_rain'], res['sum_rain'])
#[1] TRUE

identical(df2['specific_days'], res['specific_days'])
#[1] TRUE

NO need to do another join. This gives the expected output as in the OP's post and not provide any incorrect output.


Or this can be also done with data.table

library(data.table)
setDT(df)[, .(sum_rain = sum(rain), 
              specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])),
            by =  .(site, year= factor(format(date, "%Y")), parameter)
      ][, saturation := sum_rain * specific_days][]
#      site year parameter    sum_rain    specific_days saturation
# 1: Site_1 2003   param_A 15988875602 0.00000123589041 19760.4980
# 2: Site_1 2004   param_A 15180127505 0.00000116507160 17685.9355
# 3: Site_1 2005   param_A 16058234005 0.00000120130563 19290.8469
# 4: Site_1 2003   param_B 15988875602 0.00000172552158 27589.1499
# 5: Site_1 2004   param_B 15180127505 0.00000181695952 27581.6772
# 6: Site_1 2005   param_B 16058234005 0.00000186185975 29898.1795
# 7: Site_1 2003   param_c 15988875602 0.00000002161544   345.6067
# 8: Site_1 2004   param_c 15180127505 0.00000002185010   331.6873
# 9: Site_1 2005   param_c 16058234005 0.00000002049335   329.0870
#10: Site_2 2003   param_A  9930134442 0.00000079639249  7908.2845
#11: Site_2 2004   param_A 10926778631 0.00000088141235  9630.9976
#12: Site_2 2005   param_A  9599581600 0.00000089477811  8589.4955
#13: Site_2 2003   param_B  9930134442 0.00000246576645 24485.3923
#14: Site_2 2004   param_B 10926778631 0.00000244015257 26663.0070
#15: Site_2 2005   param_B  9599581600 0.00000238522373 22897.1498
#16: Site_2 2003   param_c  9930134442 0.00000003348046   332.4655
#17: Site_2 2004   param_c 10926778631 0.00000003448817   376.8447
#18: Site_2 2005   param_c  9599581600 0.00000003442887   330.5027
#19: Site_3 2003   param_A 13711985538 0.00000142896664 19593.9700
#20: Site_3 2004   param_A 14371047715 0.00000134324260 19303.8035
#21: Site_3 2005   param_A 13729491381 0.00000131305086 18027.5205
#22: Site_3 2003   param_B 13711985538 0.00000157700917 21623.9270
#23: Site_3 2004   param_B 14371047715 0.00000156583784 22502.7303
#24: Site_3 2005   param_B 13729491381 0.00000159005889 21830.6999
#25: Site_3 2003   param_c 13711985538 0.00000004665944   639.7935
#26: Site_3 2004   param_c 14371047715 0.00000004859102   698.3039
#27: Site_3 2005   param_c 13729491381 0.00000004616979   633.8878

Upvotes: 5

Related Questions