Reputation: 3502
Using df
that has daily values of rain
for two year
s (2003 and 2004) at two site
s
set.seed(123)
date <- rep(as.Date(seq(as.Date("2003-01-01"),
as.Date("2004-12-31"), by = 1),
format="%Y-%m-%d"), 2)
site <- c(rep("Site_1", 731), rep("Site_2", 731))
rain <- c(as.numeric(sample(1.1e6:87e6, 731, replace=T)),
as.numeric(sample(4.5e5:56e5, 731, replace=T)))
df <- data.frame(date, site, rain)
and df1
that has monthly measurements of two parameters param_A
and param_B
at each of the two sites. For the two parameters, there are sometimes more than one measurement per day.
set.seed(123)
date <- rep(c(as.Date("2003-01-05"),as.Date("2003-02-12"), as.Date("2003-03-07"), as.Date("2003-04-07"),
as.Date("2003-05-13"),as.Date("2003-06-17"), as.Date("2003-07-16"), as.Date("2003-08-23"),
as.Date("2003-09-24"),as.Date("2003-10-18"), as.Date("2003-11-08"), as.Date("2003-12-28"),
rep(as.Date("2003-03-01"), 3), rep(as.Date("2003-05-14"), 3), rep(as.Date("2003-08-21"), 3),
as.Date("2004-01-14"),as.Date("2004-02-02"), as.Date("2004-03-17"), as.Date("2004-04-03"),
as.Date("2004-05-05"),as.Date("2004-06-17"), as.Date("2004-07-04"), as.Date("2004-08-08"),
as.Date("2004-09-16"),as.Date("2004-10-24"), as.Date("2004-11-09"), as.Date("2004-12-07"),
rep(as.Date("2004-09-07"), 4), rep(as.Date("2004-10-27"), 2), rep(as.Date("2004-12-14"), 4)), 4)
site <- c(rep("Site_1", 86), rep("Site_2", 86))
parameter <- rep(c(rep("param_A", 43), rep("param_B", 43)), 2)
value <- c(runif(43, 0.05, 2.3)/1e6,
runif(43, 0.03, 3.1)/1e6,
runif(43, 0.004, 0.09)/1e6,
runif(43, 0.001, 0.07)/1e6)
df1 <- data.frame(date, site, parameter, value)
I merged the two data.frames (df
and df1
) together into one data.frame df2
df2 <- merge(df, df1, by= c("date", "site"), all=T)
For each site, I want to calculate for each year
, for each parameter
, a variable let's name it saturation
where it equals
(sum(rain*value)/sum(rain)) for days where value is not NA * sum(rain for whole year)
library(dplyr)
df_fin <- df2 %>%
mutate(year = factor(format(date, "%Y"))) %>%
arrange(site, year, parameter) %>%
group_by(site, year, parameter ) %>%
summarise(sum_rain = sum(rain),
specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])) %>%
mutate(saturation = sum_rain * specific_days)
At each of the two sites, param_A
and param_B
have, in few days, more than one measurement per day. As a result, when calculating the sum of rain
in the formula, the repeated rain value per day is being added which increase the sum(daily rain values)
per year.
Is there a way to keep one rain value per day and not to add up these repeated daily values?
UPDATE
To be more specific, the sum()
of daily of values of rain
using df
will be as below
library(dplyr)
df_1 <- df %>%
mutate(year = factor(format(date, "%Y"))) %>%
arrange(site, year) %>%
group_by(site, year) %>%
summarise(sum_rain = sum(rain))
df_1
#Source: local data frame [4 x 3]
#Groups: site [?]
# site year sum_rain
# (fctr) (fctr) (dbl)
#1 Site_1 2003 16039370227
#2 Site_1 2004 16170122189
#3 Site_2 2003 1078275774
#4 Site_2 2004 1090111362
Whereas in df_fin
, sum_rain
is different from above
df_fin <- df2 %>%
mutate(year = factor(format(date, "%Y"))) %>%
arrange(site, year, parameter) %>%
group_by(site, year, parameter ) %>%
summarise(sum_rain = sum(rain),
specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])) %>%
mutate(saturation = sum_rain * specific_days)
df_fin
site year parameter sum_rain specific_days saturation
(fctr) (fctr) (fctr) (dbl) (dbl) (dbl)
1 Site_1 2003 param_A 916392248 1.342250e-06 1230.027083
2 Site_1 2003 param_B 916392248 1.263414e-06 1157.783068
3 Site_1 2003 NA 15413670047 NaN NaN
4 Site_1 2004 param_A 949808653 1.253945e-06 1191.007549
5 Site_1 2004 param_B 949808653 1.517257e-06 1441.104187
6 Site_1 2004 NA 15623884439 NaN NaN
7 Site_2 2003 param_A 59690558 5.113758e-08 3.052431
8 Site_2 2003 param_B 59690558 3.592397e-08 2.144322
9 Site_2 2003 NA 1038708704 NaN NaN
10 Site_2 2004 param_A 76792434 4.508143e-08 3.461913
11 Site_2 2004 param_B 76792434 2.958886e-08 2.272201
12 Site_2 2004 NA 1035805843 NaN NaN
I want sum_rain
in df_fin
to be the same as sum_rain
in df
Upvotes: 1
Views: 114
Reputation: 887118
May be this helps
library(dplyr)
library(lubridate)
df %>%
group_by(site, Year = year(date)) %>%
mutate(sum_rain = sum(rain)) %>%
full_join(., df1, by = c('date', 'site')) %>%
filter(!is.na(parameter)) %>%
group_by(parameter, add=TRUE) %>%
summarise(sum_rain = first(sum_rain),
specific_days = sum(rain*value, na.rm=TRUE)/sum(rain[!is.na(value)])) %>%
mutate(saturation = sum_rain * specific_days)
# site Year parameter sum_rain specific_days saturation
# <fctr> <dbl> <fctr> <dbl> <dbl> <dbl>
#1 Site_1 2003 param_A 16039370227 0.00000134224955 21528.83747
#2 Site_1 2003 param_B 16039370227 0.00000126341430 20264.36967
#3 Site_1 2004 param_A 16170122189 0.00000125394472 20276.43941
#4 Site_1 2004 param_B 16170122189 0.00000151725738 24534.23720
#5 Site_2 2003 param_A 1078275774 0.00000005113758 55.14042
#6 Site_2 2003 param_B 1078275774 0.00000003592397 38.73595
#7 Site_2 2004 param_A 1090111362 0.00000004508143 49.14378
#8 Site_2 2004 param_B 1090111362 0.00000002958886 32.25515
If we want the NA rows as well, remove the filter
step
df %>%
group_by(site, Year = year(date)) %>%
mutate(sum_rain = sum(rain)) %>%
full_join(., df1, by = c('date', 'site')) %>%
group_by(parameter, add=TRUE) %>%
summarise(sum_rain = first(sum_rain),
specific_days = sum(rain*value,
na.rm=TRUE)/sum(rain[!is.na(value)])) %>%
mutate(saturation = sum_rain * specific_days)
# site Year parameter sum_rain specific_days saturation
# <fctr> <dbl> <fctr> <dbl> <dbl> <dbl>
#1 Site_1 2003 param_A 16039370227 0.00000134224955 21528.83747
#2 Site_1 2003 param_B 16039370227 0.00000126341430 20264.36967
#3 Site_1 2003 NA 16039370227 NaN NaN
#4 Site_1 2004 param_A 16170122189 0.00000125394472 20276.43941
#5 Site_1 2004 param_B 16170122189 0.00000151725738 24534.23720
#6 Site_1 2004 NA 16170122189 NaN NaN
#7 Site_2 2003 param_A 1078275774 0.00000005113758 55.14042
#8 Site_2 2003 param_B 1078275774 0.00000003592397 38.73595
#9 Site_2 2003 NA 1078275774 NaN NaN
#10 Site_2 2004 param_A 1090111362 0.00000004508143 49.14378
#11 Site_2 2004 param_B 1090111362 0.00000002958886 32.25515
#12 Site_2 2004 NA 1090111362 NaN NaN
Upvotes: 1