Reputation: 37
I have the following dataframe:
agrodata
week temperature humidity radiation evapotranspiration
<date> <dbl> <dbl> <dbl> <dbl>
2012-03-25 15.9 54.1 20.4 0
2012-06-17 25.9 65.6 22.2 0.486
2012-06-24 27.4 61.7 21.3 0
2012-07-08 27.5 62.9 22.4 0
2012-07-15 27.5 50.1 23.1 0
2012-07-22 27.0 56.4 19.0 0
2012-07-29 28.7 61.5 19.7 0
2012-08-05 29.1 56.1 20.0 0.0286
2012-08-12 28.6 56.8 19.4 0
2012-08-19 29.0 63.0 18.1 0
2012-08-26 27.4 62.9 18.0 0.229
2012-09-02 16.9 32.9 16.9 0
2012-09-09 25.0 62.1 16.7 0
2012-09-16 25.4 62.6 14.5 0
2012-09-30 22.3 65.6 15.2 0
2012-10-07 22.4 71.4 13.9 0
2012-10-14 19.3 67.2 11.3 0.257
2012-10-21 18.7 74.5 9.70 3.43
2012-10-28 16.4 75.5 8.35 3.2
2012-11-04 17.8 79.7 7.17 7
As you can see I have the date grouped by weeks. I want to create 4 new columns (for each one of the 4 climate variables) containing the mean of the previous 5 consecutive weeks, and if there aren't 5 previous consecutive weeks, then whatever are... Is there some way to do it with dplyr? Thank you.
Upvotes: 0
Views: 867
Reputation: 269664
1) rollapply
Using agroweek
shown in the Note at the end of this answer we assume that:
Use findInterval
to find the row number of the week 6 weeks back and subtract that from the current row number to get, width
, the vector holding the number of rows to average at each point. Then use rollapplyr
to calculate r
the rolling means, prepend mean_
to the column names and cbind
them to the original data frame. For another example, see the example involving findInterval
in Examples section of ?rollapply
.
library(zoo)
k <- 6 # no of weeks
week <- agroweek$week
width <- seq_along(week) - findInterval(week - 7*k, week)
r <- rollapplyr(agroweek[-1], width, mean)
colnames(r) <- paste("mean", colnames(r), sep = "_")
res1 <- cbind(agroweek, r)
giving:
> head(res1)
week temperature humidity radiation evapotranspiration mean_temperature mean_humidity mean_radiation mean_evapotranspiration
1 2012-03-25 15.9 54.1 20.4 0.000 15.90000 54.100 20.40000 0.0000
2 2012-06-17 25.9 65.6 22.2 0.486 25.90000 65.600 22.20000 0.4860
3 2012-06-24 27.4 61.7 21.3 0.000 26.65000 63.650 21.75000 0.2430
4 2012-07-08 27.5 62.9 22.4 0.000 26.93333 63.400 21.96667 0.1620
5 2012-07-15 27.5 50.1 23.1 0.000 27.07500 60.075 22.25000 0.1215
6 2012-07-22 27.0 56.4 19.0 0.000 27.06000 59.340 21.60000 0.0972
2) SQL
Another approach is a left self join in SQL grouping by week and averaging the joined values to each row. This works even if there are NAs and even if the data is not sorted.
library(sqldf)
k <- 6
fn$sqldf("select a.*,
avg(b.temperature) as mean_temperature,
avg(b.humidity) as mean_humidity,
avg(b.radiation) as mean_radiation,
avg(b.evapotranspiration) as mean_evapotranspiration
from agroweek as a
left join agroweek as b on b.week between a.week - ($k-1)*7 and a.week
group by a.week")
This could also be written as:
k <- 6
means <- toString(sprintf("avg(b.%s) as mean_%s", names(DF)[-1], names(DF)[-1]))
fn$sqldf("select a.*, $means
from agroweek as a
left join agroweek as b on b.week between a.week - ($k-1)*7 and a.week
group by a.week")
The input in reproducible form is assumed to be:
agroweek <- structure(list(week = structure(c(15424, 15508, 15515, 15529,
15536, 15543, 15550, 15557, 15564, 15571, 15578, 15585, 15592,
15599, 15613, 15620, 15627, 15634, 15641, 15648), class = "Date"),
temperature = c(15.9, 25.9, 27.4, 27.5, 27.5, 27, 28.7, 29.1,
28.6, 29, 27.4, 16.9, 25, 25.4, 22.3, 22.4, 19.3, 18.7, 16.4,
17.8), humidity = c(54.1, 65.6, 61.7, 62.9, 50.1, 56.4, 61.5,
56.1, 56.8, 63, 62.9, 32.9, 62.1, 62.6, 65.6, 71.4, 67.2,
74.5, 75.5, 79.7), radiation = c(20.4, 22.2, 21.3, 22.4,
23.1, 19, 19.7, 20, 19.4, 18.1, 18, 16.9, 16.7, 14.5, 15.2,
13.9, 11.3, 9.7, 8.35, 7.17), evapotranspiration = c(0, 0.486,
0, 0, 0, 0, 0, 0.0286, 0, 0, 0.229, 0, 0, 0, 0, 0, 0.257,
3.43, 3.2, 7)), row.names = c(NA, -20L), class = "data.frame")
which can be generated like this:
Lines <- "
week temperature humidity radiation evapotranspiration
2012-03-25 15.9 54.1 20.4 0
2012-06-17 25.9 65.6 22.2 0.486
2012-06-24 27.4 61.7 21.3 0
2012-07-08 27.5 62.9 22.4 0
2012-07-15 27.5 50.1 23.1 0
2012-07-22 27.0 56.4 19.0 0
2012-07-29 28.7 61.5 19.7 0
2012-08-05 29.1 56.1 20.0 0.0286
2012-08-12 28.6 56.8 19.4 0
2012-08-19 29.0 63.0 18.1 0
2012-08-26 27.4 62.9 18.0 0.229
2012-09-02 16.9 32.9 16.9 0
2012-09-09 25.0 62.1 16.7 0
2012-09-16 25.4 62.6 14.5 0
2012-09-30 22.3 65.6 15.2 0
2012-10-07 22.4 71.4 13.9 0
2012-10-14 19.3 67.2 11.3 0.257
2012-10-21 18.7 74.5 9.70 3.43
2012-10-28 16.4 75.5 8.35 3.2
2012-11-04 17.8 79.7 7.17 7"
agroweek <- read.table(text = Lines, header = TRUE)
agroweek$week <- as.Date(agroweek$week)
Upvotes: 1
Reputation: 33488
Using data.table
:
setDT(df)
cols <- names(df)[-1]
df[, paste0(cols, "_5w_avg") :=
lapply(.SD, function(x) fcoalesce(frollmean(x, n=5:1, na.rm=TRUE))),
.SDcols = cols]
df
# week temperature humidity radiation evapotranspiration temperature_5w_avg humidity_5w_avg radiation_5w_avg evapotranspiration_5w_avg
# 1: 25/03/2012 15.9 54.1 20.40 0.0000 15.90000 54.10000 20.400 0.000000e+00
# 2: 17/06/2012 25.9 65.6 22.20 0.4860 20.90000 59.85000 21.300 2.430000e-01
# 3: 24/06/2012 27.4 61.7 21.30 0.0000 23.06667 60.46667 21.300 1.620000e-01
# 4: 08/07/2012 27.5 62.9 22.40 0.0000 24.17500 61.07500 21.575 1.215000e-01
# 5: 15/07/2012 27.5 50.1 23.10 0.0000 24.84000 58.88000 21.880 9.720000e-02
# 6: 22/07/2012 27.0 56.4 19.00 0.0000 27.06000 59.34000 21.600 9.720000e-02
# 7: 29/07/2012 28.7 61.5 19.70 0.0000 27.62000 58.52000 21.100 0.000000e+00
# 8: 05/08/2012 29.1 56.1 20.00 0.0286 27.96000 57.40000 20.840 5.720000e-03
# 9: 12/08/2012 28.6 56.8 19.40 0.0000 28.18000 56.18000 20.240 5.720000e-03
# 10: 19/08/2012 29.0 63.0 18.10 0.0000 28.48000 58.76000 19.240 5.720000e-03
# 11: 26/08/2012 27.4 62.9 18.00 0.2290 28.56000 60.06000 19.040 5.152000e-02
# 12: 02/09/2012 16.9 32.9 16.90 0.0000 26.20000 54.34000 18.480 5.152000e-02
# 13: 09/09/2012 25.0 62.1 16.70 0.0000 25.38000 55.54000 17.820 4.580000e-02
# 14: 16/09/2012 25.4 62.6 14.50 0.0000 24.74000 56.70000 16.840 4.580000e-02
# 15: 30/09/2012 22.3 65.6 15.20 0.0000 23.40000 57.22000 16.260 4.580000e-02
# 16: 07/10/2012 22.4 71.4 13.90 0.0000 22.40000 58.92000 15.440 -5.551115e-18
# 17: 14/10/2012 19.3 67.2 11.30 0.2570 22.88000 65.78000 14.320 5.140000e-02
# 18: 21/10/2012 18.7 74.5 9.70 3.4300 21.62000 68.26000 12.920 7.374000e-01
# 19: 28/10/2012 16.4 75.5 8.35 3.2000 19.82000 70.84000 11.690 1.377400e+00
# 20: 04/11/2012 17.8 79.7 7.17 7.0000 18.92000 73.66000 10.084 2.777400e+00
Upvotes: 0
Reputation: 39605
I would suggest using rollmean()
from zoo
and a tidyverse approach with across()
from dplyr
:
library(tidyverse)
library(zoo)
#Data
df <- structure(list(week = c("25/03/2012", "17/06/2012", "24/06/2012",
"08/07/2012", "15/07/2012", "22/07/2012", "29/07/2012", "05/08/2012",
"12/08/2012", "19/08/2012", "26/08/2012", "02/09/2012", "09/09/2012",
"16/09/2012", "30/09/2012", "07/10/2012", "14/10/2012", "21/10/2012",
"28/10/2012", "04/11/2012"), temperature = c(15.9, 25.9, 27.4,
27.5, 27.5, 27, 28.7, 29.1, 28.6, 29, 27.4, 16.9, 25, 25.4, 22.3,
22.4, 19.3, 18.7, 16.4, 17.8), humidity = c(54.1, 65.6, 61.7,
62.9, 50.1, 56.4, 61.5, 56.1, 56.8, 63, 62.9, 32.9, 62.1, 62.6,
65.6, 71.4, 67.2, 74.5, 75.5, 79.7), radiation = c(20.4, 22.2,
21.3, 22.4, 23.1, 19, 19.7, 20, 19.4, 18.1, 18, 16.9, 16.7, 14.5,
15.2, 13.9, 11.3, 9.7, 8.35, 7.17), evapotranspiration = c(0,
0.486, 0, 0, 0, 0, 0, 0.0286, 0, 0, 0.229, 0, 0, 0, 0, 0, 0.257,
3.43, 3.2, 7)), class = "data.frame", row.names = c(NA, -20L))
The code:
df %>%
mutate(across(c(temperature:evapotranspiration),
.fns = list(avg = ~ rollmean(.,k=5,fill=NA,align = 'right')))) -> df2
The output:
week temperature humidity radiation evapotranspiration temperature_avg humidity_avg radiation_avg
1 25/03/2012 15.9 54.1 20.40 0.0000 NA NA NA
2 17/06/2012 25.9 65.6 22.20 0.4860 NA NA NA
3 24/06/2012 27.4 61.7 21.30 0.0000 NA NA NA
4 08/07/2012 27.5 62.9 22.40 0.0000 NA NA NA
5 15/07/2012 27.5 50.1 23.10 0.0000 24.84 58.88 21.880
6 22/07/2012 27.0 56.4 19.00 0.0000 27.06 59.34 21.600
7 29/07/2012 28.7 61.5 19.70 0.0000 27.62 58.52 21.100
8 05/08/2012 29.1 56.1 20.00 0.0286 27.96 57.40 20.840
9 12/08/2012 28.6 56.8 19.40 0.0000 28.18 56.18 20.240
10 19/08/2012 29.0 63.0 18.10 0.0000 28.48 58.76 19.240
11 26/08/2012 27.4 62.9 18.00 0.2290 28.56 60.06 19.040
12 02/09/2012 16.9 32.9 16.90 0.0000 26.20 54.34 18.480
13 09/09/2012 25.0 62.1 16.70 0.0000 25.38 55.54 17.820
14 16/09/2012 25.4 62.6 14.50 0.0000 24.74 56.70 16.840
15 30/09/2012 22.3 65.6 15.20 0.0000 23.40 57.22 16.260
16 07/10/2012 22.4 71.4 13.90 0.0000 22.40 58.92 15.440
17 14/10/2012 19.3 67.2 11.30 0.2570 22.88 65.78 14.320
18 21/10/2012 18.7 74.5 9.70 3.4300 21.62 68.26 12.920
19 28/10/2012 16.4 75.5 8.35 3.2000 19.82 70.84 11.690
20 04/11/2012 17.8 79.7 7.17 7.0000 18.92 73.66 10.084
evapotranspiration_avg
1 NA
2 NA
3 NA
4 NA
5 0.09720
6 0.09720
7 0.00000
8 0.00572
9 0.00572
10 0.00572
11 0.05152
12 0.05152
13 0.04580
14 0.04580
15 0.04580
16 0.00000
17 0.05140
18 0.73740
19 1.37740
20 2.77740
Upvotes: 3