Reputation: 123
I have two dataframes. df1 is a dataframe that includes multiple locations within a each set of units. df2 includes daily observations for maximum temperature (df$tmax) for all locations. For each unit within df1 I would like to compute the average daily max. temperature across all locations within each unit.
The following code generates examples of each dataframe. I will need to scale this up to about 240 units and 8 years of daily data.
These kind of lookup/matching exercises in R always seem to get me. There must be an obvious way to do this, but I'm stymied at the moment without some real brute force joining etc.
df1 <-
structure(list(unitID = c("98008", "98008", "98008", "98008",
"98065", "98065", "98065", "98065", "98146", "98146", "98146",
"98146", "98584", "98584", "98584"), locationID = c("USW00094290", "USW00094248",
"USW00024234", "USC00454169", "USC00458508", "USS0021B60S", "USR0000WFTA",
"USC00451233", "USW00024234", "USW00024233", "USW00094248", "USC00454169",
"USW00094227", "USC00451939", "USC00455086")), class = "data.frame", row.names = c(NA,
-15L))
df1
unitID locationID
1 98008 USW00094290
2 98008 USW00094248
3 98008 USW00024234
4 98008 USC00454169
5 98065 USC00458508
6 98065 USS0021B60S
7 98065 USR0000WFTA
8 98065 USC00451233
9 98146 USW00024234
10 98146 USW00024233
11 98146 USW00094248
12 98146 USC00454169
13 98584 USW00094227
14 98584 USC00451939
15 98584 USC00455086
df2 <-
structure(list(id = c("USW00094290", "USW00094290", "USW00094248",
"USW00094248", "USW00024234", "USW00024234", "USC00454169", "USC00454169",
"USC00458508", "USC00458508", "USS0021B60S", "USS0021B60S", "USR0000WFTA",
"USR0000WFTA", "USC00451233", "USC00451233", "USW00024233", "USW00024233",
"USW00094227", "USW00094227", "USC00451939", "USC00451939", "USC00455086",
"USC00455086"), date = structure(c(17167, 17168, 17167, 17168,
17167, 17168, 17167, 17168, 17167, 17168, 17167, 17168, 17167,
17168, 17167, 17168, 17167, 17168, 17167, 17168, 17167, 17168,
17167, 17168), class = "Date"), tmax = c(28, 28, 28, 28, 33,
28, 33, 28, -11, -28, -17, -50, 11, -17, 0, -11, 28, 11, 44,
33, 50, 39, 39, 28)), row.names = c(NA, -24L), class = c("tbl_df",
"tbl", "data.frame"))
df2
# A tibble: 24 x 3
id date tmax
<chr> <date> <dbl>
1 USW00094290 2017-01-01 28
2 USW00094290 2017-01-02 28
3 USW00094248 2017-01-01 28
4 USW00094248 2017-01-02 28
5 USW00024234 2017-01-01 33
6 USW00024234 2017-01-02 28
7 USC00454169 2017-01-01 33
8 USC00454169 2017-01-02 28
9 USC00458508 2017-01-01 -11
10 USC00458508 2017-01-02 -28
# ... with 14 more rows
The output should include the unitID, date, and average max. Temp.
unitID date avg_temp
98008 2009-01-01 30.5
98008 2009-01-02 ...
98008 2009-01-03 ...
Upvotes: 1
Views: 46
Reputation: 887098
We can use data.table
join
library(data.table)
setDT(df1)[setDT(df2), on = .(locationID = id)][,
.(tmx = mean(tmax, na.rm = TRUE)), .(unitID, locationID)]
#. unitID locationID tmx
# 1: 98008 USW00094290 28.0
# 2: 98008 USW00094248 28.0
# 3: 98146 USW00094248 28.0
# 4: 98008 USW00024234 30.5
# 5: 98146 USW00024234 30.5
# 6: 98008 USC00454169 30.5
# 7: 98146 USC00454169 30.5
# 8: 98065 USC00458508 -19.5
# 9: 98065 USS0021B60S -33.5
#10: 98065 USR0000WFTA -3.0
#11: 98065 USC00451233 -5.5
#12: 98146 USW00024233 19.5
#13: 98584 USW00094227 38.5
#14: 98584 USC00451939 44.5
#15: 98584 USC00455086 33.5
Upvotes: 0
Reputation: 388982
We could use left_join
, group_by
unitID
and locationID
and take mean
of tmax
.
library(dplyr)
df1 %>%
left_join(df2, by = c("locationID" = "id")) %>%
group_by(unitID, locationID) %>%
summarise(tmx = mean(tmax, na.rm = TRUE))
# unitID locationID tmx
# <chr> <chr> <dbl>
# 1 98008 USC00454169 30.5
# 2 98008 USW00024234 30.5
# 3 98008 USW00094248 28
# 4 98008 USW00094290 28
# 5 98065 USC00451233 -5.5
# 6 98065 USC00458508 -19.5
# 7 98065 USR0000WFTA -3
# 8 98065 USS0021B60S -33.5
# 9 98146 USC00454169 30.5
#10 98146 USW00024233 19.5
#11 98146 USW00024234 30.5
#12 98146 USW00094248 28
#13 98584 USC00451939 44.5
#14 98584 USC00455086 33.5
#15 98584 USW00094227 38.5
In base R, we can use merge
and aggregate
aggregate(tmax~unitID + locationID,
merge(df1, df2, by.x = "locationID", by.y = "id", all.x = TRUE),
mean, na.rm = TRUE)
Upvotes: 1