Reputation: 7445
I'm building some weather data and need to check and make sure that there are no outliers, values equal to -9999, and no missing days. If any of these conditions are found, I've written a function nearest()
which will find the 5 closest stations and compute an inverse distance weighted value, then plug that back into where the condition was found. The problem is that the code works, but it will take a very long time to run. I have over 600 stations and each station takes about 1 hour to compute.
Can this code be optimized to improve computation time? What is the best way to deal with nested for()
loops being used this way?
The following code is a very small portion of the data set used as a reproducible example. This obviously runs very fast, but when spread out over the entire data set will take a long time. Notice that in output, row 10 has an NA in the value. When the code is run, that value is replaced.
dput:
db_sid <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4,
element = "TMAX", firstyear = 1892L, lastyear = 1952L, state = "arkansas"), .Names = c("id",
"lat", "long", "element", "firstyear", "lastyear", "state"), row.names = 5L, class = "data.frame")
output <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = c(1900,
1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900), month = c(1,
1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1), date = structure(c(-25567, -25567, -25536, -25536, -25508,
-25508, -25477, -25477, -25447, -25447), class = "Date"), value = c(30.02,
10.94, 37.94, 10.94, NA, 28.04, 64.94, 41, 82.04, 51.08)), .Names = c("id",
"element", "year", "month", "day", "date", "value"), row.names = c(NA,
-10L), class = c("tbl_df", "data.frame"))
newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = structure(c(1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"),
year = c("1900", "1900", "1900", "1900", "1900", "1900",
"1900", "1900", "1900", "1900"), month = c("01", "01", "02",
"02", "03", "04", "04", "05", "05", "01"), day = c("01",
"01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567,
-25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447,
-25566), class = "Date"), value = c(30.02, 10.94, 37.94,
10.94, 28.04, 64.94, 41, 82.04, 51.08, NA)), .Names = c("id",
"element", "year", "month", "day", "date", "value"), row.names = c(NA,
10L), class = "data.frame")
stack <- structure(list(id = c("USC00035754", "USC00236357", "USC00033466",
"USC00032930"), x = c(-92.0189, -95.1464, -93.0486, -94.4481),
y = c(34.2256, 39.9808, 34.5128, 36.4261), value = c(62.06,
44.96, 55.94, 57.92)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .Names = c("id", "x", "y", "value"))
station <- structure(list(id = "USC00031632", lat = 36.4197, long = -90.5858,
value = 30.02), row.names = c(NA, -1L), class = c("tbl_df",
"data.frame"), .Names = c("id", "lat", "long", "value"))
nearest()
function:
nearest <- function(id, yr, mnt, dy, ele, out, stack, station){
if (dim(stack)[1] >= 1){
ifelse(dim(stack)[1] == 1, v <- stack$value, v <- idw(stack$value, stack[,2:4], station[,2:3]))
} else {
ret <- filter(out, id == s_id & year == yr, month == mnt, element == ele, value != -9999)
v <- mean(ret$value)
}
return(v)
}
for()
loops:
library(dplyr)
library(phylin)
library(lubridate)
for (i in unique(db_sid$id)){
# Check for outliers
for(j in which(output$value > 134 | output$value < -80 | output$value == -9999)){
output[j,7] <- nearest(id = j, yr = as.numeric(output[j,3]), mnt = as.numeric(output[j,4]), dy = as.numeric(output[j,5]),
ele = as.character(output[j,2]), out = output)
}
# Check for NA and replace
for (k in which(is.na(newdat$value))){
newdat[k,7] <- nearest(id = k, yr = as.numeric(newdat[k,3]), mnt = as.numeric(newdat[k,4]), dy = as.numeric(newdat[k,5]),
ele = as.character(newdat[k,2]), out = newdat, stack = stack, station = station)
}
}
Upvotes: 0
Views: 144
Reputation: 4024
I'm not sure I understand at all what you're trying to do. For example, the i from the outer for loop is never actually used. Here is some code that I think will be useful to you:
library(plyr)
library(dplyr)
output_summary =
output %>%
filter(value %>% between(-80, 134) ) %>%
group_by(date, element, id) %>%
summarize(mean_value = mean(value))
if (nrow(stack) == 1) fill_value = stack$value else
fill_value = idw(
stack$value,
stack %>% select(x, y, value),
station %>% select(lat, long) )
newdat_filled =
newdat %>%
mutate(filled_value =
value %>%
mapvalues(NA, fill_value) )
Upvotes: 1