How to optimize these for loops and function

Question

Problem

I'm building some weather data and need to check and make sure that there are no outliers, values equal to -9999, and no missing days. If any of these conditions are found, I've written a function nearest() which will find the 5 closest stations and compute an inverse distance weighted value, then plug that back into where the condition was found. The problem is that the code works, but it will take a very long time to run. I have over 600 stations and each station takes about 1 hour to compute.

Question

Can this code be optimized to improve computation time? What is the best way to deal with nested for() loops being used this way?

Code

The following code is a very small portion of the data set used as a reproducible example. This obviously runs very fast, but when spread out over the entire data set will take a long time. Notice that in output, row 10 has an NA in the value. When the code is run, that value is replaced.

dput:

db_sid <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4, 
    element = "TMAX", firstyear = 1892L, lastyear = 1952L, state = "arkansas"), .Names = c("id", 
"lat", "long", "element", "firstyear", "lastyear", "state"), row.names = 5L, class = "data.frame")

output <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX", 
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = c(1900, 
1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900), month = c(1, 
1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 
1), date = structure(c(-25567, -25567, -25536, -25536, -25508, 
-25508, -25477, -25477, -25447, -25447), class = "Date"), value = c(30.02, 
10.94, 37.94, 10.94, NA, 28.04, 64.94, 41, 82.04, 51.08)), .Names = c("id", 
"element", "year", "month", "day", "date", "value"), row.names = c(NA, 
-10L), class = c("tbl_df", "data.frame"))

newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632"), element = structure(c(1L, 2L, 
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"), 
    year = c("1900", "1900", "1900", "1900", "1900", "1900", 
    "1900", "1900", "1900", "1900"), month = c("01", "01", "02", 
    "02", "03", "04", "04", "05", "05", "01"), day = c("01", 
    "01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567, 
    -25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447, 
    -25566), class = "Date"), value = c(30.02, 10.94, 37.94, 
    10.94, 28.04, 64.94, 41, 82.04, 51.08, NA)), .Names = c("id", 
"element", "year", "month", "day", "date", "value"), row.names = c(NA, 
10L), class = "data.frame")

stack <- structure(list(id = c("USC00035754", "USC00236357", "USC00033466", 
"USC00032930"), x = c(-92.0189, -95.1464, -93.0486, -94.4481), 
    y = c(34.2256, 39.9808, 34.5128, 36.4261), value = c(62.06, 
    44.96, 55.94, 57.92)), row.names = c(NA, -4L), class = c("tbl_df", 
"tbl", "data.frame"), .Names = c("id", "x", "y", "value"))

station <- structure(list(id = "USC00031632", lat = 36.4197, long = -90.5858, 
    value = 30.02), row.names = c(NA, -1L), class = c("tbl_df", 
"data.frame"), .Names = c("id", "lat", "long", "value"))

nearest() function:

nearest <- function(id, yr, mnt, dy, ele, out, stack, station){

  if (dim(stack)[1] >= 1){
    ifelse(dim(stack)[1] == 1, v <- stack$value, v <- idw(stack$value, stack[,2:4], station[,2:3])) 
  } else {
    ret <- filter(out, id == s_id & year == yr, month == mnt, element == ele, value != -9999)
    v <- mean(ret$value) 
  } 
  return(v)
}

for() loops:

library(dplyr)
library(phylin)
library(lubridate)

for (i in unique(db_sid$id)){

  # Check for outliers
  for(j in which(output$value > 134 | output$value < -80 | output$value == -9999)){
    output[j,7] <- nearest(id = j, yr = as.numeric(output[j,3]), mnt = as.numeric(output[j,4]), dy = as.numeric(output[j,5]),
                           ele = as.character(output[j,2]), out = output)
  }

  # Check for NA and replace
  for (k in which(is.na(newdat$value))){
   newdat[k,7] <- nearest(id = k, yr = as.numeric(newdat[k,3]), mnt = as.numeric(newdat[k,4]), dy = as.numeric(newdat[k,5]),
                           ele = as.character(newdat[k,2]), out = newdat, stack = stack, station = station)
  }

}

How to optimize these for loops and function

Problem

Question

Code

Answers (1)

Related Questions