Vedda
Vedda

Reputation: 7445

How to optimize these for loops and function

Problem

I'm building some weather data and need to check and make sure that there are no outliers, values equal to -9999, and no missing days. If any of these conditions are found, I've written a function nearest() which will find the 5 closest stations and compute an inverse distance weighted value, then plug that back into where the condition was found. The problem is that the code works, but it will take a very long time to run. I have over 600 stations and each station takes about 1 hour to compute.

Question

Can this code be optimized to improve computation time? What is the best way to deal with nested for() loops being used this way?

Code

The following code is a very small portion of the data set used as a reproducible example. This obviously runs very fast, but when spread out over the entire data set will take a long time. Notice that in output, row 10 has an NA in the value. When the code is run, that value is replaced.

dput:

db_sid <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4, 
    element = "TMAX", firstyear = 1892L, lastyear = 1952L, state = "arkansas"), .Names = c("id", 
"lat", "long", "element", "firstyear", "lastyear", "state"), row.names = 5L, class = "data.frame")

output <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX", 
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = c(1900, 
1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900), month = c(1, 
1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 
1), date = structure(c(-25567, -25567, -25536, -25536, -25508, 
-25508, -25477, -25477, -25447, -25447), class = "Date"), value = c(30.02, 
10.94, 37.94, 10.94, NA, 28.04, 64.94, 41, 82.04, 51.08)), .Names = c("id", 
"element", "year", "month", "day", "date", "value"), row.names = c(NA, 
-10L), class = c("tbl_df", "data.frame"))

newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632"), element = structure(c(1L, 2L, 
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"), 
    year = c("1900", "1900", "1900", "1900", "1900", "1900", 
    "1900", "1900", "1900", "1900"), month = c("01", "01", "02", 
    "02", "03", "04", "04", "05", "05", "01"), day = c("01", 
    "01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567, 
    -25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447, 
    -25566), class = "Date"), value = c(30.02, 10.94, 37.94, 
    10.94, 28.04, 64.94, 41, 82.04, 51.08, NA)), .Names = c("id", 
"element", "year", "month", "day", "date", "value"), row.names = c(NA, 
10L), class = "data.frame")

stack <- structure(list(id = c("USC00035754", "USC00236357", "USC00033466", 
"USC00032930"), x = c(-92.0189, -95.1464, -93.0486, -94.4481), 
    y = c(34.2256, 39.9808, 34.5128, 36.4261), value = c(62.06, 
    44.96, 55.94, 57.92)), row.names = c(NA, -4L), class = c("tbl_df", 
"tbl", "data.frame"), .Names = c("id", "x", "y", "value"))

station <- structure(list(id = "USC00031632", lat = 36.4197, long = -90.5858, 
    value = 30.02), row.names = c(NA, -1L), class = c("tbl_df", 
"data.frame"), .Names = c("id", "lat", "long", "value"))

nearest() function:

nearest <- function(id, yr, mnt, dy, ele, out, stack, station){

  if (dim(stack)[1] >= 1){
    ifelse(dim(stack)[1] == 1, v <- stack$value, v <- idw(stack$value, stack[,2:4], station[,2:3])) 
  } else {
    ret <- filter(out, id == s_id & year == yr, month == mnt, element == ele, value != -9999)
    v <- mean(ret$value) 
  } 
  return(v)
}

for() loops:

library(dplyr)
library(phylin)
library(lubridate)

for (i in unique(db_sid$id)){

  # Check for outliers
  for(j in which(output$value > 134 | output$value < -80 | output$value == -9999)){
    output[j,7] <- nearest(id = j, yr = as.numeric(output[j,3]), mnt = as.numeric(output[j,4]), dy = as.numeric(output[j,5]),
                           ele = as.character(output[j,2]), out = output)
  }

  # Check for NA and replace
  for (k in which(is.na(newdat$value))){
   newdat[k,7] <- nearest(id = k, yr = as.numeric(newdat[k,3]), mnt = as.numeric(newdat[k,4]), dy = as.numeric(newdat[k,5]),
                           ele = as.character(newdat[k,2]), out = newdat, stack = stack, station = station)
  }

}

Upvotes: 0

Views: 144

Answers (1)

bramtayl
bramtayl

Reputation: 4024

I'm not sure I understand at all what you're trying to do. For example, the i from the outer for loop is never actually used. Here is some code that I think will be useful to you:

library(plyr)
library(dplyr)

output_summary = 
  output %>%
  filter(value %>% between(-80, 134) ) %>%
  group_by(date, element, id) %>%
  summarize(mean_value = mean(value))

if (nrow(stack) == 1) fill_value = stack$value else
  fill_value = idw(
    stack$value,
    stack %>% select(x, y, value),
    station %>% select(lat, long) )

newdat_filled = 
  newdat %>%
  mutate(filled_value = 
           value %>% 
           mapvalues(NA, fill_value) )

Upvotes: 1

Related Questions