user1885116
user1885116

Reputation: 1797

Find value in previous and next year

I have a dataframe with timeseries observations. i would like for each observation to add a variable with the value at the closest similar date in the previous year and the closest similar date in the next year (e.g. for a value of 15 May 2014, this might be 13 May 2013 and 21 May 2015). is there a smart way, e.g. using dplyr, to do this? please find example code below (most code focused on creating a random set of dates and value, with thanks to an earlier SO question). Many thanks in advance.

       date    value nearest_val_nextyear nearest_val_prevyear
1  2009-02-14 6.511781                    0                    0
2  2009-12-23 5.389843                    0                    0
3  2011-08-01 4.378759                    0                    0
4  2014-04-07 2.785300                    0                    0
5  2008-08-12 6.124931                    0                    0
6  2014-03-10 4.955066                    0                    0
7  2014-07-23 4.983810                    0                    0
8  2012-04-14 5.943836                    0                    0
9  2012-01-13 5.821221                    0                    0
10 2007-06-30 5.593901                    0                    0
11 2008-08-24 5.918977                    0                    0
12 2008-05-30 5.782136                    0                    0
13 2012-06-30 5.074565                    0                    0
14 2010-01-27 3.010648                    0                    0
15 2013-02-27 5.619826                    0                    0
16 2010-12-25 4.943871                    0                    0
17 2012-09-27 4.844204                    0                    0
18 2014-12-08 3.529248                    0                    0
19 2010-01-15 4.521850                    0                    0
20 2013-03-21 5.417942                    0                    0



# set start and end dates to sample between
day.start <- "2007/01/01"
day.end <- "2014/12/31"

set.seed(1)
# define a random date/time selection function
rand.day.time <- function(day.start,day.end,size) {
  dayseq <- seq.Date(as.Date(day.start),as.Date(day.end),by="day")
  dayselect <- sample(dayseq,size,replace=TRUE)
  as.POSIXlt(paste(dayselect) )
}

dateval=rand.day.time(day.start,day.end,size=20)
value=rnorm(n=20,mean=5,sd=1)
df=data.frame(date=dateval,value=value)
df$nearest_val_nextyear=0
df$nearest_val_prevyear=0
df

Upvotes: 1

Views: 2580

Answers (2)

Andy McKenzie
Andy McKenzie

Reputation: 456

Molx's answer is shorter and better than mine, but since I have already written it out, and just in case you wanted an answer that a) makes heavier use of functions, and b) is written in base R, here is mine.

Load in the data:

dates = read.table("date_data.txt")

This is a function to find the difference in days in terms of months and dates only; which you want so that different years won't be compared in terms of their closeness.

#get differences in terms of the months and dates only 
compare_dates_days <- function(date1, date2, date_format = "%Y-%m-%d"){

    #give them all "blank" years of "00"
    month_day_only1 = paste("00", strsplit(date1,  "-")[[1]][2], strsplit(date1,  "-")[[1]][3], sep = "-")
    month_day_only2 = paste("00", strsplit(date2,  "-")[[1]][2], strsplit(date2,  "-")[[1]][3], sep = "-")

    day_difference = as.numeric(as.Date(as.character(month_day_only1, format = "%m-%d")) -
        as.Date(as.character(month_day_only2, format = "%m-%d")))

    return(day_difference)

}

#testing the above function 
a = "2009-02-14"
b = "2009-02-28"
diff = compare_dates_days(a, b)

A function to find the minimum of the absolute values of non-zero values in a vector.

min_abs_index <- function(v){

  v.na = abs(v)
  v.na[v==0] = NA
  return(c( which.min(v.na) ))

}

Here is the function which compares one date to a vector of dates and spits out the index of that date's closest day of the above and below years; it uses the above functions.

above_below_year_date <- function(date, date_ref_compare, date_format = "%Y-%m-%d"){

    one_year_ahead_diffs = rep(0, length(date_ref_compare))
    one_year_behind_diffs = rep(0, length(date_ref_compare))

    date_diffs = unlist(lapply(seq_along(1:length(date_ref_compare)),
        function(i) compare_dates_days(date_ref_compare[i],date )))

    for(i in 1:length(date_ref_compare)){
        #calendar year ahead
        if(as.numeric(sapply(strsplit(date, "-"),"[[", 1)) - 
            as.numeric(sapply(strsplit(date_ref_compare[i], 
            "-"),"[[", 1)) == 1){
            one_year_ahead_diffs[i] = date_diffs[i]
        }
        #calendar year behind
        if(as.numeric(sapply(strsplit(date, "-"),"[[", 1)) - 
            as.numeric(sapply(strsplit(date_ref_compare[i], 
            "-"),"[[", 1)) == -1){
            one_year_behind_diffs[i] = date_diffs[i]
        }
    }

    res_ahead = min_abs_index(one_year_ahead_diffs)
    print(res_ahead)
    print(one_year_ahead_diffs[res_ahead])

    print(one_year_ahead_diffs)

    res_behind = min_abs_index(one_year_behind_diffs)

    return(c(res_ahead, res_behind))

}

We apply the above function to each of the dates in the vector provided:

vector_of_ahead_indices = rep(0, length(dates$date))
vector_of_behind_indices = rep(0, length(dates$date))

for(i in 1:length(dates$date)){
    res = above_below_year_date(dates$date[i], dates$date)
    vector_of_ahead_indices[i] = res[1]
    vector_of_behind_indices[i] = res[2]
}

dates$nearest_val_nextyear = dates$value[vector_of_behind_indices]
dates$nearest_val_prevyear = dates$value[vector_of_ahead_indices]

Then we order to make it easier to manually check, and reorder the first year which had the NA value in the wrong column.

#order to make it easier to manually check 
dates = dates[order(dates$date), ] 

#reorder the first year 
dates[1, "nearest_val_nextyear"] = dates[1, "nearest_val_prevyear"]
dates[1, "nearest_val_prevyear"] = NA

Finally here is the sorted output along with your original row names:

         date    value nearest_val_nextyear nearest_val_prevyear
10 2007-06-30 5.593901             5.782136                   NA
12 2008-05-30 5.782136             6.511781             5.593901
5  2008-08-12 6.124931             5.389843             5.593901
11 2008-08-24 5.918977             5.389843             5.593901
1  2009-02-14 6.511781             3.010648             5.782136
2  2009-12-23 5.389843             4.943871             5.918977
19 2010-01-15 4.521850             4.378759             6.511781
14 2010-01-27 3.010648             4.378759             6.511781
16 2010-12-25 4.943871             4.378759             5.389843
3  2011-08-01 4.378759             5.074565             4.943871
9  2012-01-13 5.821221             5.619826             4.378759
8  2012-04-14 5.943836             5.417942             4.378759
13 2012-06-30 5.074565             5.417942             4.378759
17 2012-09-27 4.844204             5.417942             4.378759
15 2013-02-27 5.619826             4.955066             5.821221
20 2013-03-21 5.417942             4.955066             5.943836
6  2014-03-10 4.955066                   NA             5.417942
4  2014-04-07 2.785300                   NA             5.417942
7  2014-07-23 4.983810                   NA             5.417942
18 2014-12-08 3.529248                   NA             5.417942

Upvotes: 0

Molx
Molx

Reputation: 6931

This is definitely not a smart way of doing this, but I'm posting with the hopes that someone, maybe you, can make it smart/pretty.

library(dplyr)
library(lubridate)
dat <- data.frame(dateval, value)
dat <- dat %>% mutate(year = year(dateval), nv_next = NA, nv_prev = NA)
#You don't really need dplyr just for this...

shifts <- c(1, -1) #nextyear, prevyear

for (s in 1:2) { #Once for each shift
  for (i in 1:nrow(dat)) {
    otheryear <- dat[dat[,"year"]==dat[i,"year"]+shifts[s],] #Subset the df with only dates of other year
    if (nrow(otheryear) == 0) { #Ends if there's no other year
      dat[i,3+s] <- NA
    } else {
      cands <- otheryear$dateval #Candidates to have their value chosen
      cands_shifted <- cands
      year(cands_shifted) <- dat[i,"year"] #Change the year in cand's copy
      nearest_date <- which.min(abs(difftime(dat[i,"dateval"], cands_shifted))) #After the years are the same, the closest date can be calculated with difftime
      dat[i,3+s] <- dat[dat$dateval == cands[nearest_date],"value"] #We check back on cands what real date that was, and assign it's value
    }   
  }
}

This resulted in

> dat
      dateval    value year  nv_next  nv_prev
1  2009-02-14 6.511781 2009 3.010648 5.782136
2  2009-12-23 5.389843 2009 4.943871 5.918977
3  2011-08-01 4.378759 2011 5.074565 4.943871
4  2014-04-07 2.785300 2014       NA 5.417942
5  2008-08-12 6.124931 2008 5.389843 5.593901
6  2014-03-10 4.955066 2014       NA 5.619826
7  2014-07-23 4.983810 2014       NA 5.417942
8  2012-04-14 5.943836 2012 5.417942 4.378759
9  2012-01-13 5.821221 2012 5.619826 4.378759
10 2007-06-30 5.593901 2007 5.782136       NA
11 2008-08-24 5.918977 2008 5.389843 5.593901
12 2008-05-30 5.782136 2008 6.511781 5.593901
13 2012-06-30 5.074565 2012 5.417942 4.378759
14 2010-01-27 3.010648 2010 4.378759 6.511781
15 2013-02-27 5.619826 2013 4.955066 5.821221
16 2010-12-25 4.943871 2010 4.378759 5.389843
17 2012-09-27 4.844204 2012 5.417942 4.378759
18 2014-12-08 3.529248 2014       NA 5.417942
19 2010-01-15 4.521850 2010 4.378759 6.511781
20 2013-03-21 5.417942 2013 4.955066 5.943836

I nested the for loops instead of using a copy for each shift, but you must be careful with the nv_next and nv_prev since they were selected by index and not name.

Upvotes: 1

Related Questions