Reputation: 1797
I have a dataframe with timeseries observations. i would like for each observation to add a variable with the value at the closest similar date in the previous year and the closest similar date in the next year (e.g. for a value of 15 May 2014, this might be 13 May 2013 and 21 May 2015). is there a smart way, e.g. using dplyr, to do this? please find example code below (most code focused on creating a random set of dates and value, with thanks to an earlier SO question). Many thanks in advance.
date value nearest_val_nextyear nearest_val_prevyear
1 2009-02-14 6.511781 0 0
2 2009-12-23 5.389843 0 0
3 2011-08-01 4.378759 0 0
4 2014-04-07 2.785300 0 0
5 2008-08-12 6.124931 0 0
6 2014-03-10 4.955066 0 0
7 2014-07-23 4.983810 0 0
8 2012-04-14 5.943836 0 0
9 2012-01-13 5.821221 0 0
10 2007-06-30 5.593901 0 0
11 2008-08-24 5.918977 0 0
12 2008-05-30 5.782136 0 0
13 2012-06-30 5.074565 0 0
14 2010-01-27 3.010648 0 0
15 2013-02-27 5.619826 0 0
16 2010-12-25 4.943871 0 0
17 2012-09-27 4.844204 0 0
18 2014-12-08 3.529248 0 0
19 2010-01-15 4.521850 0 0
20 2013-03-21 5.417942 0 0
# set start and end dates to sample between
day.start <- "2007/01/01"
day.end <- "2014/12/31"
set.seed(1)
# define a random date/time selection function
rand.day.time <- function(day.start,day.end,size) {
dayseq <- seq.Date(as.Date(day.start),as.Date(day.end),by="day")
dayselect <- sample(dayseq,size,replace=TRUE)
as.POSIXlt(paste(dayselect) )
}
dateval=rand.day.time(day.start,day.end,size=20)
value=rnorm(n=20,mean=5,sd=1)
df=data.frame(date=dateval,value=value)
df$nearest_val_nextyear=0
df$nearest_val_prevyear=0
df
Upvotes: 1
Views: 2580
Reputation: 456
Molx's answer is shorter and better than mine, but since I have already written it out, and just in case you wanted an answer that a) makes heavier use of functions, and b) is written in base R, here is mine.
Load in the data:
dates = read.table("date_data.txt")
This is a function to find the difference in days in terms of months and dates only; which you want so that different years won't be compared in terms of their closeness.
#get differences in terms of the months and dates only
compare_dates_days <- function(date1, date2, date_format = "%Y-%m-%d"){
#give them all "blank" years of "00"
month_day_only1 = paste("00", strsplit(date1, "-")[[1]][2], strsplit(date1, "-")[[1]][3], sep = "-")
month_day_only2 = paste("00", strsplit(date2, "-")[[1]][2], strsplit(date2, "-")[[1]][3], sep = "-")
day_difference = as.numeric(as.Date(as.character(month_day_only1, format = "%m-%d")) -
as.Date(as.character(month_day_only2, format = "%m-%d")))
return(day_difference)
}
#testing the above function
a = "2009-02-14"
b = "2009-02-28"
diff = compare_dates_days(a, b)
A function to find the minimum of the absolute values of non-zero values in a vector.
min_abs_index <- function(v){
v.na = abs(v)
v.na[v==0] = NA
return(c( which.min(v.na) ))
}
Here is the function which compares one date to a vector of dates and spits out the index of that date's closest day of the above and below years; it uses the above functions.
above_below_year_date <- function(date, date_ref_compare, date_format = "%Y-%m-%d"){
one_year_ahead_diffs = rep(0, length(date_ref_compare))
one_year_behind_diffs = rep(0, length(date_ref_compare))
date_diffs = unlist(lapply(seq_along(1:length(date_ref_compare)),
function(i) compare_dates_days(date_ref_compare[i],date )))
for(i in 1:length(date_ref_compare)){
#calendar year ahead
if(as.numeric(sapply(strsplit(date, "-"),"[[", 1)) -
as.numeric(sapply(strsplit(date_ref_compare[i],
"-"),"[[", 1)) == 1){
one_year_ahead_diffs[i] = date_diffs[i]
}
#calendar year behind
if(as.numeric(sapply(strsplit(date, "-"),"[[", 1)) -
as.numeric(sapply(strsplit(date_ref_compare[i],
"-"),"[[", 1)) == -1){
one_year_behind_diffs[i] = date_diffs[i]
}
}
res_ahead = min_abs_index(one_year_ahead_diffs)
print(res_ahead)
print(one_year_ahead_diffs[res_ahead])
print(one_year_ahead_diffs)
res_behind = min_abs_index(one_year_behind_diffs)
return(c(res_ahead, res_behind))
}
We apply the above function to each of the dates in the vector provided:
vector_of_ahead_indices = rep(0, length(dates$date))
vector_of_behind_indices = rep(0, length(dates$date))
for(i in 1:length(dates$date)){
res = above_below_year_date(dates$date[i], dates$date)
vector_of_ahead_indices[i] = res[1]
vector_of_behind_indices[i] = res[2]
}
dates$nearest_val_nextyear = dates$value[vector_of_behind_indices]
dates$nearest_val_prevyear = dates$value[vector_of_ahead_indices]
Then we order to make it easier to manually check, and reorder the first year which had the NA value in the wrong column.
#order to make it easier to manually check
dates = dates[order(dates$date), ]
#reorder the first year
dates[1, "nearest_val_nextyear"] = dates[1, "nearest_val_prevyear"]
dates[1, "nearest_val_prevyear"] = NA
Finally here is the sorted output along with your original row names:
date value nearest_val_nextyear nearest_val_prevyear
10 2007-06-30 5.593901 5.782136 NA
12 2008-05-30 5.782136 6.511781 5.593901
5 2008-08-12 6.124931 5.389843 5.593901
11 2008-08-24 5.918977 5.389843 5.593901
1 2009-02-14 6.511781 3.010648 5.782136
2 2009-12-23 5.389843 4.943871 5.918977
19 2010-01-15 4.521850 4.378759 6.511781
14 2010-01-27 3.010648 4.378759 6.511781
16 2010-12-25 4.943871 4.378759 5.389843
3 2011-08-01 4.378759 5.074565 4.943871
9 2012-01-13 5.821221 5.619826 4.378759
8 2012-04-14 5.943836 5.417942 4.378759
13 2012-06-30 5.074565 5.417942 4.378759
17 2012-09-27 4.844204 5.417942 4.378759
15 2013-02-27 5.619826 4.955066 5.821221
20 2013-03-21 5.417942 4.955066 5.943836
6 2014-03-10 4.955066 NA 5.417942
4 2014-04-07 2.785300 NA 5.417942
7 2014-07-23 4.983810 NA 5.417942
18 2014-12-08 3.529248 NA 5.417942
Upvotes: 0
Reputation: 6931
This is definitely not a smart way of doing this, but I'm posting with the hopes that someone, maybe you, can make it smart/pretty.
library(dplyr)
library(lubridate)
dat <- data.frame(dateval, value)
dat <- dat %>% mutate(year = year(dateval), nv_next = NA, nv_prev = NA)
#You don't really need dplyr just for this...
shifts <- c(1, -1) #nextyear, prevyear
for (s in 1:2) { #Once for each shift
for (i in 1:nrow(dat)) {
otheryear <- dat[dat[,"year"]==dat[i,"year"]+shifts[s],] #Subset the df with only dates of other year
if (nrow(otheryear) == 0) { #Ends if there's no other year
dat[i,3+s] <- NA
} else {
cands <- otheryear$dateval #Candidates to have their value chosen
cands_shifted <- cands
year(cands_shifted) <- dat[i,"year"] #Change the year in cand's copy
nearest_date <- which.min(abs(difftime(dat[i,"dateval"], cands_shifted))) #After the years are the same, the closest date can be calculated with difftime
dat[i,3+s] <- dat[dat$dateval == cands[nearest_date],"value"] #We check back on cands what real date that was, and assign it's value
}
}
}
This resulted in
> dat
dateval value year nv_next nv_prev
1 2009-02-14 6.511781 2009 3.010648 5.782136
2 2009-12-23 5.389843 2009 4.943871 5.918977
3 2011-08-01 4.378759 2011 5.074565 4.943871
4 2014-04-07 2.785300 2014 NA 5.417942
5 2008-08-12 6.124931 2008 5.389843 5.593901
6 2014-03-10 4.955066 2014 NA 5.619826
7 2014-07-23 4.983810 2014 NA 5.417942
8 2012-04-14 5.943836 2012 5.417942 4.378759
9 2012-01-13 5.821221 2012 5.619826 4.378759
10 2007-06-30 5.593901 2007 5.782136 NA
11 2008-08-24 5.918977 2008 5.389843 5.593901
12 2008-05-30 5.782136 2008 6.511781 5.593901
13 2012-06-30 5.074565 2012 5.417942 4.378759
14 2010-01-27 3.010648 2010 4.378759 6.511781
15 2013-02-27 5.619826 2013 4.955066 5.821221
16 2010-12-25 4.943871 2010 4.378759 5.389843
17 2012-09-27 4.844204 2012 5.417942 4.378759
18 2014-12-08 3.529248 2014 NA 5.417942
19 2010-01-15 4.521850 2010 4.378759 6.511781
20 2013-03-21 5.417942 2013 4.955066 5.943836
I nested the for loops instead of using a copy for each shift, but you must be careful with the nv_next
and nv_prev
since they were selected by index and not name.
Upvotes: 1