Reputation: 304
I have a dataframe with a time series that looks like this:
df<-structure(list(date = structure(c(-6905, -6891, -6853, -6588,
-6588, -6586, -6523, -6515, -5856, -5753), class = "Date"), flow = c(2.22,
2.56, 3.3, 1.38, 4, 1.4, 1.32, 1.26, 6, 35.69)), .Names = c("date",
"flow"), row.names = c(NA, 10L), class = "data.frame")
I want to remove all the lines that are not the maximum within 2 days forward or backward of its date. So in the case above, lines 4 and 6 will be removed. I couldn't find similar answered questions.
I wrote this code that doesn't work and it is ugly, long and doesn't take care of the edges of the dataframe:
idx <- c()
for (j in 3:(length(df$date)-2)){
if (as.Date(df$date[j+2])-as.Date(df$date[j])<3 |
as.Date(df$date[j])-as.Date(df$date[j-2])<3){
if (df$flow[j]!=max(df$flow[(j-2):(j+2)])){
idx <- c(idx,j)
}
} else if (as.Date(df$date[j+1])-as.Date(df$date[j])<3 |
as.Date(df$date[j])-as.Date(df$date[j-1])<3){
if (df$flow[j]!=max(df$flow[(j-1):(j+1)])){
idx <- c(idx,j)
}
}
}
Notice that the dates in the dataframe are not consecutive.
Upvotes: 3
Views: 162
Reputation: 8836
Using the zoo
library.
library(zoo)
# convert into a zoo time series
dtf.zoo <- zoo(dt$flow, order.by=dt$date)
# remove duplicate dates by keeping the maximum value
dtf.zoo <- aggregate(dtf.zoo, time(dtf.zoo), max)
# pad with NAs to make the time series regular
dtf.zoo <- merge(
dtf.zoo,
zoo(, seq(min(index(dtf.zoo)), max(index(dtf.zoo)), "day"))
)
# find rows that are less than a value two days prior or hence
rem <- which(dtf.zoo < rollapply(dtf.zoo, 5, max, na.rm=TRUE, partial=TRUE))
# remove those rows
dtf.zoo2 <- dtf.zoo[-rem]
# remove NAs
dt2 <- data.frame(flow=na.omit(dtf.zoo2))
dt2
# flow
# 1951-02-05 2.22
# 1951-02-19 2.56
# 1951-03-29 3.30
# 1951-12-19 4.00
# 1952-02-22 1.32
# 1952-03-01 1.26
# 1953-12-20 6.00
# 1954-04-02 35.69
which(!(dt$flow %in% dt2$flow))
# 4 6
Upvotes: 3
Reputation: 35554
I use lapply()
to check the range : [date - 2 days , date + 2 days] of each date.
rm.list <- lapply(df$date, function(x) {
ind <- which(abs(df$date - x) <= 2)
flow <- df$flow[ind]
if(length(ind) > 1) which(flow < max(flow)) + min(ind) - 1
else NULL
})
rm <- unique(unlist(rm.list)) # [1] 4 6
df[-rm, ]
# date flow
# 1 1951-02-05 2.22
# 2 1951-02-19 2.56
# 3 1951-03-29 3.30
# 5 1951-12-19 4.00
# 7 1952-02-22 1.32
# 8 1952-03-01 1.26
# 9 1953-12-20 6.00
# 10 1954-04-02 35.69
Upvotes: 3
Reputation: 4863
You can also use the tidyverse
approch:
require(tidyverse)
df %>%
#Arrange by date
arrange(date) %>%
#Picking the max for each da
group_by(date) %>%
top_n(1, flow) %>%
ungroup() %>%
#Adding missing dates with NAs
complete(date = seq.Date(min(date), max(date), by="day")) %>%
#Remove Two up/down
mutate(
remove = case_when(
flow < rowMeans(data.frame(lag(flow, 1),
lag(flow, 2)), na.rm = TRUE) ~ "remove",
flow < rowMeans(data.frame(lead(flow, 1),
lead(flow, 2)), na.rm = TRUE) ~ "remove",
TRUE ~ "keep")) %>%
na.omit() %>%
filter(remove == "keep") %>%
select(-remove)
# A tibble: 8 x 2
date flow
<date> <dbl>
1 1951-02-05 2.22
2 1951-02-19 2.56
3 1951-03-29 3.30
4 1951-12-19 4.00
5 1952-02-22 1.32
6 1952-03-01 1.26
7 1953-12-20 6.00
8 1954-04-02 35.7
Upvotes: 2