Reputation: 304

Find the maximum in a certain time frame in a non-continuous time series

I have a dataframe with a time series that looks like this:

df<-structure(list(date = structure(c(-6905, -6891, -6853, -6588, 
-6588, -6586, -6523, -6515, -5856, -5753), class = "Date"), flow = c(2.22, 
2.56, 3.3, 1.38, 4, 1.4, 1.32, 1.26, 6, 35.69)), .Names = c("date", 
"flow"), row.names = c(NA, 10L), class = "data.frame")

I want to remove all the lines that are not the maximum within 2 days forward or backward of its date. So in the case above, lines 4 and 6 will be removed. I couldn't find similar answered questions.

I wrote this code that doesn't work and it is ugly, long and doesn't take care of the edges of the dataframe:

  idx <- c()
  for (j in 3:(length(df$date)-2)){
    if (as.Date(df$date[j+2])-as.Date(df$date[j])<3 |
        as.Date(df$date[j])-as.Date(df$date[j-2])<3){
      if (df$flow[j]!=max(df$flow[(j-2):(j+2)])){
        idx <- c(idx,j)
      }
    } else if (as.Date(df$date[j+1])-as.Date(df$date[j])<3 |
               as.Date(df$date[j])-as.Date(df$date[j-1])<3){
      if (df$flow[j]!=max(df$flow[(j-1):(j+1)])){
        idx <- c(idx,j)
      }
    }
  }

Notice that the dates in the dataframe are not consecutive.

Upvotes: 3

Answers (3)

AkselA

Reputation: 8836

Using the zoo library.

library(zoo)

# convert into a zoo time series
dtf.zoo <- zoo(dt$flow, order.by=dt$date)

# remove duplicate dates by keeping the maximum value
dtf.zoo <- aggregate(dtf.zoo, time(dtf.zoo), max)

# pad with NAs to make the time series regular
dtf.zoo <- merge(
  dtf.zoo, 
  zoo(, seq(min(index(dtf.zoo)), max(index(dtf.zoo)), "day"))
)

# find rows that are less than a value two days prior or hence
rem <- which(dtf.zoo < rollapply(dtf.zoo, 5, max, na.rm=TRUE, partial=TRUE))

# remove those rows
dtf.zoo2 <- dtf.zoo[-rem]

# remove NAs
dt2 <- data.frame(flow=na.omit(dtf.zoo2))
dt2
#             flow
# 1951-02-05  2.22
# 1951-02-19  2.56
# 1951-03-29  3.30
# 1951-12-19  4.00
# 1952-02-22  1.32
# 1952-03-01  1.26
# 1953-12-20  6.00
# 1954-04-02 35.69

which(!(dt$flow %in% dt2$flow))
# 4 6

Upvotes: 3

Darren Tsai

Reputation: 35554

I use lapply() to check the range : [date - 2 days , date + 2 days] of each date.

rm.list <- lapply(df$date, function(x) {
  ind <- which(abs(df$date - x) <= 2)
  flow <- df$flow[ind]
  if(length(ind) > 1) which(flow < max(flow)) + min(ind) - 1
  else NULL
})

rm <- unique(unlist(rm.list)) # [1] 4 6
df[-rm, ]

#          date  flow
# 1  1951-02-05  2.22
# 2  1951-02-19  2.56
# 3  1951-03-29  3.30
# 5  1951-12-19  4.00
# 7  1952-02-22  1.32
# 8  1952-03-01  1.26
# 9  1953-12-20  6.00
# 10 1954-04-02 35.69

Upvotes: 3

DJV

Reputation: 4863

You can also use the tidyverse approch:

require(tidyverse)

df %>% 
  #Arrange by date
  arrange(date) %>%
  #Picking the max for each da
  group_by(date) %>% 
  top_n(1, flow) %>% 
  ungroup() %>%
  #Adding missing dates with NAs
  complete(date = seq.Date(min(date), max(date), by="day")) %>% 
  #Remove Two up/down
  mutate(
    remove = case_when(
      flow < rowMeans(data.frame(lag(flow, 1), 
                                 lag(flow, 2)), na.rm = TRUE) ~ "remove", 
      flow < rowMeans(data.frame(lead(flow, 1),
                                 lead(flow, 2)), na.rm = TRUE) ~ "remove", 
      TRUE ~ "keep")) %>% 
  na.omit() %>%
  filter(remove == "keep") %>% 
  select(-remove)


# A tibble: 8 x 2
  date        flow
  <date>     <dbl>
1 1951-02-05  2.22
2 1951-02-19  2.56
3 1951-03-29  3.30
4 1951-12-19  4.00
5 1952-02-22  1.32
6 1952-03-01  1.26
7 1953-12-20  6.00
8 1954-04-02 35.7

Upvotes: 2

Find the maximum in a certain time frame in a non-continuous time series

Answers (3)

Related Questions