AudileF
AudileF

Reputation: 446

How to filter dataframe for consecutively increasing values

I have a large dataframe with several columns but for this query Im interested in 3 columns.

df <- structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ireland", class = "factor"), 
    parameter = structure(c(2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L,2L, 1L, 3L), .Label = c("rainfall", "temp", "wind"), class = "factor"), 
    value = c(10L, 15L, 20L, 9L, 18L, 10L, 12L, 25L, 15L, 10L, 10L, 20L), unit = structure(c(3L, 2L, 1L, 3L, 2L, 1L, 3L, 
    2L, 1L, 3L, 2L, 1L), .Label = c("km/hr", "mm", "oC"), class = "factor")), class = "data.frame", row.names = c(NA, -12L))

country parameter   value   unit
ireland temp          10    oC
ireland rainfall      15    mm
ireland wind          20    km/hr
ireland temp           9    oC
ireland rainfall      18    mm
ireland wind          10    km/hr
ireland temp          12    oC
ireland rainfall      25    mm
ireland wind          15    km/hr
ireland temp          10    oC
ireland rainfall      10    mm
ireland wind          20    km/hr

I want to group by country and parameter to extract rows where the value column consecutively increase 3 or more times.

Example of desire output.

country parameter   value   unit
ireland rainfall      15    mm
ireland rainfall      18    mm
ireland rainfall      25    mm
ireland wind          10    km/hr
ireland wind          15    km/hr
ireland wind          20    km/hr

Upvotes: 4

Views: 371

Answers (4)

ThomasIsCoding
ThomasIsCoding

Reputation: 101317

Here is a base R option using subset + ave + rle

subset(
  df[with(df,order(country, parameter)), ],
  !!ave(value, country, parameter, FUN = function(x) with(rle(cumsum(c(1, diff(x) <= 0))), rep(lengths >= 3, lengths)))
)

which gives

   country parameter value  unit
2  ireland  rainfall    15    mm
5  ireland  rainfall    18    mm
8  ireland  rainfall    25    mm
6  ireland      wind    10 km/hr
9  ireland      wind    15 km/hr
12 ireland      wind    20 km/hr

Upvotes: 1

lroha
lroha

Reputation: 34416

You can group by country and parameter and them create a third grouping variable for values that aren't less than the lagged value, then filter group sizes 3 or larger:

library(dplyr)

df %>%
  group_by(country, parameter) %>%
  group_by(x = cumsum(value <= lag(value, default = FALSE)), .add = TRUE) %>%
  filter(n() >= 3) %>%
  ungroup() %>%
  arrange(country, parameter) %>%
  select(-x)

# A tibble: 6 x 4
  country parameter value unit 
  <fct>   <fct>     <int> <fct>
1 ireland rainfall     15 mm   
2 ireland rainfall     18 mm   
3 ireland rainfall     25 mm   
4 ireland wind         10 km/hr
5 ireland wind         15 km/hr
6 ireland wind         20 km/hr

Upvotes: 2

det
det

Reputation: 5232

df %>% 
  group_by(country, parameter) %>% 
  mutate(
    flag = c(0, diff(value)) > 0,
    flag_lag = lead(flag),
    seq_end = flag == TRUE & flag_lag %in% c(NA, FALSE),
    seq_begin = flag == FALSE & flag_lag == TRUE,
  ) %>%
  slice(if(length(which(seq_begin == TRUE):which(seq_end == TRUE)) >= 3) which(seq_begin == TRUE):which(seq_end == TRUE) else NA) %>%
  select(-contains("flag"), -contains("seq"))

Upvotes: 0

mhovd
mhovd

Reputation: 4067

Here is a solution using dplyr

df %>%
  arrange(value,decreasing = FALSE) %>% # Arrange by value
  filter(lag(value) > 3) # And filter for a difference above 3 in value

df
   country parameter value  unit
1  ireland      temp    10    oC
2  ireland      wind    10 km/hr
3  ireland      temp    10    oC
4  ireland  rainfall    10    mm
5  ireland      temp    12    oC
6  ireland  rainfall    15    mm
7  ireland      wind    15 km/hr
8  ireland  rainfall    18    mm
9  ireland      wind    20 km/hr
10 ireland      wind    20 km/hr
11 ireland  rainfall    25    mm

Upvotes: -1

Related Questions