Aceconhielo
Aceconhielo

Reputation: 3636

R - Count max of consecutive days if a event ocurrs

I have the next data.table

TIME       ZONE TEN
2018-07-12  N1  12
2018-07-13  N1  13
2018-07-14  N2  11
2018-07-15  N2  16
2018-07-16  N2  12

I want to count the consecutive days that in each zone the TEN variable is < 15 and add a new column with the max of days consecutives. The output should be:

TIME       ZONE TEN TEN_<_15
2018-07-12  N1  12  2
2018-07-13  N1  13  2
2018-07-14  N2  11  0
2018-07-15  N2  16  0
2018-07-16  N2  12  0

I have tried the next:

dataset[TEN < 15 ,TEN_<_15:= paste0(ZONE, "_", cumsum(!c(TRUE, diff(TIME) == 1))), by = ZONE]

But it does not return the output that I want.

Any ideas?

Thanks

Upvotes: 1

Views: 308

Answers (3)

chinsoon12
chinsoon12

Reputation: 25223

Not sure if I understand your requirement correctly. Anyway, here is a method using data.table and a dataset with more cases:

#convert string to Date
DT[, TIME := as.Date(TIME, "%Y-%m-%d")][, 
    #group by ZONE and consecutive TEN less than 15, then 
    #if there are at least 1 row and gaps between dates are all ones, 
    #return number of rows
    LESS_THAN_15 := if (.N > 1L && all(diff(TIME) == 1L)) .N else 0L,
    by=.(ZONE, rleid(TEN < 15L))]
DT

output:

          TIME ZONE TEN LESS_THAN_15
 1: 2018-07-12   N1  11            2
 2: 2018-07-13   N1  12            2
 3: 2018-07-14   N1  15            0
 4: 2018-07-20   N1  14            0
 5: 2018-08-11   N2   1            3
 6: 2018-08-12   N2   2            3
 7: 2018-08-13   N2   3            3
 8: 2018-08-17   N2  17            0
 9: 2018-08-19   N2   9            0
10: 2018-08-21   N2  11            0
11: 2018-08-23   N2  13            0
12: 2018-08-25   N2  15            0
13: 2018-09-01   N3   4            0
14: 2018-09-01   N3   5            0
15: 2018-09-03   N3   6            0

data with more cases:

library(data.table)
DT <- fread("TIME       ZONE TEN
2018-07-12  N1  11
2018-07-13  N1  12
2018-07-14  N1  15
2018-07-20  N1  14
2018-08-11  N2  1
2018-08-12  N2  2
2018-08-13  N2  3
2018-08-17  N2  17
2018-08-19  N2  9
2018-08-21  N2  11
2018-08-23  N2  13
2018-08-25  N2  15
2018-09-01  N3  4
2018-09-01  N3  5
2018-09-03  N3  6")

Upvotes: 0

moodymudskipper
moodymudskipper

Reputation: 47340

In base R:

dataset$`TEN_<_15` <-
  ave(dataset$TEN,dataset$ZONE,FUN = function(vec)
  with(rle(vec <15), max(lengths[values])))

dataset$`TEN_<_15`[dataset$`TEN_<_15` ==1] <- 0

#         TIME ZONE TEN TEN_<_15
# 1 2018-07-12   N1  12        2
# 2 2018-07-13   N1  13        2
# 3 2018-07-14   N2  11        0
# 4 2018-07-15   N2  16        0
# 5 2018-07-16   N2  12        0

Translated into tidyverse syntax:

library(dplyr)
dataset %>%
  group_by(ZONE) %>%
  mutate(`TEN_<_15` = with(rle(TEN <15), max(lengths[values]))) %>%
  mutate_at("TEN_<_15",~ifelse(.x==1,0,.x))

# # A tibble: 5 x 4
# # Groups:   ZONE [2]
#          TIME   ZONE   TEN `TEN_<_15`
#       <fctr> <fctr> <int>      <dbl>
# 1 2018-07-12     N1    12          2
# 2 2018-07-13     N1    13          2
# 3 2018-07-14     N2    11          0
# 4 2018-07-15     N2    16          0
# 5 2018-07-16     N2    12          0

Upvotes: 3

Rui Barradas
Rui Barradas

Reputation: 76651

Using base R you can do it like this:

dataset$`TEN_<_15` <- ave(dataset$TEN, dataset$ZONE, FUN = function(x){
  r <- rle(x < 15)
  r$values[r$values & r$lengths < 2] <- FALSE
  r$values*r$lengths
})

dataset
#       TIME ZONE TEN TEN_<_15
#1 2018-07-12   N1  12        2
#2 2018-07-13   N1  13        2
#3 2018-07-14   N2  11        0
#4 2018-07-15   N2  16        0
#5 2018-07-16   N2  12        0

Data.

dataset <- read.table(text = "
TIME       ZONE TEN
2018-07-12  N1  12
2018-07-13  N1  13
2018-07-14  N2  11
2018-07-15  N2  16
2018-07-16  N2  12                      
", header = TRUE)

Upvotes: 1

Related Questions