User LG
User LG

Reputation: 307

Run length sequence by time and ID

This problem does not seem to have been put out here before.

I want to find the number of subjects that score 1 for 6 consecutive hours. The subjects have not been scored every hour so if an hour is missing the hours are not consecutive and the output for that 6-hour period should be NA. The reason for assigning NA would be that we do not know how the subject has scored on the missing hour. This problem can be used to count consecutive hits, but only count it if a subject has participated.

My dataframe looks like this:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
df<-data.frame(ID,hour,A)

I have tried to use the rle function (I am sure its possible) but I cannot get it to condition on both hour and ID. The output would be like this:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six<-c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

Thank you in advance.

I believe the original data set I gave was too small to make the solutions more generalizable.
I just tried the codes with this dataset and found that this will result in a wrong result.

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31) 
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
df<-data.frame(ID,hour,A)

For the new dataset the output should be:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31) 
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

Upvotes: 8

Views: 348

Answers (3)

missuse
missuse

Reputation: 19726

Here is an approach in tidyverse with the updated data set:

library(tidyverse)

df %>%
  group_by(ID) %>%
  expand(hour = seq(min(hour), max(hour))) %>%
  left_join(df) %>%
  mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
  group_by(ID, rle) %>%
  mutate(sum = cumsum(A),
         six = ifelse(rle >= 6 & A == 1, 0, NA),
         six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
  filter(!is.na(A)) %>%
  ungroup() %>%
  select(ID, hour, A, six) %>%
  as.data.frame() ->  df_out2

check requested output:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31) 
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

all.equal(df, df_out2)
#output
TRUE

The old answer:

df %>%
  mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
  group_by(ID, rle) %>%
  mutate(sum = cumsum(A),
         six = ifelse(rle >= 6 & A == 1, 0, NA),
         six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
  ungroup() %>%
  select(ID, hour, A, six) %>%
  as.data.frame() ->  df_out2

Lets check if the result is like requested:

ID <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour <- c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A <- c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six <- c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df1 <- data.frame(ID, hour, A, six)

df1 is requested output

all.equal(df1, df_out2)
#output
TRUE   

some benchmarking:

library(microbenchmark)
library(data.table)

akrun <- function(df){
  setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
                               ][,  sixnew :=if(sum(A)>=5)  rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
  i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
  df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
  }

missuse <- function(df){
  df %>%
    mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
    group_by(ID, rle) %>%
    mutate(sum = cumsum(A),
           six = ifelse(rle >= 6 & A == 1, 0, NA),
           six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
    ungroup() %>%
    select(ID, hour, A, six)
}


Mike <- function(df){
  ave(df$A, 
      cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)), 
      FUN = function(x) {
        if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
        else return(rep(NA, length(x)))})
}

microbenchmark(Mike(df),
               akrun(df),
               missuse(df))

#output
Unit: microseconds
        expr       min         lq       mean     median         uq       max neval
    Mike(df)   491.291   575.7115   704.2213   597.7155   629.0295  9578.684   100
   akrun(df)  6568.313  6725.5175  7867.4059  6843.5790  7279.2240 69790.755   100
 missuse(df) 11042.822 11321.0505 12434.8671 11512.3200 12616.3485 43170.935   100

way to go Mike H.!

Upvotes: 7

Mike H.
Mike H.

Reputation: 14370

To get the groupings you can compare the current hour to the lagged hour to see if they are "consecutive" or 1 integer apart and then take the cumsum of that. Once you have the groupings you can use a simple ave to get the output you want.

library(data.table)
df$six <-  ave(df$A, 
               cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)), 
               FUN = function(x) {
                    if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
                    else return(rep(NA, length(x)))}
               )

df
#   ID hour A six
#1   1    1 0  NA
#2   1    2 1  NA
#3   1    3 0  NA
#4   1    7 1   0
#5   1    8 1   0
#6   1    9 1   0
#7   1   10 1   0
#8   1   11 1   0
#9   1   12 1   1
#10  1   17 0  NA
#11  1   18 0  NA
#12  1   19 0  NA
#13  2    1 1   0
#14  2    2 1   0
#15  2    3 1   0
#16  2    4 1   0
#17  2    5 1   0
#18  2    6 1   1
#19  2    8 1  NA
#20  2    9 1  NA
#21  2   15 1  NA

If you only want to select a patient at most once, this will select the last time period:

df$six_adj <- ave(df$six, df$ID, df$six, FUN = function(x) {
                                                if(all(x==1)) return(c(rep(0, length(x) - 1), 1))
                                                else return(x)}
  )

Upvotes: 7

akrun
akrun

Reputation: 887541

We can use rleid from data.table. Create a grouping column with run-length-id of 'A' ('grp'). Take the difference of 'hour' with the next value of 'hour', check if it is equal to 1 and multiply with 'A' to create 'Anew'. Grouped by run-length-id of 'Anew' and 'grp', if the sum of 'A' is greater than a particular value, replicate with 0s and 1s or else return NA. In the last phase, assign some spillover NAs to 0 by creating an index ('i1')

library(data.table)
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
     ][,  sixnew :=if(sum(A)>=5)  rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
#    ID hour A six sixnew
# 1:  1    1 0  NA     NA
# 2:  1    2 1  NA     NA
# 3:  1    3 0  NA     NA
# 4:  1    7 1   0      0
# 5:  1    8 1   0      0
# 6:  1    9 1   0      0
# 7:  1   10 1   0      0
# 8:  1   11 1   0      0
# 9:  1   12 1   1      1
#10:  1   17 0  NA     NA
#11:  1   18 0  NA     NA
#12:  1   19 0  NA     NA
#13:  2    1 1   0      0
#14:  2    2 1   0      0
#15:  2    3 1   0      0
#16:  2    4 1   0      0
#17:  2    5 1   0      0
#18:  2    6 1   1      1
#19:  2    8 1  NA     NA
#20:  2    9 1  NA     NA
#21:  2   15 1  NA     NA

Or slightly more compact option would be

i1 <- setDT(df)[, if(sum(A)>= 5)  .I[.N] , rleid(c(TRUE,  diff(hour)==1), A)]$V1
df[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
df
#    ID hour A six sixnew
# 1:  1    1 0  NA     NA
# 2:  1    2 1  NA     NA
# 3:  1    3 0  NA     NA
# 4:  1    7 1   0      0
# 5:  1    8 1   0      0
# 6:  1    9 1   0      0
# 7:  1   10 1   0      0
# 8:  1   11 1   0      0
# 9:  1   12 1   1      1
#10:  1   17 0  NA     NA
#11:  1   18 0  NA     NA
#12:  1   19 0  NA     NA
#13:  2    1 1   0      0
#14:  2    2 1   0      0
#15:  2    3 1   0      0
#16:  2    4 1   0      0
#17:  2    5 1   0      0
#18:  2    6 1   1      1
#19:  2    8 1  NA     NA
#20:  2    9 1  NA     NA
#21:  2   15 1  NA     NA

Benchmarks

Created a slightly bigger dataset and tested the solutions

-data

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)

ID <-  rep(1:5000, rep(c(12, 9), 2500))
A <- rep(A, 2500)
hour <- rep(hour, 2500)
dftest <- data.frame(ID, hour, A)

-functions

akrun <- function(df){
  df1 <- copy(df)
  setDT(df1)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
                               ][,  sixnew :=if(sum(A)>=5)  rep(c(0, 1), c(.N-1, 1))
     else NA_real_,.(rleid(Anew), grp)]
  i1 <- df1[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
  df1[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
  }

akrun2 <- function(df) {
      df1 <- copy(df)
      i1 <- setDT(df1)[, if(sum(A)>= 5)  .I[.N] , rleid(c(TRUE,  diff(hour)==1), A)]$V1
      df1[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
  }


missuse <- function(df){
  df %>%
    mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
    group_by(ID, rle) %>%
    mutate(sum = cumsum(A),
           six = ifelse(rle >= 6 & A == 1, 0, NA),
           six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
    ungroup() %>%
    select(ID, hour, A, six)
}


Mike <- function(df){
  ave(df$A, 
      cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)), 
      FUN = function(x) {
        if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
        else return(rep(NA, length(x)))})
}

-benchmark

microbenchmark(Mike(dftest),
               akrun(dftest),
               akrun2(dftest),
               missuse(dftest),  
                times = 10L, unit = 'relative')

-output

#Unit: relative
#            expr       min        lq      mean   median        uq       max neval cld
#    Mike(dftest)  1.682794  1.754494  1.673811  1.68806  1.632765  1.640221    10 a  
#   akrun(dftest) 13.159245 12.950117 12.176965 12.33716 11.856271 11.095228    10  b 
#  akrun2(dftest)  1.000000  1.000000  1.000000  1.00000  1.000000  1.000000    10 a  
# missuse(dftest) 37.899905 36.773837 34.726845 34.87672 33.155939 30.665840    10   c

Upvotes: 5

Related Questions