Statistix
Statistix

Reputation: 83

Finding observations occuring more than 3 times in longitudinal data

I have the following data:

structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L), A = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 
1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), Day = c(1L, 2L, 3L, 4L, 5L, 
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
17L)), row.names = c(NA, 48L), class = "data.frame")

I want to find IDs who have >3 observations for A in a row, and create the following data:

structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L), A = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 
1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), Day = c(1L, 2L, 3L, 4L, 5L, 
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
17L), Censor = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 1L, 1L, 1L, 1L), Day_2 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L)), row.names = c(NA, 
48L), class = "data.frame")

Where Censor starts from the first Day that an ID has >3 observations for A in a row, and Day_2 is the day that first observation occurred.

Upvotes: 2

Views: 64

Answers (2)

Ben
Ben

Reputation: 30494

Another approach could use rollapply from zoo. Here, after grouping by ID, you can use rollapply over a width/window of 4 values, and add indicator of 1 if all of those values from A are 1. The next cumany will make the rest of the Censor column 1 following the first 1 value. Last, which will provide the index where Censor is first equal to 1.

library(tidyverse)
library(zoo)

df %>%
  group_by(ID) %>%
  mutate(Censor = rollapply(A == 1, width = 4, all, fill = 0, align = "left"),
         Censor = +cumany(Censor == 1),
         Day_2 = which(Censor == 1)[1]) 

Output

      ID     A   Day Censor Day_2
   <int> <int> <int>  <int> <int>
 1     1     0     1      0    NA
 2     1     0     2      0    NA
 3     1     0     3      0    NA
 4     1     0     4      0    NA
 5     1     0     5      0    NA
 6     1     1     6      0    NA
 7     1     1     7      0    NA
 8     1     0     8      0    NA
 9     1     0     9      0    NA
10     1     0    10      0    NA
11     1     0    11      0    NA
12     1     1    12      0    NA
13     1     1    13      0    NA
14     1     1    14      0    NA
15     1     0    15      0    NA
16     1     0    16      0    NA
17     1     0    17      0    NA
18     2     0     1      0     5
19     2     0     2      0     5
20     2     0     3      0     5
21     2     0     4      0     5
22     2     1     5      1     5
23     2     1     6      1     5
24     2     1     7      1     5
25     2     1     8      1     5
26     2     1     9      1     5
27     2     1    10      1     5
28     2     1    11      1     5
29     2     1    12      1     5
30     2     1    13      1     5
31     2     1    14      1     5
32     3     0     1      0    14
33     3     0     2      0    14
34     3     0     3      0    14
35     3     0     4      0    14
36     3     0     5      0    14
37     3     1     6      0    14
38     3     1     7      0    14
39     3     1     8      0    14
40     3     0     9      0    14
41     3     1    10      0    14
42     3     0    11      0    14
43     3     0    12      0    14
44     3     0    13      0    14
45     3     1    14      1    14
46     3     1    15      1    14
47     3     1    16      1    14
48     3     1    17      1    14

Upvotes: 2

jay.sf
jay.sf

Reputation: 73592

Wrapping an rle approach in a function that identifies those subsequent 1s with length > 3, and then using it in ave.

f <- \(x) with(rle(x), rep.int(replace(numeric(length(values)),
                                       which(values == 1 & lengths > 3), 1), lengths))

res <- within(dat, {
  censor <- ave(A, ID, FUN=f)
  Day_2 <- ave(censor, ID, FUN=\(x) if (sum(x) != 0) which.max(x) else NA_integer_)
})
res
#    ID A Day Day_2 censor
# 1   1 0   1    NA      0
# 2   1 0   2    NA      0
# 3   1 0   3    NA      0
# 4   1 0   4    NA      0
# 5   1 0   5    NA      0
# 6   1 1   6    NA      0
# 7   1 1   7    NA      0
# 8   1 0   8    NA      0
# 9   1 0   9    NA      0
# 10  1 0  10    NA      0
# 11  1 0  11    NA      0
# 12  1 1  12    NA      0
# 13  1 1  13    NA      0
# 14  1 1  14    NA      0
# 15  1 0  15    NA      0
# 16  1 0  16    NA      0
# 17  1 0  17    NA      0
# 18  2 0   1     5      0
# 19  2 0   2     5      0
# 20  2 0   3     5      0
# 21  2 0   4     5      0
# 22  2 1   5     5      1
# 23  2 1   6     5      1
# 24  2 1   7     5      1
# 25  2 1   8     5      1
# 26  2 1   9     5      1
# 27  2 1  10     5      1
# 28  2 1  11     5      1
# 29  2 1  12     5      1
# 30  2 1  13     5      1
# 31  2 1  14     5      1
# 32  3 0   1    14      0
# 33  3 0   2    14      0
# 34  3 0   3    14      0
# 35  3 0   4    14      0
# 36  3 0   5    14      0
# 37  3 1   6    14      0
# 38  3 1   7    14      0
# 39  3 1   8    14      0
# 40  3 0   9    14      0
# 41  3 1  10    14      0
# 42  3 0  11    14      0
# 43  3 0  12    14      0
# 44  3 0  13    14      0
# 45  3 1  14    14      1
# 46  3 1  15    14      1
# 47  3 1  16    14      1
# 48  3 1  17    14      1

Data:

dat <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L), A = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 
1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), Day = c(1L, 2L, 3L, 4L, 5L, 
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
17L)), row.names = c(NA, 48L), class = "data.frame")

Upvotes: 2

Related Questions