importm
importm

Reputation: 305

looking for patterns in binary columns r

I need to find and count the ID's that appear with a 1 after 3 or more consecutive zeros.

This is a example of what I have:

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    0
#   3   0    0   0    0   0   0   0   0    0    1
#   4   1    0   0    1   0   1   0   1    0    1
#   5   0    0   1    0   0   1   1   0    0    1

c1<- c("ID","Jan","Feb", "Mar","Apr", "May","Jun", "Jul", "Aug", "Sept", "Oct")
c2<-  c(1,0,0,0,1,0,0,1,1,1,0)
c3<- c(2,0,0,0,0,0,0,1,0,0,0)
c4<- c(3,0,0,0,0,0,0,0,0,0,1)
c5<- c(4,1,0,0,1,0,1,0,1,0,1)
c6<- c(5,0,0,1,0,0,1,1,0,0,1)
BD<-data.frame(rbind(c2,c3,c4,c5,c6))
colnames(BD)<-c1

The result of what I expect is something like this:

#  ID   Jan  Feb Mar  Apr May Jun Jul Aug Sept Oct
#   1   0    0   0    1   0   0   1   1    1    0
#   2   0    0   0    0   0   0   1   0    0    1
#   3   0    0   0    0   0   0   0   0    0    1

Anyone know how to do it? Thanks!

Upvotes: 3

Views: 222

Answers (5)

chinsoon12
chinsoon12

Reputation: 25225

An option using data.table to melt and filter for rows that match conditions.

library(data.table)
setDT(BD)[ID %in%
    melt(BD, id.vars="ID")[, 
        mth := .GRP, variable][
            value==1L, ID[mth[1L]>3L | any(diff(mth) > 3L)], ID]$V1
]

It should be faster for large datasets with sparse data.

Upvotes: 1

jay.sf
jay.sf

Reputation: 73134

You could collapse to string and use grep() to search for pattern.

k <- 3

grep(sprintf(paste0("%0", k + 1, "d"), 1), apply(d[-1], 1, paste, collapse=""))
# [1] 2 4 5 6 8

If no following 1 is needed you could use the rle().

d
#     id Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# c1   1   1   1   0   1   1   0   0   1   1   1   0   0
# c2   2   0   0   0   1   1   1   0   1   1   0   1   0
# c3   3   1   0   0   1   1   0   1   1   1   0   1   0
# c4   4   0   0   0   0   0   1   1   0   0   1   1   0
# c5   5   0   0   0   1   1   1   1   0   0   1   0   1
# c6   6   1   0   0   0   1   0   1   0   0   0   0   1
# c7   7   0   1   0   0   1   0   1   1   1   0   0   1
# c8   8   0   1   1   1   1   1   1   1   0   0   0   1
# c9   9   0   1   0   0   1   1   0   0   1   1   1   0
# c10 10   1   1   0   1   0   1   1   0   0   1   0   1

k <- 3
d$id[sapply(as.data.frame(t(d[-1])), function(x) any(rle(x)$lengths[rle(x)$values == 0] >= k))]
# [1] 2 4 5 6 8

Data:

set.seed(0)
d <- data.frame(id=1:10, 
                  `dimnames<-`(matrix(sample(0:1, 120, r=1), 10), 
                               list(paste0("c", 1:10), month.abb)))

Upvotes: 2

ThomasIsCoding
ThomasIsCoding

Reputation: 101916

Here is a base R solution that can make it

BDout <- subset(BD,apply(BD[-1], 1, function(x) head(which(x==1),1))>3)

such that

> BDout
  ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
1  1   0   0   0   1   0   0   1   1    1   0
2  2   0   0   0   0   0   0   1   0    0   0
3  3   0   0   0   0   0   0   0   0    0   1

Upvotes: 1

Kent Johnson
Kent Johnson

Reputation: 3388

You can consolidate the rows into strings and use a regular expression to match '0001':

library(tidyverse)
rows = BD %>% 
  purrr::pmap(function(...) paste0(list(...)[-1], collapse='')) %>% 
  stringr::str_detect('0001')
BD[rows,]

Upvotes: 1

IceCreamToucan
IceCreamToucan

Reputation: 28695

If you take the rowid(rleid(x)) of a vector x you get the number of steps into each "run" each element is*. You can check that this is >= 3 and the element is 0. If that is true for the previous element (for the shifted output) and the element is 1, return TRUE. Then check if this is TRUE for any of the elements in the row.

library(data.table)

rows <- 
  apply(BD, 1, function(r) any(shift(rowid(rleid(r)) >= 3 & r == 0) & r == 1))

BD[rows,]
#    ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# c2  1   0   0   0   1   0   0   1   1    1   0
# c3  2   0   0   0   0   0   0   1   0    0   0
# c4  3   0   0   0   0   0   0   0   0    0   1

* Here's an example for a particular row (the first)

rbind(
  rowid_rleid = rowid(rleid(unlist(BD[1,]))),
  original = unlist(BD[1,]))

#             ID Jan Feb Mar Apr May Jun Jul Aug Sept Oct
# rowid_rleid  1   1   2   3   1   1   2   1   2    3   1
# original     1   0   0   0   1   0   0   1   1    1   0

Upvotes: 2

Related Questions