Morpheus
Morpheus

Reputation: 3543

R: How to find the first non-zero element in a dataframe by group

I have the following dataframe

ID     date        Flag
ABC    2018-03-21  N/A
ABC    2018-03-17  0
ABC    2018-03-12  0 
ABC    2018-03-10  0 
ABC    2018-03-09  1
ABC    2018-03-08  0
ABC    2018-03-07  1
DEF    2018-03-24  N/A
DEF    2018-03-21  0
DEF    2018-03-20  0
DEF    2018-03-14  0
DEF    2018-03-13  0
DEF    2018-03-12  0
DEF    2018-03-11  0
DEF    2018-03-10  0
DEF    2018-03-09  0       
DEF    2018-03-08  1       
DEF    2018-03-07  0
DEF    2018-03-06  0
DEF    2018-03-05  1

I want to subset this dataset such that I will only have rows between the first record and the the first 1 value in the flag column for each group and if there is no 1, that group should not appear at all.

Something like this:

ID     date        Flag
ABC    2018-03-21  N/A
ABC    2018-03-17  0
ABC    2018-03-12  0 
ABC    2018-03-10  0 
DEF    2018-03-24  N/A
DEF    2018-03-21  0
DEF    2018-03-20  0
DEF    2018-03-14  0
DEF    2018-03-13  0
DEF    2018-03-12  0
DEF    2018-03-11  0
DEF    2018-03-10  0
DEF    2018-03-09  0          

I saw some answers at Dplyr : how to find the first-non missing string by groups? But it is for non-missing and I have both non-missing and 0 values.

Upvotes: 2

Views: 2011

Answers (5)

IceCreamToucan
IceCreamToucan

Reputation: 28695

library(data.table)
setDT(df)

df[, if(1 %in% Flag) head(.SD, which.max(Flag == 1) - 1)
   , by = ID]

#      ID       date Flag
#  1: ABC 2018-03-21   NA
#  2: ABC 2018-03-17    0
#  3: ABC 2018-03-12    0
#  4: ABC 2018-03-10    0
#  5: DEF 2018-03-24   NA
#  6: DEF 2018-03-21    0
#  7: DEF 2018-03-20    0
#  8: DEF 2018-03-14    0
#  9: DEF 2018-03-13    0
# 10: DEF 2018-03-12    0
# 11: DEF 2018-03-11    0
# 12: DEF 2018-03-10    0
# 13: DEF 2018-03-09    0

Or in dplyr (same result)

library(dplyr)
df %>% 
  group_by(ID) %>% 
  filter(1 %in% Flag) %>% 
  slice(1:(which.max(Flag == 1) - 1))

Data used:

df <- fread("
ID     date        Flag
ABC    2018-03-21  NA
ABC    2018-03-17  0
ABC    2018-03-12  0 
ABC    2018-03-10  0 
ABC    2018-03-09  1
ABC    2018-03-08  0
ABC    2018-03-07  1
DEF    2018-03-24  NA
DEF    2018-03-21  0
DEF    2018-03-20  0
DEF    2018-03-14  0
DEF    2018-03-13  0
DEF    2018-03-12  0
DEF    2018-03-11  0
DEF    2018-03-10  0
DEF    2018-03-09  0       
DEF    2018-03-08  1       
DEF    2018-03-07  0
DEF    2018-03-06  0
DEF    2018-03-05  1
")

Benchmark Output:

# Unit: relative
#  expr       min       lq     mean   median       uq       max neval
#   ry0 1.0000000 1.000000 1.000000 1.000000 1.000000 1.0000000   100
#   ry1 0.9039601 1.005675 1.107913 1.007259 1.013925 0.9834608   100
#   ry2 4.1922470 4.119451 3.833156 4.054261 4.064153 2.1996109   100
#   mkr 2.7526006 2.860652 2.734473 2.851795 2.780521 1.4623569   100
#   www 5.8029974 5.601037 5.293515 5.588397 5.372007 1.5343666   100
#   leb 6.8563589 6.548586 6.687608 6.461585 6.991874 2.2607231   100
#   mm1 1.8219038 1.782887 1.464588 1.791532 1.669813 0.2896809   100
#   mm2 6.0007823 5.806987 5.393869 5.679563 5.672251 1.7103423   100
#   mm3 2.1094639 2.372948 2.899198 2.437456 2.270863 1.8811060   100

Benchmark code:

df <- read.table(text="ID     date        Flag
ABC    2018-03-21  NA
ABC    2018-03-17  0
ABC    2018-03-12  0 
ABC    2018-03-10  0 
ABC    2018-03-09  1
ABC    2018-03-08  0
ABC    2018-03-07  1
DEF    2018-03-24  NA
DEF    2018-03-21  0
DEF    2018-03-20  0
DEF    2018-03-14  0
DEF    2018-03-13  0
DEF    2018-03-12  0
DEF    2018-03-11  0
DEF    2018-03-10  0
DEF    2018-03-09  0       
DEF    2018-03-08  1       
DEF    2018-03-07  0
DEF    2018-03-06  0
DEF    2018-03-05  1
FOO    1983-01-01  NA
FOO    1983-01-02  NA
FOO    1983-01-02  0
FOO    1983-01-02  0", header=TRUE, stringsAsFactors=FALSE)


df <- setDF(rbindlist(replicate(1e4, df, simplify = F)))


dt <- as.data.table(df)
microbenchmark::microbenchmark(
  ry0 = dt[, if(1 %in% Flag) head(.SD, which.max(Flag == 1) - 1) , by = ID],
  ry1 = dt[, if(1 %in% Flag) .SD[1:(which.max(Flag == 1) - 1)] , by = ID],
  ry2 = df %>% 
          group_by(ID) %>% 
          filter(1 %in% Flag) %>% 
          slice(1:(which.max(Flag == 1) - 1)),
mkr = df %>% group_by(ID) %>%
  filter(cumsum(!is.na(Flag) & Flag == 1) == 0),
www = df %>%
  mutate(Flag2 = ifelse(is.na(Flag), 0, Flag)) %>%
  group_by(ID) %>%
  filter(cumsum(Flag2) < 1) %>%
  ungroup() %>%
  select(-Flag2),
leb = do.call(rbind,lapply(
  split(df, df["ID"]),
  function(.)
    if(!1 %in% .$Flag) NULL
    else .[1:(which.max(.$Flag %in% 1)-1),])),
mm1 = df %>%
  group_by(ID) %>%
  slice(seq_len(match(1,Flag,nomatch=1)-1)),
mm2 = do.call(rbind, by(df, df$ID, function(x) head(x,match(1,x$Flag,nomatch=1)-1))),
mm3 = df[ave(as.logical(df$Flag),df$ID,FUN=function(x){
  y <- match(TRUE,x)-1
  z <- logical(length(x))
  if (is.na(y)) z
  else {z[seq_len(y)] <- TRUE;z}
}),],
unit="relative",
times = 100
)

Upvotes: 4

MKR
MKR

Reputation: 20095

A cumsum based solution using dplyr can be as:

library(dplyr)

df %>% group_by(ID) %>%
  filter(cumsum(!is.na(Flag) & Flag == 1) == 0 & any(Flag == 1))

# # A tibble: 13 x 3
# # Groups: ID [2]
#    ID    date        Flag
#    <chr> <chr>      <int>
#  1 ABC   2018-03-21    NA
#  2 ABC   2018-03-17     0
#  3 ABC   2018-03-12     0
#  4 ABC   2018-03-10     0
#  5 DEF   2018-03-24    NA
#  6 DEF   2018-03-21     0
#  7 DEF   2018-03-20     0
#  8 DEF   2018-03-14     0
#  9 DEF   2018-03-13     0
# 10 DEF   2018-03-12     0
# 11 DEF   2018-03-11     0
# 12 DEF   2018-03-10     0
# 13 DEF   2018-03-09     0

Data:

df <- read.table(text ="
ID     date        Flag
ABC    2018-03-21  NA
ABC    2018-03-17  0
ABC    2018-03-12  0 
ABC    2018-03-10  0 
ABC    2018-03-09  1
ABC    2018-03-08  0
ABC    2018-03-07  1
DEF    2018-03-24  NA
DEF    2018-03-21  0
DEF    2018-03-20  0
DEF    2018-03-14  0
DEF    2018-03-13  0
DEF    2018-03-12  0
DEF    2018-03-11  0
DEF    2018-03-10  0
DEF    2018-03-09  0       
DEF    2018-03-08  1       
DEF    2018-03-07  0
DEF    2018-03-06  0
DEF    2018-03-05  1",
header = TRUE, stringsAsFactors = FALSE)

Upvotes: 2

moodymudskipper
moodymudskipper

Reputation: 47330

Using dplyr::slice, then the equivalent base R using by. And finally one just for performance, with benchmark. All robust for the case with no Flag==1 in a group.

dplyr

df %>%
  group_by(ID) %>%
  slice(seq_len(match(1,Flag,nomatch=1)-1))

# # A tibble: 13 x 3
# # Groups:   ID [2]
#    ID    date        Flag
#    <chr> <chr>      <int>
#  1 ABC   2018-03-21    NA
#  2 ABC   2018-03-17     0
#  3 ABC   2018-03-12     0
#  4 ABC   2018-03-10     0
#  5 DEF   2018-03-24    NA
#  6 DEF   2018-03-21     0
#  7 DEF   2018-03-20     0
#  8 DEF   2018-03-14     0
#  9 DEF   2018-03-13     0
# 10 DEF   2018-03-12     0
# 11 DEF   2018-03-11     0
# 12 DEF   2018-03-10     0
# 13 DEF   2018-03-09     0

base

do.call(rbind, by(df, df$ID, function(x) 
  head(x,match(1,x$Flag,nomatch=1)-1)))

# ID       date Flag
# ABC.1  ABC 2018-03-21   NA
# ABC.2  ABC 2018-03-17    0
# ABC.3  ABC 2018-03-12    0
# ABC.4  ABC 2018-03-10    0
# DEF.8  DEF 2018-03-24   NA
# DEF.9  DEF 2018-03-21    0
# DEF.10 DEF 2018-03-20    0
# DEF.11 DEF 2018-03-14    0
# DEF.12 DEF 2018-03-13    0
# DEF.13 DEF 2018-03-12    0
# DEF.14 DEF 2018-03-11    0
# DEF.15 DEF 2018-03-10    0
# DEF.16 DEF 2018-03-09    0

base fast

df[ave(as.logical(df$Flag),df$ID,FUN=function(x){
  y <- match(TRUE,x)-1
  z <- logical(length(x))
  if (is.na(y)) z
  else {z[seq_len(y)] <- TRUE;z}
}),]

#     ID       date Flag
# 1  ABC 2018-03-21   NA
# 2  ABC 2018-03-17    0
# 3  ABC 2018-03-12    0
# 4  ABC 2018-03-10    0
# 8  DEF 2018-03-24   NA
# 9  DEF 2018-03-21    0
# 10 DEF 2018-03-20    0
# 11 DEF 2018-03-14    0
# 12 DEF 2018-03-13    0
# 13 DEF 2018-03-12    0
# 14 DEF 2018-03-11    0
# 15 DEF 2018-03-10    0
# 16 DEF 2018-03-09    0

benchmark

I did the benchmark on @Lebatsnok modified input, that I remodified because the NAs were not properly recognized as such. MKR's and WWW's solutions are not robust for this case, but i left them in the benchmark anyway.

# Unit: relative
# expr       min        lq      mean    median        uq       max neval
# ry1  7.843459  5.885757  4.465808  5.515120  4.972157 0.4357556   100
# ry2 10.750648  8.840738  7.170055  8.922515  8.044793 0.7575101   100
# mkr  7.842997  5.892338  4.903737  5.872316  5.295717 0.6153142   100
# www 19.043776 16.816860 12.987223 16.270110 14.358256 2.3291645   100
# leb  2.882267  2.180278  2.132873  2.454936  2.328484 1.0160795   100
# mm1  7.974575  6.519906  5.417112  6.664007  5.958628 0.6423475   100
# mm2  3.677730  3.196962  2.861106  3.347310  3.093514 0.7054546   100
# mm3  1.000000  1.000000  1.000000  1.000000  1.000000 1.0000000   100

data

df <- read.table(text="ID     date        Flag
ABC    2018-03-21  NA
ABC    2018-03-17  0
ABC    2018-03-12  0 
ABC    2018-03-10  0 
ABC    2018-03-09  1
ABC    2018-03-08  0
ABC    2018-03-07  1
DEF    2018-03-24  NA
DEF    2018-03-21  0
DEF    2018-03-20  0
DEF    2018-03-14  0
DEF    2018-03-13  0
DEF    2018-03-12  0
DEF    2018-03-11  0
DEF    2018-03-10  0
DEF    2018-03-09  0       
DEF    2018-03-08  1       
DEF    2018-03-07  0
DEF    2018-03-06  0
DEF    2018-03-05  1
FOO    1983-01-01  NA
FOO    1983-01-02  NA
FOO    1983-01-02  0
FOO    1983-01-02  0", header=TRUE, stringsAsFactors=FALSE)

benchmark code

dt <- as.data.table(df)
microbenchmark::microbenchmark(
ry1 = dt[, if(1 %in% Flag) .SD[1:(which.max(Flag == 1) - 1)] , by = ID],
ry2 = df %>% 
  group_by(ID) %>% 
  filter(1 %in% Flag) %>% 
  slice(1:(which.max(Flag == 1) - 1)),
mkr = df %>% group_by(ID) %>%
  filter(cumsum(!is.na(Flag) & Flag == 1) == 0),
www = df %>%
  mutate(Flag2 = ifelse(is.na(Flag), 0, Flag)) %>%
  group_by(ID) %>%
  filter(cumsum(Flag2) < 1) %>%
  ungroup() %>%
  select(-Flag2),
leb = do.call(rbind,lapply(
  split(df, df["ID"]),
  function(.)
    if(!1 %in% .$Flag) NULL
    else .[1:(which.max(.$Flag %in% 1)-1),])),
mm1 = df %>%
  group_by(ID) %>%
  slice(seq_len(match(1,Flag,nomatch=1)-1)),
mm2 = do.call(rbind, by(df, df$ID, function(x) head(x,match(1,x$Flag,nomatch=1)-1))),
mm3 = df[ave(as.logical(df$Flag),df$ID,FUN=function(x){
  y <- match(TRUE,x)-1
  z <- logical(length(x))
  if (is.na(y)) z
  else {z[seq_len(y)] <- TRUE;z}
}),],
unit="relative"
)

Upvotes: 2

lebatsnok
lebatsnok

Reputation: 6469

With base R, one could, for instance, do this.

First of all, we need a complete test case with a group having no "1" in the "Flag" column:

df <- read.table(text="ID     date        Flag
ABC    2018-03-21  N/A
ABC    2018-03-17  0
ABC    2018-03-12  0 
ABC    2018-03-10  0 
ABC    2018-03-09  1
ABC    2018-03-08  0
ABC    2018-03-07  1
DEF    2018-03-24  N/A
DEF    2018-03-21  0
DEF    2018-03-20  0
DEF    2018-03-14  0
DEF    2018-03-13  0
DEF    2018-03-12  0
DEF    2018-03-11  0
DEF    2018-03-10  0
DEF    2018-03-09  0       
DEF    2018-03-08  1       
DEF    2018-03-07  0
DEF    2018-03-06  0
DEF    2018-03-05  1
FOO    1983-01-01  N/A
FOO    1983-01-02  N/A
FOO    1983-01-02  0
FOO    1983-01-02  0", header=TRUE, as.is=TRUE)

Now let's define a function that would take in a data frame and return NULL if there no 1 in $Flag and the first N rows otherwise (where N is the number of the row where 1 first occurs). This could be done using which.max with a boolean (TRUE if $Flag is 1, FALSE otherwise):

findit <- function(.) if(!1 %in% .$Flag) NULL else .[1:(which.max(.$Flag %in% 1)-1),]

Now we need to split the data frame by ID, apply the function, and rbind the parts again:

do.call(rbind,lapply(split(df, df["ID"]), findit))

Upvotes: 1

www
www

Reputation: 39154

A solution using dplyr and cumsum.

library(dplyr)

dat2 <- dat %>%
  mutate(Flag2 = ifelse(is.na(Flag), 0, Flag)) %>%
  group_by(ID) %>%
  filter(cumsum(Flag2) < 1) %>%
  ungroup() %>%
  select(-Flag2)
dat2
# # A tibble: 13 x 3
#    ID    date        Flag
#    <chr> <chr>      <int>
#  1 ABC   2018-03-21    NA
#  2 ABC   2018-03-17     0
#  3 ABC   2018-03-12     0
#  4 ABC   2018-03-10     0
#  5 DEF   2018-03-24    NA
#  6 DEF   2018-03-21     0
#  7 DEF   2018-03-20     0
#  8 DEF   2018-03-14     0
#  9 DEF   2018-03-13     0
# 10 DEF   2018-03-12     0
# 11 DEF   2018-03-11     0
# 12 DEF   2018-03-10     0
# 13 DEF   2018-03-09     0

DATA

dat <- read.table(text = "ID     date        Flag
ABC    '2018-03-21'  NA
ABC    '2018-03-17'  0
ABC    '2018-03-12'  0 
ABC    '2018-03-10'  0 
ABC    '2018-03-09'  1
ABC    '2018-03-08'  0
ABC    '2018-03-07'  1
DEF    '2018-03-24'  NA
DEF    '2018-03-21'  0
DEF    '2018-03-20'  0
DEF    '2018-03-14'  0
DEF    '2018-03-13'  0
DEF    '2018-03-12'  0
DEF    '2018-03-11'  0
DEF    '2018-03-10'  0
DEF    '2018-03-09'  0       
DEF    '2018-03-08'  1       
DEF    '2018-03-07'  0
DEF    '2018-03-06'  0
DEF    '2018-03-05'  1",
                  header = TRUE, stringsAsFactors = FALSE)

Upvotes: 1

Related Questions