Docconcoct
Docconcoct

Reputation: 2050

Using filter() in dplyr to find the first occurrence of a grepl value and return it and all following rows

I'm trying to filter a data frame using dplyr by group, where the first occurrence of the string 'ReadingOnset' in a row is captured and it and all subsequent rows are passed into a new dataframe.

Text_Stimuli <- structure(list(Name = c("Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", 
"Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1", "Sub1"
), StimulusName = c("GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", "GenLie20", 
"GenLie20"), StimuliBlock = c("Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4", "Block_4", "Block_4", "Block_4", "Block_4", 
"Block_4", "Block_4"), Reading_Onset = c("", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "ReadingOnset", "", "", "", "", 
"", "", "")), row.names = c(NA, -101L), vars = c("Name", "StimulusName", 
"StimuliBlock"), drop = TRUE, indices = list(0:100), group_sizes = 101L, biggest_group_size = 101L, labels = structure(list(
    Name = "Innocent Subject 15", StimulusName = "GenLie20", 
    StimuliBlock = "Block_4"), row.names = c(NA, -1L), class = "data.frame", vars = c("Name", 
"StimulusName", "StimuliBlock"), drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

Here is an example of the type of solution I've been trying to get to work, but without success.

Test <- Text_Stimuli %>% 
  group_by(Name, StimulusName, StimuliBlock)%>%   
  filter(!lead(cumsum(grepl("ReadingOnset", Reading_Onset)), default = 0))

As you can see I'm trying to group by Name, StimulusName, and StimuliBlock. Then I'm trying to find the first occurrence of 'ReadingOnset' in the column Reading_Onset and return all leading rows from that (and including the row with 'ReadingOnset').

I've been trying to adapt this solution to the inverse of the problem: https://stackoverflow.com/a/37922522/2653210

Upvotes: 0

Views: 1378

Answers (2)

AndS.
AndS.

Reputation: 8110

I couldn't tell if you want everything before and including ReadingOnset or if you want everything after and including ReadingOnset, so i'll show both.

Everything before and including:

library(dplyr)

Text_Stimuli %>% 
  filter(row_number() <= which(grepl("ReadingOnset", Reading_Onset)))

Everything after and including:

Text_Stimuli %>% 
  filter(row_number() >= which(grepl("ReadingOnset", Reading_Onset)))

What we're doing is just filtering based on the row number that "ReadingOnset" is found on.

Upvotes: 2

Nicolas2
Nicolas2

Reputation: 2210

You could try this, with your data set :

library(dplyr)
library(stringr)
library(zoo)
df %>% filter(ifelse(str_detect(Reading_Onset,"ReadingOnset"),TRUE,NA) %>%
              na.locf(na.rm=FALSE))

## A tibble: 8 x 4
## Groups:   Name, StimulusName, StimuliBlock [1]
#  Name                StimulusName StimuliBlock Reading_Onset
#  <chr>               <chr>        <chr>        <chr>        
#1 Innocent Subject 15 GenLie20     Block_4      ReadingOnset 
#2 Innocent Subject 15 GenLie20     Block_4      ""           
#3 Innocent Subject 15 GenLie20     Block_4      ""           
#4 Innocent Subject 15 GenLie20     Block_4      ""           
#5 Innocent Subject 15 GenLie20     Block_4      ""           
#6 Innocent Subject 15 GenLie20     Block_4      ""           
#7 Innocent Subject 15 GenLie20     Block_4      ""           
#8 Innocent Subject 15 GenLie20     Block_4      ""

Upvotes: 2

Related Questions