Reputation: 3384
data1=data.frame("StudentID"=c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6),
"Time"=c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6),
"var1"=c(0,0,0,NA,1,2,0,1,2,2,2,2,0,0,NA,1,1,1,NA,0,0,0,0,1,0,0,0,NA,0,0,0,0,0,1,NA,NA))
library(dplyr)
data2 <- group_by(data1, StudentID) %>%
slice(seq_len(min(which(var1 == 1), n())))
After much attempt I am able to obtain 'data2' from 'data1'. The rule is simple that in data1 FOR EACH STUDENTID if var1 equals to 1, keep that row and delete everything after.
Upvotes: 2
Views: 39
Reputation: 25225
Another option is to find the rows where var1 == 1L
and use unique
to select the top row then perform a non-equi inner join to filter the rows:
library(data.table)
setDT(data1)
f <- unique(data1[var1==1L | c(diff(StudentID) != 0L, TRUE)], by="StudentID")[, var1 := NULL]
f[data1, on=.(StudentID, Time>=Time), nomatch=0L]
timing code:
library(data.table)
setDT(data1)
DT <- rbindlist(replicate(2e5, data1, simplify=FALSE))
DT[, StudentID:=c(1L, 1L+cumsum(diff(StudentID)!=0L))]
microbenchmark::microbenchmark(times=1L,
mtd0 = a1 <- {
DT[DT[, .I[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]$V1]
},
mtd1 = a2 <- {
f <- unique(DT[var1==1L | c(diff(StudentID) != 0L, TRUE)], by="StudentID")[, var1 := NULL]
f[DT, on=.(StudentID, Time>=Time), nomatch=0L]
}
)
fsetequal(a1, a2)
#[1] TRUE
timings:
Unit: seconds
expr min lq mean median uq max neval
mtd0 2.830089 2.830089 2.830089 2.830089 2.830089 2.830089 1
mtd1 1.153433 1.153433 1.153433 1.153433 1.153433 1.153433 1
Upvotes: 1
Reputation: 887501
If we want a similar option in data.table
, either use the condition in .SD
library(data.table)
setDT(data1)[, .SD[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]
or use row index with .I
, and extract the column as $V1
to subset the dataset
setDT(data1)[data1[, .I[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]$V1]
Or with match
setDT(data1)[, .SD[seq_len(min(match(1, var1), .N, na.rm = TRUE))], .(StudentID)]
Upvotes: 1