bvowe
bvowe

Reputation: 3384

R Data.Table Solution for DPLYR Resolution

data1=data.frame("StudentID"=c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6),
                 "Time"=c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6),
                 "var1"=c(0,0,0,NA,1,2,0,1,2,2,2,2,0,0,NA,1,1,1,NA,0,0,0,0,1,0,0,0,NA,0,0,0,0,0,1,NA,NA))


library(dplyr)
data2 <- group_by(data1, StudentID) %>% 
  slice(seq_len(min(which(var1 == 1), n()))) 

After much attempt I am able to obtain 'data2' from 'data1'. The rule is simple that in data1 FOR EACH STUDENTID if var1 equals to 1, keep that row and delete everything after.

Upvotes: 2

Views: 39

Answers (2)

chinsoon12
chinsoon12

Reputation: 25225

Another option is to find the rows where var1 == 1L and use unique to select the top row then perform a non-equi inner join to filter the rows:

library(data.table)
setDT(data1)
f <- unique(data1[var1==1L | c(diff(StudentID) != 0L, TRUE)], by="StudentID")[, var1 := NULL]
f[data1, on=.(StudentID, Time>=Time), nomatch=0L]

timing code:

library(data.table)
setDT(data1)
DT <- rbindlist(replicate(2e5, data1, simplify=FALSE))
DT[, StudentID:=c(1L, 1L+cumsum(diff(StudentID)!=0L))]
microbenchmark::microbenchmark(times=1L,
    mtd0 = a1 <- {
        DT[DT[, .I[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]$V1]    
    },
    mtd1 = a2 <- {
        f <- unique(DT[var1==1L | c(diff(StudentID) != 0L, TRUE)], by="StudentID")[, var1 := NULL]
        f[DT, on=.(StudentID, Time>=Time), nomatch=0L]
    }
)
fsetequal(a1, a2)
#[1] TRUE

timings:

Unit: seconds
 expr      min       lq     mean   median       uq      max neval
 mtd0 2.830089 2.830089 2.830089 2.830089 2.830089 2.830089     1
 mtd1 1.153433 1.153433 1.153433 1.153433 1.153433 1.153433     1

Upvotes: 1

akrun
akrun

Reputation: 887501

If we want a similar option in data.table, either use the condition in .SD

library(data.table)
setDT(data1)[, .SD[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]

or use row index with .I, and extract the column as $V1 to subset the dataset

setDT(data1)[data1[, .I[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]$V1]

Or with match

setDT(data1)[, .SD[seq_len(min(match(1, var1), .N, na.rm = TRUE))], .(StudentID)]

Upvotes: 1

Related Questions