marbel
marbel

Reputation: 7714

optimize code to update following rows in group based on condition with data.table

Here is some data to clarify the desired input/output.

dt = structure(list(ticker = c("FTNT", "FTNT", "FTNT", "FTNT", "FTNT", "FTNT", "MDB", "MDB", "MDB", "MDB", "MDB", "MDB", "ZS", "ZS", "ZS", "ZS", "ZS", "ZS"), 
                    date = structure(c(18631, 18632, 18633, 18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638), tzone = "UTC", tclass = "Date", class = "Date"), 
                    R = c(-0.0199959672793103, -0.0262434257521769, -0.025610299646878,  0.0372167038069893, 0.0341361186736473, 0.000202504389108293, -0.0240363323965936, 0.0146970518570011, -0.0569805661884384,  0.0488220267994761, 0.0232887982973971, 0.00444619037040206,  -0.0181263375550329, -0.0133102149688453, -0.0468265360104722,  0.0467953860405097, 0.022377580589833, 0.00673862306172723),
                    R_acum = c(-0.0199959672793103, -0.0457146303488496, -0.0701541646142473,  -0.0355483675725332, -0.00262573219299644, -0.00242375952618179, -0.0240363323965936, -0.00969254376327744, -0.066120823320278,  -0.0205269491289479, 0.00228380119052418, 0.00674014577578741, -0.0181263375550329, -0.0311952870744229, -0.0765610558513475, -0.0333483739750707, -0.0117170493114047, -0.00505738302838277
                    )), row.names = c(NA, -18L), class = c("data.table", "data.frame"))

A sample of the data:

enter image description here

Objective: For each ticker I'd like to set to zero the value of the R variable for the following rows given a logical condition is satisfied on R_acum. In this case, if R_acum is lower than -0.03 I'd like to set to zero the values of R for the following rows, excluding the row when the condition is satisfied.

This is the desired output for this example:

enter image description here

This code generates the desired output but I'm sure there is a better/faster way to approach it.

threshold = -0.03
ls_dt = lapply(split(dt, dt$ticker), function(d){
  idx = d[, ifelse(R_acum < threshold, .I, 0)]
  idx = setdiff(idx, 0)
  if(length(idx > 0)){
    min_idx = min(idx)
    d[, idx:=ifelse(.I > min_idx, 0, 1), by=ticker]
    d[, R:=R * idx]
    d[, idx:=NULL]
  }
  d
})
ls_dt

Upvotes: 2

Views: 72

Answers (3)

ThomasIsCoding
ThomasIsCoding

Reputation: 102710

A data.table option using shift + cumsum

dt[, R := R * (cumsum(shift(R_acum < -0.03, fill = FALSE)) == 0), ticker]

gives

> dt
    ticker       date           R       R_acum
 1:   FTNT 2021-01-04 -0.01999597 -0.019995967
 2:   FTNT 2021-01-05 -0.02624343 -0.045714630
 3:   FTNT 2021-01-06  0.00000000 -0.070154165
 4:   FTNT 2021-01-07  0.00000000 -0.035548368
 5:   FTNT 2021-01-08  0.00000000 -0.002625732
 6:   FTNT 2021-01-11  0.00000000 -0.002423760
 7:    MDB 2021-01-04 -0.02403633 -0.024036332
 8:    MDB 2021-01-05  0.01469705 -0.009692544
 9:    MDB 2021-01-06 -0.05698057 -0.066120823
10:    MDB 2021-01-07  0.00000000 -0.020526949
11:    MDB 2021-01-08  0.00000000  0.002283801
12:    MDB 2021-01-11  0.00000000  0.006740146
13:     ZS 2021-01-04 -0.01812634 -0.018126338
14:     ZS 2021-01-05 -0.01331021 -0.031195287
15:     ZS 2021-01-06  0.00000000 -0.076561056
16:     ZS 2021-01-07  0.00000000 -0.033348374
17:     ZS 2021-01-08  0.00000000 -0.011717049
18:     ZS 2021-01-11  0.00000000 -0.005057383

Upvotes: 3

MacOS
MacOS

Reputation: 1159

data.table

I do not know what you mean with better, but I would suggest

> threshold <- -0.03
> row.idxs <- df[, .I[(min(which(R_acum < threshold))+1):.N], by = ticker]$V1
> df[row.idxs, R:= 0]
> df
    ticker       date           R       R_acum
 1:   FTNT 2021-01-04 -0.01999597 -0.019995967
 2:   FTNT 2021-01-05 -0.02624343 -0.045714630
 3:   FTNT 2021-01-06  0.00000000 -0.070154165
 4:   FTNT 2021-01-07  0.00000000 -0.035548368
 5:   FTNT 2021-01-08  0.00000000 -0.002625732
 6:   FTNT 2021-01-11  0.00000000 -0.002423760
 7:    MDB 2021-01-04 -0.02403633 -0.024036332
 8:    MDB 2021-01-05  0.01469705 -0.009692544
 9:    MDB 2021-01-06 -0.05698057 -0.066120823
10:    MDB 2021-01-07  0.00000000 -0.020526949
11:    MDB 2021-01-08  0.00000000  0.002283801
12:    MDB 2021-01-11  0.00000000  0.006740146
13:     ZS 2021-01-04 -0.01812634 -0.018126338
14:     ZS 2021-01-05 -0.01331021 -0.031195287
15:     ZS 2021-01-06  0.00000000 -0.076561056
16:     ZS 2021-01-07  0.00000000 -0.033348374
17:     ZS 2021-01-08  0.00000000 -0.011717049
18:     ZS 2021-01-11  0.00000000 -0.005057383

> dput(df)
structure(list(ticker = c("FTNT", "FTNT", "FTNT", "FTNT", "FTNT", 
"FTNT", "MDB", "MDB", "MDB", "MDB", "MDB", "MDB", "ZS", "ZS", 
"ZS", "ZS", "ZS", "ZS"), date = structure(c(18631, 18632, 18633, 
18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638, 
18631, 18632, 18633, 18634, 18635, 18638), tzone = "UTC", tclass = "Date", class = "Date"), 
    R = c(-0.0199959672793103, -0.0262434257521769, 0, 0, 0, 
    0, -0.0240363323965936, 0.0146970518570011, -0.0569805661884384, 
    0, 0, 0, -0.0181263375550329, -0.0133102149688453, 0, 0, 
    0, 0), R_acum = c(-0.0199959672793103, -0.0457146303488496, 
    -0.0701541646142473, -0.0355483675725332, -0.00262573219299644, 
    -0.00242375952618179, -0.0240363323965936, -0.00969254376327744, 
    -0.066120823320278, -0.0205269491289479, 0.00228380119052418, 
    0.00674014577578741, -0.0181263375550329, -0.0311952870744229, 
    -0.0765610558513475, -0.0333483739750707, -0.0117170493114047, 
    -0.00505738302838277)), row.names = c(NA, -18L), class = c("data.table", 
"data.frame"))

This way your question is related to Subset by group with data.table compared to aggregate a data.table and Conditionally Select Rows within a Group with Data.Table.

Base R

Here is a version that only uses base R.

df <- structure(list(ticker = c("FTNT", "FTNT", "FTNT", "FTNT", "FTNT", "FTNT", "MDB", "MDB", "MDB", "MDB", "MDB", "MDB", "ZS", "ZS", "ZS", "ZS", "ZS", "ZS"), 
                    date = structure(c(18631, 18632, 18633, 18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638), tzone = "UTC", tclass = "Date", class = "Date"), 
                    R = c(-0.0199959672793103, -0.0262434257521769, -0.025610299646878,  0.0372167038069893, 0.0341361186736473, 0.000202504389108293, -0.0240363323965936, 0.0146970518570011, -0.0569805661884384,  0.0488220267994761, 0.0232887982973971, 0.00444619037040206,  -0.0181263375550329, -0.0133102149688453, -0.0468265360104722,  0.0467953860405097, 0.022377580589833, 0.00673862306172723),
                    R_acum = c(-0.0199959672793103, -0.0457146303488496, -0.0701541646142473,  -0.0355483675725332, -0.00262573219299644, -0.00242375952618179, -0.0240363323965936, -0.00969254376327744, -0.066120823320278,  -0.0205269491289479, 0.00228380119052418, 0.00674014577578741, -0.0181263375550329, -0.0311952870744229, -0.0765610558513475, -0.0333483739750707, -0.0117170493114047, -0.00505738302838277
                    )), row.names = c(NA, -18L), class = c("data.table", "data.frame"))


threshold <- -0.03

df.list <- split(df, df$ticker)

df.list <- lapply(df.list,
                  FUN = function(df) {
                    idxs <- which(df$R_acum < threshold)
                        
                    # The first match should be ignored
                    min.idx <- min(idxs) + 1
                        
                    idxs <- min.idx:nrow(df)
                        
                    df$R[idxs] <- 0
                        
                    return(df)
                  })

df.list

$FTNT
  ticker       date           R       R_acum
1   FTNT 2021-01-04 -0.01999597 -0.019995967
2   FTNT 2021-01-05 -0.02624343 -0.045714630
3   FTNT 2021-01-06  0.00000000 -0.070154165
4   FTNT 2021-01-07  0.00000000 -0.035548368
5   FTNT 2021-01-08  0.00000000 -0.002625732
6   FTNT 2021-01-11  0.00000000 -0.002423760

$MDB
   ticker       date           R       R_acum
7     MDB 2021-01-04 -0.02403633 -0.024036332
8     MDB 2021-01-05  0.01469705 -0.009692544
9     MDB 2021-01-06 -0.05698057 -0.066120823
10    MDB 2021-01-07  0.00000000 -0.020526949
11    MDB 2021-01-08  0.00000000  0.002283801
12    MDB 2021-01-11  0.00000000  0.006740146

$ZS
   ticker       date           R       R_acum
13     ZS 2021-01-04 -0.01812634 -0.018126338
14     ZS 2021-01-05 -0.01331021 -0.031195287
15     ZS 2021-01-06  0.00000000 -0.076561056
16     ZS 2021-01-07  0.00000000 -0.033348374
17     ZS 2021-01-08  0.00000000 -0.011717049
18     ZS 2021-01-11  0.00000000 -0.005057383

Here is the output of dput for df.list.

list(FTNT = structure(list(ticker = c("FTNT", "FTNT", "FTNT", 
"FTNT", "FTNT", "FTNT"), date = structure(c(18631, 18632, 18633, 
18634, 18635, 18638), class = "Date"), R = c(-0.0199959672793103, 
-0.0262434257521769, 0, 0, 0, 0), R_acum = c(-0.0199959672793103, 
-0.0457146303488496, -0.0701541646142473, -0.0355483675725332, 
-0.00262573219299644, -0.00242375952618179)), row.names = c(NA, 
6L), class = c("data.table", "data.frame")), MDB = structure(list(
    ticker = c("MDB", "MDB", "MDB", "MDB", "MDB", "MDB"), date = structure(c(18631, 
    18632, 18633, 18634, 18635, 18638), class = "Date"), R = c(-0.0240363323965936, 
    0.0146970518570011, -0.0569805661884384, 0, 0, 0), R_acum = c(-0.0240363323965936, 
    -0.00969254376327744, -0.066120823320278, -0.0205269491289479, 
    0.00228380119052418, 0.00674014577578741)), row.names = 7:12, class = c("data.table", 
"data.frame")), ZS = structure(list(ticker = c("ZS", "ZS", "ZS", 
"ZS", "ZS", "ZS"), date = structure(c(18631, 18632, 18633, 18634, 
18635, 18638), class = "Date"), R = c(-0.0181263375550329, -0.0133102149688453, 
0, 0, 0, 0), R_acum = c(-0.0181263375550329, -0.0311952870744229, 
-0.0765610558513475, -0.0333483739750707, -0.0117170493114047, 
-0.00505738302838277)), row.names = 13:18, class = c("data.table", 
"data.frame")))

If you want the original data frame back you can simply

df <- do.call(rbind, c(df.list, make.row.names = FALSE))
df

   ticker       date           R       R_acum
1    FTNT 2021-01-04 -0.01999597 -0.019995967
2    FTNT 2021-01-05 -0.02624343 -0.045714630
3    FTNT 2021-01-06  0.00000000 -0.070154165
4    FTNT 2021-01-07  0.00000000 -0.035548368
5    FTNT 2021-01-08  0.00000000 -0.002625732
6    FTNT 2021-01-11  0.00000000 -0.002423760
7     MDB 2021-01-04 -0.02403633 -0.024036332
8     MDB 2021-01-05  0.01469705 -0.009692544
9     MDB 2021-01-06 -0.05698057 -0.066120823
10    MDB 2021-01-07  0.00000000 -0.020526949
11    MDB 2021-01-08  0.00000000  0.002283801
12    MDB 2021-01-11  0.00000000  0.006740146
13     ZS 2021-01-04 -0.01812634 -0.018126338
14     ZS 2021-01-05 -0.01331021 -0.031195287
15     ZS 2021-01-06  0.00000000 -0.076561056
16     ZS 2021-01-07  0.00000000 -0.033348374
17     ZS 2021-01-08  0.00000000 -0.011717049
18     ZS 2021-01-11  0.00000000 -0.005057383

Upvotes: 0

dww
dww

Reputation: 31454

you can use

dt[, flag := rleid(R_acum < -0.03), by=ticker]               
dt[flag>1, R := c(R[1], rep(0, .N-1)), by=ticker]

Upvotes: 2

Related Questions