Reputation: 7714
Here is some data to clarify the desired input/output.
dt = structure(list(ticker = c("FTNT", "FTNT", "FTNT", "FTNT", "FTNT", "FTNT", "MDB", "MDB", "MDB", "MDB", "MDB", "MDB", "ZS", "ZS", "ZS", "ZS", "ZS", "ZS"),
date = structure(c(18631, 18632, 18633, 18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638), tzone = "UTC", tclass = "Date", class = "Date"),
R = c(-0.0199959672793103, -0.0262434257521769, -0.025610299646878, 0.0372167038069893, 0.0341361186736473, 0.000202504389108293, -0.0240363323965936, 0.0146970518570011, -0.0569805661884384, 0.0488220267994761, 0.0232887982973971, 0.00444619037040206, -0.0181263375550329, -0.0133102149688453, -0.0468265360104722, 0.0467953860405097, 0.022377580589833, 0.00673862306172723),
R_acum = c(-0.0199959672793103, -0.0457146303488496, -0.0701541646142473, -0.0355483675725332, -0.00262573219299644, -0.00242375952618179, -0.0240363323965936, -0.00969254376327744, -0.066120823320278, -0.0205269491289479, 0.00228380119052418, 0.00674014577578741, -0.0181263375550329, -0.0311952870744229, -0.0765610558513475, -0.0333483739750707, -0.0117170493114047, -0.00505738302838277
)), row.names = c(NA, -18L), class = c("data.table", "data.frame"))
A sample of the data:
Objective:
For each ticker I'd like to set to zero the value of the R
variable for the following rows given a logical condition is satisfied on R_acum
.
In this case, if R_acum
is lower than -0.03 I'd like to set to zero the values of R
for the following rows, excluding the row when the condition is satisfied.
This is the desired output for this example:
This code generates the desired output but I'm sure there is a better/faster way to approach it.
threshold = -0.03
ls_dt = lapply(split(dt, dt$ticker), function(d){
idx = d[, ifelse(R_acum < threshold, .I, 0)]
idx = setdiff(idx, 0)
if(length(idx > 0)){
min_idx = min(idx)
d[, idx:=ifelse(.I > min_idx, 0, 1), by=ticker]
d[, R:=R * idx]
d[, idx:=NULL]
}
d
})
ls_dt
Upvotes: 2
Views: 72
Reputation: 102710
A data.table
option using shift
+ cumsum
dt[, R := R * (cumsum(shift(R_acum < -0.03, fill = FALSE)) == 0), ticker]
gives
> dt
ticker date R R_acum
1: FTNT 2021-01-04 -0.01999597 -0.019995967
2: FTNT 2021-01-05 -0.02624343 -0.045714630
3: FTNT 2021-01-06 0.00000000 -0.070154165
4: FTNT 2021-01-07 0.00000000 -0.035548368
5: FTNT 2021-01-08 0.00000000 -0.002625732
6: FTNT 2021-01-11 0.00000000 -0.002423760
7: MDB 2021-01-04 -0.02403633 -0.024036332
8: MDB 2021-01-05 0.01469705 -0.009692544
9: MDB 2021-01-06 -0.05698057 -0.066120823
10: MDB 2021-01-07 0.00000000 -0.020526949
11: MDB 2021-01-08 0.00000000 0.002283801
12: MDB 2021-01-11 0.00000000 0.006740146
13: ZS 2021-01-04 -0.01812634 -0.018126338
14: ZS 2021-01-05 -0.01331021 -0.031195287
15: ZS 2021-01-06 0.00000000 -0.076561056
16: ZS 2021-01-07 0.00000000 -0.033348374
17: ZS 2021-01-08 0.00000000 -0.011717049
18: ZS 2021-01-11 0.00000000 -0.005057383
Upvotes: 3
Reputation: 1159
I do not know what you mean with better, but I would suggest
> threshold <- -0.03
> row.idxs <- df[, .I[(min(which(R_acum < threshold))+1):.N], by = ticker]$V1
> df[row.idxs, R:= 0]
> df
ticker date R R_acum
1: FTNT 2021-01-04 -0.01999597 -0.019995967
2: FTNT 2021-01-05 -0.02624343 -0.045714630
3: FTNT 2021-01-06 0.00000000 -0.070154165
4: FTNT 2021-01-07 0.00000000 -0.035548368
5: FTNT 2021-01-08 0.00000000 -0.002625732
6: FTNT 2021-01-11 0.00000000 -0.002423760
7: MDB 2021-01-04 -0.02403633 -0.024036332
8: MDB 2021-01-05 0.01469705 -0.009692544
9: MDB 2021-01-06 -0.05698057 -0.066120823
10: MDB 2021-01-07 0.00000000 -0.020526949
11: MDB 2021-01-08 0.00000000 0.002283801
12: MDB 2021-01-11 0.00000000 0.006740146
13: ZS 2021-01-04 -0.01812634 -0.018126338
14: ZS 2021-01-05 -0.01331021 -0.031195287
15: ZS 2021-01-06 0.00000000 -0.076561056
16: ZS 2021-01-07 0.00000000 -0.033348374
17: ZS 2021-01-08 0.00000000 -0.011717049
18: ZS 2021-01-11 0.00000000 -0.005057383
> dput(df)
structure(list(ticker = c("FTNT", "FTNT", "FTNT", "FTNT", "FTNT",
"FTNT", "MDB", "MDB", "MDB", "MDB", "MDB", "MDB", "ZS", "ZS",
"ZS", "ZS", "ZS", "ZS"), date = structure(c(18631, 18632, 18633,
18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638,
18631, 18632, 18633, 18634, 18635, 18638), tzone = "UTC", tclass = "Date", class = "Date"),
R = c(-0.0199959672793103, -0.0262434257521769, 0, 0, 0,
0, -0.0240363323965936, 0.0146970518570011, -0.0569805661884384,
0, 0, 0, -0.0181263375550329, -0.0133102149688453, 0, 0,
0, 0), R_acum = c(-0.0199959672793103, -0.0457146303488496,
-0.0701541646142473, -0.0355483675725332, -0.00262573219299644,
-0.00242375952618179, -0.0240363323965936, -0.00969254376327744,
-0.066120823320278, -0.0205269491289479, 0.00228380119052418,
0.00674014577578741, -0.0181263375550329, -0.0311952870744229,
-0.0765610558513475, -0.0333483739750707, -0.0117170493114047,
-0.00505738302838277)), row.names = c(NA, -18L), class = c("data.table",
"data.frame"))
This way your question is related to Subset by group with data.table compared to aggregate a data.table and Conditionally Select Rows within a Group with Data.Table.
Here is a version that only uses base R
.
df <- structure(list(ticker = c("FTNT", "FTNT", "FTNT", "FTNT", "FTNT", "FTNT", "MDB", "MDB", "MDB", "MDB", "MDB", "MDB", "ZS", "ZS", "ZS", "ZS", "ZS", "ZS"),
date = structure(c(18631, 18632, 18633, 18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638, 18631, 18632, 18633, 18634, 18635, 18638), tzone = "UTC", tclass = "Date", class = "Date"),
R = c(-0.0199959672793103, -0.0262434257521769, -0.025610299646878, 0.0372167038069893, 0.0341361186736473, 0.000202504389108293, -0.0240363323965936, 0.0146970518570011, -0.0569805661884384, 0.0488220267994761, 0.0232887982973971, 0.00444619037040206, -0.0181263375550329, -0.0133102149688453, -0.0468265360104722, 0.0467953860405097, 0.022377580589833, 0.00673862306172723),
R_acum = c(-0.0199959672793103, -0.0457146303488496, -0.0701541646142473, -0.0355483675725332, -0.00262573219299644, -0.00242375952618179, -0.0240363323965936, -0.00969254376327744, -0.066120823320278, -0.0205269491289479, 0.00228380119052418, 0.00674014577578741, -0.0181263375550329, -0.0311952870744229, -0.0765610558513475, -0.0333483739750707, -0.0117170493114047, -0.00505738302838277
)), row.names = c(NA, -18L), class = c("data.table", "data.frame"))
threshold <- -0.03
df.list <- split(df, df$ticker)
df.list <- lapply(df.list,
FUN = function(df) {
idxs <- which(df$R_acum < threshold)
# The first match should be ignored
min.idx <- min(idxs) + 1
idxs <- min.idx:nrow(df)
df$R[idxs] <- 0
return(df)
})
df.list
$FTNT
ticker date R R_acum
1 FTNT 2021-01-04 -0.01999597 -0.019995967
2 FTNT 2021-01-05 -0.02624343 -0.045714630
3 FTNT 2021-01-06 0.00000000 -0.070154165
4 FTNT 2021-01-07 0.00000000 -0.035548368
5 FTNT 2021-01-08 0.00000000 -0.002625732
6 FTNT 2021-01-11 0.00000000 -0.002423760
$MDB
ticker date R R_acum
7 MDB 2021-01-04 -0.02403633 -0.024036332
8 MDB 2021-01-05 0.01469705 -0.009692544
9 MDB 2021-01-06 -0.05698057 -0.066120823
10 MDB 2021-01-07 0.00000000 -0.020526949
11 MDB 2021-01-08 0.00000000 0.002283801
12 MDB 2021-01-11 0.00000000 0.006740146
$ZS
ticker date R R_acum
13 ZS 2021-01-04 -0.01812634 -0.018126338
14 ZS 2021-01-05 -0.01331021 -0.031195287
15 ZS 2021-01-06 0.00000000 -0.076561056
16 ZS 2021-01-07 0.00000000 -0.033348374
17 ZS 2021-01-08 0.00000000 -0.011717049
18 ZS 2021-01-11 0.00000000 -0.005057383
Here is the output of dput
for df.list
.
list(FTNT = structure(list(ticker = c("FTNT", "FTNT", "FTNT",
"FTNT", "FTNT", "FTNT"), date = structure(c(18631, 18632, 18633,
18634, 18635, 18638), class = "Date"), R = c(-0.0199959672793103,
-0.0262434257521769, 0, 0, 0, 0), R_acum = c(-0.0199959672793103,
-0.0457146303488496, -0.0701541646142473, -0.0355483675725332,
-0.00262573219299644, -0.00242375952618179)), row.names = c(NA,
6L), class = c("data.table", "data.frame")), MDB = structure(list(
ticker = c("MDB", "MDB", "MDB", "MDB", "MDB", "MDB"), date = structure(c(18631,
18632, 18633, 18634, 18635, 18638), class = "Date"), R = c(-0.0240363323965936,
0.0146970518570011, -0.0569805661884384, 0, 0, 0), R_acum = c(-0.0240363323965936,
-0.00969254376327744, -0.066120823320278, -0.0205269491289479,
0.00228380119052418, 0.00674014577578741)), row.names = 7:12, class = c("data.table",
"data.frame")), ZS = structure(list(ticker = c("ZS", "ZS", "ZS",
"ZS", "ZS", "ZS"), date = structure(c(18631, 18632, 18633, 18634,
18635, 18638), class = "Date"), R = c(-0.0181263375550329, -0.0133102149688453,
0, 0, 0, 0), R_acum = c(-0.0181263375550329, -0.0311952870744229,
-0.0765610558513475, -0.0333483739750707, -0.0117170493114047,
-0.00505738302838277)), row.names = 13:18, class = c("data.table",
"data.frame")))
If you want the original data frame back you can simply
df <- do.call(rbind, c(df.list, make.row.names = FALSE))
df
ticker date R R_acum
1 FTNT 2021-01-04 -0.01999597 -0.019995967
2 FTNT 2021-01-05 -0.02624343 -0.045714630
3 FTNT 2021-01-06 0.00000000 -0.070154165
4 FTNT 2021-01-07 0.00000000 -0.035548368
5 FTNT 2021-01-08 0.00000000 -0.002625732
6 FTNT 2021-01-11 0.00000000 -0.002423760
7 MDB 2021-01-04 -0.02403633 -0.024036332
8 MDB 2021-01-05 0.01469705 -0.009692544
9 MDB 2021-01-06 -0.05698057 -0.066120823
10 MDB 2021-01-07 0.00000000 -0.020526949
11 MDB 2021-01-08 0.00000000 0.002283801
12 MDB 2021-01-11 0.00000000 0.006740146
13 ZS 2021-01-04 -0.01812634 -0.018126338
14 ZS 2021-01-05 -0.01331021 -0.031195287
15 ZS 2021-01-06 0.00000000 -0.076561056
16 ZS 2021-01-07 0.00000000 -0.033348374
17 ZS 2021-01-08 0.00000000 -0.011717049
18 ZS 2021-01-11 0.00000000 -0.005057383
Upvotes: 0
Reputation: 31454
you can use
dt[, flag := rleid(R_acum < -0.03), by=ticker]
dt[flag>1, R := c(R[1], rep(0, .N-1)), by=ticker]
Upvotes: 2