Reputation: 69
I have a large data.table
. I require:
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
> data
group value
1: A -0.3260365
2: A 0.5524619
3: A -0.6749438
4: A 0.2143595
5: A 0.3107692
6: A 1.1739663
7: A 0.6187899
8: A -0.1127343
9: A 0.9170283
10: A -0.2232594
11: B 0.5264481
12: B -0.7948444
13: B 1.4277555
14: B -1.4668197
15: B -0.2366834
16: B -0.1933380
17: B -0.8497547
To determine the rolling quantile of the 'value' column I am using the function runquantile()
from the package caTools
which takes ~1 minute to execute on my data set.
For the same rolling window (here k=4
), how to obtain the rolling mean value below the moving quantile, when computation time is a concern? In the example, the result should look something like column 'mean_below_q'.
library(caTools)
data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]
> data
group value rolling_q mean_below_q
1: A -0.3260365 NA NA
2: A 0.5524619 NA NA
3: A -0.6749438 NA NA
4: A 0.2143595 -0.21795730 -0.50049020
5: A 0.3107692 0.23364141 -0.23029220
6: A 1.1739663 0.23364141 -0.23029220
7: A 0.6187899 0.37237334 0.26256430
8: A -0.1127343 0.37237334 0.09901745
9: A 0.9170283 0.67843754 0.25302780
10: A -0.2232594 0.03357052 -0.16799680
11: B 0.5264481 NA NA
12: B -0.7948444 NA NA
13: B 1.4277555 NA NA
14: B -1.4668197 -0.53058593 -1.13083200
15: B -0.2366834 -0.68321222 -1.13083200
16: B -0.1933380 -0.22801430 -0.85175150
17: B -0.8497547 -0.72714047 -1.15828700
Thank you very much.
As per @chinsoon12 brilliant script it solves the problem and takes roughly 15 minutes on the large data set.
# Option 1)
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))
sz <- 4L
prob <- 0.4
#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]
data[, rolling_q := frollapply(value, sz, function(x) {
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
qs
#.(q, sum(x[1L:lo]) / lo)
}), group]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
Lastly, I tried to use runquantile()
from the caTools
package in conjunction with the Option 1) code for the rolling mean below threshold. It seems to work fine and takes in total 2.5 minutes on the large data set.
# rolling_q via runquantile & mean_below_q via Option 1)
library(data.table)
library(caTools)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7),rep("C",13)), value=c(rnorm(17),rep(0,3),-0.9,rep(0,4),-0.5,-0.8,rep(0,3)))
data[,rolling_q := c(rep(NA,3),runquantile(value,k=4,0.4,endrule="trim")),group]
sz <- 4L
prob <- 0.4
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
> data
group value rolling_q sr er mean_below_q
1: A -0.3260365 NA -2 1 NA
2: A 0.5524619 NA -1 2 NA
3: A -0.6749438 NA 0 3 NA
4: A 0.2143595 -0.21795730 1 4 -0.50049017
5: A 0.3107692 0.23364141 2 5 -0.23029219
6: A 1.1739663 0.23364141 3 6 -0.23029219
7: A 0.6187899 0.37237334 4 7 0.26256434
8: A -0.1127343 0.37237334 5 8 0.09901745
9: A 0.9170283 0.67843754 6 9 0.25302777
10: A -0.2232594 0.03357052 7 10 -0.16799684
11: B 0.5264481 NA 8 11 NA
12: B -0.7948444 NA 9 12 NA
13: B 1.4277555 NA 10 13 NA
14: B -1.4668197 -0.53058593 11 14 -1.13083206
15: B -0.2366834 -0.68321222 12 15 -1.13083206
16: B -0.1933380 -0.22801430 13 16 -0.85175154
17: B -0.8497547 -0.72714047 14 17 -1.15828722
18: C 0.0000000 NA 15 18 NA
19: C 0.0000000 NA 16 19 NA
20: C 0.0000000 NA 17 20 NA
21: C -0.9000000 0.00000000 18 21 -0.22500000
22: C 0.0000000 0.00000000 19 22 -0.22500000
23: C 0.0000000 0.00000000 20 23 -0.22500000
24: C 0.0000000 0.00000000 21 24 -0.22500000
25: C 0.0000000 0.00000000 22 25 0.00000000
26: C -0.5000000 0.00000000 23 26 -0.12500000
27: C -0.8000000 -0.40000000 24 27 -0.65000000
28: C 0.0000000 -0.40000000 25 28 -0.65000000
29: C 0.0000000 -0.40000000 26 29 -0.65000000
30: C 0.0000000 0.00000000 27 30 -0.20000000
group value rolling_q sr er mean_below_q
Upvotes: 1
Views: 468
Reputation: 25225
I believe there is a typo as data is showing mean of values below percentile rather than above percentile.
Here are 2 options:
1) using frollapply
(as it returns a value of length 1), hence you need another non-equi join to calculate your mean of values below percentile.
data[, rolling_q := frollapply(value, sz, function(x) {
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
qs
#.(q, sum(x[1L:lo]) / lo)
}), group]
data[, mean_below_q :=
data[data, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, mean(value[value<=i.rolling_q])]$V1
]
2) Using 1 non-equi join to calculate both values
data[, c("rolling_q", "mean_below_q") :=
.SD[.SD, on=.(group, er>=sr, er<=er), allow.cartesian=TRUE,
by=.EACHI, {
if (.N >= sz) {
x <- x.value
x <- sort(x, partial = unique(c(lo, hi)))
qs <- x[lo]
i <- which(index > lo & x[hi] != qs)
h <- (index - lo)[i]
qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
qs
.(qs, sum(x[1L:lo]) / lo)
} else
.(NA_real_, NA_real_)
}][, (1L:3L) := NULL]
]
output:
group value sr er rolling_q mean_below_q
1: A -0.3260365 -2 1 NA NA
2: A 0.5524619 -1 2 NA NA
3: A -0.6749438 0 3 NA NA
4: A 0.2143595 1 4 -0.21795730 -0.50049017
5: A 0.3107692 2 5 0.23364141 -0.23029219
6: A 1.1739663 3 6 0.23364141 -0.23029219
7: A 0.6187899 4 7 0.37237334 0.26256434
8: A -0.1127343 5 8 0.37237334 0.09901745
9: A 0.9170283 6 9 0.67843754 0.25302777
10: A -0.2232594 7 10 0.03357052 -0.16799684
11: B 0.5264481 8 11 NA NA
12: B -0.7948444 9 12 NA NA
13: B 1.4277555 10 13 NA NA
14: B -1.4668197 11 14 -0.53058593 -1.13083206
15: B -0.2366834 12 15 -0.68321222 -1.13083206
16: B -0.1933380 13 16 -0.22801430 -0.85175154
17: B -0.8497547 14 17 -0.72714047 -1.15828722
data:
library(data.table)
set.seed(101)
data <- data.table(group=c(rep("A",10),rep("B",7)), value=rnorm(17))
sz <- 4L
prob <- 0.4
#see stats:::quantile.default
index <- 1 + (sz - 1) * prob
lo <- floor(index)
hi <- ceiling(index)
data[, c("sr", "er") := .(.I - sz + 1L, .I)]
Upvotes: 1