Reputation: 151
I am working with some big time series datasets, with about 2million rows in each file. So far I've been using ddply to aggregate the data like I want it to but unfortunately it has become too slow and I really need a faster way. Here is my code:
DF <- read.csv(file = "NSE/20151221/AUROPHARMA15DECFUT_20151221_ob.csv",header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10", "C", "Price", "Qty", "OldPrice", "OldQty"), colClasses = c(NA, rep("integer",31), rep("NULL", 35)))
DF <- DF[which(DF$DateTime != 0),]
options(digits.secs = 3)
DF$DateTime = as.POSIXct(DF$DateTime/(10^9), origin="1970-01-01")
completecase <- DF[complete.cases(DF),]
midpoint = data.frame(DateTime=completecase$DateTime, MP=(completecase$BP1+completecase$AP1)/2)
**# creating 10 millisecond time intervals**
cuts = seq.POSIXt(from = min(midpoint$DateTime), to = max(midpoint$DateTime), by = .01)
**#creating new Time variable with the 10 millisecond breaks**
midpoint$Time = cut(midpoint$DateTime, breaks = cuts)
**#summarizing the MP variable every 10millisecond while keeping the empty time frames with the .drop = FALSE argument**
mp = ddply(midpoint, .(Time), summarise, MP = mean(MP), .drop = FALSE)
mp$Time = as.POSIXct(mp$Time)
mp_xts = xts(mp$MP, mp$Time, tzone = 'Asia/Kolkata')
mp_xts = mp_xts["2015-12-21 09:15:00.000/2015-12-21 15:30:00.000"]
mp_xts = makeReturns(mp_xts)
ddply is way too slow. I know I can do this with the data.table package or the dplyr package but what I cant seem to figure out is replicating the .drop = FALSE argument. Both the other packages drop all the empty time intervals but it is important for me to retain them for my analysis.
What I am trying to do is basically, aggregate my time series for every 10milliseconds, and fill the NA time frames with either 0 or locf. So if it does it right I should have 6.25*60*60*100 = 2,250,000 rows as my output.
Any ideas how I can do this faster while doing exactly the same thing as ddply?
> dput(DF[1:20,])
structure(list(DateTime = structure(c(1450669500.804, 1450669500.806,
1450669500.806, 1450669500.807, 1450669500.807, 1450669500.808,
1450669500.812, 1450669500.813, 1450669500.813, 1450669500.813,
1450669500.814, 1450669500.819, 1450669500.82, 1450669500.82,
1450669500.827, 1450669500.85, 1450669500.85, 1450669500.85,
1450669500.851, 1450669500.851), class = c("POSIXct", "POSIXt"
), tzone = ""), Seq = c(104L, 163L, 169L, 190L, 198L, 227L, 301L,
315L, 319L, 320L, 326L, 404L, 429L, 435L, 583L, 928L, 931L, 932L,
944L, 947L), BP1 = c(82055L, 82055L, 82055L, 82055L, 82055L,
82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L,
82630L, 82630L, 82630L, 82630L, 82630L, 82830L, 82830L), BQ1 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), BO1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP1 = c(0L, 87800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83795L, 83795L,
83795L, 83795L), AQ1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO1 = c(0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), BP2 = c(0L, 0L, 0L, 0L, 0L, 82055L, 82525L, 82525L,
82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L,
82525L, 82525L, 82630L, 82630L), BQ2 = c(0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO2 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP2 = c(0L, 0L, 87800L, 84805L,
84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L,
84230L, 84230L, 84230L, 84230L, 83800L, 83800L, 83800L, 83800L
), AQ2 = c(0L, 0L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), AO2 = c(0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP3 = c(0L, 0L, 0L, 0L, 0L, 0L, 82055L, 82055L, 82055L,
82055L, 82055L, 82320L, 82320L, 82320L, 82320L, 82320L, 82320L,
82320L, 82525L, 82525L), BQ3 = c(0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L, 0L, 0L, 87800L,
84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L,
84805L, 84805L, 84805L, 84805L, 84230L, 84230L, 84230L, 84230L
), AQ3 = c(0L, 0L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO3 = c(0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 82035L, 82035L,
82035L, 82055L, 82055L, 82055L, 82055L, 82060L, 82060L, 82060L,
82320L, 82320L), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), AP4 = c(0L, 0L, 0L, 0L, 87800L, 87800L,
87800L, 85380L, 85380L, 85380L, 85365L, 85365L, 85365L, 85365L,
84980L, 84980L, 84805L, 84805L, 84805L, 84400L), AQ4 = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L), AO4 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L, 80035L, 82035L, 82035L,
82035L, 82035L, 82055L, 82055L, 82055L, 82060L, 82060L),
BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), BO5 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 87800L, 87800L, 87800L,
85380L, 85380L, 85380L, 85380L, 85365L, 85365L, 84980L, 84980L,
84980L, 84805L), AQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), AO5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1",
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2",
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4",
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5",
"AO5"), row.names = c(NA, 20L), class = "data.frame")
Please, let me know if I should post any other information and really appreciate the help.
> sessionInfo()
R version 3.2.2 (2015-08-14)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu precise (12.04.5 LTS)
locale:
[1] LC_CTYPE=en_IN.UTF-8 LC_NUMERIC=C LC_TIME=en_IN.UTF-8 LC_COLLATE=en_IN.UTF-8 LC_MONETARY=en_IN.UTF-8 LC_MESSAGES=en_IN.UTF-8 LC_PAPER=en_IN.UTF-8
[8] LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] highfrequency_0.4 xts_0.9-7 zoo_1.7-12 data.table_1.9.7
loaded via a namespace (and not attached):
[1] tools_3.2.2 grid_3.2.2 lattice_0.20-33
Upvotes: 0
Views: 344
Reputation: 49448
The latest development version of data.table
adds non-equi joins which allow for very compact and fast solutions for these kinds of problems:
dt = as.data.table(df) # or convert in place using setDT
breaks = dt[, {tmp = seq(min(DateTime) - 0.01, max(DateTime) + 0.01, 0.01);
.(Start = tmp, End = c(tail(tmp, -1), Inf))}];
dt[breaks, on = .(DateTime >= Start, DateTime < End),
lapply(.SD, mean), by = .EACHI]
Upvotes: 3
Reputation: 35314
Can't do much better than data.table, except perhaps trying to write your own C++ implementation in Rcpp. Anyway, here's a data.table solution, using findInterval()
instead of cut()
, which should be slightly faster. Also, I've expanded the breaks outside the min and max to get some extra empty intervals to test.
library(data.table);
setDT(DF);
breaks <- seq.POSIXt(min(DF$DateTime)-0.01,max(DF$DateTime)+0.01,0.01);
breaks;
## [1] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST"
## [4] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST"
## [7] "2015-12-20 22:45:00 EST"
diff(breaks);
## Time differences in secs
## [1] 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999
ints <- findInterval(DF$DateTime,breaks);
ints;
## [1] 2 2 2 2 2 2 2 2 2 2 3 3 3 3 4 6 6 6 6 6
agg <- DF[,lapply(.SD,mean),breaks[ints],.SDcols=-1L];
agg;
## breaks Seq BP1 BQ1 BO1 AP1 AQ1 AO1 BP2 BQ2 BO2 AP2 AQ2 AO2 BP3 BQ3 BO3 AP3 AQ3 AO3 BP4 BQ4 BO4 AP4 AQ4 AO4 BP5 BQ5 BO5 AP5 AQ5 AO5
## 1: 2015-12-20 22:45:00 230.6 82342.5 1 1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6 8003.5 0.1 0.1 26340 0.3 0.3
## 2: 2015-12-20 22:45:00 398.5 82630.0 1 1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0
## 3: 2015-12-20 22:45:00 583.0 82630.0 1 1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0
## 4: 2015-12-20 22:45:00 936.4 82710.0 1 1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0
emp <- setdiff(seq_along(breaks),ints);
emp;
## [1] 1 5 7
agg <- rbind(fill=T,agg,data.table(breaks=breaks[emp]));
agg;
## breaks Seq BP1 BQ1 BO1 AP1 AQ1 AO1 BP2 BQ2 BO2 AP2 AQ2 AO2 BP3 BQ3 BO3 AP3 AQ3 AO3 BP4 BQ4 BO4 AP4 AQ4 AO4 BP5 BQ5 BO5 AP5 AQ5 AO5
## 1: 2015-12-20 22:45:00 230.6 82342.5 1 1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6 8003.5 0.1 0.1 26340 0.3 0.3
## 2: 2015-12-20 22:45:00 398.5 82630.0 1 1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0
## 3: 2015-12-20 22:45:00 583.0 82630.0 1 1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0
## 4: 2015-12-20 22:45:00 936.4 82710.0 1 1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0
## 5: 2015-12-20 22:45:00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 6: 2015-12-20 22:45:00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 7: 2015-12-20 22:45:00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Upvotes: 2