UtdMan
UtdMan

Reputation: 151

Replicating ddply with the dplyr package? ddply is too slow

I am working with some big time series datasets, with about 2million rows in each file. So far I've been using ddply to aggregate the data like I want it to but unfortunately it has become too slow and I really need a faster way. Here is my code:

    DF <- read.csv(file = "NSE/20151221/AUROPHARMA15DECFUT_20151221_ob.csv",header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10", "C", "Price", "Qty", "OldPrice", "OldQty"), colClasses = c(NA, rep("integer",31), rep("NULL", 35)))
    DF <- DF[which(DF$DateTime != 0),]
    options(digits.secs = 3)
    DF$DateTime = as.POSIXct(DF$DateTime/(10^9), origin="1970-01-01")     
    completecase <- DF[complete.cases(DF),]
    midpoint = data.frame(DateTime=completecase$DateTime, MP=(completecase$BP1+completecase$AP1)/2)
    **# creating 10 millisecond time intervals**
    cuts = seq.POSIXt(from = min(midpoint$DateTime), to = max(midpoint$DateTime), by = .01)
   **#creating new Time variable with the 10 millisecond breaks**
    midpoint$Time = cut(midpoint$DateTime, breaks = cuts)
   **#summarizing the MP variable every 10millisecond while keeping the empty time frames with the .drop = FALSE argument**
    mp = ddply(midpoint, .(Time), summarise, MP = mean(MP), .drop = FALSE)
    mp$Time = as.POSIXct(mp$Time)
    mp_xts = xts(mp$MP, mp$Time, tzone = 'Asia/Kolkata')
    mp_xts = mp_xts["2015-12-21 09:15:00.000/2015-12-21 15:30:00.000"]
    mp_xts = makeReturns(mp_xts)

ddply is way too slow. I know I can do this with the data.table package or the dplyr package but what I cant seem to figure out is replicating the .drop = FALSE argument. Both the other packages drop all the empty time intervals but it is important for me to retain them for my analysis.

What I am trying to do is basically, aggregate my time series for every 10milliseconds, and fill the NA time frames with either 0 or locf. So if it does it right I should have 6.25*60*60*100 = 2,250,000 rows as my output.

Any ideas how I can do this faster while doing exactly the same thing as ddply?

> dput(DF[1:20,])
structure(list(DateTime = structure(c(1450669500.804, 1450669500.806, 
1450669500.806, 1450669500.807, 1450669500.807, 1450669500.808, 
1450669500.812, 1450669500.813, 1450669500.813, 1450669500.813, 
1450669500.814, 1450669500.819, 1450669500.82, 1450669500.82, 
1450669500.827, 1450669500.85, 1450669500.85, 1450669500.85, 
1450669500.851, 1450669500.851), class = c("POSIXct", "POSIXt"
), tzone = ""), Seq = c(104L, 163L, 169L, 190L, 198L, 227L, 301L, 
315L, 319L, 320L, 326L, 404L, 429L, 435L, 583L, 928L, 931L, 932L, 
944L, 947L), BP1 = c(82055L, 82055L, 82055L, 82055L, 82055L, 
82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 
82630L, 82630L, 82630L, 82630L, 82630L, 82830L, 82830L), BQ1 = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), BO1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP1 = c(0L, 87800L, 
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83795L, 83795L, 
83795L, 83795L), AQ1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO1 = c(0L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), BP2 = c(0L, 0L, 0L, 0L, 0L, 82055L, 82525L, 82525L, 
82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 
82525L, 82525L, 82630L, 82630L), BQ2 = c(0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    BO2 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), AP2 = c(0L, 0L, 87800L, 84805L, 
    84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 
    84230L, 84230L, 84230L, 84230L, 83800L, 83800L, 83800L, 83800L
    ), AQ2 = c(0L, 0L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), AO2 = c(0L, 0L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), BP3 = c(0L, 0L, 0L, 0L, 0L, 0L, 82055L, 82055L, 82055L, 
    82055L, 82055L, 82320L, 82320L, 82320L, 82320L, 82320L, 82320L, 
    82320L, 82525L, 82525L), BQ3 = c(0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L, 0L, 0L, 87800L, 
    84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 
    84805L, 84805L, 84805L, 84805L, 84230L, 84230L, 84230L, 84230L
    ), AQ3 = c(0L, 0L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO3 = c(0L, 0L, 0L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 82035L, 82035L, 
    82035L, 82055L, 82055L, 82055L, 82055L, 82060L, 82060L, 82060L, 
    82320L, 82320L), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO4 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), AP4 = c(0L, 0L, 0L, 0L, 87800L, 87800L, 
    87800L, 85380L, 85380L, 85380L, 85365L, 85365L, 85365L, 85365L, 
    84980L, 84980L, 84805L, 84805L, 84805L, 84400L), AQ4 = c(0L, 
    0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 2L, 2L, 1L), AO4 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L, 80035L, 82035L, 82035L, 
    82035L, 82035L, 82055L, 82055L, 82055L, 82060L, 82060L), 
    BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), BO5 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 87800L, 87800L, 87800L, 
    85380L, 85380L, 85380L, 85380L, 85365L, 85365L, 84980L, 84980L, 
    84980L, 84805L), AQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), AO5 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1", 
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2", 
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4", 
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5", 
"AO5"), row.names = c(NA, 20L), class = "data.frame") 

Please, let me know if I should post any other information and really appreciate the help.

> sessionInfo()
R version 3.2.2 (2015-08-14)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu precise (12.04.5 LTS)

locale:
 [1] LC_CTYPE=en_IN.UTF-8       LC_NUMERIC=C               LC_TIME=en_IN.UTF-8        LC_COLLATE=en_IN.UTF-8     LC_MONETARY=en_IN.UTF-8    LC_MESSAGES=en_IN.UTF-8    LC_PAPER=en_IN.UTF-8      
 [8] LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] highfrequency_0.4 xts_0.9-7         zoo_1.7-12        data.table_1.9.7 

loaded via a namespace (and not attached):
[1] tools_3.2.2     grid_3.2.2      lattice_0.20-33

Upvotes: 0

Views: 344

Answers (2)

eddi
eddi

Reputation: 49448

The latest development version of data.table adds non-equi joins which allow for very compact and fast solutions for these kinds of problems:

dt = as.data.table(df) # or convert in place using setDT
breaks = dt[, {tmp = seq(min(DateTime) - 0.01, max(DateTime) + 0.01, 0.01);
               .(Start = tmp, End = c(tail(tmp, -1), Inf))}];

dt[breaks, on = .(DateTime >= Start, DateTime < End),
   lapply(.SD, mean), by = .EACHI]

Upvotes: 3

bgoldst
bgoldst

Reputation: 35314

Can't do much better than data.table, except perhaps trying to write your own C++ implementation in Rcpp. Anyway, here's a data.table solution, using findInterval() instead of cut(), which should be slightly faster. Also, I've expanded the breaks outside the min and max to get some extra empty intervals to test.

library(data.table);
setDT(DF);
breaks <- seq.POSIXt(min(DF$DateTime)-0.01,max(DF$DateTime)+0.01,0.01);
breaks;
## [1] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST"
## [4] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST"
## [7] "2015-12-20 22:45:00 EST"
diff(breaks);
## Time differences in secs
## [1] 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999
ints <- findInterval(DF$DateTime,breaks);
ints;
##  [1] 2 2 2 2 2 2 2 2 2 2 3 3 3 3 4 6 6 6 6 6
agg <- DF[,lapply(.SD,mean),breaks[ints],.SDcols=-1L];
agg;
##                 breaks   Seq     BP1 BQ1 BO1   AP1  AQ1 AO1     BP2 BQ2 BO2     AP2 AQ2 AO2      BP3 BQ3 BO3   AP3 AQ3 AO3   BP4 BQ4 BO4   AP4 AQ4 AO4     BP5 BQ5 BO5   AP5 AQ5 AO5
## 1: 2015-12-20 22:45:00 230.6 82342.5   1   1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6  8003.5 0.1 0.1 26340 0.3 0.3
## 2: 2015-12-20 22:45:00 398.5 82630.0   1   1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0
## 3: 2015-12-20 22:45:00 583.0 82630.0   1   1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0
## 4: 2015-12-20 22:45:00 936.4 82710.0   1   1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0
emp <- setdiff(seq_along(breaks),ints);
emp;
## [1] 1 5 7
agg <- rbind(fill=T,agg,data.table(breaks=breaks[emp]));
agg;
##                 breaks   Seq     BP1 BQ1 BO1   AP1  AQ1 AO1     BP2 BQ2 BO2     AP2 AQ2 AO2      BP3 BQ3 BO3   AP3 AQ3 AO3   BP4 BQ4 BO4   AP4 AQ4 AO4     BP5 BQ5 BO5   AP5 AQ5 AO5
## 1: 2015-12-20 22:45:00 230.6 82342.5   1   1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6  8003.5 0.1 0.1 26340 0.3 0.3
## 2: 2015-12-20 22:45:00 398.5 82630.0   1   1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0
## 3: 2015-12-20 22:45:00 583.0 82630.0   1   1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0
## 4: 2015-12-20 22:45:00 936.4 82710.0   1   1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0
## 5: 2015-12-20 22:45:00    NA      NA  NA  NA    NA   NA  NA      NA  NA  NA      NA  NA  NA       NA  NA  NA    NA  NA  NA    NA  NA  NA    NA  NA  NA      NA  NA  NA    NA  NA  NA
## 6: 2015-12-20 22:45:00    NA      NA  NA  NA    NA   NA  NA      NA  NA  NA      NA  NA  NA       NA  NA  NA    NA  NA  NA    NA  NA  NA    NA  NA  NA      NA  NA  NA    NA  NA  NA
## 7: 2015-12-20 22:45:00    NA      NA  NA  NA    NA   NA  NA      NA  NA  NA      NA  NA  NA       NA  NA  NA    NA  NA  NA    NA  NA  NA    NA  NA  NA      NA  NA  NA    NA  NA  NA

Upvotes: 2

Related Questions