Daniel Krizian
Daniel Krizian

Reputation: 4716

data.table subset with irregular date range boundaries by group

I have different period starts for each instrument organised in this data.table ....

S

   Instrument       Date
1:        DBC 2006-02-08
2:        IEF 2002-08-02
3:        VEU 2007-03-10
4:        VNQ 2004-10-03
5:        VTI 2001-06-17

and another data.table.....

DT

   Instrument       Date   Open   High    Low  Close  Volume Adjusted
 1:        DBC 2006-02-06  24.55  24.66  24.10  24.20  771500    22.69
 2:        DBC 2006-02-07  24.05  24.20  23.45  23.50  674400    22.04
 3:        DBC 2006-02-08  23.49  23.67  23.33  23.40  335400    21.94
 4:        DBC 2006-02-09  23.56  23.82  23.51  23.62  705900    22.15
 5:        DBC 2006-02-10  23.65  23.69  23.10  23.19  276700    21.75
 6:        IEF 2002-07-31  82.05  82.58  82.05  82.52   32600    55.22
 7:        IEF 2002-08-01  82.54  82.90  82.52  82.86   71400    55.44
 8:        IEF 2002-08-02  83.02  83.70  82.90  83.50  120300    55.87
 9:        IEF 2002-08-05  83.68  83.92  83.53  83.92  159300    56.15
10:        IEF 2002-08-06  83.51  83.51  83.03  83.24   90100    55.70
11:        VEU 2007-03-08  50.71  51.10  50.71  50.96  113600    43.05
12:        VEU 2007-03-09  51.15  51.36  51.02  51.30  139100    43.34
13:        VEU 2007-03-12  51.25  51.59  51.20  51.54  108000    43.54
14:        VEU 2007-03-13  51.20  51.23  50.17  50.19  216900    42.40
15:        VEU 2007-03-14  50.00  50.26  49.39  50.25  154900    42.45
16:        VNQ 2004-10-01  50.35  51.25  50.25  51.18  129800    32.84
17:        VNQ 2004-10-04  51.39  51.55  51.39  51.40    8100    32.98
18:        VNQ 2004-10-05  51.40  51.50  51.35  51.39   11900    32.97
19:        VNQ 2004-10-06  51.50  51.63  51.49  51.63    9300    33.12
20:        VNQ 2004-10-07  51.65  51.65  51.10  51.10    7900    32.78
21:        VTI 2001-06-15 110.85 112.01 110.35 111.33 1067400    44.58
22:        VTI 2001-06-18 111.63 111.83 110.62 110.62  282600    44.29
23:        VTI 2001-06-19 112.30 112.33 110.47 110.93 1777600    44.42
24:        VTI 2001-06-20 110.75 112.15 110.75 112.10  476000    44.88
25:        VTI 2001-06-21 112.00 113.45 111.75 113.00  240400    45.24
    Instrument       Date   Open   High    Low  Close  Volume Adjusted

... , which I would like to subset, taking Date from S as starting points for each Instrument:


Reproducible data:

S <- structure(list(Instrument = c("DBC", "IEF", "VEU", "VNQ", "VTI"), Date = structure(c(13187, 11901, 13582, 12694, 11490), class = "Date")), row.names     =c(NA, -5L), class = c("data.table", "data.frame"), .Names = c("Instrument", "Date"), sorted = c("Instrument", "Date"))

DT <- structure(list(Instrument = c("DBC", "DBC", "DBC", "DBC", "DBC", "IEF", "IEF", "IEF", "IEF", "IEF", "VEU", "VEU", "VEU", "VEU", "VEU", "VNQ", "VNQ", "VNQ", "VNQ", "VNQ", "VTI", "VTI", "VTI", "VTI", "VTI"), Date = structure(c(13185, 13186, 13187, 13188, 13189, 11899,11900, 11901, 11904, 11905, 13580, 13581, 13584, 13585, 13586, 12692, 12695, 12696, 12697, 12698, 11488, 11491, 11492, 11493, 11494), class = c("IDate", "Date")), Open = c(24.55, 24.05, 23.49, 23.56, 23.65, 82.05, 82.54, 83.02, 83.68, 83.51, 50.71, 51.15, 51.25,51.2, 50, 50.35, 51.39, 51.4, 51.5, 51.65, 110.85, 111.63, 112.3, 110.75, 112), High = c(24.66, 24.2, 23.67, 23.82, 23.69, 82.58, 82.9, 83.7, 83.92, 83.51, 51.1, 51.36, 51.59,51.23, 50.26, 51.25, 51.55, 51.5, 51.63, 51.65, 112.01, 111.83, 112.33, 112.15, 113.45), Low = c(24.1, 23.45, 23.33, 23.51, 23.1, 82.05, 82.52, 82.9, 83.53, 83.03, 50.71, 51.02, 51.2, 50.17, 49.39, 50.25, 51.39, 51.35, 51.49, 51.1, 110.35, 110.62, 110.47, 110.75, 111.75), Close = c(24.2, 23.5, 23.4, 23.62, 23.19, 82.52, 82.86, 83.5, 83.92, 83.24, 50.96, 51.3, 51.54, 50.19, 50.25, 51.18, 51.4, 51.39, 51.63, 51.1, 111.33, 110.62, 110.93, 112.1, 113), Volume = c(771500, 674400, 335400, 705900, 276700, 32600, 71400, 120300, 159300, 90100, 113600, 139100, 108000, 216900, 154900, 129800, 8100, 11900, 9300, 7900,1067400, 282600, 1777600, 476000, 240400), Adjusted = c(22.69, 22.04, 21.94, 22.15, 21.75, 55.22, 55.44, 55.87, 56.15, 55.7, 43.05, 43.34, 43.54, 42.4, 42.45, 32.84, 32.98, 32.97, 33.12, 32.78, 44.58, 44.29, 44.42, 44.88, 45.24)), .Names = c("Instrument", "Date", "Open", "High", "Low", "Close", "Volume", "Adjusted"), row.names = c(NA, -25L), class = c("data.table", "data.frame"), sorted = c("Instrument", "Date"))

Upvotes: 2

Views: 260

Answers (2)

Daniel Krizian
Daniel Krizian

Reputation: 4716

Here is the summary and benchmark test of the three solutions provided by @agstudy and @Arun

  1. merge(DT,S)[,.SD[Date.x>=Date.y],Instrument]
  2. DT[S, .SD[Date >= i.Date]]
  3. DT[S][Date>=Date.1]

Benchmarks

require(rbenchmark)
setkey(DT, Instrument, Date) # initial condition
setkey(S, Instrument, Date) # initial condition

agstudy <- function() {
  setkey(DT, Instrument)
  setkey(S, Instrument)

  out <- merge(DT,S)[,.SD[Date.x>=Date.y],Instrument]

  out[,Date.y:=NULL]
  setnames(out,"Date.x", "Date")
  setkey(out, Instrument, Date)
  return(out)
}

Arun <- function(){
  setkey(S, Instrument)

  out <- DT[S, .SD[Date >= i.Date]]

  setkey(out, Instrument, Date)
  return(out)
}

DanielKrizian <- function(){
    setkey(S, Instrument)

    out <- DT[S][Date>=Date.1]

    setkey(out, Instrument, Date)
    out[,Date.1:=NULL]
    return(out)
}

setkey(DT, Instrument, Date) # initial condition
setkey(S, Instrument, Date) # initial condition
benchmark(ag <- agstudy(),replications=1000)

setkey(DT, Instrument, Date) 
setkey(S, Instrument, Date)
benchmark(ar <- Arun(),replications=1000)

setkey(DT, Instrument, Date)
setkey(S, Instrument, Date)
benchmark(dk <- DanielKrizian(),replications=1000)

Output

           test replications elapsed relative user.self sys.self user.child sys.child
ag <- agstudy()         1000    9.39        1      8.55     0.07         NA        NA
ar <- Arun()            1000    7.16        1      6.53     0.07         NA        NA
dk <- DanielKrizian()   1000    2.95        1       2.7        0         NA        NA

Check

all(identical(ag, ar),identical(ag,dk),identical(ar,dk))
[1]TRUE

Upvotes: 2

agstudy
agstudy

Reputation: 121608

You can merge the 2 sets , then filter by instrument:

setkey(S,Instrument)
setkey(DT,Instrument)
merge(DT,S)[,.SD[Date.x>=Date.y],Instrument]

  Instrument     Date.x   Open   High    Low  Close  Volume Adjusted     Date.y
 1:        DBC 2006-02-08  23.49  23.67  23.33  23.40  335400    21.94 2006-02-08
 2:        DBC 2006-02-09  23.56  23.82  23.51  23.62  705900    22.15 2006-02-08
 3:        DBC 2006-02-10  23.65  23.69  23.10  23.19  276700    21.75 2006-02-08
 4:        IEF 2002-08-02  83.02  83.70  82.90  83.50  120300    55.87 2002-08-02
 5:        IEF 2002-08-05  83.68  83.92  83.53  83.92  159300    56.15 2002-08-02
 6:        IEF 2002-08-06  83.51  83.51  83.03  83.24   90100    55.70 2002-08-02
 7:        VEU 2007-03-12  51.25  51.59  51.20  51.54  108000    43.54 2007-03-10
 8:        VEU 2007-03-13  51.20  51.23  50.17  50.19  216900    42.40 2007-03-10
 9:        VEU 2007-03-14  50.00  50.26  49.39  50.25  154900    42.45 2007-03-10
10:        VNQ 2004-10-04  51.39  51.55  51.39  51.40    8100    32.98 2004-10-03
11:        VNQ 2004-10-05  51.40  51.50  51.35  51.39   11900    32.97 2004-10-03
12:        VNQ 2004-10-06  51.50  51.63  51.49  51.63    9300    33.12 2004-10-03
13:        VNQ 2004-10-07  51.65  51.65  51.10  51.10    7900    32.78 2004-10-03
14:        VTI 2001-06-18 111.63 111.83 110.62 110.62  282600    44.29 2001-06-17
15:        VTI 2001-06-19 112.30 112.33 110.47 110.93 1777600    44.42 2001-06-17
16:        VTI 2001-06-20 110.75 112.15 110.75 112.10  476000    44.88 2001-06-17
17:        VTI 2001-06-21 112.00 113.45 111.75 113.00  240400    45.24 2001-06-17

Upvotes: 2

Related Questions