Create new column of closest values according to another column by groups

Question

I have the following data frame (called mydata_tsample):

cusip_id     trd_exctn_dt   trd_exctn_tm    price   contra_party_type **refPrice**
BUHADU       01.04.2016     01:10:50        101.00  C                 102.10
BUHADU       01.04.2016     02:10:50        101.50  C                 102.10    
BUHADU       01.04.2016     08:10:50        102.10  D                 102.10
BUHADU       01.04.2016     09:10:50        102.10  C                 102.10
BUHADU       02.04.2016     07:12:50        90.50   C                 90.85
BUHADU       02.04.2016     09:10:55        90.85   D                 90.85
BUHADU       02.04.2016     12:11:40        90.90   C                 91.00
BUHADU       02.04.2016     12:12:02        91.00   D                 91.00
XDSEOI       03.04.2016     06:52:51        50.00   D                 50.00 
XDSEOI       03.04.2016     08:40:58        50.20   C                 50.00  
XDSEOI       03.04.2016     15:10:51        51.00   C                 52.00
XDSEOI       03.04.2016     15:14:51        52.00   D                 52.00

I'd like to generate/add a new column (called refPrice) that is calculated using a for loop. For each row in column RefPrice, I'd like to extract the price with following conditions:

same cusip_ID
same trd_exctn_dt
contra_party_type = D
then take the closest price to trd_exctn_tm

I did a code that does exactly this:

for (i in 1:nrow(mydata_tsample)){
      Mtx_aftr_CUSIP=mydata_tsample[mydata_tsample$cusip_id %in% mydata_tsample[i,1],]
      Mtx_aftr_CUSIP_dt=Mtx_aftr_CUSIP[Mtx_aftr_CUSIP$trd_exctn_dt %in% mydata_tsample[i,2],]
      Mtx_aftr_CUSIP_dt_dealer=Mtx_aftr_CUSIP_dt[Mtx_aftr_CUSIP_dt$contra_party_type %in% "D",]
      if(nrow(Mtx_aftr_CUSIP_dt_dealer)==0) {next} else 
      {
        closesttime=which.min(abs(Mtx_aftr_CUSIP_dt_dealer$trd_exctn_tm - mydata_tsample[i,3]))
        mydata_tsample$RefPrice[i]=Mtx_aftr_CUSIP_dt_dealer[closesttime,4]  }
}

The problem I have is speed. I takes me a couple of hours to process 0.5Mio. lines. In total I have 5Mio. lines...

I tried with doParallel, but I did not work out.

library(doParallel)
registerDoParallel(cores=4)
library(foreach)
foreach(i=1:nrow(mydata_tsample)) %dopar% {
  Mtx_aftr_CUSIP=mydata_tsample[mydata_tsample$cusip_id %in% mydata_tsample[i,1],]
  Mtx_aftr_CUSIP_dt=Mtx_aftr_CUSIP[Mtx_aftr_CUSIP$trd_exctn_dt %in% mydata_tsample[i,2],]
  Mtx_aftr_CUSIP_dt_dealer=Mtx_aftr_CUSIP_dt[Mtx_aftr_CUSIP_dt$contra_party_type %in% "D",]
  if(nrow(Mtx_aftr_CUSIP_dt_dealer)==0) {next} else 
  {
    closesttime=which.min(abs(Mtx_aftr_CUSIP_dt_dealer$trd_exctn_tm - mydata_tsample[i,3]))
    mydata_tsample$RefPrice[i]=Mtx_aftr_CUSIP_dt_dealer[closesttime,4]
  }
}

}

user3603486 · Accepted Answer

Here is a simple partial solution that runs in seconds, and gets the nearest previous price where contra_party_type=="D".

# generate toy data:
library(dplyr)
library(zoo)
n <- 500000
dfr <- dplyr::tibble(
  cusip_id = sample(LETTERS, n, replace = TRUE),
  trd_exctn_dt = as.Date(sample(365, n, replace = TRUE), 
    origin = "2016-01-01"),
  trd_exctn_tm = strftime(as.POSIXlt(sample(60*60*24, n, replace = TRUE),
    origin = "1970-01-01"), "%H:%M:%S"),
  price = round(rnorm(n, 100, 5), 2),
  contra_party_type = sample(LETTERS[1:4], n, replace = TRUE)
)


dfr <- dfr %>% 
      group_by(cusip_id, trd_exctn_dt) %>% 
      arrange(trd_exctn_tm, .by_group = TRUE) %>% 
      mutate(
        refprice = ifelse(contra_party_type == "D", price, NA),
        refprice = zoo::na.locf(refprice, na.rm = FALSE)
      )
dfr

# A tibble: 500,000 x 6
# Groups:   cusip_id, trd_exctn_dt [9,490]
   cusip_id trd_exctn_dt trd_exctn_tm price contra_party_type refprice
                                       
 1 A        2016-01-02   00:25:47      89.6 D                     89.6
 2 A        2016-01-02   01:19:37     101.  B                     89.6
 3 A        2016-01-02   01:22:34     108.  B                     89.6
 4 A        2016-01-02   01:28:14     102.  D                    102. 
 5 A        2016-01-02   01:35:36      95.9 A                    102. 
 6 A        2016-01-02   01:45:01     102.  C                    102.

To do exactly what you want, I would

calculate the time difference to the last previous instance where cpt is D
calculate the time difference to the next future instance where cpt is D
calculate the prices for each of those instances
choose the price based on the closest time difference, using ifelse

Create new column of closest values according to another column by groups

Answers (2)

Related Questions