user1766682
user1766682

Reputation: 400

Find whether event's timestamp is within a time interval

I have one data.frame with events and their arrival timestamps (microsecond precision). In the second dataframe I have the states with start and end times (= validity interval).

Below I wrote for loop routine which does the job but is really slow. I think that a combination of data.table, map/apply, parallelization (I have 12 cores available) could substantially reduce the processing time.

Could you please help to optimize my code?

Thanks!

options(digits.secs = 6)

start <- strptime("2019-10-16 08:00:00.789543 CET", "%Y-%m-%d %H:%M:%OS")
start <- format(start, "%Y-%m-%d %H:%M:%OS")

end <- strptime("2019-10-16 08:10:00.471123 CET", "%Y-%m-%d %H:%M:%OS")
end <- format(end, "%Y-%m-%d %H:%M:%OS")


#### events
event_timestamps <- seq.POSIXt(as.POSIXct(start), 
                    as.POSIXct(end), units = "seconds", by = .1)
events <- sprintf("event%s",seq(1:length(event_timestamps)))


events_df <- data.frame(event_timestamps, events, stringsAsFactors=FALSE)

#### states
states <- sprintf("state%s",seq(1:4))

state_start <- c("2019-10-16 07:00:00.000000 CEST",
                 "2019-10-16 08:03:00.765233 CEST",
                 "2019-10-16 08:05:03.765432 CEST",
                 "2019-10-16 08:05:03.765434")


state_end <- c("2019-10-16 08:03:00.765232 CEST",
               "2019-10-16 08:05:03.765431 CEST",
               "2019-10-16 08:05:03.765433 CEST",
               "2019-10-16 08:12:03.471122 CEST")

states_df <- data.frame(states, state_start = as.POSIXct(state_start),
                        state_end = as.POSIXct(state_end), stringsAsFactors=FALSE)

#The state dataframe contains states with non-overlapping start and end timestamps.
#That means that one event can fall into exactly one state
# the goal is for every event to find the state it belongs to

#########################################################################

library(lubridate)

# empty data.frame
resulting_df <- data.frame(events = character(),
                           state = character(),
                           stringsAsFactors=FALSE) 



# loop eventy by event
for(event in 1:nrow(events_df)) {
  # go with the event to the states data.frame
  for (state in 1:nrow(states_df)) {
    # define state's interval
    interv <- lubridate::interval(states_df$state_start[state], states_df$state_end[state], tzone = 'CET') 
    # check for every event if its timestamp is within the state interval
    if (events_df$event_timestamps[event] %within% interv){

      # then write the temp data.frame
      temp <- data.frame(events = events_df$events[event],
                         state = states_df$states[state],
                         stringsAsFactors=FALSE)

      # collect events with states
      resulting_df <- dplyr::bind_rows(resulting_df, temp)
      rm(temp)
      # one event can only be in one state at a time
      # after we found the state for the event, break the inner state loop
      # and move to the next event
      break
      next
    }


  }

}

Upvotes: 1

Views: 476

Answers (5)

chinsoon12
chinsoon12

Reputation: 25225

Some timings for reference using functions from data.table:

library(data.table) #data.table_1.12.4

s <- as.POSIXct(strptime("2019-10-01 00:00:00.000000 CET", "%Y-%m-%d %H:%M:%OS"))
e <- as.POSIXct(strptime("2019-10-10 23:59:59.999999 CET", "%Y-%m-%d %H:%M:%OS"))

#8,640,000 rows
events <- data.table(TIME=seq.POSIXt(s, e, units="seconds", by=.1))[, EVENT := .I]

#863,999 rows
h <- seq.POSIXt(s, e, units="hour", by=1)
states <- data.table(STATE=seq_len(length(h)-1L), START=h[-length(h)], END=h[-1L],
    key=c("START","END"))

events_foverlap <- copy(events)[, c("START", "END") := TIME]
states_foverlap <- copy(states)
setkey(events, TIME)

dt_foverlap <- function() {
    ans <- foverlaps(events_foverlap, states_foverlap, type="any", mult="first")
    ans[, .N]
}

dt_nonequi <- function() {
    ans <- states[events, on=.(START<=TIME, END>=TIME), mult="first"]
    ans[,.N]
}

dt_roll <- function() {
    ans <- states[events, roll=TRUE]   
    ans[,.N]
}

bench::mark(dt_foverlap(), dt_nonequi(), dt_roll())

timings:

# A tibble: 3 x 13
  expression         min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result    memory             time     gc              
  <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>    <list>             <list>   <list>          
1 dt_foverlap()    2.99s    2.99s     0.335    1.24GB    1.00      1     3      2.99s <int [1]> <df[,3] [122 x 3]> <bch:tm> <tibble [1 x 3]>
2 dt_nonequi()     3.78s    3.78s     0.265  372.55MB    0.265     1     1      3.78s <int [1]> <df[,3] [43 x 3]>  <bch:tm> <tibble [1 x 3]>
3 dt_roll()        1.09s    1.09s     0.918  329.69MB    0.918     1     1      1.09s <int [1]> <df[,3] [33 x 3]>  <bch:tm> <tibble [1 x 3]>

Upvotes: 0

B. Christian Kamgang
B. Christian Kamgang

Reputation: 6489

You could use the function foverlaps in the data.table package as follows (It's very fast!):

setDT(states_df, key = c("state_start", "state_end"))
cols <- c("state_start", "state_end")
setDT(events_df)[, (cols) := event_timestamps]

foverlaps(events_df, states_df)[, paste0("i.", cols) := NULL]

To understand how the function foverlaps works, it's better to read its documentation here

Upvotes: 1

Ape
Ape

Reputation: 1169

You can hack something together using the intervals package. Here is my quick and messy attempt:

int_mat = intervals::Intervals(cbind(as.numeric(gsub("\\D","", state_start)), 
                                     as.numeric(gsub("\\D","", state_end))))
point_mat = intervals::Intervals(cbind(as.numeric(gsub("\\D","", event_timestamps)), 
                                       as.numeric(gsub("\\D","", event_timestamps))))
ls = intervals::interval_included(int_mat, point_mat)
# ls[[n]] are indices of points that belong to the n-th interval

Note that Intervals() only accepts numeric matrices, so first I convert the timestamps to integers. All timestamps need to be in exactly the same format and include leading/trailing zeros (or just use different way to convert them to integers than I did).

Upvotes: 0

user2474226
user2474226

Reputation: 1502

You could try the sqldf package. Not sure how efficient it is on your full dataset, but this should work:

library(sqldf)
sqldf('SELECT events_df.events, states_df.states 
       FROM events_df INNER JOIN states_df 
       ON events_df.event_timestamps BETWEEN states_df.state_start AND states_df.state_end')

Upvotes: 0

ClancyStats
ClancyStats

Reputation: 1231

You can use a rolling join in data.table for this. The idea here is that you set the key for each data.table to be either the event time or the starting time for the state. The join will then match each event to the most recent starting state time. And since you have non-overlapping states, this achieves what you want.

## Your creation code above
#########################################################################

library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
library(tictoc)
tic()
# empty data.frame
resulting_df <- data.frame(events = character(),
                           state = character(),
                           stringsAsFactors=FALSE) 



# loop eventy by event
for(event in 1:nrow(events_df)) {
  # go with the event to the states data.frame
  for (state in 1:nrow(states_df)) {
    # define state's interval
    interv <- lubridate::interval(states_df$state_start[state], states_df$state_end[state], tzone = 'CET') 
    # check for every event if its timestamp is within the state interval
    if (events_df$event_timestamps[event] %within% interv){

      # then write the temp data.frame
      temp <- data.frame(events = events_df$events[event],
                         state = states_df$states[state],
                         stringsAsFactors=FALSE)

      # collect events with states
      resulting_df <- dplyr::bind_rows(resulting_df, temp)
      rm(temp)
      # one event can only be in one state at a time
      # after we found the state for the event, break the inner state loop
      # and move to the next event
      break
      next
    }


  }

}
toc()
#> 9.61 sec elapsed

library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:lubridate':
#> 
#>     hour, isoweek, mday, minute, month, quarter, second, wday,
#>     week, yday, year
events_dt <- data.table(events_df)
states_dt <- data.table(states_df)
setkey(states_dt, state_start)
setkey(events_dt, event_timestamps)
tic()
resulting_dt <- states_dt[events_dt, roll = T][,.(events, states)]
toc()
#> 0 sec elapsed
all(data.table(resulting_df) == resulting_dt)
#> [1] TRUE

Created on 2019-10-16 by the reprex package (v0.3.0)

Upvotes: 1

Related Questions