Reputation: 400
I have one data.frame with events and their arrival timestamps (microsecond precision). In the second dataframe I have the states with start and end times (= validity interval).
Below I wrote for loop routine which does the job but is really slow. I think that a combination of data.table, map/apply, parallelization (I have 12 cores available) could substantially reduce the processing time.
Could you please help to optimize my code?
Thanks!
options(digits.secs = 6)
start <- strptime("2019-10-16 08:00:00.789543 CET", "%Y-%m-%d %H:%M:%OS")
start <- format(start, "%Y-%m-%d %H:%M:%OS")
end <- strptime("2019-10-16 08:10:00.471123 CET", "%Y-%m-%d %H:%M:%OS")
end <- format(end, "%Y-%m-%d %H:%M:%OS")
#### events
event_timestamps <- seq.POSIXt(as.POSIXct(start),
as.POSIXct(end), units = "seconds", by = .1)
events <- sprintf("event%s",seq(1:length(event_timestamps)))
events_df <- data.frame(event_timestamps, events, stringsAsFactors=FALSE)
#### states
states <- sprintf("state%s",seq(1:4))
state_start <- c("2019-10-16 07:00:00.000000 CEST",
"2019-10-16 08:03:00.765233 CEST",
"2019-10-16 08:05:03.765432 CEST",
"2019-10-16 08:05:03.765434")
state_end <- c("2019-10-16 08:03:00.765232 CEST",
"2019-10-16 08:05:03.765431 CEST",
"2019-10-16 08:05:03.765433 CEST",
"2019-10-16 08:12:03.471122 CEST")
states_df <- data.frame(states, state_start = as.POSIXct(state_start),
state_end = as.POSIXct(state_end), stringsAsFactors=FALSE)
#The state dataframe contains states with non-overlapping start and end timestamps.
#That means that one event can fall into exactly one state
# the goal is for every event to find the state it belongs to
#########################################################################
library(lubridate)
# empty data.frame
resulting_df <- data.frame(events = character(),
state = character(),
stringsAsFactors=FALSE)
# loop eventy by event
for(event in 1:nrow(events_df)) {
# go with the event to the states data.frame
for (state in 1:nrow(states_df)) {
# define state's interval
interv <- lubridate::interval(states_df$state_start[state], states_df$state_end[state], tzone = 'CET')
# check for every event if its timestamp is within the state interval
if (events_df$event_timestamps[event] %within% interv){
# then write the temp data.frame
temp <- data.frame(events = events_df$events[event],
state = states_df$states[state],
stringsAsFactors=FALSE)
# collect events with states
resulting_df <- dplyr::bind_rows(resulting_df, temp)
rm(temp)
# one event can only be in one state at a time
# after we found the state for the event, break the inner state loop
# and move to the next event
break
next
}
}
}
Upvotes: 1
Views: 476
Reputation: 25225
Some timings for reference using functions from data.table
:
library(data.table) #data.table_1.12.4
s <- as.POSIXct(strptime("2019-10-01 00:00:00.000000 CET", "%Y-%m-%d %H:%M:%OS"))
e <- as.POSIXct(strptime("2019-10-10 23:59:59.999999 CET", "%Y-%m-%d %H:%M:%OS"))
#8,640,000 rows
events <- data.table(TIME=seq.POSIXt(s, e, units="seconds", by=.1))[, EVENT := .I]
#863,999 rows
h <- seq.POSIXt(s, e, units="hour", by=1)
states <- data.table(STATE=seq_len(length(h)-1L), START=h[-length(h)], END=h[-1L],
key=c("START","END"))
events_foverlap <- copy(events)[, c("START", "END") := TIME]
states_foverlap <- copy(states)
setkey(events, TIME)
dt_foverlap <- function() {
ans <- foverlaps(events_foverlap, states_foverlap, type="any", mult="first")
ans[, .N]
}
dt_nonequi <- function() {
ans <- states[events, on=.(START<=TIME, END>=TIME), mult="first"]
ans[,.N]
}
dt_roll <- function() {
ans <- states[events, roll=TRUE]
ans[,.N]
}
bench::mark(dt_foverlap(), dt_nonequi(), dt_roll())
timings:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 dt_foverlap() 2.99s 2.99s 0.335 1.24GB 1.00 1 3 2.99s <int [1]> <df[,3] [122 x 3]> <bch:tm> <tibble [1 x 3]>
2 dt_nonequi() 3.78s 3.78s 0.265 372.55MB 0.265 1 1 3.78s <int [1]> <df[,3] [43 x 3]> <bch:tm> <tibble [1 x 3]>
3 dt_roll() 1.09s 1.09s 0.918 329.69MB 0.918 1 1 1.09s <int [1]> <df[,3] [33 x 3]> <bch:tm> <tibble [1 x 3]>
Upvotes: 0
Reputation: 6489
You could use the function foverlaps
in the data.table
package as follows (It's very fast!):
setDT(states_df, key = c("state_start", "state_end"))
cols <- c("state_start", "state_end")
setDT(events_df)[, (cols) := event_timestamps]
foverlaps(events_df, states_df)[, paste0("i.", cols) := NULL]
To understand how the function foverlaps
works, it's better to read its documentation here
Upvotes: 1
Reputation: 1169
You can hack something together using the intervals
package. Here is my quick and messy attempt:
int_mat = intervals::Intervals(cbind(as.numeric(gsub("\\D","", state_start)),
as.numeric(gsub("\\D","", state_end))))
point_mat = intervals::Intervals(cbind(as.numeric(gsub("\\D","", event_timestamps)),
as.numeric(gsub("\\D","", event_timestamps))))
ls = intervals::interval_included(int_mat, point_mat)
# ls[[n]] are indices of points that belong to the n-th interval
Note that Intervals()
only accepts numeric matrices, so first I convert the timestamps to integers. All timestamps need to be in exactly the same format and include leading/trailing zeros (or just use different way to convert them to integers than I did).
Upvotes: 0
Reputation: 1502
You could try the sqldf package. Not sure how efficient it is on your full dataset, but this should work:
library(sqldf)
sqldf('SELECT events_df.events, states_df.states
FROM events_df INNER JOIN states_df
ON events_df.event_timestamps BETWEEN states_df.state_start AND states_df.state_end')
Upvotes: 0
Reputation: 1231
You can use a rolling join in data.table
for this. The idea here is that you set the key for each data.table
to be either the event time or the starting time for the state. The join will then match each event to the most recent starting state time. And since you have non-overlapping states, this achieves what you want.
## Your creation code above
#########################################################################
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
library(tictoc)
tic()
# empty data.frame
resulting_df <- data.frame(events = character(),
state = character(),
stringsAsFactors=FALSE)
# loop eventy by event
for(event in 1:nrow(events_df)) {
# go with the event to the states data.frame
for (state in 1:nrow(states_df)) {
# define state's interval
interv <- lubridate::interval(states_df$state_start[state], states_df$state_end[state], tzone = 'CET')
# check for every event if its timestamp is within the state interval
if (events_df$event_timestamps[event] %within% interv){
# then write the temp data.frame
temp <- data.frame(events = events_df$events[event],
state = states_df$states[state],
stringsAsFactors=FALSE)
# collect events with states
resulting_df <- dplyr::bind_rows(resulting_df, temp)
rm(temp)
# one event can only be in one state at a time
# after we found the state for the event, break the inner state loop
# and move to the next event
break
next
}
}
}
toc()
#> 9.61 sec elapsed
library(data.table)
#>
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:lubridate':
#>
#> hour, isoweek, mday, minute, month, quarter, second, wday,
#> week, yday, year
events_dt <- data.table(events_df)
states_dt <- data.table(states_df)
setkey(states_dt, state_start)
setkey(events_dt, event_timestamps)
tic()
resulting_dt <- states_dt[events_dt, roll = T][,.(events, states)]
toc()
#> 0 sec elapsed
all(data.table(resulting_df) == resulting_dt)
#> [1] TRUE
Created on 2019-10-16 by the reprex package (v0.3.0)
Upvotes: 1