Reputation: 5405
I have data with IDs, close times, and open times. I need to break out up each line that spans over an hour into multiple rows with for each ID, such that each row has an open and close time not spanning over the top of the hour. Ideally, this would leave rows that have open and close times within the same hour alone. It would also retain the ID throughout each row.
For example, if I have an open time from 11:55am to 1:10pm - I would want three rows to result from this respective column. One from 11:55 - 12, 12 - 1, and 1 - 1:10.
I believe I have come up with a solution, but it is convoluted:
dat <- tibble(ID = c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L),
open_time = structure(c( 1509378717, 1509475803, 1509460317, 1509372561, 1508445791, 1508962523, 1509483224, 1509483978, 1509483727),
tzone = "America/New_York",
class = c("POSIXct", "POSIXt")),
close_time = structure(c( 1509383226, 1509476435, 1509462052, 1509376589, 1508445791, 1508962523, 1509483543, 1509483983, 1509483727),
tzone = "America/New_York",
class = c("POSIXct", "POSIXt")))
split_by_hour <- function(open_time, close_time){
# get hours to span
hour_start <- lubridate::ceiling_date(open_time, 'hour')
hour_end <- lubridate::floor_date(close_time, 'hour')
# hour sequence to create rows
hour_seq <- seq(hour_start, hour_end, by = 'hour')
# create tibble
time_tbl <- tibble(
open_time = lubridate::ymd_hms(c(open_time, hour_seq), tz = "America/New_York"),
close_time = lubridate::ymd_hms(c(hour_seq, close_time), tz = "America/New_York")
)
time_tbl
}
row_hour_breakout <- function(rw){
if(lubridate::floor_date(rw$open_time, 'hour') != lubridate::floor_date(rw$close_time, 'hour')){
# if hours are different, use helper function and bind columns
time_splits <- split_by_hour(rw$open_time, rw$close_time)
dplyr::bind_cols(ID = rep(rw$ID, nrow(time_splits)),
time_splits)
} else{
# else return normal row
rw[, c("ID", "open_time", "close_time")]
}
}
break_tbl_hourly <- function(hour_dat){
purrr::by_row(hour_dat, row_hour_breakout, .labels = FALSE)[[1]] %>%
dplyr::bind_rows()
}
>dat
# A tibble: 9 x 3
ID open_time close_time
<int> <dttm> <dttm>
1 2 2017-10-30 11:51:57 2017-10-30 13:07:06
2 1 2017-10-31 14:50:03 2017-10-31 15:00:35
3 2 2017-10-31 10:31:57 2017-10-31 11:00:52
4 1 2017-10-30 10:09:21 2017-10-30 11:16:29
5 2 2017-10-19 16:43:11 2017-10-19 16:43:11
6 1 2017-10-25 16:15:23 2017-10-25 16:15:23
7 2 2017-10-31 16:53:44 2017-10-31 16:59:03
8 1 2017-10-31 17:06:18 2017-10-31 17:06:23
9 2 2017-10-31 17:02:07 2017-10-31 17:02:07
> break_tbl_hourly(dat)
# A tibble: 14 x 3
ID open_time close_time
<int> <dttm> <dttm>
1 2 2017-10-30 11:51:57 2017-10-30 12:00:00
2 2 2017-10-30 12:00:00 2017-10-30 13:00:00
3 2 2017-10-30 13:00:00 2017-10-30 13:07:06
4 1 2017-10-31 14:50:03 2017-10-31 15:00:00
5 1 2017-10-31 15:00:00 2017-10-31 15:00:35
6 2 2017-10-31 10:31:57 2017-10-31 11:00:00
7 2 2017-10-31 11:00:00 2017-10-31 11:00:52
8 1 2017-10-30 10:09:21 2017-10-30 11:00:00
9 1 2017-10-30 11:00:00 2017-10-30 11:16:29
10 2 2017-10-19 20:43:11 2017-10-19 20:43:11
11 1 2017-10-25 20:15:23 2017-10-25 20:15:23
12 2 2017-10-31 20:53:44 2017-10-31 20:59:03
13 1 2017-10-31 21:06:18 2017-10-31 21:06:23
14 2 2017-10-31 21:02:07 2017-10-31 21:02:07
Ultimately, I would like to know:
Thank you!
=====================================
follow up, i've written a function to do this, based on the solution above:
# break rows function
# df: data frame of interest
# begin_time_var: variable of beginning times
# end_time_var: variable of ending times
break_rows_hourly <- function(df, begin_time_var, end_time_var){
begin <- enquo(begin_time_var)
end <- enquo(end_time_var)
#######################################
#
# to be applied to each open/close time
#
#######################################
split_by_hour <- function(open_time, close_time){
# ensure open time is at least before close time
if(open_time <= close_time){
# get hours to span
hour_start <- lubridate::ceiling_date(open_time, 'hour')
hour_end <- lubridate::floor_date(close_time, 'hour')
# check if hourly difference
if(hour_start <= hour_end){
#
# if it is, then go on to create multiple rows
#
# hour sequence to create rows
hour_seq <- seq(hour_start, hour_end, by = 'hour')
# create tibble
time_tbl <- tibble(
open_time = lubridate::ymd_hms(c(open_time, hour_seq)),
close_time = lubridate::ymd_hms(c(hour_seq, close_time))
)
return(time_tbl)
} else {
#
# hour start > hour end, return 1 row
#
# create tibble
time_tbl <- tibble(
open_time = open_time,
close_time = close_time
)
return(time_tbl)
}
} else {
#
# open time greater than close time, error printed statement
#
print("Close Time Before Open Time")
}
}
#######################################
#
# applies split by row and creates a df
#
#######################################
row_hour_breakout <- function(rw){
# split row
time_splits <- split_by_hour(rw %>% select(!!begin) %>% pull(), rw %>% select(!!end) %>% pull())
# get orther columns
other_cols <- rw %>%
select(-!!begin, - !!end) %>%
map(function(x) rep(x, nrow(time_splits))) %>%
as.tibble()
dplyr::bind_cols(other_cols,
time_splits)
}
#######################################
#
# map to each row, rbind to return
#
#######################################
return_df <- purrr::by_row(df, row_hour_breakout, .labels = FALSE)[[1]] %>%
dplyr::bind_rows()
return(return_df)
}
> break_rows_hourly(dat, open_time, close_time)
# A tibble: 14 x 3
ID open_time close_time
<int> <dttm> <dttm>
1 2 2017-10-30 11:51:57 2017-10-30 12:00:00
2 2 2017-10-30 12:00:00 2017-10-30 13:00:00
3 2 2017-10-30 13:00:00 2017-10-30 13:07:06
4 1 2017-10-31 14:50:03 2017-10-31 15:00:00
5 1 2017-10-31 15:00:00 2017-10-31 15:00:35
6 2 2017-10-31 10:31:57 2017-10-31 11:00:00
7 2 2017-10-31 11:00:00 2017-10-31 11:00:52
8 1 2017-10-30 10:09:21 2017-10-30 11:00:00
9 1 2017-10-30 11:00:00 2017-10-30 11:16:29
10 2 2017-10-19 20:43:11 2017-10-19 20:43:11
11 1 2017-10-25 20:15:23 2017-10-25 20:15:23
12 2 2017-10-31 20:53:44 2017-10-31 20:59:03
13 1 2017-10-31 21:06:18 2017-10-31 21:06:23
14 2 2017-10-31 21:02:07 2017-10-31 21:02:07
Upvotes: 1
Views: 159
Reputation: 7839
You can use the split-apply-combine strategy. In this case, we have to process one by one each row in dat
. So the whole thing would look like
do.call(rbind, lapply(split(dat, seq(nrow(dat))), expand.row))
where expand.row
is a function that takes a data-frame containing
exactly one row and outputs a data frame containing one or more rows.
The split(...)
part creates a list of 1-row data-frames. lapply(..., expand.row)
applies expand.row
to each element in the list and collects the results in a different list. do.call(rbind, ...)
stacks the elements in the second list on top of one another in order to get the resulting data frame.
All we have to do now is write expand.row
.
expand.row <- function(x) {
with(x, {
h <- trunc(open_time, 'hour') + 3600 # nearest full hour > open_time
if (h > close_time)
p <- c(open_time, close_time)
else
p <- unique(c(open_time, seq(h, close_time, 3600), close_time))
n <- length(p)
data.frame(ID = ID, open_time = p[seq(1, n - 1)],
close_time = p[seq(2, n)])
})
}
Result:
do.call(rbind, lapply(split(dat, seq(nrow(dat))), expand.row))
# ID open_time close_time
#1.1 2 2017-10-30 16:51:57 2017-10-30 17:00:00
#1.2 2 2017-10-30 17:00:00 2017-10-30 18:00:00
#1.3 2 2017-10-30 18:00:00 2017-10-30 18:07:06
#2.1 1 2017-10-31 19:50:03 2017-10-31 20:00:00
#2.2 1 2017-10-31 20:00:00 2017-10-31 20:00:35
#3.1 2 2017-10-31 15:31:57 2017-10-31 16:00:00
#3.2 2 2017-10-31 16:00:00 2017-10-31 16:00:52
#4.1 1 2017-10-30 15:09:21 2017-10-30 16:00:00
#4.2 1 2017-10-30 16:00:00 2017-10-30 16:16:29
#5 2 2017-10-19 22:43:11 2017-10-19 22:43:11
#6 1 2017-10-25 22:15:23 2017-10-25 22:15:23
#7 2 2017-10-31 21:53:44 2017-10-31 21:59:03
#8 1 2017-10-31 22:06:18 2017-10-31 22:06:23
#9 2 2017-10-31 22:02:07 2017-10-31 22:02:07
Upvotes: 1