Reputation: 55
I have a dataframe which records at what time cabs are doing trips. I need to transform this data into 24 hour duration when they are doing trips and idle. Below I'm sharing a sample data and the required dataframe
df <- data.frame(cab_id = c("c1","c1","c2*","c3","c3","c3","c4","c4"),
trip_id = c("101","102","103","104","105","106","107","108"),
trip_start = c("15:00", "21:27", " 23:11", " 09:33", " 17:41", "22:11", "21:31", "23:47"),
trip_end = c("18:30", "23:33", "02:30", "12:11", "20:18", "01:15", "22:45", "02:12"))
reqd_df <- data.frame(time = c("0","0.25","0.5","0.75","1","1.25","1.5","1.75","2","2.25","2.5","2.75","3","3.25","3.5","3.75","4","4.25","4.5","4.75","5","5.25","5.5","5.75","6","6.25","6.5","6.75","7","7.25","7.5","7.75","8","8.25","8.5","8.75","9","9.25","9.5","9.75","10","10.25","10.5","10.75","11","11.25","11.5","11.75","12","12.25","12.5","12.75","13","13.25","13.5","13.75","14","14.25","14.5","14.75","15","15.25","15.5","15.75","16","16.25","16.5","16.75","17","17.25","17.5","17.75","18","18.25","18.5","18.75","19","19.25","19.5","19.75","20","20.25","20.5","20.75","21","21.25","21.5","21.75","22","22.25","22.5","22.75","23","23.25","23.5","23.75"),
c1 = c("0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","1","1","1","1","1","1","1","1","1","1","1","1","1","1","1","0","0","0","0","0","0","0","0","0","0","0","1","1","1","1","1","1","1","1","1","0"),
c2 = c("1","1","1","1","1","1","1","1","1","1","1","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","1","1","1"),
c3 = c("1","1","1","0","1","1","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","1","1","1","1","1","1","1","1","1","1","1","1","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","1","1","1","1","1","1","1","1","1","1","1","0","0","0","0","0","0","0","1","1","1","1","1","1","1"),
c4 = c("1","1","1","1","1","1","1","1","1","1","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","1"))
reqd_df_2 <- melt(reqd_df,id = 1)
ggplot(reqd_df_2, aes(x = time, y = value, label = value, fill= variable ))+
geom_bar(stat = "identity")+
facet_grid(rows = vars(variable))
Q1: how to transform dataframe df to dataframe reqd_df, where time is rounded to nearby quarter of the hour , intervals like (1:15, 1:30, 1:45, 2:00). If cab is doing trip in a particular quarter of hour populate it by 1 and if cab is idle populate it by 0.
Note1: 0, 0.25, 0:50, 0:75, 1:00 represent 0:00, 0:15, 0:30, 0:45, 1:00 respectively and so on.
Note2: some trips start at end hour of the days like 22:11 hours and ends around 02:15 hour, from 22:15 to 23:45 and 00:00 to 02:15 data will be populated by 1
Thanks in advance
Upvotes: 0
Views: 75
Reputation: 25225
Using data.table::as.ITime
and nearest rolling join (i.e. roll="nearest"
):
library(data.table)
setDT(df)
#convert into ITime
cols <- c("trip_start", "trip_end")
df[, (cols) := lapply(.SD, function(x) as.ITime(x)), .SDcols=cols]
#sequence of times with 15 mins interval
sec15min <- 15 * 60
alltimes <- as.ITime(seq(as.ITime("00:00"), as.ITime("23:59"), sec15min))
intl <- data.table(INTL=alltimes)[, VAL:=INTL]
#find nearest 15min interval
df[, near_start := intl[df, on=.(INTL=trip_start), roll="nearest", VAL]]
df[, near_end := intl[df, on=.(INTL=trip_end), roll="nearest", VAL]]
#get straight to reqd_df_2
df[, {
if (near_start <= near_end)
biz <- seq(near_start, near_end, sec15min)
else
biz <- c(seq(as.ITime("00:00"), near_end, sec15min),
seq(near_start, as.ITime("23:59"), sec15min))
.(time=as.ITime(alltimes),
busy=replace(logical(length(alltimes)), alltimes %in% biz, TRUE))
},
by=.(cab_id, trip_id)][,
.(busy=+any(busy)), by=.(time, cab_id)]
output:
time cab_id busy
1: 00:00:00 c1 0
2: 00:15:00 c1 0
3: 00:30:00 c1 0
4: 00:45:00 c1 0
5: 01:00:00 c1 0
---
380: 22:45:00 c4 1
381: 23:00:00 c4 0
382: 23:15:00 c4 0
383: 23:30:00 c4 0
384: 23:45:00 c4 1
p.s.: I think OP is missing trip 107 in the desired output.
Upvotes: 1