h7681
h7681

Reputation: 355

R - Time Spent in Interval

I have a data frame of start and end dates/times, like so:

start_date <- c("20/09/2016 01:20" , "22/09/2016 01:20", "28/09/2016 22:16",  "16/09/2016 21:01")
end_date <- c("20/09/2016 06:20" , "24/09/2016 22:40", "29/09/2016 03:20", "16/09/2016 23:01")
df <- data.frame(start_date, end_date)

And some time intervals:

interval_start <- "21:00"
interval_end   <- "02:00"

I would like to create a new column in df which calculates the total number of minutes each instance spent within the interval period. For example, row 1 spent 40 minutes in the interval period.

Does anyone know how this could be achieved? Thanks.

Upvotes: 2

Views: 1205

Answers (3)

shosaco
shosaco

Reputation: 6165

Package lubridate helps doing the job. The main problem to tackle is long time periods, where the interval occurs several times (I solved it with the inner for loop) and the key function is intersect, which gives the simple answer to the problem "Intersection of two intervals". Summing up all the intersections gives the solution per row.

library(lubridate)

start_date <- c("20/09/2016 01:20" , "22/09/2016 01:20", "28/09/2016 22:16",  "16/09/2016 21:01")
end_date <- c("20/09/2016 06:20" , "24/09/2016 22:40", "29/09/2016 03:20", "16/09/2016 23:01")

start_date <- dmy_hm(start_date)
end_date <- dmy_hm(end_date)

df <- data.frame(start_date, end_date)

time_spent <- c()

# loop through each row
for (i in 1:nrow(df)){
  row <- df[i,]
  out <- 0

  period <- interval(row$start_date, row$end_date)

  #1. Set as many intervals for this time periods as there are days
  for(day in seq(day(row$start_date) - 1, day(row$end_date), 1)){
    myInterval <- interval(dmy_hm(paste(day, 
                                        month(row$start_date), 
                                        year(row$start_date),
                                        "21:00")),
                           dmy_hm(paste(day+1, 
                                        month(row$start_date), 
                                        year(row$start_date),
                                        "02:00")))

    # calculate intersection
    timedifference <- intersect(period, myInterval)

    if(!is.na(timedifference)){
      out <- out + as.numeric(timedifference)/60
    }

  }

  time_spent <- c(time_spent, out)
}

df$time_spent <- time_spent

The solution is

> df$time_spent
[1]  40 740 224 120

Upvotes: 1

Hack-R
Hack-R

Reputation: 23200

Please see code comments along the way. I used the lubridate package.

start_date <- c("20/09/2016 01:20" , "22/09/2016 01:20", "28/09/2016 22:16",  "16/09/2016 21:01")
end_date <- c("20/09/2016 06:20" , "24/09/2016 22:40", "29/09/2016 03:20", "16/09/2016 23:01")
df <- data.frame(start_date, end_date)


interval_start <- "21:00"
interval_end   <- "02:00"

# Convert strings to dates
library(lubridate)
df$start_date <- dmy_hm(df$start_date)
df$end_date   <- dmy_hm(df$end_date)

# Helper columns
df$day  <- day(df$start_date)
df$mo   <- month(df$start_date)
df$yr   <- year(df$start_date)
df$day1 <- day(df$end_date)
df$mo1  <- month(df$end_date)
df$yr1  <- year(df$end_date)

# Add custom start/end for first day in row
df$interval_start <- dmy_hm(paste0(df$day-1,"/",df$mo,"/",df$yr," ", interval_start))
df$interval_end   <- dmy_hm(paste0(df$day,"/",df$mo,"/",df$yr," ", interval_end))


# Add custom start/end for last day in row, if it is different
df$interval_start1 <- df$interval_start # this is just to initialize the column with the proper class
df$interval_end1   <- df$interval_end

for(i in 1:nrow(df)){
  if(!(df$mo[i] == df$mo1[i] & df$day[i] == df$day1[i])){
    df$interval_start1[i] <- dmy_hm(paste0(df$day1[i],"/",df$mo1[i],"/",df$yr1[i]," ", interval_start))
    df$interval_end1[i]   <- dmy_hm(paste0(df$day1[i],"/",df$mo1[i],"/",df$yr1[i]," ", interval_end))
  }else{
    df$interval_start1[i] <- NA
    df$interval_end1[i]   <- NA
  } 
} 

# Calculate time in intervals for first day
time1     <- difftime(df$start_date,df$interval_end, units="mins")
time1.cap <- difftime(df$interval_start, df$interval_end, units="mins")
time1[abs(time1) > abs(time1.cap)] <- time1.cap[abs(time1) > abs(time1.cap)]

# initialize class of new col
df$time1 <- difftime(df$interval_start, df$interval_end, units="mins")

# Update time1
for(i in 1:nrow(df)){
  if(df$start_date[i] < df$interval_end[i]){
    time1     <- difftime(df$start_date,df$interval_end, units="mins")
    time1.cap <- difftime(df$interval_start, df$interval_end, units="mins")
    time1[abs(time1) > abs(time1.cap)] <- time1.cap[abs(time1) > abs(time1.cap)]

    df$time1[i] <- time1[i]*-1
  } else{

    if(df$start_date[i] > df$interval_end[i])  {
      time1     <- difftime(df$start_date,df$interval_end+86400, units="mins")
      time1.cap <- difftime(df$interval_start, df$interval_end+86400, units="mins")
      time1[abs(time1) > abs(time1.cap)] <- time1.cap[abs(time1) > abs(time1.cap)]

      df$time1[i] <- time1[i]*-1
    }
  }
}

# initialize class of new col

df1 <- df[!is.na(df$interval_start1),]
df1$time2 <- difftime(df1$interval_start, df1$interval_end, units="mins")

# create time2 for last day, if different
for(i in 1:nrow(df1)){
  if(df1$end_date[i] < df1$interval_end1[i]){
    time2     <- difftime(df1$end_date,df1$interval_end1, units="mins")
    time2.cap <- difftime(df1$interval_start1, df1$interval_end1, units="mins")
    time2[abs(time2) > abs(time2.cap)] <- time2.cap[abs(time2) > abs(time2.cap)]

    df1$time2[i] <- time2[i]*-1
  } else{

    if(df1$end_date[i] > df1$interval_end1[i])  {
      time2     <- difftime(df1$interval_start1,df1$end_date, units="mins")
      time2.cap <- difftime(df1$interval_start1, df1$interval_end1+86400, units="mins")
      time2[abs(time2) > abs(time2.cap)] <- time2.cap[abs(time2) > abs(time2.cap)]

      df1$time2[i] <- time2[i]*-1
    }
  }
}

# See if there were any days in between first and last and if so add time
time2 <- minutes(300 * round(difftime(df1$end_date,df1$start_date, units = "days")))+minutes(time2)*-1

df$time2 <- as.period(NA)
df$time2[!is.na(df$interval_start1)]  <- time2
df$time2[is.na(df$interval_start1)]   <- 0

df$time_in_interval <- minutes(df$time1)+df$time2
df$time_in_interval

Note that 86,400 is the number of seconds in a day, so that's what that number was.

Upvotes: 0

J_F
J_F

Reputation: 10352

Here my short solution (in contrast to the other answers ;-) ) I also used the lubridate package:

library(lubridate)
df$start_date <- dmy_hm(df$start_date)
df$end_date <- dmy_hm(df$end_date)

df$ diff <- unlist(lapply(1:nrow(df), function(x){

   sequence <- seq(df$start_date[x],df$end_date[x], by = "min")
   cum_sum <- cumsum(format(sequence, format = "%H:%M") <= "02:00" | format(sequence, format = "%H:%M") >= "21:00")
   sum <- sum(format(sequence, format = "%H:%M") <= "02:00" | format(sequence, format = "%H:%M") >= "21:00")
   n_intervals <- length(unique(cum_sum[cum_sum %in% unique(cum_sum[duplicated(cum_sum)])]))

   ifelse(cum_sum[length(cum_sum)] - cum_sum[length(cum_sum)-1] != 0, return(sum - n_intervals-1), return(sum-1))
}))

#            start_date            end_date diff
# 1 2016-09-20 01:20:00 2016-09-20 06:20:00   40
# 2 2016-09-22 01:20:00 2016-09-24 22:40:00  740
# 3 2016-09-28 22:16:00 2016-09-29 03:20:00  224
# 4 2016-09-16 21:01:00 2016-09-16 23:01:00  120

The idea is the following (code in lapply):

  1. create a sequence from start to end of each interval by one minute
  2. Calculate the sum and the cumsum of the condition, that all times from this sequence are in the interval "21:00" to "02:00".
  3. Calculate the number of intervals in the cumsumto see how much different intervals are in this sequence.
  4. The difficult thing is, that when sum is 2 long, the difference in minutes is just 1, so we always have to subtract 1. We have to do this for every interval we have found. In the case, that the last value of cum_sum is different to the second to last one, this is a addition interval and we have to subtract 1 more.

It looks very complex, but the idea behind should be clear (I hope).

Upvotes: 1

Related Questions