Fred
Fred

Reputation: 430

adding rows for missing intervals between existing intervals in R

Problem: I have a data set that contains interval data with start and end dates and my goal is it to add rows to that data set that reflect the potential gaps between these intervals (per group). The complexity is that the already existing intervals may overlap and there might be an arbitrary number of gaps.

Example: We have start and end dates showing from when to when a person has occupied an apartment. There may have been multiple people at one apartment at the same time and there needs to be rows added for the time the apartment has been "empty".

Example data:

example_dates <- structure(list(apartment = c("A", "A", "A", "A", "B", "B", 
                                                "B", "C", "C", "C"), 
                                start_date = structure(c(1640995200, 1642291200, 
                                                         1649980800, 1655769600, 1644451200, 1646092800, 1659312000, 1646438400, 
                                                         1649376000, 1664582400), 
                                                       class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
                                end_date = structure(c(1642204800, 1648166400, 1655683200, 
                                                       1668643200, 1655251200, 1653868800, 1667260800, 1654819200, 
                                                       1661385600, 1668470400), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
                                status = c("person in apartment", "person in apartment", 
                                           "person in apartment", "person in apartment", "person in apartment", 
                                           "person in apartment", "person in apartment", "person in apartment", 
                                           "person in apartment", "person in apartment")), 
                           class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -10L))

Desired output:

desired_outcome <- structure(list(apartment = c("A", "A", "A", "A", "A", "B", 
                                                  "B", "B", "B", "C", "C", "C", "C"), 
                                  start_date = structure(c(1640995200, 1642291200, 1648252800, 
                                                           1649980800, 1655769600, 1644451200, 1646092800, 
                                                           1655337600, 1659312000, 1646438400, 1649376000, 1661472000, 1664582400), 
                                                         class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
                                  end_date = structure(c(1642204800, 1648166400, 1649894400, 1655683200, 1668643200, 1655251200, 1653868800, 
                                                         1659225600, 1667260800, 1654819200, 1661385600, 1664496000, 1668470400),
                                                       class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
                                  status = c("person in apartment", "person in apartment", "apartment empty", 
                                             "person in apartment", "person in apartment", "person in apartment", 
                                             "person in apartment", "apartment empty", "person in apartment", "person in apartment", 
                                             "person in apartment", "apartment empty", "person in apartment"
                                                  )), 
                             class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -13L))

Upvotes: 2

Views: 145

Answers (2)

Davis Vaughan
Davis Vaughan

Reputation: 2960

I think this is a great use case for the ivs package that I created for working with intervals in the tidyverse. The main details here are:

  • Convert to dates rather than date-times
  • Add 1 to end_date to make ranges that match [, ), which is what ivs wants (and makes more sense here)
  • For each apartment, compute the interval complement with iv_complement(). These are the empty dates (it keeps in mind any overlaps)
  • Bind the complement with the original data and sort

The end dates in the final result will be +1 vs what is in your desired_outcome but I find these interval problems are easier to think about if you use right open intervals, [, ).

library(dplyr, warn.conflicts = FALSE)
library(ivs)

example_dates <- structure(
  list(
    apartment = c("A", "A", "A", "A", "B", "B", "B", "C", "C", "C"), 
    start_date = structure(
      c(
        1640995200, 1642291200, 1649980800, 1655769600, 1644451200,
        1646092800, 1659312000, 1646438400, 1649376000, 1664582400
      ), 
      class = c("POSIXct", "POSIXt"), 
      tzone = "UTC"
    ), 
    end_date = structure(
      c(
        1642204800, 1648166400, 1655683200, 1668643200, 1655251200, 
        1653868800, 1667260800, 1654819200, 1661385600, 1668470400
      ), 
      class = c("POSIXct", "POSIXt"), 
      tzone = "UTC"
    ), 
    status = c(
      "person in apartment", "person in apartment", "person in apartment", 
      "person in apartment", "person in apartment", "person in apartment",
      "person in apartment", "person in apartment", "person in apartment", 
      "person in apartment"
    )
  ), 
  class = c("tbl_df", "tbl", "data.frame"), 
  row.names = c(NA, -10L)
)

# Convert dates to dates rather than date-times
example_dates <- example_dates %>%
  mutate(
    start_date = as.Date(start_date),
    end_date = as.Date(end_date)
  ) %>%
  mutate(
    # Make `end_date` exclusive
    end_date = end_date + 1
  )

example_dates
#> # A tibble: 10 × 4
#>    apartment start_date end_date   status             
#>    <chr>     <date>     <date>     <chr>              
#>  1 A         2022-01-01 2022-01-16 person in apartment
#>  2 A         2022-01-16 2022-03-26 person in apartment
#>  3 A         2022-04-15 2022-06-21 person in apartment
#>  4 A         2022-06-21 2022-11-18 person in apartment
#>  5 B         2022-02-10 2022-06-16 person in apartment
#>  6 B         2022-03-01 2022-05-31 person in apartment
#>  7 B         2022-08-01 2022-11-02 person in apartment
#>  8 C         2022-03-05 2022-06-11 person in apartment
#>  9 C         2022-04-08 2022-08-26 person in apartment
#> 10 C         2022-10-01 2022-11-16 person in apartment

# Combine start/end into an interval vector
example_dates <- example_dates %>%
  mutate(range = iv(start_date, end_date), .keep = "unused")

example_dates
#> # A tibble: 10 × 3
#>    apartment status                                 range
#>    <chr>     <chr>                             <iv<date>>
#>  1 A         person in apartment [2022-01-01, 2022-01-16)
#>  2 A         person in apartment [2022-01-16, 2022-03-26)
#>  3 A         person in apartment [2022-04-15, 2022-06-21)
#>  4 A         person in apartment [2022-06-21, 2022-11-18)
#>  5 B         person in apartment [2022-02-10, 2022-06-16)
#>  6 B         person in apartment [2022-03-01, 2022-05-31)
#>  7 B         person in apartment [2022-08-01, 2022-11-02)
#>  8 C         person in apartment [2022-03-05, 2022-06-11)
#>  9 C         person in apartment [2022-04-08, 2022-08-26)
#> 10 C         person in apartment [2022-10-01, 2022-11-16)

# Compute the complement per apartment
empty_dates <- example_dates %>%
  group_by(apartment) %>%
  summarise(range = iv_complement(range)) %>%
  mutate(status = "apartment empty")

empty_dates
#> # A tibble: 3 × 3
#>   apartment                    range status         
#>   <chr>                   <iv<date>> <chr>          
#> 1 A         [2022-03-26, 2022-04-15) apartment empty
#> 2 B         [2022-06-16, 2022-08-01) apartment empty
#> 3 C         [2022-08-26, 2022-10-01) apartment empty

# Bind and sort
bind_rows(example_dates, empty_dates) %>%
  arrange(apartment, range) %>%
  mutate(start_date = iv_start(range), end_date = iv_end(range), .keep = "unused")
#> # A tibble: 13 × 4
#>    apartment status              start_date end_date  
#>    <chr>     <chr>               <date>     <date>    
#>  1 A         person in apartment 2022-01-01 2022-01-16
#>  2 A         person in apartment 2022-01-16 2022-03-26
#>  3 A         apartment empty     2022-03-26 2022-04-15
#>  4 A         person in apartment 2022-04-15 2022-06-21
#>  5 A         person in apartment 2022-06-21 2022-11-18
#>  6 B         person in apartment 2022-02-10 2022-06-16
#>  7 B         person in apartment 2022-03-01 2022-05-31
#>  8 B         apartment empty     2022-06-16 2022-08-01
#>  9 B         person in apartment 2022-08-01 2022-11-02
#> 10 C         person in apartment 2022-03-05 2022-06-11
#> 11 C         person in apartment 2022-04-08 2022-08-26
#> 12 C         apartment empty     2022-08-26 2022-10-01
#> 13 C         person in apartment 2022-10-01 2022-11-16

Upvotes: 3

Will
Will

Reputation: 942

what about this?

library(lubridate)
library(data.table)

setDT(example_dates)
setDT(desired_outcome)

example_dates[, start_date := lubridate::ymd(start_date)]
example_dates[, end_date := lubridate::ymd(end_date)]
example_dates[, start_date_int := as.numeric(start_date)]                     
example_dates[, end_date_int := as.numeric(end_date)]

agg <- example_dates[, g := c(0L, cumsum(shift(start_date_int, -1L) > cummax(end_date_int))[-.N]), apartment][,.(min(start_date_int), max(end_date_int)), .(g, apartment)]
colnames(agg) <- c("g", "apartment", "start_date", "end_date")
agg[, start_date := as_date(start_date, origin="1970-01-01")]
agg[, end_date := as_date(end_date, origin="1970-01-01")]


agg[, gap := start_date - shift(end_date) - 1, by = .(apartment)]
agg[, empty_start := shift(end_date) + lubridate::days(1)]
agg[, empty_end := empty_start + gap - lubridate::days(1)]

empty.dt <- agg[gap > 1, .(apartment, empty_start, empty_end)]
empty.dt$status <- "apartment empty"
colnames(empty.dt) <- c("apartment", "start_date", "end_date", "status")
full.dt <- example_dates[, .(apartment, start_date, end_date, status)]
res <- rbindlist(list(full.dt, empty.dt))

Upvotes: 0

Related Questions