Reputation: 35
I have the following data frame of projects:
id | start_date | end_date |
---|---|---|
01 | 6/11/2019 | 4/7/2021 |
02 | 8/12/2021 | 1/18/2022 |
03 | 10/24/2019 | 12/3/2019 |
04 | 9/20/2021 | 11/11/2021 |
05 | 11/11/2021 | 11/11/2021 |
I also have a list of dates:
date_list <- c(1/1/2020, 1/1/2021, 1/1,2022)
I want to count how many projects were 'active' during those specific dates.
For example: In 1/1/2020 there was only 1 project active which is project#1 because it started before and ended after 1/1/2020. in other words, it was an active project during that date.
for (date in date-list) {
projects %>% filter(start_date <= date & end_date > date) %>% count()
}
This is taking too long as I'm working with a list of thousands of dates and a list of thousands of projects.
Upvotes: 2
Views: 105
Reputation: 19163
Not exactly sure about the expected result, but this adds a count of how many date_list entries are active per project row.
Note: convert the dates to Date objects with projects$start_date <- as.Date(projects$start_date, "%m/%d/%Y")
and projects$end_date <- as.Date(projects$end_date, "%m/%d/%Y")
Preparation
library(data.table)
setDT(projects)
date_list <- c("1/1/2020", "1/1/2021", "1/1/2022")
projects$active <- rowSums(sapply(as.Date(date_list, "%m/%d/%Y"), \(x)
data.table::between(x, projects$start_date, projects$end_date)))
result
projects
id start_date end_date active
1: 1 2019-06-11 2021-04-07 2
2: 2 2021-08-12 2022-01-18 1
3: 3 2019-10-24 2019-12-03 0
4: 4 2021-09-20 2021-11-11 0
5: 5 2021-11-11 2021-11-11 0
projects <- structure(list(id = 1:5, start_date = structure(c(18058, 18851,
18193, 18890, 18942), class = "Date"), end_date = structure(c(18724,
19010, 18233, 18942, 18942), class = "Date")), row.names = c(NA,
-5L), class = "data.frame")
Upvotes: 0
Reputation: 4243
Another option using tidyr::crossing
library(dplyr)
library(tidyr)
crossing(date_list, projects) %>%
filter(between(date_list, start_date, end_date)) %>%
count(date_list)
#----
# A tibble: 3 x 2
date_list n
<date> <int>
1 2020-01-01 1
2 2021-01-01 1
3 2022-01-01 1
Example data
projects <- structure(list(id = 1:5, start_date = structure(c(18058, 18851,
18193, 18890, 18942), class = "Date"), end_date = structure(c(18724,
19010, 18233, 18942, 18942), class = "Date")), row.names = c(NA,
-5L), class = c("data.table", "data.frame"))
date_list <- as.Date(c("1/1/2020", "1/1/2021", "1/1/2022"), "%m/%d/%Y")
Since all combinations are created, this is likely not the most efficient solution. But a quick test on ~1,800 dates runs in a second or two
dates_tst <- seq(as.Date("2018-1-1"), as.Date("2023-1-1"), by = '1 day')
projects_tst <- data.frame(projects = 1:length(dates_tst),
start_date = dates_tst - 100,
end_date = dates_tst + 100)
crossing(dates_tst, projects_tst) %>% # 3.3M combinations
filter(between(dates_tst, start_date, end_date)) %>%
count(dates_tst)
#----
# A tibble: 1,827 x 2
dates_tst n
<date> <int>
1 2018-01-01 101
2 2018-01-02 102
3 2018-01-03 103
4 2018-01-04 104
5 2018-01-05 105
6 2018-01-06 106
7 2018-01-07 107
8 2018-01-08 108
9 2018-01-09 109
10 2018-01-10 110
# i 1,817 more rows
Upvotes: 0
Reputation: 17011
1
to the start date, -1
to the end date, and 0
to the lookup dateWith data.table
:
library(data.table)
date_list <- as.Date(c("1/1/2020", "1/1/2021", "1/1/2022"), "%m/%d/%Y")
setorder(
rbindlist(
list(
projects[,.(date = c(start_date, end_date), inc = rep(c(1L, -1L), each = .N))],
data.table(date = date_list, inc = 0L)
)
), date
)[,n := cumsum(inc)][inc == 0L][,inc := NULL][]
#> date n
#> 1: 2020-01-01 1
#> 2: 2021-01-01 1
#> 3: 2022-01-01 1
Data:
projects <- structure(list(id = 1:5, start_date = structure(c(18058, 18851,
18193, 18890, 18942), class = "Date"), end_date = structure(c(18724,
19010, 18233, 18942, 18942), class = "Date")), row.names = c(NA,
-5L), class = c("data.table", "data.frame"))
Upvotes: 0
Reputation: 20444
An efficient way to do this is to use data.table::foverlaps()
, a fast binary-search based overlap join of two data.tables.
First create two data.table
s for the join. The date_list
data.table
will have a dummy_date
field, as foverlaps()
can find overlapping ranges. But in this case the start and end of the range will be the same.
library(data.table)
setDT(projects)
# Create a data.table to join against
date_dt <- data.table(date = date_list, dummy_date = date_list)
# Set keys for join
setkey(projects, start_date, end_date)
setkey(date_dt, date, dummy_date)
Then we can do the join, group by date
and count how many projects were open on each date:
# Do the join
project_dates <- foverlaps(projects, date_dt, type = "any")
# Summarise by date
project_dates[!is.na(date), .N, date]
# date N
# <IDat> <int>
# 1: 2020-01-01 1
# 2: 2021-01-01 1
# 3: 2022-01-01 1
projects <- structure(list(id = 1:5, start_date = structure(c(18058L, 18851L,
18193L, 18890L, 18942L), class = c("IDate", "Date")), end_date = structure(c(18724L,
19010L, 18233L, 18942L, 18942L), class = c("IDate", "Date"))), class = "data.frame", row.names = c(NA, -5L))
date_list <- structure(c(18262L, 18628L, 18993L), class = c("IDate", "Date"))
Upvotes: 0
Reputation: 206401
You could first build a table of all open projects for a given date. Here's one way to do that
library(dplyr)
number_open <- rbind(
data.frame(date=dd$end_date+1, open=-1),
data.frame(date=dd$start_date, open=1)
) %>%
arrange(date) %>%
summarize(open=sum(open), .by=date) %>%
mutate(open=cumsum(open))
That returns
date open
1 2019-06-11 1
2 2019-10-24 2
3 2019-12-04 1
4 2021-04-08 0
5 2021-08-12 1
6 2021-09-20 2
7 2021-11-11 3
8 2021-11-12 1
9 2022-01-19 0
so for each date we can see when the number of projects changes.
We can this sorted lists of dates with findInterval
to look up the number of open events for a given date
date_list <- as.Date(c("1/1/2020", "1/1/2021", "1/1/2022"), "%m/%d/%Y")
number_open$open[findInterval(date_list, number_open$date)]
# [1] 1 1 1
So there was 1 project open on each of these dates. Tested with
dd <- read.table(text="
id start_date end_date
01 6/11/2019 4/7/2021
02 8/12/2021 1/18/2022
03 10/24/2019 12/3/2019
04 9/20/2021 11/11/2021
05 11/11/2021 11/11/2021", header=TRUE)
dd$start_date <- as.Date(dd$start_date, "%m/%d/%Y")
dd$end_date <- as.Date(dd$end_date, "%m/%d/%Y")
Upvotes: 1