Reputation: 61
I have a data frame with two columns - one is the Start date the other is the End date. I need to get a new data frame with two columns - a Date column and a column which is the count of observations from the first data frame where the date is between Start and End.
Have:
dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))
Start End
2017-01-01 2017-01-03
2017-01-02 2017-01-04
2017-01-03 2017-01-05
2017-01-04 2017-01-06
2017-01-05 2017-01-07
Want:
Date Count
2017-01-01 1
2017-01-02 2
2017-01-03 3
2017-01-04 3
2017-01-05 3
2017-01-06 2
2017-01-07 1
I could use for loops, but is there a better way to do this in R?
Upvotes: 5
Views: 2929
Reputation: 11255
These base options avoid grouping.
The first option uses logic to avoid Map
or reshaping. It is:
dates$Start
optionsdates$End
when the difference between the two isn't 0 (i.e., if it's the same date I shouldn't double count it).dates$Start
plus the seq_len
where, yet again, the difference between isn't 0. date_diffs <- dates$End - dates$Start
x <- c(dates[['Start']],
with(subset(dates, subset = date_diffs > 0)
,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
))
data.frame(table(x))
The second option Vectorize()
s the seq.Date()
function. Then it's just simply combining the results.
#or
vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)
table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1)))
Var1 Freq
1 2017-01-01 1
2 2017-01-02 2
3 2017-01-03 3
4 2017-01-04 3
5 2017-01-05 3
6 2017-01-06 2
7 2017-01-07 1
Performance:
For @akrun's base option, I wrapped the table()
result in data.frame()
so everyone was producing a data.frame
of some sort.
# The original data set copied to make 1080 rows
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 1629.3048 1647.52845 1680.82496 1664.07245 1697.4511 1828.4093 20
tmfmnk_lubrid 6882.1404 6959.20810 7018.69083 7002.27455 7064.0898 7276.2349 20
M_M_data.tab 103.4166 103.99925 108.33817 107.95715 108.6591 134.9388 20
Ronak_stack_base 131.0364 134.23370 137.46651 137.32235 141.4388 144.5971 20
akrun_purrr 133.4917 136.89080 138.92787 138.44575 140.7778 147.4172 20
akrun_base 130.4179 134.16735 137.21640 136.98225 140.1182 145.8873 20
cole_base 15.4396 15.60345 16.42558 16.74245 17.1322 17.4201 20
cole_vec_seq 114.7890 118.44795 130.69493 121.76430 124.0880 309.7418 20
cole_dt_version 15.9107 15.98035 16.56220 16.11790 17.0216 18.8438 20
#Original data set copied to make 180 rows:
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 275.6845 279.06390 281.871350 281.95420 284.16025 287.5412 20
tmfmnk_lubrid 1136.1109 1161.35915 1176.073630 1169.81685 1176.87130 1277.6444 20
M_M_data.tab 19.0258 19.33070 19.766890 19.45450 19.63410 24.7390 20
Ronak_stack_base 22.2327 22.56530 23.234095 22.85260 23.20790 27.0589 20
akrun_purrr 27.8797 28.50225 29.146325 28.71840 29.11915 33.3277 20
akrun_base 22.3477 22.61135 23.370780 22.81920 23.41300 28.6941 20
cole_base 3.4258 3.50735 3.642605 3.62470 3.67595 3.9780 20
cole_vec_seq 19.9366 20.08345 21.359275 20.17250 22.48055 25.7780 20
cole_dt_version 3.9992 4.09905 4.207690 4.16135 4.28265 4.5052 20
# Original dataset copied to make 30 rows
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 51.2437 52.16495 54.524465 52.55520 56.19050 66.9461 20
tmfmnk_lubrid 192.1206 196.99550 198.501640 197.64815 201.42050 203.7031 20
M_M_data.tab 4.9511 5.05215 5.215670 5.19315 5.33075 5.7740 20
Ronak_stack_base 4.3609 4.51110 4.995405 4.54885 4.79490 8.8183 20
akrun_purrr 10.9024 10.96420 11.622235 11.07575 11.58300 15.7751 20
akrun_base 4.4919 4.55905 4.843730 4.60825 4.73760 8.4334 20
cole_base 1.4225 1.48635 1.738995 1.58685 1.60780 5.2324 20
cole_vec_seq 4.0648 4.16095 4.318665 4.24445 4.48420 4.7344 20
cole_dt_version 1.9733 2.06385 2.132040 2.13965 2.18945 2.3612 20
#Original 5 row dataset
Unit: milliseconds
expr min lq mean median uq max neval
tmfmnk_complete 14.7549 14.90780 15.463195 15.10195 15.62030 18.9115 20
tmfmnk_lubrid 37.2571 37.58240 41.583090 38.18540 40.57435 86.6058 20
M_M_data.tab 2.6235 2.85145 3.037975 2.90815 2.97045 5.3476 20
Ronak_stack_base 1.3305 1.38490 1.465170 1.49175 1.53355 1.5978 20
akrun_purrr 7.7036 7.86260 8.212875 7.98790 8.18055 11.7898 20
akrun_base 1.4046 1.43715 1.501945 1.51890 1.56545 1.6176 20
cole_base 1.0560 1.09905 1.169260 1.16010 1.21595 1.3601 20
cole_vec_seq 1.3547 1.40685 1.452515 1.45645 1.51385 1.5328 20
cole_dt_version 1.5662 1.70555 1.813365 1.78930 1.84720 2.5267 20
Code for reference:
library(data.table)
library(dplyr)
library(purrr)
library(tidyverse)
library(microbenchmark)
library(lubridate)
dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))
dates_dt <- as.data.table(dates)
dates <- rbind(dates,dates,dates,dates,dates,dates) #repeat this as many times as you want
dates_dt <- as.data.table(dates)
vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)
microbenchmark(
tmfmnk_complete = {
dates %>%
rowid_to_column() %>%
gather(var, Date, -rowid) %>%
group_by(rowid) %>%
complete(Date = seq(min(Date), max(Date), by = "1 day")) %>%
ungroup() %>%
count(Date)
}
, tmfmnk_lubrid = {
dates %>%
rowwise() %>%
mutate(Date = interval(Start, End)/days(1),
Date = list(Start + days(0:Date))) %>%
ungroup() %>%
unnest() %>%
count(Date)
}
, M_M_data.tab = {
dates_dt[ ,.(Date = seq(Start, End, by = "day")),
by = 1:nrow(dates_dt)][,
.(count = .N), by = Date]
}
, Ronak_stack_base = {
stack(table(do.call(c, Map(seq, dates$Start, dates$End, by = "1 day"))))
}
, akrun_purrr = {
dates %>%
transmute(Date = map2(Start, End, seq, by = "1 day")) %>%
unnest(Date) %>%
count(Date)
}
, akrun_base = {
lst1 <- do.call(Map, c(f = seq, unname(dates), by = "1 day"))
data.frame(table(do.call(c, lst1)))
}
, cole_base = {
date_diffs <- dates$End - dates$Start
x <- c(dates[['Start']],
with(subset(dates, subset = date_diffs > 0)
,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
))
data.frame(table(x))
}
, cole_vec_seq = {
data.frame(table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1))))
}
, cole_dt_version = {
date_diffs <- dates$End - dates$Start
dates_dt[date_diffs > 0, data.frame(table({diff_sub = End - Start -1; c(dates_dt[['Start']], End, rep(Start, diff_sub) + sequence(diff_sub))}))]
}
, times = 20
)
Upvotes: 5
Reputation: 887851
An option would be to use map2
to get the sequence of 'Date' between corresponding 'Start', 'End' columns, unnest
the list
output and get the count
library(dplyr)
library(tidyr)
library(purrr)
dates %>%
transmute(Date = map2(Start, End, seq, by = "1 day")) %>%
unnest(Date) %>%
count(Date)
# A tibble: 7 x 2
# Date n
# <date> <int>
#1 2017-01-01 1
#2 2017-01-02 2
#3 2017-01-03 3
#4 2017-01-04 3
#5 2017-01-05 3
#6 2017-01-06 2
#7 2017-01-07 1
Or an option in base R
lst1 <- do.call(Map, c(f = seq, unname(dates), by = "1 day"))
table(do.call(c, lst1))
Upvotes: 2
Reputation: 40171
A possibility involving dplyr
and tidyr
could be:
dates %>%
rowid_to_column() %>%
gather(var, Date, -rowid) %>%
group_by(rowid) %>%
complete(Date = seq(min(Date), max(Date), by = "1 day")) %>%
ungroup() %>%
count(Date)
Date n
<date> <int>
1 2017-01-01 1
2 2017-01-02 2
3 2017-01-03 3
4 2017-01-04 3
5 2017-01-05 3
6 2017-01-06 2
7 2017-01-07 1
Or with addition of lubridate
:
dates %>%
rowwise() %>%
mutate(Date = interval(Start, End)/days(1),
Date = list(Start + days(0:Date))) %>%
ungroup() %>%
unnest() %>%
count(Date)
Upvotes: 2
Reputation: 389235
Using base R, we can create a sequence between Start
and End
dates and calculate the frequency of all the dates using table
.
stack(table(do.call(c, Map(seq, dates$Start, dates$End, by = "1 day"))))
# values ind
#1 1 2017-01-01
#2 2 2017-01-02
#3 3 2017-01-03
#4 3 2017-01-04
#5 3 2017-01-05
#6 2 2017-01-06
#7 1 2017-01-07
Upvotes: 2
Reputation: 29202
Here's a possibility using data.table
:
library(data.table)
setDT(dates)[ ,.(Date = seq(Start, End, by = "day")),
by = 1:nrow(dates)][,
.(count = .N), by = Date]
#> Date count
#> 1: 2017-01-01 1
#> 2: 2017-01-02 2
#> 3: 2017-01-03 3
#> 4: 2017-01-04 3
#> 5: 2017-01-05 3
#> 6: 2017-01-06 2
#> 7: 2017-01-07 1
Upvotes: 3