dpl136
dpl136

Reputation: 61

Sum/count between two dates in R

I have a data frame with two columns - one is the Start date the other is the End date. I need to get a new data frame with two columns - a Date column and a column which is the count of observations from the first data frame where the date is between Start and End.

Have:

dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))

 Start        End
2017-01-01 2017-01-03
2017-01-02 2017-01-04
2017-01-03 2017-01-05
2017-01-04 2017-01-06
2017-01-05 2017-01-07

Want:

Date         Count
2017-01-01     1
2017-01-02     2
2017-01-03     3
2017-01-04     3
2017-01-05     3
2017-01-06     2
2017-01-07     1

I could use for loops, but is there a better way to do this in R?

Upvotes: 5

Views: 2929

Answers (5)

Cole
Cole

Reputation: 11255

These base options avoid grouping.

The first option uses logic to avoid Map or reshaping. It is:

  1. Give me all the dates$Start options
  2. Combine with dates$End when the difference between the two isn't 0 (i.e., if it's the same date I shouldn't double count it).
  3. Combine with dates$Start plus the seq_len where, yet again, the difference between isn't 0.
date_diffs <- dates$End - dates$Start
x <- c(dates[['Start']],
  with(subset(dates, subset = date_diffs > 0)
        ,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
           ))
data.frame(table(x))

The second option Vectorize()s the seq.Date() function. Then it's just simply combining the results.


#or
vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)
table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1)))

        Var1 Freq
1 2017-01-01    1
2 2017-01-02    2
3 2017-01-03    3
4 2017-01-04    3
5 2017-01-05    3
6 2017-01-06    2
7 2017-01-07    1

Performance: For @akrun's base option, I wrapped the table() result in data.frame() so everyone was producing a data.frame of some sort.

# The original data set copied to make 1080 rows
Unit: milliseconds
             expr       min         lq       mean     median        uq       max neval
  tmfmnk_complete 1629.3048 1647.52845 1680.82496 1664.07245 1697.4511 1828.4093    20
    tmfmnk_lubrid 6882.1404 6959.20810 7018.69083 7002.27455 7064.0898 7276.2349    20
     M_M_data.tab  103.4166  103.99925  108.33817  107.95715  108.6591  134.9388    20
 Ronak_stack_base  131.0364  134.23370  137.46651  137.32235  141.4388  144.5971    20
      akrun_purrr  133.4917  136.89080  138.92787  138.44575  140.7778  147.4172    20
       akrun_base  130.4179  134.16735  137.21640  136.98225  140.1182  145.8873    20
        cole_base   15.4396   15.60345   16.42558   16.74245   17.1322   17.4201    20
     cole_vec_seq  114.7890  118.44795  130.69493  121.76430  124.0880  309.7418    20
  cole_dt_version   15.9107   15.98035   16.56220   16.11790   17.0216   18.8438    20

#Original data set copied to make  180 rows:

Unit: milliseconds
             expr       min         lq        mean     median         uq       max neval
  tmfmnk_complete  275.6845  279.06390  281.871350  281.95420  284.16025  287.5412    20
    tmfmnk_lubrid 1136.1109 1161.35915 1176.073630 1169.81685 1176.87130 1277.6444    20
     M_M_data.tab   19.0258   19.33070   19.766890   19.45450   19.63410   24.7390    20
 Ronak_stack_base   22.2327   22.56530   23.234095   22.85260   23.20790   27.0589    20
      akrun_purrr   27.8797   28.50225   29.146325   28.71840   29.11915   33.3277    20
       akrun_base   22.3477   22.61135   23.370780   22.81920   23.41300   28.6941    20
        cole_base    3.4258    3.50735    3.642605    3.62470    3.67595    3.9780    20
     cole_vec_seq   19.9366   20.08345   21.359275   20.17250   22.48055   25.7780    20
  cole_dt_version    3.9992    4.09905    4.207690    4.16135    4.28265    4.5052    20

# Original dataset copied to make 30 rows

Unit: milliseconds
             expr      min        lq       mean    median        uq      max neval
  tmfmnk_complete  51.2437  52.16495  54.524465  52.55520  56.19050  66.9461    20
    tmfmnk_lubrid 192.1206 196.99550 198.501640 197.64815 201.42050 203.7031    20
     M_M_data.tab   4.9511   5.05215   5.215670   5.19315   5.33075   5.7740    20
 Ronak_stack_base   4.3609   4.51110   4.995405   4.54885   4.79490   8.8183    20
      akrun_purrr  10.9024  10.96420  11.622235  11.07575  11.58300  15.7751    20
       akrun_base   4.4919   4.55905   4.843730   4.60825   4.73760   8.4334    20
        cole_base   1.4225   1.48635   1.738995   1.58685   1.60780   5.2324    20
     cole_vec_seq   4.0648   4.16095   4.318665   4.24445   4.48420   4.7344    20
  cole_dt_version   1.9733   2.06385   2.132040   2.13965   2.18945   2.3612    20

#Original 5 row dataset

Unit: milliseconds
             expr     min       lq      mean   median       uq     max neval
  tmfmnk_complete 14.7549 14.90780 15.463195 15.10195 15.62030 18.9115    20
    tmfmnk_lubrid 37.2571 37.58240 41.583090 38.18540 40.57435 86.6058    20
     M_M_data.tab  2.6235  2.85145  3.037975  2.90815  2.97045  5.3476    20
 Ronak_stack_base  1.3305  1.38490  1.465170  1.49175  1.53355  1.5978    20
      akrun_purrr  7.7036  7.86260  8.212875  7.98790  8.18055 11.7898    20
       akrun_base  1.4046  1.43715  1.501945  1.51890  1.56545  1.6176    20
        cole_base  1.0560  1.09905  1.169260  1.16010  1.21595  1.3601    20
     cole_vec_seq  1.3547  1.40685  1.452515  1.45645  1.51385  1.5328    20
  cole_dt_version  1.5662  1.70555  1.813365  1.78930  1.84720  2.5267    20

Code for reference:

library(data.table)
library(dplyr)
library(purrr)
library(tidyverse)
library(microbenchmark)
library(lubridate)

dates<-data.frame("Start"=seq(as.Date("2017/1/1"),by="day", length.out = 5),"End"=seq(as.Date("2017/1/3"),by="day", length.out = 5))
dates_dt <- as.data.table(dates)

dates <- rbind(dates,dates,dates,dates,dates,dates) #repeat this as many times as you want
dates_dt <- as.data.table(dates)

vec_seq <- Vectorize(seq.Date, , vectorize.args = c("from", "to"), SIMPLIFY = F)

microbenchmark(
  tmfmnk_complete = {
    dates %>%
      rowid_to_column() %>%
      gather(var, Date, -rowid) %>%
      group_by(rowid) %>%
      complete(Date = seq(min(Date), max(Date), by = "1 day")) %>%
      ungroup() %>%
      count(Date)
  }
  , tmfmnk_lubrid = {
    dates %>%
      rowwise() %>%
      mutate(Date = interval(Start, End)/days(1),
             Date = list(Start + days(0:Date))) %>%
      ungroup() %>%
      unnest() %>%
      count(Date)
  }
  , M_M_data.tab = {
    dates_dt[ ,.(Date = seq(Start, End, by = "day")), 
      by = 1:nrow(dates_dt)][,
                          .(count = .N), by = Date]
  }
  , Ronak_stack_base = {
    stack(table(do.call(c, Map(seq, dates$Start, dates$End, by = "1 day"))))
  }
  , akrun_purrr = {
    dates %>%
      transmute(Date = map2(Start, End, seq, by = "1 day")) %>%
      unnest(Date) %>%
      count(Date)
  }
  , akrun_base = {
    lst1 <- do.call(Map, c(f = seq,  unname(dates), by = "1 day"))
    data.frame(table(do.call(c, lst1)))
  }
  , cole_base = {
    date_diffs <- dates$End - dates$Start
    x <- c(dates[['Start']],
           with(subset(dates, subset = date_diffs > 0)
                ,c(End, rep(Start, date_diffs - 1) + sequence(date_diffs-1))
           ))
    data.frame(table(x))
  }
  , cole_vec_seq = {
    data.frame(table(do.call(c, vec_seq(dates[['Start']], dates[['End']], 1))))
  }
  , cole_dt_version = {
    date_diffs <- dates$End - dates$Start
    dates_dt[date_diffs > 0, data.frame(table({diff_sub = End - Start -1; c(dates_dt[['Start']], End, rep(Start, diff_sub) + sequence(diff_sub))}))]
  }
, times = 20
  )

Upvotes: 5

akrun
akrun

Reputation: 887851

An option would be to use map2 to get the sequence of 'Date' between corresponding 'Start', 'End' columns, unnest the list output and get the count

library(dplyr)
library(tidyr)
library(purrr)
dates %>%
   transmute(Date = map2(Start, End, seq, by = "1 day")) %>%
   unnest(Date) %>%
   count(Date)
# A tibble: 7 x 2
#  Date           n
#  <date>     <int>
#1 2017-01-01     1
#2 2017-01-02     2
#3 2017-01-03     3
#4 2017-01-04     3
#5 2017-01-05     3
#6 2017-01-06     2
#7 2017-01-07     1

Or an option in base R

lst1 <- do.call(Map, c(f = seq,  unname(dates), by = "1 day"))
table(do.call(c, lst1))

Upvotes: 2

tmfmnk
tmfmnk

Reputation: 40171

A possibility involving dplyr and tidyr could be:

dates %>%
 rowid_to_column() %>%
 gather(var, Date, -rowid) %>%
 group_by(rowid) %>%
 complete(Date = seq(min(Date), max(Date), by = "1 day")) %>%
 ungroup() %>%
 count(Date)

  Date           n
  <date>     <int>
1 2017-01-01     1
2 2017-01-02     2
3 2017-01-03     3
4 2017-01-04     3
5 2017-01-05     3
6 2017-01-06     2
7 2017-01-07     1

Or with addition of lubridate:

dates %>%
 rowwise() %>%
 mutate(Date = interval(Start, End)/days(1),
        Date = list(Start + days(0:Date))) %>%
 ungroup() %>%
 unnest() %>%
 count(Date)

Upvotes: 2

Ronak Shah
Ronak Shah

Reputation: 389235

Using base R, we can create a sequence between Start and End dates and calculate the frequency of all the dates using table.

stack(table(do.call(c, Map(seq, dates$Start, dates$End, by = "1 day"))))

#  values        ind
#1      1 2017-01-01
#2      2 2017-01-02
#3      3 2017-01-03
#4      3 2017-01-04
#5      3 2017-01-05
#6      2 2017-01-06
#7      1 2017-01-07

Upvotes: 2

M--
M--

Reputation: 29202

Here's a possibility using data.table:

library(data.table)
setDT(dates)[ ,.(Date = seq(Start, End, by = "day")), 
               by = 1:nrow(dates)][,
                                    .(count = .N), by = Date]

#>          Date count
#> 1: 2017-01-01     1
#> 2: 2017-01-02     2
#> 3: 2017-01-03     3
#> 4: 2017-01-04     3
#> 5: 2017-01-05     3
#> 6: 2017-01-06     2
#> 7: 2017-01-07     1

Upvotes: 3

Related Questions