Reputation: 16478
Consider a data frame of the form
idnum start end
1993.1 17 1993-01-01 1993-12-31
1993.2 17 1993-01-01 1993-12-31
1993.3 17 1993-01-01 1993-12-31
with start
and end
being of type Date
$ idnum : int 17 17 17 17 27 27
$ start : Date, format: "1993-01-01" "1993-01-01" "1993-01-01" "1993-01-01" ...
$ end : Date, format: "1993-12-31" "1993-12-31" "1993-12-31" "1993-12-31" ...
I would like to create a new dataframe, that has instead monthly observations for every row, for every month in between start
and end
(including the boundaries):
Desired Output
idnum month
17 1993-01-01
17 1993-02-01
17 1993-03-01
...
17 1993-11-01
17 1993-12-01
I'm not sure what format month
should have, I will at some point want to group by idnum
, month
for regressions on the rest of the data set.
So far, for every single row, seq(from=test[1,'start'], to=test[1, 'end'], by='1 month')
gives me the right sequence - but as soon as I try to apply that to the whole data frame, it will not work:
> foo <- apply(test, 1, function(x) seq(x['start'], to=x['end'], by='1 month'))
Error in to - from : non-numeric argument to binary operator
Upvotes: 51
Views: 29291
Reputation: 39858
One option creating a sequence per every row using dplyr
and tidyr
could be:
df %>%
rowwise() %>%
transmute(idnum,
date = list(seq(start, end, by = "month"))) %>%
unnest(date)
idnum date
<int> <date>
1 17 1993-01-01
2 17 1993-02-01
3 17 1993-03-01
4 17 1993-04-01
5 17 1993-05-01
6 17 1993-06-01
7 17 1993-07-01
8 17 1993-08-01
9 17 1993-09-01
10 17 1993-10-01
# … with 26 more rows
Or creating the sequence using a grouping ID:
df %>%
group_by(idnum) %>%
transmute(date = list(seq(min(start), max(end), by = "month"))) %>%
unnest(date)
Or when the goal is to create only one unique sequence per ID:
df %>%
group_by(idnum) %>%
summarise(start = min(start),
end = max(end)) %>%
transmute(date = list(seq(min(start), max(end), by = "month"))) %>%
unnest(date)
date
<date>
1 1993-01-01
2 1993-02-01
3 1993-03-01
4 1993-04-01
5 1993-05-01
6 1993-06-01
7 1993-07-01
8 1993-08-01
9 1993-09-01
10 1993-10-01
11 1993-11-01
12 1993-12-01
Or using reframe()
since dplyr 1.1.0
:
df %>%
rowwise() %>%
reframe(idnum,
date = seq(start, end, by = "month"))
Upvotes: 10
Reputation: 1253
A vectorised solution which utilises lubridate
for the month calculations.
time_seq_v()
is a vectorised version of seq()
specifically for date and datetime calculations.
library(lubridate)
library(data.table)
# remotes::install_github("NicChr/timeplyr")
library(timeplyr)
df <- data.frame(idnum = c(1993.1, 1993.2, 1993.3),
start = ymd(rep(19930101, 3)),
end = ymd(rep(19931231, 3)))
setDT(df)
df[, list(month = time_seq_v(start, end, by = "month"))]
#> month
#> 1: 1993-01-01
#> 2: 1993-02-01
#> 3: 1993-03-01
#> 4: 1993-04-01
#> 5: 1993-05-01
#> 6: 1993-06-01
#> 7: 1993-07-01
#> 8: 1993-08-01
#> 9: 1993-09-01
#> 10: 1993-10-01
#> 11: 1993-11-01
#> 12: 1993-12-01
#> 13: 1993-01-01
#> 14: 1993-02-01
#> 15: 1993-03-01
#> 16: 1993-04-01
#> 17: 1993-05-01
#> 18: 1993-06-01
#> 19: 1993-07-01
#> 20: 1993-08-01
#> 21: 1993-09-01
#> 22: 1993-10-01
#> 23: 1993-11-01
#> 24: 1993-12-01
#> 25: 1993-01-01
#> 26: 1993-02-01
#> 27: 1993-03-01
#> 28: 1993-04-01
#> 29: 1993-05-01
#> 30: 1993-06-01
#> 31: 1993-07-01
#> 32: 1993-08-01
#> 33: 1993-09-01
#> 34: 1993-10-01
#> 35: 1993-11-01
#> 36: 1993-12-01
#> month
Created on 2023-05-16 with reprex v2.0.2
Upvotes: 0
Reputation: 123818
And yet another tidyverse
approach would be to use tidyr::expand
:
library(dplyr, warn = FALSE)
library(tidyr)
df |>
mutate(
row = row_number()
) |>
group_by(row) |>
expand(idnum, date = seq(start, end, "month")) |>
ungroup() |>
select(-row)
#> # A tibble: 36 × 2
#> idnum date
#> <int> <date>
#> 1 17 1993-01-01
#> 2 17 1993-02-01
#> 3 17 1993-03-01
#> 4 17 1993-04-01
#> 5 17 1993-05-01
#> 6 17 1993-06-01
#> 7 17 1993-07-01
#> 8 17 1993-08-01
#> 9 17 1993-09-01
#> 10 17 1993-10-01
#> # … with 26 more rows
Upvotes: 6
Reputation: 886938
With new versions of purrr
(0.3.0
) and dplyr
(0.8.0
), this can be done with map2
library(dplyr)
library(purrr)
test %>%
# sequence of monthly dates for each corresponding start, end elements
transmute(idnum, month = map2(start, end, seq, by = "1 month")) %>%
# unnest the list column
unnest %>%
# remove any duplicate rows
distinct
Based on @Ananda Mahto's comments
res1 <- melt(setNames(lapply(1:nrow(test), function(x) seq(test[x, "start"],
test[x, "end"], by = "1 month")), test$idnum))
Also,
res2 <- setNames(do.call(`rbind`,
with(test,
Map(`expand.grid`,idnum,
Map(`seq`, start, end, by='1 month')))), c("idnum", "month"))
head(res1)
# idnum month
#1 17 1993-01-01
#2 17 1993-02-01
#3 17 1993-03-01
#4 17 1993-04-01
#5 17 1993-05-01
#6 17 1993-06-01
Upvotes: 12
Reputation: 118779
Using data.table
:
require(data.table) ## 1.9.2+
setDT(df)[ , list(idnum = idnum, month = seq(start, end, by = "month")), by = 1:nrow(df)]
# you may use dot notation as a shorthand alias of list in j:
setDT(df)[ , .(idnum = idnum, month = seq(start, end, by = "month")), by = 1:nrow(df)]
setDT
converts df
to a data.table
. Then for each row, by = 1:nrow(df)
, we create idnum
and month
as required.
Upvotes: 48
Reputation: 13581
tidyverse
answer
Data
df <- structure(list(idnum = c(17L, 17L, 17L), start = structure(c(8401,
8401, 8401), class = "Date"), end = structure(c(8765, 8765, 8765
), class = "Date")), class = "data.frame", .Names = c("idnum",
"start", "end"), row.names = c(NA, -3L))
Answer and output
library(tidyverse)
df %>%
nest(start, end) %>%
mutate(data = map(data, ~seq(unique(.x$start), unique(.x$end), 1))) %>%
unnest(data)
# # A tibble: 365 x 2
# idnum data
# <int> <date>
# 1 17 1993-01-01
# 2 17 1993-01-02
# 3 17 1993-01-03
# 4 17 1993-01-04
# 5 17 1993-01-05
# 6 17 1993-01-06
# 7 17 1993-01-07
# 8 17 1993-01-08
# 9 17 1993-01-09
# 10 17 1993-01-10
# # ... with 355 more rows
Upvotes: 4
Reputation: 49033
Using dplyr
:
test %>%
group_by(idnum) %>%
summarize(start=min(start),end=max(end)) %>%
do(data.frame(idnum=.$idnum, month=seq(.$start,.$end,by="1 month")))
Note that here I don't generate a sequence between start
and end
for each row, instead it is a sequence between min(start)
and max(end)
for each idnum
. If you want the former :
test %>%
rowwise() %>%
do(data.frame(idnum=.$idnum, month=seq(.$start,.$end,by="1 month")))
Upvotes: 27