Reputation: 163
I have a dataset in which each row is a minister ID, for each row I have columns with the date (day, month, and year) the minister entered the government (begin) and exit the government (exit). This is a sample on how it looks like right now:
data <- structure(list(id_min = c("1030015", "1030028"), begin_day = c("29",
"4"), begin_month = c("12", "1"), begin_year = c("2019", "2020"
), exit_day = c("3", "10"), exit_month = c("1", "1"), exit_year = c("2020",
"2020")), row.names = c(NA, -2L), class = c("data.frame"))
I want to expand the rows between the "begin date" and "exit date", and also create a new column (number_days) counting the number of days the minister was in the government. The other variables in the dataset (omitted here) should just be repeated in the expanded version of the data). This is the data frame output I am looking for:
I was able to put the dates together in the format %Y-%m-%d, and create the new column "number_days" with the following codes
data$begin_date <- as.Date(with(data, paste(begin_year, begin_month, begin_day,sep="-")), "%Y-%m-%d")
data$exit_date <- as.Date(with(data, paste(exit_year, exit_month, exit_day,sep="-")), "%Y-%m-%d")
data$number_days <- data$exit_date - data$begin_date
But I am not having success expanding the rows between the two dates (begin_date and exit_date). I was trying to do it using tidyr::complete()
.
Upvotes: 0
Views: 623
Reputation: 388982
A tidyverse
solution :
library(tidyverse)
df %>%
#Combine begin columns to have start_day
unite(start_day, begin_year, begin_month, begin_day, sep = '-', remove = FALSE) %>%
#Combine exit columns to have end_day
unite(end_day, exit_year, exit_month, exit_day, sep = '-', remove = FALSE) %>%
#Convert to date
mutate_at(vars(start_day, end_day), as.Date) %>%
#Count number of days between them
mutate(number_days = as.integer(end_day - start_day) + 1) %>%
#Create a sequence between two dates
mutate(date = map2(start_day, end_day, seq, by = 'day')) %>%
#Add the data in long format
unnest(date) %>%
#separate date into year, month and date.
separate(date, c('begin_year', 'begin_month', 'begin_day'), sep = '-') %>%
select(-start_day, -end_day)
# A tibble: 13 x 8
# id_min exit_day exit_month exit_year number_days begin_year begin_month begin_day
# <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
# 1 1030015 3 1 2020 6 2019 12 29
# 2 1030015 3 1 2020 6 2019 12 30
# 3 1030015 3 1 2020 6 2019 12 31
# 4 1030015 3 1 2020 6 2020 01 01
# 5 1030015 3 1 2020 6 2020 01 02
# 6 1030015 3 1 2020 6 2020 01 03
# 7 1030028 10 1 2020 7 2020 01 04
# 8 1030028 10 1 2020 7 2020 01 05
# 9 1030028 10 1 2020 7 2020 01 06
#10 1030028 10 1 2020 7 2020 01 07
#11 1030028 10 1 2020 7 2020 01 08
#12 1030028 10 1 2020 7 2020 01 09
#13 1030028 10 1 2020 7 2020 01 10
Upvotes: 4
Reputation: 24790
One approach is with the uncount
function:
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
mutate(start_date = mdy(paste(begin_month,begin_day,begin_year,sep="-")),
end_date = mdy(paste(exit_month,exit_day,exit_year,sep="-")),
number_days = as.integer(end_date-start_date + 1)) %>%
uncount(as.integer(number_days)) %>%
group_by(id_min) %>%
mutate(begin_day = day(seq(start_date[1], end_date[1], by = "days")),
begin_month = month(seq(start_date[1], end_date[1], by = "days")),
begin_year = year(seq(start_date[1], end_date[1], by = "days"))) %>%
dplyr::select(-start_date, -end_date)
# A tibble: 13 x 8
# Groups: id_min [2]
id_min begin_day begin_month begin_year exit_day exit_month exit_year number_days
<chr> <int> <int> <int> <chr> <chr> <chr> <int>
1 1030015 29 12 2019 3 1 2020 6
2 1030015 30 12 2019 3 1 2020 6
3 1030015 31 12 2019 3 1 2020 6
4 1030015 1 1 2020 3 1 2020 6
5 1030015 2 1 2020 3 1 2020 6
6 1030015 3 1 2020 3 1 2020 6
7 1030028 4 1 2020 10 1 2020 7
8 1030028 5 1 2020 10 1 2020 7
9 1030028 6 1 2020 10 1 2020 7
10 1030028 7 1 2020 10 1 2020 7
11 1030028 8 1 2020 10 1 2020 7
12 1030028 9 1 2020 10 1 2020 7
13 1030028 10 1 2020 10 1 2020 7
Upvotes: 1
Reputation: 3178
You can use the fill_gaps()
function from the tsibble
package as well.
library(tsibble)
library(dplyr)
library(tidyr)
data %>%
mutate(
exit_date = as.Date(paste(exit_year, exit_month, exit_day, sep = "-")),
begin_date = as.Date(paste(begin_year, begin_month, begin_day, sep = "-")),
number_days = as.numeric(exit_date - begin_date) + 1
) %>%
pivot_longer(cols = ends_with("date"), names_to = "event",values_to = "date") %>%
as_tsibble(key = id_min, index = date) %>%
fill_gaps() %>%
mutate(begin_day = format(date, "%d"),
begin_month = format(date, "%m"),
begin_year = format(date, "%Y")) %>%
as_tibble() %>%
select(-event, -date) %>%
fill(everything())
# A tibble: 13 x 8
id_min begin_day begin_month begin_year exit_day exit_month exit_year number_days
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 1030015 29 12 2019 3 1 2020 6
2 1030015 30 12 2019 3 1 2020 6
3 1030015 31 12 2019 3 1 2020 6
4 1030015 01 01 2020 3 1 2020 6
5 1030015 02 01 2020 3 1 2020 6
6 1030015 03 01 2020 3 1 2020 6
7 1030028 04 01 2020 10 1 2020 7
8 1030028 05 01 2020 10 1 2020 7
9 1030028 06 01 2020 10 1 2020 7
10 1030028 07 01 2020 10 1 2020 7
11 1030028 08 01 2020 10 1 2020 7
12 1030028 09 01 2020 10 1 2020 7
13 1030028 10 01 2020 10 1 2020 7
Upvotes: 2