Reputation: 241
I have a list where each element is a data frame that flags dates in a trip duration for each individual.
group.l <- list(
data.frame(DATE = seq.Date(as.Date("2017-01-01"),
as.Date("2017-12-31"),
"days")) %>%
mutate(PERSON1_TRIP1 = ifelse(DATE %in% seq.Date(as.Date("2017-01-05"),
as.Date("2017-03-04"),
"days"), 1, 0)),
data.frame(DATE = seq.Date(as.Date("2017-01-01"),
as.Date("2017-12-31"),
"days")) %>%
mutate(PERSON1_TRIP2 = ifelse(DATE %in% seq.Date(as.Date("2017-04-15"),
as.Date("2017-06-04"),
"days"), 1, 0)),
data.frame(DATE = seq.Date(as.Date("2017-01-01"),
as.Date("2017-12-31"),
"days")) %>%
mutate(PERSON2_TRIP1 = ifelse(DATE %in% seq.Date(as.Date("2017-01-01"),
as.Date("2017-01-05"),
"days"), 1, 0)),
data.frame(DATE = seq.Date(as.Date("2017-01-01"),
as.Date("2017-12-31"),
"days")) %>%
mutate(PERSON2_TRIP2 = ifelse(DATE %in% seq.Date(as.Date("2017-11-15"),
as.Date("2017-12-04"),
"days"), 1, 0))
)
A sample of the 3rd element of the list looks like this
I would like to have a data frame where there is one column for Person, one column for trip number, and the remaining columns would be each date that would take a 1 if person i's trip x fell on that column date.
maybe a transpose combination with a str_split would work? transpose would make the date column into the new individual date columns and the str_split would split up the original PERSON_TRIP column name into two individual columns.
My real data set spans over 2004 to 2019 with thousands of PERSON_TRIP combinations so I would need the most efficient code as a way to save on computational time.
Upvotes: 0
Views: 159
Reputation: 887008
We can bind the list
elements with bind_rows
, use pivot_longer
to reshape the data into 'long' format, separate
the column 'name' at the delimiter _
and reshape it back to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
library(data.table)
bind_rows(group.l) %>%
pivot_longer(cols = -DATE, values_drop_na = TRUE) %>%
separate(name, into = c("PERSON", "TRIP")) %>%
mutate(rn = rowid(PERSON, TRIP, DATE)) %>%
pivot_wider(names_from = DATE, values_from = value, values_fill = 0)
-output
# A tibble: 4 x 368
PERSON TRIP rn `2017-01-01` `2017-01-02` `2017-01-03` `2017-01-04` `2017-01-05` `2017-01-06` `2017-01-07` `2017-01-08` `2017-01-09`
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 PERSO… TRIP1 1 0 0 0 0 1 1 1 1 1
2 PERSO… TRIP2 1 0 0 0 0 0 0 0 0 0
3 PERSO… TRIP1 1 1 1 1 1 1 0 0 0 0
4 PERSO… TRIP2 1 0 0 0 0 0 0 0 0 0
# … with 356 more variables: `2017-01-10` <dbl>, `2017-01-11` <dbl>, `2017-01-12` <dbl>, `2017-01-13` <dbl>, `2017-01-14` <dbl>,
# `2017-01-15` <dbl>, `2017-01-16` <dbl>, `2017-01-17` <dbl>, `2017-01-18` <dbl>, `2017-01-19` <dbl>, `2017-01-20` <dbl>, `2017-01-21` <dbl>,
# `2017-01-22` <dbl>, `2017-01-23` <dbl>, `2017-01-24` <dbl>, `2017-01-25` <dbl>, `2017-01-26` <dbl>, `2017-01-27` <dbl>, `2017-01-28` <dbl>,
# `2017-01-29` <dbl>, `2017-01-30` <dbl>, `2017-01-31` <dbl>, `2017-02-01` <dbl>, `2017-02-02` <dbl>, `2017-02-03` <dbl>, `2017-02-04` <dbl>,
# `2017-02-05` <dbl>, `2017-02-06` <dbl>, `2017-02-07` <dbl>, `2017-02-08` <dbl>, `2017-02-09` <dbl>, `2017-02-10` <dbl>, `2017-02-11` <dbl>,
# `2017-02-12` <dbl>, `2017-02-13` <dbl>, `2017-02-14` <dbl>, `2017-02-15` <dbl>, `2017-02-16` <dbl>, `2017-02-17` <dbl>, `2017-02-18` <dbl>,
# `2017-02-19` <dbl>, `2017-02-20` <dbl>, `2017-02-21` <dbl>, `2017-02-22` <dbl>, `2017-02-23` <dbl>, `2017-02-24` <dbl>, `2017-02-25` <dbl>,
# `2017-02-26` <dbl>, `2017-02-27` <dbl>, `2017-02-28` <dbl>, `2017-03-01` <dbl>, `2017-03-02` <dbl>, `2017-03-03` <dbl>, `2017-03-04` <dbl>,
# `2017-03-05` <dbl>, `2017-03-06` <dbl>, `2017-03-07` <dbl>, `2017-03-08` <dbl>, `2017-03-09` <dbl>, `2017-03-10` <dbl>, `2017-03-11` <dbl>,
# `2017-03-12` <dbl>, `2017-03-13` <dbl>, `2017-03-14` <dbl>, `2017-03-15` <dbl>, `2017-03-16` <dbl>, `2017-03-17` <dbl>, `2017-03-18` <dbl>,
# `2017-03-19` <dbl>, `2017-03-20` <dbl>, `2017-03-21` <dbl>, `2017-03-22` <dbl>, `2017-03-23` <dbl>, `2017-03-24` <dbl>, `2017-03-25` <dbl>,
# `2017-03-26` <dbl>, `2017-03-27` <dbl>, `2017-03-28` <dbl>, `2017-03-29` <dbl>, `2017-03-30` <dbl>, `2017-03-31` <dbl>, `2017-04-01` <dbl>,
# `2017-04-02` <dbl>, `2017-04-03` <dbl>, `2017-04-04` <dbl>, `2017-04-05` <dbl>, `2017-04-06` <dbl>, `2017-04-07` <dbl>, `2017-04-08` <dbl>,
# `2017-04-09` <dbl>, `2017-04-10` <dbl>, `2017-04-11` <dbl>, `2017-04-12` <dbl>, `2017-04-13` <dbl>, `2017-04-14` <dbl>, `2017-04-15` <dbl>,
# `2017-04-16` <dbl>, `2017-04-17` <dbl>, `2017-04-18` <dbl>, `2017-04-19` <dbl>, …
>
Upvotes: 2
Reputation: 1579
Use stringr::str_split_fixed()
:
data <-
map_dfr(
group.l,
function(df) {
df <- df %>%
mutate(
person_trip = names(df)[2]
)
df <-
bind_cols(
df %>% select(-person_trip) %>% setNames(c("date", "value")),
as.data.frame(stringr::str_split_fixed(df$person_trip, "_", 2)) %>%
setNames(c("person", "trip"))
)
}
) %>%
tidyr::pivot_wider(names_from = date, values_from = value) # this will give you what you want although I don't think spread the date value is a good idea
> data
# A tibble: 4 x 367
person trip `2017-01-01` `2017-01-02` `2017-01-03` `2017-01-04` `2017-01-05` `2017-01-06` `2017-01-07` `2017-01-08` `2017-01-09` `2017-01-10`
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 PERSO… TRIP1 0 0 0 0 1 1 1 1 1 1
2 PERSO… TRIP2 0 0 0 0 0 0 0 0 0 0
3 PERSO… TRIP1 1 1 1 1 1 0 0 0 0 0
4 PERSO… TRIP2 0 0 0 0 0 0 0 0 0 0
# … with 355 more variables: `2017-01-11` <dbl>, `2017-01-12` <dbl>, `2017-01-13` <dbl>, `2017-01-14` <dbl>, `2017-01-15` <dbl>,
# `2017-01-16` <dbl>, `2017-01-17` <dbl>, `2017-01-18` <dbl>, `2017-01-19` <dbl>, `2017-01-20` <dbl>, `2017-01-21` <dbl>, `2017-01-22` <dbl>,
# `2017-01-23` <dbl>, `2017-01-24` <dbl>, `2017-01-25` <dbl>, `2017-01-26` <dbl>, `2017-01-27` <dbl>, `2017-01-28` <dbl>, `2017-01-29` <dbl>,
# `2017-01-30` <dbl>, `2017-01-31` <dbl>, `2017-02-01` <dbl>, `2017-02-02` <dbl>, `2017-02-03` <dbl>, `2017-02-04` <dbl>, `2017-02-05` <dbl>,
# `2017-02-06` <dbl>, `2017-02-07` <dbl>, `2017-02-08` <dbl>, `2017-02-09` <dbl>, `2017-02-10` <dbl>, `2017-02-11` <dbl>, `2017-02-12` <dbl>,
# `2017-02-13` <dbl>, `2017-02-14` <dbl>, `2017-02-15` <dbl>, `2017-02-16` <dbl>, `2017-02-17` <dbl>, `2017-02-18` <dbl>, `2017-02-19` <dbl>,
# `2017-02-20` <dbl>, `2017-02-21` <dbl>, `2017-02-22` <dbl>, `2017-02-23` <dbl>, `2017-02-24` <dbl>, `2017-02-25` <dbl>, `2017-02-26` <dbl>,
# `2017-02-27` <dbl>, `2017-02-28` <dbl>, `2017-03-01` <dbl>, `2017-03-02` <dbl>, `2017-03-03` <dbl>, `2017-03-04` <dbl>, `2017-03-05` <dbl>,
# `2017-03-06` <dbl>, `2017-03-07` <dbl>, `2017-03-08` <dbl>, `2017-03-09` <dbl>, `2017-03-10` <dbl>, `2017-03-11` <dbl>, `2017-03-12` <dbl>,
# `2017-03-13` <dbl>, `2017-03-14` <dbl>, `2017-03-15` <dbl>, `2017-03-16` <dbl>, `2017-03-17` <dbl>, `2017-03-18` <dbl>, `2017-03-19` <dbl>,
# `2017-03-20` <dbl>, `2017-03-21` <dbl>, `2017-03-22` <dbl>, `2017-03-23` <dbl>, `2017-03-24` <dbl>, `2017-03-25` <dbl>, `2017-03-26` <dbl>,
# `2017-03-27` <dbl>, `2017-03-28` <dbl>, `2017-03-29` <dbl>, `2017-03-30` <dbl>, `2017-03-31` <dbl>, `2017-04-01` <dbl>, `2017-04-02` <dbl>,
# `2017-04-03` <dbl>, `2017-04-04` <dbl>, `2017-04-05` <dbl>, `2017-04-06` <dbl>, `2017-04-07` <dbl>, `2017-04-08` <dbl>, `2017-04-09` <dbl>,
# `2017-04-10` <dbl>, `2017-04-11` <dbl>, `2017-04-12` <dbl>, `2017-04-13` <dbl>, `2017-04-14` <dbl>, `2017-04-15` <dbl>, `2017-04-16` <dbl>,
# `2017-04-17` <dbl>, `2017-04-18` <dbl>, `2017-04-19` <dbl>, `2017-04-20` <dbl>, …
Upvotes: 2