Reputation: 1493
How can I convert my column "payment" from long to wide format while keeping the other columns unchanged?
For each level of "letter", when the cell is before the value of "payment", then when in the wide format this row of the corresponding new variable "e.g., dollar" will have "0"; otherwise "1".
I tried output_format_test<-input_format%>%tidyr::pivot_wider(names_from = age, values_from = payment)
, but it does not produce the intended result.
##Input format
input_format <- readr::read_table2("letter age payment
A 2 NA
A 3 dollar
A 4 NA
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
input_format
# A tibble: 11 x 3
letter age payment
<chr> <dbl> <chr>
1 A 2 NA
2 A 3 dollar
3 A 4 NA
4 D 2 euro
5 D 3 dollar
6 D 4 NA
7 F 2 NA
8 F 3 euro
9 F 3 dollar
10 F 4 NA
11 F 4 NA
##output format
output_format <- readr::read_table2(
"letter age payment dollar euro
A 2 NA 0 0
A 3 dollar 1 0
A 4 NA 1 0
D 2 euro 0 1
D 3 dollar 1 1
D 4 NA 1 1
F 2 NA 0 0
F 3 euro 0 1
F 3 dollar 1 1
F 4 NA 1 1
F 4 NA 1 1
")
output_format
# A tibble: 11 x 5
letter age payment dollar euro
<chr> <dbl> <chr> <dbl> <dbl>
1 A 2 NA 0 0
2 A 3 dollar 1 0
3 A 4 NA 1 0
4 D 2 euro 0 1
5 D 3 dollar 1 1
6 D 4 NA 1 1
7 F 2 NA 0 0
8 F 3 euro 0 1
9 F 3 dollar 1 1
10 F 4 NA 1 1
11 F 4 NA 1 1
Thanks. Edited.
Upvotes: 4
Views: 688
Reputation: 21908
You can also use the following tidyverse
solution:
library(dplyr)
library(tidyr)
library(stringr)
input_format %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = payment, values_from = payment,
values_fn = length) %>%
select(- c(id, `NA`)) %>%
bind_cols(input_format$payment) %>%
rename_with(~ str_replace(., "\\.\\.\\.\\d+", "payment"), contains(fixed("..."))) %>%
relocate(letter, age, payment) %>%
group_by(letter) %>%
replace_na(list(dollar = 0, euro = 0)) %>%
mutate(across(dollar:euro, ~ cummax(.x))) -> input2
# A tibble: 11 x 5
# Groups: letter [3]
letter age payment dollar euro
<chr> <dbl> <chr> <dbl> <dbl>
1 A 2 NA 0 0
2 A 3 dollar 1 0
3 A 4 NA 1 0
4 D 2 euro 0 1
5 D 3 dollar 1 1
6 D 4 NA 1 1
7 F 2 NA 0 0
8 F 3 euro 0 1
9 F 3 dollar 1 1
10 F 4 NA 1 1
11 F 4 NA 1 1
After discussion in the comment you can use the following solution to get your desired output:
input2 %>%
group_by(letter, age) %>%
add_count() %>%
group_by(letter, age) %>%
filter((n == 2 & if_all(dollar:euro, ~ .x == 1)) | n == 1) %>%
select(-n) %>%
group_by(letter, age) %>%
add_count() %>%
group_split(letter, age) %>%
map_dfr(~ if(.x$n[1] == 2) {
.x %>% slice_tail(n = 1)
} else {
.x
})
# A tibble: 9 x 6
letter age payment dollar euro n
<chr> <dbl> <chr> <dbl> <dbl> <int>
1 A 2 NA 0 0 1
2 A 3 dollar 1 0 1
3 A 4 NA 1 0 1
4 D 2 euro 0 1 1
5 D 3 dollar 1 1 1
6 D 4 NA 1 1 1
7 F 2 NA 0 0 1
8 F 3 dollar 1 1 1
9 F 4 NA 1 1 2
Upvotes: 3
Reputation: 18551
To add another approach:
We could use map_dfc
and set_names
to loop over a named vector of payment
s.
library(dplyr)
library(purrr)
input_format %>%
group_by(letter) %>%
mutate(map_dfc(unique(.$payment) %>% set_names(., .),
~ cumsum(!(payment != .x | is.na(payment)))
)) %>%
select(- `...1`)
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> # A tibble: 11 x 5
#> # Groups: letter [3]
#> letter age payment dollar euro
#> <chr> <dbl> <chr> <int> <int>
#> 1 A 2 <NA> 0 0
#> 2 A 3 dollar 1 0
#> 3 A 4 <NA> 1 0
#> 4 D 2 euro 0 1
#> 5 D 3 dollar 1 1
#> 6 D 4 <NA> 1 1
#> 7 F 2 <NA> 0 0
#> 8 F 3 euro 0 1
#> 9 F 3 dollar 1 1
#> 10 F 4 <NA> 1 1
#> 11 F 4 <NA> 1 1
We could simplify the code above with a package I have on github and use over
instead of map_dfc
and dist_values
instead of unique
library(dplyover) # https://github.com/TimTeaFan/dplyover
input_format %>%
group_by(letter) %>%
mutate(over(dist_values(.$payment),
~ cumsum(!(payment != .x | is.na(payment)))
))
#> # A tibble: 11 x 5
#> # Groups: letter [3]
#> letter age payment dollar euro
#> <chr> <dbl> <chr> <dbl> <dbl>
#> 1 A 2 <NA> 0 0
#> 2 A 3 dollar 1 0
#> 3 A 4 <NA> 1 0
#> 4 D 2 euro 0 1
#> 5 D 3 dollar 1 1
#> 6 D 4 <NA> 1 1
#> 7 F 2 <NA> 0 0
#> 8 F 3 euro 0 1
#> 9 F 3 dollar 1 1
#> 10 F 4 <NA> 1 1
#> 11 F 4 <NA> 1 1
Created on 2021-06-04 by the reprex package (v0.3.0)
However, neither my answer nor the accepted answer by @AnilGoyal can handle data when a payment method is mentioned more than once per group. I don't know if the desired answer should account for this case. At the moment only the answers by @Wietse de Vries and @Anoushiravan R work with this kind of data:
input_format <- readr::read_table2("letter age payment
A 2 NA
A 3 dollar
A 4 NA
A 5 dollar # this line is new
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
We could easily adjust the approach above as follows to account for this case:
input_format %>%
group_by(letter) %>%
mutate(over(dist_values(.$payment),
~ ifelse(
cumsum(!(payment != .x | is.na(payment))) >= 1,
1, 0)
))
#> # A tibble: 12 x 5
#> # Groups: letter [3]
#> letter age payment dollar euro
#> <chr> <dbl> <chr> <dbl> <dbl>
#> 1 A 2 <NA> 0 0
#> 2 A 3 dollar 1 0
#> 3 A 4 <NA> 1 0
#> 4 A 5 dollar 1 0
#> 5 D 2 euro 0 1
#> 6 D 3 dollar 1 1
#> 7 D 4 <NA> 1 1
#> 8 F 2 <NA> 0 0
#> 9 F 3 euro 0 1
#> 10 F 3 dollar 1 1
#> 11 F 4 <NA> 1 1
#> 12 F 4 <NA> 1 1
Created on 2021-06-04 by the reprex package (v0.3.0)
Upvotes: 2
Reputation: 2783
using zoo (and data.table but not required):
input_format <- fread("letter age payment
A 2 NA
A 3 dollar
A 4 NA
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
output_format <- copy(input_format)[payment == "dollar", dollar := 1][, dollar := na.locf0(dollar), by=.(letter)]
output_format[payment == "euro", euro := 1][, euro := na.locf0(euro), by=.(letter)]
output_format[, c("dollar", "euro")][is.na(output_format[, c("dollar", "euro")])] <- 0
which produces:
> output_format
letter age payment dollar euro
1: A 2 <NA> 0 0
2: A 3 dollar 1 0
3: A 4 <NA> 1 0
4: D 2 euro 0 1
5: D 3 dollar 1 1
6: D 4 <NA> 1 1
7: F 2 <NA> 0 0
8: F 3 euro 0 1
9: F 3 dollar 1 1
10: F 4 <NA> 1 1
11: F 4 <NA> 1 1
Upvotes: 2
Reputation: 26218
Tidyverse approach
input_format <- readr::read_table2("letter age payment
A 2 NA
A 3 dollar
A 4 NA
D 2 euro
D 3 dollar
D 4 NA
F 2 NA
F 3 euro
F 3 dollar
F 4 NA
F 4 NA")
library(tidyverse)
input_format %>% mutate(rowid = row_number(),
payment1 = payment,
dummy = 1) %>%
pivot_wider(id_cols = -c(payment1, dummy), names_from = payment1, values_from = dummy, values_fill = 0, values_fn = length) %>%
select(-`NA`) %>%
group_by(letter) %>%
mutate(across(c('dollar', 'euro'), cumsum))
#> # A tibble: 11 x 6
#> # Groups: letter [3]
#> letter age payment rowid dollar euro
#> <chr> <dbl> <chr> <int> <int> <int>
#> 1 A 2 <NA> 1 0 0
#> 2 A 3 dollar 2 1 0
#> 3 A 4 <NA> 3 1 0
#> 4 D 2 euro 4 0 1
#> 5 D 3 dollar 5 1 1
#> 6 D 4 <NA> 6 1 1
#> 7 F 2 <NA> 7 0 0
#> 8 F 3 euro 8 0 1
#> 9 F 3 dollar 9 1 1
#> 10 F 4 <NA> 10 1 1
#> 11 F 4 <NA> 11 1 1
Created on 2021-06-04 by the reprex package (v2.0.0)
Upvotes: 2