Krantz
Krantz

Reputation: 1493

How to spread a single column into wide format with 0 and 1 as values defined conditionally?

How can I convert my column "payment" from long to wide format while keeping the other columns unchanged?

For each level of "letter", when the cell is before the value of "payment", then when in the wide format this row of the corresponding new variable "e.g., dollar" will have "0"; otherwise "1".

I tried output_format_test<-input_format%>%tidyr::pivot_wider(names_from = age, values_from = payment), but it does not produce the intended result.

##Input format

input_format <- readr::read_table2("letter age payment
    A 2 NA
    A 3 dollar
    A 4 NA
    D 2 euro
    D 3 dollar
    D 4 NA
    F 2 NA
    F 3 euro
    F 3 dollar
    F 4 NA
    F 4 NA")
    
input_format 
# A tibble: 11 x 3
   letter   age payment
   <chr>  <dbl> <chr>  
 1 A          2 NA     
 2 A          3 dollar 
 3 A          4 NA     
 4 D          2 euro   
 5 D          3 dollar 
 6 D          4 NA     
 7 F          2 NA     
 8 F          3 euro   
 9 F          3 dollar 
10 F          4 NA     
11 F          4 NA 

##output format

output_format <- readr::read_table2(
"letter age payment dollar euro
A 2 NA 0 0
A 3 dollar 1 0
A 4 NA 1 0
D 2 euro 0 1
D 3 dollar 1 1
D 4 NA 1 1
F 2 NA 0 0
F 3 euro 0 1
F 3 dollar 1 1
F 4 NA 1 1
F 4 NA 1 1
")
    
output_format
# A tibble: 11 x 5
   letter   age payment dollar  euro
   <chr>  <dbl> <chr>    <dbl> <dbl>
 1 A          2 NA           0     0
 2 A          3 dollar       1     0
 3 A          4 NA           1     0
 4 D          2 euro         0     1
 5 D          3 dollar       1     1
 6 D          4 NA           1     1
 7 F          2 NA           0     0
 8 F          3 euro         0     1
 9 F          3 dollar       1     1
10 F          4 NA           1     1
11 F          4 NA           1     1

Thanks. Edited.

Upvotes: 4

Views: 688

Answers (4)

Anoushiravan R
Anoushiravan R

Reputation: 21908

You can also use the following tidyverse solution:

library(dplyr)
library(tidyr)
library(stringr)

input_format %>%
  mutate(id = row_number()) %>%
  pivot_wider(names_from = payment, values_from = payment, 
              values_fn = length) %>%
  select(- c(id, `NA`)) %>%
  bind_cols(input_format$payment) %>%
  rename_with(~ str_replace(., "\\.\\.\\.\\d+", "payment"), contains(fixed("..."))) %>%
  relocate(letter, age, payment) %>%
  group_by(letter) %>%
  replace_na(list(dollar = 0, euro = 0)) %>%
  mutate(across(dollar:euro, ~ cummax(.x))) -> input2


# A tibble: 11 x 5
# Groups:   letter [3]
   letter   age payment dollar  euro
   <chr>  <dbl> <chr>    <dbl> <dbl>
 1 A          2 NA           0     0
 2 A          3 dollar       1     0
 3 A          4 NA           1     0
 4 D          2 euro         0     1
 5 D          3 dollar       1     1
 6 D          4 NA           1     1
 7 F          2 NA           0     0
 8 F          3 euro         0     1
 9 F          3 dollar       1     1
10 F          4 NA           1     1
11 F          4 NA           1     1

After discussion in the comment you can use the following solution to get your desired output:

input2 %>%
  group_by(letter, age) %>%
  add_count() %>%
  group_by(letter, age) %>%
  filter((n == 2 & if_all(dollar:euro, ~ .x == 1)) | n == 1) %>%
  select(-n) %>%
  group_by(letter, age) %>%
  add_count() %>%
  group_split(letter, age) %>%
  map_dfr(~ if(.x$n[1] == 2) {
    .x %>% slice_tail(n = 1)
  } else {
    .x
  })

# A tibble: 9 x 6
  letter   age payment dollar  euro     n
  <chr>  <dbl> <chr>    <dbl> <dbl> <int>
1 A          2 NA           0     0     1
2 A          3 dollar       1     0     1
3 A          4 NA           1     0     1
4 D          2 euro         0     1     1
5 D          3 dollar       1     1     1
6 D          4 NA           1     1     1
7 F          2 NA           0     0     1
8 F          3 dollar       1     1     1
9 F          4 NA           1     1     2

Upvotes: 3

TimTeaFan
TimTeaFan

Reputation: 18551

To add another approach:

We could use map_dfc and set_names to loop over a named vector of payments.

library(dplyr)
library(purrr)

input_format %>% 
  group_by(letter) %>% 
  mutate(map_dfc(unique(.$payment) %>% set_names(., .),
                 ~ cumsum(!(payment != .x | is.na(payment)))
  )) %>% 
  select(- `...1`)

#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> New names:
#> * NA -> ...1
#> # A tibble: 11 x 5
#> # Groups:   letter [3]
#>    letter   age payment dollar  euro
#>    <chr>  <dbl> <chr>    <int> <int>
#>  1 A          2 <NA>         0     0
#>  2 A          3 dollar       1     0
#>  3 A          4 <NA>         1     0
#>  4 D          2 euro         0     1
#>  5 D          3 dollar       1     1
#>  6 D          4 <NA>         1     1
#>  7 F          2 <NA>         0     0
#>  8 F          3 euro         0     1
#>  9 F          3 dollar       1     1
#> 10 F          4 <NA>         1     1
#> 11 F          4 <NA>         1     1

We could simplify the code above with a package I have on github and use over instead of map_dfc and dist_values instead of unique

library(dplyover) # https://github.com/TimTeaFan/dplyover

input_format %>% 
  group_by(letter) %>% 
  mutate(over(dist_values(.$payment),
              ~ cumsum(!(payment != .x | is.na(payment)))
              ))

#> # A tibble: 11 x 5
#> # Groups:   letter [3]
#>    letter   age payment dollar  euro
#>    <chr>  <dbl> <chr>    <dbl> <dbl>
#>  1 A          2 <NA>         0     0
#>  2 A          3 dollar       1     0
#>  3 A          4 <NA>         1     0
#>  4 D          2 euro         0     1
#>  5 D          3 dollar       1     1
#>  6 D          4 <NA>         1     1
#>  7 F          2 <NA>         0     0
#>  8 F          3 euro         0     1
#>  9 F          3 dollar       1     1
#> 10 F          4 <NA>         1     1
#> 11 F          4 <NA>         1     1

Created on 2021-06-04 by the reprex package (v0.3.0)

However, neither my answer nor the accepted answer by @AnilGoyal can handle data when a payment method is mentioned more than once per group. I don't know if the desired answer should account for this case. At the moment only the answers by @Wietse de Vries and @Anoushiravan R work with this kind of data:

input_format <- readr::read_table2("letter age payment
    A 2 NA
    A 3 dollar
    A 4 NA
    A 5 dollar # this line is new
    D 2 euro
    D 3 dollar
    D 4 NA
    F 2 NA
    F 3 euro
    F 3 dollar
    F 4 NA
    F 4 NA")

We could easily adjust the approach above as follows to account for this case:

input_format %>% 
  group_by(letter) %>% 
  mutate(over(dist_values(.$payment),
              ~ ifelse(
                  cumsum(!(payment != .x | is.na(payment))) >= 1,
                  1, 0)
  ))

#> # A tibble: 12 x 5
#> # Groups:   letter [3]
#>    letter   age payment dollar  euro
#>    <chr>  <dbl> <chr>    <dbl> <dbl>
#>  1 A          2 <NA>         0     0
#>  2 A          3 dollar       1     0
#>  3 A          4 <NA>         1     0
#>  4 A          5 dollar       1     0
#>  5 D          2 euro         0     1
#>  6 D          3 dollar       1     1
#>  7 D          4 <NA>         1     1
#>  8 F          2 <NA>         0     0
#>  9 F          3 euro         0     1
#> 10 F          3 dollar       1     1
#> 11 F          4 <NA>         1     1
#> 12 F          4 <NA>         1     1

Created on 2021-06-04 by the reprex package (v0.3.0)

Upvotes: 2

koolmees
koolmees

Reputation: 2783

using zoo (and data.table but not required):

input_format <- fread("letter age payment
     A 2 NA
     A 3 dollar
     A 4 NA
     D 2 euro
     D 3 dollar
     D 4 NA
     F 2 NA
     F 3 euro
     F 3 dollar
     F 4 NA
     F 4 NA")

output_format <- copy(input_format)[payment == "dollar", dollar := 1][, dollar := na.locf0(dollar), by=.(letter)]
output_format[payment == "euro", euro := 1][, euro := na.locf0(euro), by=.(letter)]
output_format[, c("dollar", "euro")][is.na(output_format[, c("dollar", "euro")])] <- 0

which produces:

 > output_format
    letter age payment dollar euro
 1:      A   2    <NA>      0    0
 2:      A   3  dollar      1    0
 3:      A   4    <NA>      1    0
 4:      D   2    euro      0    1
 5:      D   3  dollar      1    1
 6:      D   4    <NA>      1    1
 7:      F   2    <NA>      0    0
 8:      F   3    euro      0    1
 9:      F   3  dollar      1    1
10:      F   4    <NA>      1    1
11:      F   4    <NA>      1    1

Upvotes: 2

AnilGoyal
AnilGoyal

Reputation: 26218

Tidyverse approach

input_format <- readr::read_table2("letter age payment
    A 2 NA
    A 3 dollar
    A 4 NA
    D 2 euro
    D 3 dollar
    D 4 NA
    F 2 NA
    F 3 euro
    F 3 dollar
    F 4 NA
    F 4 NA")

library(tidyverse)

input_format %>% mutate(rowid = row_number(),
                        payment1 = payment,
                        dummy = 1) %>%
  pivot_wider(id_cols = -c(payment1, dummy), names_from = payment1, values_from = dummy, values_fill = 0, values_fn = length) %>%
  select(-`NA`) %>%
  group_by(letter) %>%
  mutate(across(c('dollar', 'euro'), cumsum))
#> # A tibble: 11 x 6
#> # Groups:   letter [3]
#>    letter   age payment rowid dollar  euro
#>    <chr>  <dbl> <chr>   <int>  <int> <int>
#>  1 A          2 <NA>        1      0     0
#>  2 A          3 dollar      2      1     0
#>  3 A          4 <NA>        3      1     0
#>  4 D          2 euro        4      0     1
#>  5 D          3 dollar      5      1     1
#>  6 D          4 <NA>        6      1     1
#>  7 F          2 <NA>        7      0     0
#>  8 F          3 euro        8      0     1
#>  9 F          3 dollar      9      1     1
#> 10 F          4 <NA>       10      1     1
#> 11 F          4 <NA>       11      1     1

Created on 2021-06-04 by the reprex package (v2.0.0)

Upvotes: 2

Related Questions