Eisen
Eisen

Reputation: 1887

Creating dummy variables in tidyverse syntax?

I have a data frame like so:

dat <- tribble(
  ~isHot, ~isCrispy, ~Restaurant,
  1, 0, "A",
  0, 0, "B",
  1, 1, "B",
  0, 0, "C"
)

> dat
# A tibble: 4 × 3
  isHot isCrispy Restaurant
  <dbl>    <dbl> <chr>     
1     1        0 A         
2     0        0 B         
3     1        1 B         
4     0        0 C    

I want to create dummy variables for all categorical variables to get the following output:

  isHot isCrispy Restaurant_A Restuarant_B     
1     1        0 1            0
2     0        0 0            1
3     1        1 0            1
4     0        0 0            0

Can I do this just via tidyverse syntax? I don't want to use recipes, fastdummies, or other packages.

EDIT:

I want the code to be adapted to all categorical variables. In this example there is only one categorical variable, but what if there are more? I want to be able to take the feature name and create dummies. For example if there is another categorical feature called City, I would have dummies variables like..City_A, City_B, etc.

Upvotes: 1

Views: 3850

Answers (3)

Eric
Eric

Reputation: 41

tidyr - all categorical columns

To obtain dummy variables for all categorical columns using a 'tidy' approach, we can use pivot_longer prior to pivot_wider.

dat <- tribble(
 ~id, ~isHot, ~isCrispy, ~Restaurant, ~City,
 1, 1, 0, "A", "B",
 2, 0, 0, "B", "B",
 3, 1, 1, "B", "A",
 4, 0, 0, "C", 'A'
)

> dat
# # A tibble: 4 x 5
#      id isHot isCrispy Restaurant City 
#   <dbl> <dbl>    <dbl> <chr>      <chr>
# 1     1     1        0 A          B    
# 2     2     0        0 B          B    
# 3     3     1        1 B          A    
# 4     4     0        0 C          A  

dat %>%
 pivot_longer(cols = where(is.character),
              names_to = "dummy_names",
              values_to = "dummy_levels") %>%
 mutate(dummy_value = 1) %>%
 pivot_wider(names_from = c(dummy_names, dummy_levels),
             values_from = dummy_value, 
             values_fill = 0)
# # A tibble: 4 x 8
#      id isHot isCrispy Restaurant_A City_B Restaurant_B City_A Restaurant_C
#   <dbl> <dbl>    <dbl>        <dbl>  <dbl>        <dbl>  <dbl>        <dbl>
# 1     1     1        0            1      1            0      0            0
# 2     2     0        0            0      1            1      0            0
# 3     3     1        1            0      0            1      1            0
# 4     4     0        0            0      0            0      1            1

Note the column positions and how they match up with the order of the original data. It shouldn't be too difficult to reorder the columns to your own liking. For example, something like this:

dat %>%
 pivot_longer(cols = where(is.character),
              names_to = "dummy_names",
              values_to = "dummy_levels") %>%
 mutate(dummy_value = 1) %>%
 pivot_wider(names_from = c(dummy_names, dummy_levels),
             values_from = dummy_value, 
             values_fill = 0) %>%
 select(id, isCrispy, isHot, order(colnames(.)))
# # A tibble: 4 x 8
#      id isCrispy isHot City_A City_B Restaurant_A Restaurant_B Restaurant_C
#   <dbl>    <dbl> <dbl>  <dbl>  <dbl>        <dbl>        <dbl>        <dbl>
# 1     1        0     1      0      1            1            0            0
# 2     2        0     0      0      1            0            1            0
# 3     3        1     1      1      0            0            1            0
# 4     4        0     0      1      0            0            0            1

Upvotes: 4

r2evans
r2evans

Reputation: 160407

dplyr

As requested,

library(dplyr)
library(tidyr) # pivot_wider
dat %>%
  mutate(rn = row_number(), val = 1L) %>%
  pivot_wider(c("rn", "isHot", "isCrispy"),
    names_from = "Restaurant", values_from = "val",
    names_prefix = "Restaurant_", values_fill = 0L) %>%
  select(-rn)
# # A tibble: 4 x 5
#   isHot isCrispy Restaurant_A Restaurant_B Restaurant_C
#   <dbl>    <dbl>        <int>        <int>        <int>
# 1     1        0            1            0            0
# 2     0        0            0            1            0
# 3     1        1            0            1            0
# 4     0        0            0            0            1

but this seems more complex than it needs to be ...

Base R

model.matrix(~ 0 + Restaurant, data = dat)
#   RestaurantA RestaurantB RestaurantC
# 1           1           0           0
# 2           0           1           0
# 3           0           1           0
# 4           0           0           1
# attr(,"assign")
# [1] 1 1 1
# attr(,"contrasts")
# attr(,"contrasts")$Restaurant
# [1] "contr.treatment"

which leads to dplyr-ification as

dat %>%
  mutate(as.data.frame(model.matrix(~ 0 + Restaurant, data = cur_data()))) %>%
  select(-Restaurant)
# # A tibble: 4 x 5
#   isHot isCrispy RestaurantA RestaurantB RestaurantC
#   <dbl>    <dbl>       <dbl>       <dbl>       <dbl>
# 1     1        0           1           0           0
# 2     0        0           0           1           0
# 3     1        1           0           1           0
# 4     0        0           0           0           1

Edit for your "dynamic factors" request. Since you did not update your sample data, I'll add a column:

dat$QUUX <- c("x","x","y","y")

isfac <- sapply(dat, inherits, c("character", "factor"))
isfac
#      isHot   isCrispy Restaurant       QUUX 
#      FALSE      FALSE       TRUE       TRUE 

out <- do.call(cbind, c(list(dat), lapply(paste("~ 0 +", names(isfac)[isfac]), function(frm) model.matrix(formula(frm), data = dat))))
out[, !names(out) %in% names(isfac)[isfac]]
#   isHot isCrispy RestaurantA RestaurantB RestaurantC QUUXx QUUXy
# 1     1        0           1           0           0     1     0
# 2     0        0           0           1           0     1     0
# 3     1        1           0           1           0     0     1
# 4     0        0           0           0           1     0     1

Upvotes: 1

Quixotic22
Quixotic22

Reputation: 2924

Here's one way with dpylr

dat %>%
  mutate(k = paste0("Restaurant_", Restaurant), v = 1) %>%
  pivot_wider(names_from = k, values_from = v, values_fill = 0) %>%
  select(-Restaurant)

Upvotes: 0

Related Questions