Evan
Evan

Reputation: 1499

How do I create a binary column based off characters in another column in R?

ROW   ID       SEX               RACE               
2  REC1000023   F                1.Black
7  REC1000032   M                6.White
8  REC1000066   M                4.Asian
9  REC1000078   M                6.White
10 REC1000099   M                5.Multiracial 

I would like to create a binary variable "Black" and have it be 0 or 1 depending on the value in the "RACE" column. I would also like a "White" column and an "Other" column. Like so:

ROW   ID       SEX               RACE           Black   White  Other        
2  REC1000023   F                1.Black         1      0      0
7  REC1000032   M                6.White         0      1      0
8  REC1000066   M                4.Asian         0      0      1
9  REC1000078   M                6.White         0      1      0
10 REC1000099   M                5.Multiracial   0      0      1

Upvotes: 2

Views: 682

Answers (5)

jared_mamrot
jared_mamrot

Reputation: 26505

Using ifelse:

library(tidyverse)
# Example data
df <- data.frame(
  stringsAsFactors = FALSE,
               ROW = c(2L, 7L, 8L, 9L, 10L),
                ID = c("REC1000023","REC1000032",
                       "REC1000066","REC1000078","REC1000099"),
               SEX = c("F", "M", "M", "M", "M"),
              RACE = c("1.Black","6.White","4.Asian",
                       "6.White","5.Multiracial")
)

# Create new columns
df2 <- df %>% 
  mutate(Black = ifelse(RACE == "1.Black", 1, 0),
         White = ifelse(RACE == "6.White", 1, 0),
         Other = ifelse(RACE != "1.Black" & RACE != "6.White", 1, 0))
df2
#  ROW         ID SEX          RACE Black White Other
#1   2 REC1000023   F       1.Black     1     0     0
#2   7 REC1000032   M       6.White     0     1     0
#3   8 REC1000066   M       4.Asian     0     0     1
#4   9 REC1000078   M       6.White     0     1     0
#5  10 REC1000099   M 5.Multiracial     0     0     1

--

Not sure if speed is a factor in your application, but here's a benchmark using the example dataset:

ronak_func <- function(df){
  df %>%
    mutate(col = sub('\\d+\\.', '', RACE), 
           col = replace(col, !col %in% c('Black', 'White'), 'Other')) %>%
    pivot_wider(names_from = col, values_from = col, 
                values_fn = length, values_fill = 0)
}

jared_func <- function(df){
  df %>% 
    mutate(Black = ifelse(RACE == "1.Black", 1, 0),
           White = ifelse(RACE == "6.White", 1, 0),
           Other = ifelse(RACE != "1.Black" & RACE != "6.White", 1, 0))
}

karthik_func <- function(df){
  df %>% mutate(Black = +str_detect(RACE,'Black'),
                White = +str_detect(RACE,'White'),
                Other = +(!str_detect(RACE,'Black|White')))
}

jpdugo17_func <- function(df){
  map_dfc(list('1.Black', '6.White'), ~ transmute(df, '{str_sub(.x, 3, -1)}' := if_else(RACE == .x, 1, 0))) %>% 
    mutate(other = if_else(Black + White == 1, 0, 1)) %>% cbind(df, .)
}

GKi1_func <- function(df) {
  df$Black <- +(df$RACE == "1.Black")
  df$White <- +(df$RACE == "6.White")
  df$Other <- 1 - (df$Black | df$White)
  df
}

GKi2_func <- function(df) {
  df$Black <- +grepl("Black", df$RACE, fixed = TRUE)
  df$White <- +grepl("White", df$RACE, fixed = TRUE)
  df$Other <- 1 - (df$Black | df$White)
  df
}

jared_func_dt <- function(df){
  setDT(df)
  df[, Black := +(df$RACE == "1.Black")][, White := +(df$RACE == "6.White")][, Other :=  1 - (df$Black | df$White)]
}

res <- microbenchmark::microbenchmark(ronak_func(df),
                                      jared_func(df),
                                      karthik_func(df),
                                      jpdugo17_func(df),
                                      GKi1_func(df),
                                      GKi2_func(df),
                                      jared_func_dt(df))
autoplot(res)

example_4.png

And a benchmark using an example dataset with 10k rows:

df2 <- data.frame(stringsAsFactors = FALSE,
                  ROW = 1:10000,
                  ID = rep(c("REC1000023","REC1000032",
                             "REC1000066","REC1000078",
                             "REC1000099"), times = 2000),
                  SEX = sample(c("F", "M"),
                               replace = TRUE,
                               size = 10000),
                  RACE = sample(c("1.Black","6.White","4.Asian",
                           "6.White","5.Multiracial"),
                           replace = TRUE,
                           size = 10000))
res <- microbenchmark::microbenchmark(ronak_func(df2),
                                      jared_func(df2),
                                      karthik_func(df2),
                                      jpdugo17_func(df2),
                                      GKi1_func(df2),
                                      GKi2_func(df2),
                                      jared_func_dt(df2))
autoplot(res)

example_3.png

Upvotes: 1

GKi
GKi

Reputation: 39657

In case Black is always coded as 1.Black and White is always coded as 6.White you can compare by using == and turn the TRUE/FALSE vector in 1/0 using +:

df$Black <- +(df$RACE == "1.Black")
df$White <- +(df$RACE == "6.White")

In case the other characters are changing then you can use grepl:

df$Black <- +grepl("Black", df$RACE, fixed = TRUE)
df$White <- +grepl("White", df$RACE, fixed = TRUE)

To get the remaining column Other just use what is already in Black and White:

df$Other <- 1 - (df$Black | df$White)

Result:

df
#  ROW         ID SEX          RACE Black White Other
#1   2 REC1000023   F       1.Black     1     0     0
#2   7 REC1000032   M       6.White     0     1     0
#3   8 REC1000066   M       4.Asian     0     0     1
#4   9 REC1000078   M       6.White     0     1     0
#5  10 REC1000099   M 5.Multiracial     0     0     1

Upvotes: 4

jpdugo17
jpdugo17

Reputation: 7106

library(tidyverse)
df <- 
read_table(file = "ROW   ID       SEX               RACE               
2  REC1000023   F                1.Black
7  REC1000032   M                6.White
8  REC1000066   M                4.Asian
9  REC1000078   M                6.White
10 REC1000099   M                5.Multiracial ")

map_dfc(list('1.Black', '6.White'), ~ transmute(df, '{str_sub(.x, 3, -1)}' := if_else(RACE == .x, 1, 0))) %>% 
    mutate(other = if_else(Black + White == 1, 0, 1)) %>% cbind(df, .)
#>        ROW   ID SEX          RACE Black White other
#> 1 2  REC1000023   F       1.Black     1     0     0
#> 2 7  REC1000032   M       6.White     0     1     0
#> 3 8  REC1000066   M       4.Asian     0     0     1
#> 4 9  REC1000078   M       6.White     0     1     0
#> 5 10 REC1000099   M 5.Multiracial     0     0     1

Created on 2021-06-16 by the reprex package (v2.0.0)

Upvotes: 1

Karthik S
Karthik S

Reputation: 11584

Does this work:

library(dplyr)
library(stringr)
df %>% mutate(Black = +str_detect(RACE,'Black'),
              White = +str_detect(RACE,'White'),
              Other = +(!str_detect(RACE,'Black|White')))
# A tibble: 5 x 7
    ROW ID         SEX   RACE          Black White Other
  <dbl> <chr>      <chr> <chr>         <int> <int> <int>
1     2 REC1000023 F     1.Black           1     0     0
2     7 REC1000032 M     6.White           0     1     0
3     8 REC1000066 M     4.Asian           0     0     1
4     9 REC1000078 M     6.White           0     1     0
5    10 REC1000099 M     5.Multiracial     0     0     1

Upvotes: 3

Ronak Shah
Ronak Shah

Reputation: 388962

Create a new column where any value apart from c('Black', 'White') is changed to 'Other' and use pivot_wider.

library(dplyr)
library(tidyr)

df %>%
  mutate(col = sub('\\d+\\.', '', RACE), 
         col = replace(col, !col %in% c('Black', 'White'), 'Other')) %>%
  pivot_wider(names_from = col, values_from = col, 
              values_fn = length, values_fill = 0)

#    ROW ID         SEX   RACE          Black White Other
#  <int> <chr>      <chr> <chr>         <int> <int> <int>
#1     2 REC1000023 F     1.Black           1     0     0
#2     7 REC1000032 M     6.White           0     1     0
#3     8 REC1000066 M     4.Asian           0     0     1
#4     9 REC1000078 M     6.White           0     1     0
#5    10 REC1000099 M     5.Multiracial     0     0     1

Upvotes: 2

Related Questions