Reputation: 1499
ROW ID SEX RACE
2 REC1000023 F 1.Black
7 REC1000032 M 6.White
8 REC1000066 M 4.Asian
9 REC1000078 M 6.White
10 REC1000099 M 5.Multiracial
I would like to create a binary variable "Black" and have it be 0 or 1 depending on the value in the "RACE" column. I would also like a "White" column and an "Other" column. Like so:
ROW ID SEX RACE Black White Other
2 REC1000023 F 1.Black 1 0 0
7 REC1000032 M 6.White 0 1 0
8 REC1000066 M 4.Asian 0 0 1
9 REC1000078 M 6.White 0 1 0
10 REC1000099 M 5.Multiracial 0 0 1
Upvotes: 2
Views: 682
Reputation: 26505
Using ifelse:
library(tidyverse)
# Example data
df <- data.frame(
stringsAsFactors = FALSE,
ROW = c(2L, 7L, 8L, 9L, 10L),
ID = c("REC1000023","REC1000032",
"REC1000066","REC1000078","REC1000099"),
SEX = c("F", "M", "M", "M", "M"),
RACE = c("1.Black","6.White","4.Asian",
"6.White","5.Multiracial")
)
# Create new columns
df2 <- df %>%
mutate(Black = ifelse(RACE == "1.Black", 1, 0),
White = ifelse(RACE == "6.White", 1, 0),
Other = ifelse(RACE != "1.Black" & RACE != "6.White", 1, 0))
df2
# ROW ID SEX RACE Black White Other
#1 2 REC1000023 F 1.Black 1 0 0
#2 7 REC1000032 M 6.White 0 1 0
#3 8 REC1000066 M 4.Asian 0 0 1
#4 9 REC1000078 M 6.White 0 1 0
#5 10 REC1000099 M 5.Multiracial 0 0 1
--
Not sure if speed is a factor in your application, but here's a benchmark using the example dataset:
ronak_func <- function(df){
df %>%
mutate(col = sub('\\d+\\.', '', RACE),
col = replace(col, !col %in% c('Black', 'White'), 'Other')) %>%
pivot_wider(names_from = col, values_from = col,
values_fn = length, values_fill = 0)
}
jared_func <- function(df){
df %>%
mutate(Black = ifelse(RACE == "1.Black", 1, 0),
White = ifelse(RACE == "6.White", 1, 0),
Other = ifelse(RACE != "1.Black" & RACE != "6.White", 1, 0))
}
karthik_func <- function(df){
df %>% mutate(Black = +str_detect(RACE,'Black'),
White = +str_detect(RACE,'White'),
Other = +(!str_detect(RACE,'Black|White')))
}
jpdugo17_func <- function(df){
map_dfc(list('1.Black', '6.White'), ~ transmute(df, '{str_sub(.x, 3, -1)}' := if_else(RACE == .x, 1, 0))) %>%
mutate(other = if_else(Black + White == 1, 0, 1)) %>% cbind(df, .)
}
GKi1_func <- function(df) {
df$Black <- +(df$RACE == "1.Black")
df$White <- +(df$RACE == "6.White")
df$Other <- 1 - (df$Black | df$White)
df
}
GKi2_func <- function(df) {
df$Black <- +grepl("Black", df$RACE, fixed = TRUE)
df$White <- +grepl("White", df$RACE, fixed = TRUE)
df$Other <- 1 - (df$Black | df$White)
df
}
jared_func_dt <- function(df){
setDT(df)
df[, Black := +(df$RACE == "1.Black")][, White := +(df$RACE == "6.White")][, Other := 1 - (df$Black | df$White)]
}
res <- microbenchmark::microbenchmark(ronak_func(df),
jared_func(df),
karthik_func(df),
jpdugo17_func(df),
GKi1_func(df),
GKi2_func(df),
jared_func_dt(df))
autoplot(res)
And a benchmark using an example dataset with 10k rows:
df2 <- data.frame(stringsAsFactors = FALSE,
ROW = 1:10000,
ID = rep(c("REC1000023","REC1000032",
"REC1000066","REC1000078",
"REC1000099"), times = 2000),
SEX = sample(c("F", "M"),
replace = TRUE,
size = 10000),
RACE = sample(c("1.Black","6.White","4.Asian",
"6.White","5.Multiracial"),
replace = TRUE,
size = 10000))
res <- microbenchmark::microbenchmark(ronak_func(df2),
jared_func(df2),
karthik_func(df2),
jpdugo17_func(df2),
GKi1_func(df2),
GKi2_func(df2),
jared_func_dt(df2))
autoplot(res)
Upvotes: 1
Reputation: 39657
In case Black is always coded as 1.Black
and White is always coded as 6.White
you can compare by using ==
and turn the TRUE/FALSE vector in 1/0 using +
:
df$Black <- +(df$RACE == "1.Black")
df$White <- +(df$RACE == "6.White")
In case the other characters are changing then you can use grepl
:
df$Black <- +grepl("Black", df$RACE, fixed = TRUE)
df$White <- +grepl("White", df$RACE, fixed = TRUE)
To get the remaining column Other just use what is already in Black
and White
:
df$Other <- 1 - (df$Black | df$White)
Result:
df
# ROW ID SEX RACE Black White Other
#1 2 REC1000023 F 1.Black 1 0 0
#2 7 REC1000032 M 6.White 0 1 0
#3 8 REC1000066 M 4.Asian 0 0 1
#4 9 REC1000078 M 6.White 0 1 0
#5 10 REC1000099 M 5.Multiracial 0 0 1
Upvotes: 4
Reputation: 7106
library(tidyverse)
df <-
read_table(file = "ROW ID SEX RACE
2 REC1000023 F 1.Black
7 REC1000032 M 6.White
8 REC1000066 M 4.Asian
9 REC1000078 M 6.White
10 REC1000099 M 5.Multiracial ")
map_dfc(list('1.Black', '6.White'), ~ transmute(df, '{str_sub(.x, 3, -1)}' := if_else(RACE == .x, 1, 0))) %>%
mutate(other = if_else(Black + White == 1, 0, 1)) %>% cbind(df, .)
#> ROW ID SEX RACE Black White other
#> 1 2 REC1000023 F 1.Black 1 0 0
#> 2 7 REC1000032 M 6.White 0 1 0
#> 3 8 REC1000066 M 4.Asian 0 0 1
#> 4 9 REC1000078 M 6.White 0 1 0
#> 5 10 REC1000099 M 5.Multiracial 0 0 1
Created on 2021-06-16 by the reprex package (v2.0.0)
Upvotes: 1
Reputation: 11584
Does this work:
library(dplyr)
library(stringr)
df %>% mutate(Black = +str_detect(RACE,'Black'),
White = +str_detect(RACE,'White'),
Other = +(!str_detect(RACE,'Black|White')))
# A tibble: 5 x 7
ROW ID SEX RACE Black White Other
<dbl> <chr> <chr> <chr> <int> <int> <int>
1 2 REC1000023 F 1.Black 1 0 0
2 7 REC1000032 M 6.White 0 1 0
3 8 REC1000066 M 4.Asian 0 0 1
4 9 REC1000078 M 6.White 0 1 0
5 10 REC1000099 M 5.Multiracial 0 0 1
Upvotes: 3
Reputation: 388962
Create a new column where any value apart from c('Black', 'White')
is changed to 'Other'
and use pivot_wider
.
library(dplyr)
library(tidyr)
df %>%
mutate(col = sub('\\d+\\.', '', RACE),
col = replace(col, !col %in% c('Black', 'White'), 'Other')) %>%
pivot_wider(names_from = col, values_from = col,
values_fn = length, values_fill = 0)
# ROW ID SEX RACE Black White Other
# <int> <chr> <chr> <chr> <int> <int> <int>
#1 2 REC1000023 F 1.Black 1 0 0
#2 7 REC1000032 M 6.White 0 1 0
#3 8 REC1000066 M 4.Asian 0 0 1
#4 9 REC1000078 M 6.White 0 1 0
#5 10 REC1000099 M 5.Multiracial 0 0 1
Upvotes: 2