Reputation: 37
How do I make a dataframe with dummy as per output below?
INPUT:
ID Colours Shapes
1 Red, Blue Triangle
2 Yellow Square
3 Green, Black Circle, Oval
OUTPUT:
ID Red Blue Yellow Green Black Triangle Square Circle Oval
1 YES YES NO NO NO YES NO NO NO
2 NO NO YES NO NO NO YES NO NO
3 NO NO NO YES YES NO NO YES YES
Upvotes: 1
Views: 225
Reputation: 968
# Generate example data
(df1 <- structure(list(ID = 1:3, Colours = c("Red, Blue", "Yellow", "Green, Black"), Shapes = c("Triangle", "Square", "Circle, Oval")), class = "data.frame", row.names = c(NA, -3L)))
# Solve the problem
Unique_Colours <- unique(unlist(strsplit(df1$Colours, ", ")))
Unique_Shapes <- unique(unlist(strsplit(df1$Shapes, ", ")))
df2 <- as.data.frame(sapply(seq_len(length(Unique_Colours)), function (x) {
grepl(Unique_Colours[x], df1$Colours)
}))
colnames(df2) <- Unique_Colours
df3 <- as.data.frame(sapply(seq_len(length(Unique_Shapes)), function (x) {
grepl(Unique_Shapes[x], df1$Shapes)
}))
colnames(df3) <- Unique_Shapes
df4 <- cbind(df2, df3)
df4 <- ifelse(as.matrix(df4) == T, "Yes", "No")
final_df <- as.data.frame(cbind(df1$ID, df4))
colnames(final_df)[1] <- "ID"
final_df
# ID Red Blue Yellow Green Black Triangle Square Circle Oval
# 1 1 Yes Yes No No No Yes No No No
# 2 2 No No Yes No No No Yes No No
# 3 3 No No No Yes Yes No No Yes Yes
Upvotes: 0
Reputation: 388982
Using dplyr
and tidyr
you can do :
library(dplyr)
library(tidyr)
df %>%
#split the data on comma and create new rows
separate_rows(Colours, Shapes, sep = ',\\s*') %>%
#Create a dummy column
mutate(col = 'Yes') %>%
#get data in long format so color and shape are in same column
pivot_longer(cols = c(Colours, Shapes)) %>%
#Remove column names
select(-name) %>%
#Keep only unique values
distinct() %>%
#Get data in wide format
pivot_wider(names_from = value, values_from = col, values_fill = 'No')
# ID Red Triangle Blue Yellow Square Green Circle Black Oval
# <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 1 Yes Yes Yes No No No No No No
#2 2 No No No Yes Yes No No No No
#3 3 No No No No No Yes Yes Yes Yes
data
df <- structure(list(ID = 1:3, Colours = c("Red,Blue", "Yellow", "Green,Black"
), Shapes = c("Triangle", "Square", "Circle,Oval")),
class = "data.frame", row.names = c(NA, -3L))
Upvotes: 2