Orion
Orion

Reputation: 1104

Convert comma-separated keywords into columns in a data frame

I have the following data frame:

id    keywords
---   --------
1     Red
2     Red, Blue
3     Green, Blue

I want to reshape it into:

id    Red     Blue    Green
---   -----   -----   -----   
1     T       F       F
2     T       T       F
3     F       T       T

Any idea?

Upvotes: 3

Views: 188

Answers (5)

A5C1D2H2I1M1N2O1R2T1
A5C1D2H2I1M1N2O1R2T1

Reputation: 193627

The cSplit_e function from my "splitstackshape" package does this (converting to 1s and 0s instead of TRUEs and FALSEs):

library(splitstackshape)
cSplit_e(d, "keywords", ",", type = "character", fill = 0)
#   id   keywords keywords_Blue keywords_Green keywords_Red
# 1  1        Red             0              0            1
# 2  2   Red,Blue             1              0            1
# 3  3 Green,Blue             1              1            0

There's also a drop argument if you want to drop the original column.

Upvotes: 0

Rich Scriven
Rich Scriven

Reputation: 99351

Here's another possible approach

## vector for matching
nm <- c("Red", "Blue", "Green")

dc <- do.call(rbind, lapply(strsplit(df$keywords, ", "), function(x) {
    length(x) <- nrow(df)
    setNames(nm %in% x, nm)
}))
cbind(df[1], dc)
#   id   Red  Blue Green
# 1  1  TRUE FALSE FALSE
# 2  2  TRUE  TRUE FALSE
# 3  3 FALSE  TRUE  TRUE

Update: Here's a shorter, more efficient method using a function from the stringi package

mat <- vapply(nm, stringi::stri_detect_fixed, logical(3L), str = df$keywords)
cbind(df[1], mat)
#   id   Red  Blue Green
# 1  1  TRUE FALSE FALSE
# 2  2  TRUE  TRUE FALSE
# 3  3 FALSE  TRUE  TRUE

Upvotes: 2

akrun
akrun

Reputation: 887431

You can try mtabulate from qdapTools after splitting the string to substrings

library(qdapTools)
data.frame(id=df1$id,!!mtabulate(strsplit(df1$keywords, ', ')))
#   id  Blue Green   Red
#1  1 FALSE FALSE  TRUE
#2  2  TRUE FALSE  TRUE
#3  3  TRUE  TRUE FALSE

Or using base R

data.frame(id=df1$id,!!t(sapply(strsplit(df1$keywords, ', '),
   function(x) table(factor(x, levels=c('Red', 'Blue', 'Green'))))))
#  id   Red  Blue Green
#1  1  TRUE FALSE FALSE
#2  2  TRUE  TRUE FALSE
#3  3 FALSE  TRUE  TRUE

Upvotes: 4

Petr Matousu
Petr Matousu

Reputation: 3140

# using R base 
#
# your data
d <- data.frame(
id=1:3,
keywords=c('Red','Red,Blue','Green,Blue'),
stringsAsFactors=F)
# 
cls <- unique(unlist(strsplit(d$keywords,split=',')))
dl <- sapply(cls,function(n)sapply(d$keywords,grepl,pattern=n))
# print result
d <- cbind(d,dl)
d
  id   keywords   Red  Blue Green
1  1        Red  TRUE FALSE FALSE
2  2   Red,Blue  TRUE  TRUE FALSE
3  3 Green,Blue FALSE  TRUE  TRUE
# d$Green is atomic

Upvotes: 3

LyzandeR
LyzandeR

Reputation: 37879

A solution with dplyr:

library(dplyr)
df %>%
  #just one mutate function really needed as below
  mutate(Red  = grepl('Red'  , df$keywords),
         Blue = grepl('Blue' , df$keywords),
         Green= grepl('Green', df$keywords)) %>%
  #select the columns you need 
  select(id, Red, Blue, Green)

Output:

  id   Red  Blue Green
1  1  TRUE FALSE FALSE
2  2  TRUE  TRUE FALSE
3  3 FALSE  TRUE  TRUE

Or an alternative if you have many variables:

keywords <- c('Red', 'Blue', 'Green')
#one lapply function
df2 <-do.call(data.frame, lapply(keywords, function(x) grepl(x, df$keywords)))
colnames(df2) <- keywords
df <- cbind(df[-2], df2)

Output:

> df
  id   Red  Blue Green
1  1  TRUE FALSE FALSE
2  2  TRUE  TRUE FALSE
3  3 FALSE  TRUE  TRUE

Upvotes: 1

Related Questions