piper180
piper180

Reputation: 361

How to I use the separate function in R to split a column into as many columns as needed rather than specifying?

I have a column of data in a dataframe that is formatted with commas as shown (first table). I am able to separate it into three columns (second table), however, would like to use this function so I can split the data into as many columns as there may be in the data rather than having to specify in this case 5 columns for the second table. How would I go about doing this with this function or similar functions? Thanks!

split_data <- separate(df,"M1", c("M1.1","M1.2","M1.3","M1.4","M1.5"), sep = ",", fill = "right")
id M1
1 blue,green,yellow
2 orange,green
3 orange,purple,blue,pink,white
id M1.1 M1.2 M1.3 M1.4 M1.5
1 blue green yellow NA NA
2 orange green NA NA NA
3 orange purple blue pink white

Upvotes: 0

Views: 1059

Answers (2)

r2evans
r2evans

Reputation: 160437

dplyr

I'm inferring tidyverse from your use of separate (though I won't be using that function

library(dplyr)
library(tidyr) # unnest, pivot_wider
df %>%
  mutate(M1 = strsplit(M1, "[,\\s]+")) %>%
  unnest(M1) %>%
  group_by(id) %>%
  mutate(v = paste0("M1.", row_number())) %>%
  pivot_wider(id, names_from = "v", values_from = "M1") %>%
  ungroup()
# # A tibble: 3 x 6
#      id M1.1   M1.2   M1.3   M1.4  M1.5 
#   <int> <chr>  <chr>  <chr>  <chr> <chr>
# 1     1 blue   green  yellow <NA>  <NA> 
# 2     2 orange green  <NA>   <NA>  <NA> 
# 3     3 orange purple blue   pink  white

base R

M1s <- strsplit(df$M1, "[,\\s]+")
M1s <- do.call(rbind, lapply(M1s, `length<-`, max(lengths(M1s))))
colnames(M1s) <- paste0("M1.", seq_len(ncol(M1s)))
cbind(subset(df, select = -M1), M1s)
#   id   M1.1   M1.2   M1.3 M1.4  M1.5
# 1  1   blue  green yellow <NA>  <NA>
# 2  2 orange  green   <NA> <NA>  <NA>
# 3  3 orange purple   blue pink white

data.table

library(data.table)
DT <- as.data.table(df)
cbind(DT[, .(id)], DT[, tstrsplit(M1, "[,\\s]+")])
#       id     V1     V2     V3     V4     V5
#    <int> <char> <char> <char> <char> <char>
# 1:     1   blue  green yellow   <NA>   <NA>
# 2:     2 orange  green   <NA>   <NA>   <NA>
# 3:     3 orange purple   blue   pink  white

(and then use setnames to rename them).


Data

df <- structure(list(id = 1:3, M1 = c("blue,green,yellow", "orange,green", "orange,purple,blue,pink,white")), class = "data.frame", row.names = c(NA, -3L))

Upvotes: 3

Samet S&#246;kel
Samet S&#246;kel

Reputation: 2670

data <- read.table(
textConnection("id  M1
1   blue,green,yellow
2   orange,green
3   orange,purple,blue,pink,white"),header=T)

library(stringr)
library(dplyr)
library(tidyr)


max_size <- data %>% select(M1) %>% pull %>%sapply(function(x) str_count(x,pattern = ',')) %>%max + 1
                                                   
data %>%separate('M1', paste("M1", 1:max_size, sep="."), sep=",", extra="drop")

output;

     id M1.1   M1.2   M1.3   M1.4  M1.5 
  <int> <chr>  <chr>  <chr>  <chr> <chr>
1     1 blue   green  yellow NA    NA   
2     2 orange green  NA     NA    NA   
3     3 orange purple blue   pink  white

Upvotes: 0

Related Questions