Reputation: 361
I have a column of data in a dataframe that is formatted with commas as shown (first table). I am able to separate it into three columns (second table), however, would like to use this function so I can split the data into as many columns as there may be in the data rather than having to specify in this case 5 columns for the second table. How would I go about doing this with this function or similar functions? Thanks!
split_data <- separate(df,"M1", c("M1.1","M1.2","M1.3","M1.4","M1.5"), sep = ",", fill = "right")
id | M1 |
---|---|
1 | blue,green,yellow |
2 | orange,green |
3 | orange,purple,blue,pink,white |
id | M1.1 | M1.2 | M1.3 | M1.4 | M1.5 |
---|---|---|---|---|---|
1 | blue | green | yellow | NA | NA |
2 | orange | green | NA | NA | NA |
3 | orange | purple | blue | pink | white |
Upvotes: 0
Views: 1059
Reputation: 160437
I'm inferring tidyverse from your use of separate
(though I won't be using that function
library(dplyr)
library(tidyr) # unnest, pivot_wider
df %>%
mutate(M1 = strsplit(M1, "[,\\s]+")) %>%
unnest(M1) %>%
group_by(id) %>%
mutate(v = paste0("M1.", row_number())) %>%
pivot_wider(id, names_from = "v", values_from = "M1") %>%
ungroup()
# # A tibble: 3 x 6
# id M1.1 M1.2 M1.3 M1.4 M1.5
# <int> <chr> <chr> <chr> <chr> <chr>
# 1 1 blue green yellow <NA> <NA>
# 2 2 orange green <NA> <NA> <NA>
# 3 3 orange purple blue pink white
M1s <- strsplit(df$M1, "[,\\s]+")
M1s <- do.call(rbind, lapply(M1s, `length<-`, max(lengths(M1s))))
colnames(M1s) <- paste0("M1.", seq_len(ncol(M1s)))
cbind(subset(df, select = -M1), M1s)
# id M1.1 M1.2 M1.3 M1.4 M1.5
# 1 1 blue green yellow <NA> <NA>
# 2 2 orange green <NA> <NA> <NA>
# 3 3 orange purple blue pink white
library(data.table)
DT <- as.data.table(df)
cbind(DT[, .(id)], DT[, tstrsplit(M1, "[,\\s]+")])
# id V1 V2 V3 V4 V5
# <int> <char> <char> <char> <char> <char>
# 1: 1 blue green yellow <NA> <NA>
# 2: 2 orange green <NA> <NA> <NA>
# 3: 3 orange purple blue pink white
(and then use setnames
to rename them).
Data
df <- structure(list(id = 1:3, M1 = c("blue,green,yellow", "orange,green", "orange,purple,blue,pink,white")), class = "data.frame", row.names = c(NA, -3L))
Upvotes: 3
Reputation: 2670
data <- read.table(
textConnection("id M1
1 blue,green,yellow
2 orange,green
3 orange,purple,blue,pink,white"),header=T)
library(stringr)
library(dplyr)
library(tidyr)
max_size <- data %>% select(M1) %>% pull %>%sapply(function(x) str_count(x,pattern = ',')) %>%max + 1
data %>%separate('M1', paste("M1", 1:max_size, sep="."), sep=",", extra="drop")
output;
id M1.1 M1.2 M1.3 M1.4 M1.5
<int> <chr> <chr> <chr> <chr> <chr>
1 1 blue green yellow NA NA
2 2 orange green NA NA NA
3 3 orange purple blue pink white
Upvotes: 0