Reputation: 796

R separate string column to multiple columns and align by column name

my issue is quite simple. I want to split column by delimiter, but aligned with column name.

Example:

library(tidyr)
library(dplyr)
df <- data.frame(col = c("A", "B", "A,B", "C"))
df %>% separate(col, c("A", "B", "C"))

I get:

  A    B    C
1 A <NA> <NA>
2 B <NA> <NA>
3 A    B <NA>
4 C <NA> <NA>

and I expect:

  A    B    C
1 A   <NA> <NA>
2 <NA> B   <NA>
3 A    B   <NA>
4 <NA><NA>  C

A to A column, B to B column... etc. How to make it done?

Update with data

My actual data is the following:

structure(list(Shortcuts = structure(c(37L, 5L, 9L, 27L, 28L, 
    27L, 8L, 19L, 15L, 11L, 29L, 7L, 38L, 18L, 23L, 27L, 27L, 24L, 
    19L, 21L, 4L, 27L, 34L, 2L, 28L, 33L, 26L, 27L, 14L, 27L, 37L, 
    8L, 17L, 40L, 37L, 27L, 25L, 22L, 36L, 27L, 34L, 27L, 27L, 19L, 
    32L, 6L, 14L, 27L, 30L, 21L, 12L, 15L, 27L, 14L, 39L, 16L, 27L, 
    19L, 21L, 6L, 20L, 35L, 27L, 31L, 8L, 27L, 27L, 10L, 27L, 13L, 
    28L), .Label = c("ALL", "ALL, NS, N, SS", "ALL, NS, SS", "LG", 
    "LG, NG, RH, ONM", "LG, RH, NMC", "LG, RH, NS", "N", "N, ONM, LG", 
    "NG", "NMC", "NMC, NS", "NMC, RH", "NS", "NS, RH", "NS, SS", 
    "NS, WTW", "NS, WTW, SS", "O", "OBN, RH", "ONM", "ONM, LG", "ONM, LG, RH", 
    "ONM, LG, RH, N, Aut", "ONM, N", "ONM, NS", "ONN", "RH", "RH, LG", 
    "RH, LG, NG, ONM", "RH, NS", "RH, NS, NMC", "RH, ONM", "SS", 
    "SS, RH", "SS, SW", "WTW", "WTW, NS, N", "WTW, SS, RH, NS", "ZNM, RH, WTW, NW, NMC"
    ), class = "factor")), row.names = c(NA, -71L), class = c("tbl_df", 
    "tbl", "data.frame"))

Upvotes: 0

Answers (3)

veka

Reputation: 37

I do not know if this issue is solved by now but since no answer is marked as the final answer I want to show you my approach which should work for you.

# Pull all the different categories and in your case shortcuts
conditions <- separate_rows(df,Shortcuts,convert=T) %>%
  distinct(Shortcuts) %>%
  pull(Shortcuts)

clean <- function(data,cond){
  ls <- apply(data,1,function(x) sapply(cond,function(y) str_detect(x,paste0("\\b",y,"\\b")))) %>%
    t() %>%
    as.data.frame() %>%
    setNames(.,cond) %>%
    mutate_all(function(x) ifelse(x==T,1,0)) %>%
    cbind(data,.) %>%
    as_tibble() %>%
  return(ls)
}

final_df <- clean(df,conditions)

The outcome looks like this then (I used dummy variables to show which shortcuts can be found. You can easily change that to give the 1s the name of the columns):

   Shortcuts         WTW    LG    NG    RH   ONM     N   ONN     O    NS   NMC    SS   Aut   ALL   ZNM    NW    SW   OBN
   <fct>           <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 WTW                 1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
 2 LG, NG, RH, ONM     0     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0
 3 N, ONM, LG          0     1     0     0     1     1     0     0     0     0     0     0     0     0     0     0     0
 4 ONN                 0     0     0     0     0     0     1     0     0     0     0     0     0     0     0     0     0
 5 RH                  0     0     0     1     0     0     0     0     0     0     0     0     0     0     0     0     0
 6 ONN                 0     0     0     0     0     0     1     0     0     0     0     0     0     0     0     0     0
 7 N                   0     0     0     0     0     1     0     0     0     0     0     0     0     0     0     0     0
 8 O                   0     0     0     0     0     0     0     1     0     0     0     0     0     0     0     0     0
 9 NS, RH              0     0     0     1     0     0     0     0     1     0     0     0     0     0     0     0     0
10 NMC                 0     0     0     0     0     0     0     0     0     1     0     0     0     0     0     0     0

Upvotes: 0

Ronak Shah

Reputation: 389145

You can use separate_rows + pivot_wider -

library(dplyr)
library(tidyr)

df %>%
  mutate(row = row_number()) %>%
  separate_rows(Shortcuts, sep = ',\\s+') %>%
  pivot_wider(names_from = Shortcuts, values_from = Shortcuts) %>%
  select(-row)

#   WTW   LG    NG    RH    ONM   N     ONN   O     NS    NMC   SS    Aut   ALL   ZNM  
#   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 WTW   NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
# 2 NA    LG    NG    RH    ONM   NA    NA    NA    NA    NA    NA    NA    NA    NA   
# 3 NA    LG    NA    NA    ONM   N     NA    NA    NA    NA    NA    NA    NA    NA   
# 4 NA    NA    NA    NA    NA    NA    ONN   NA    NA    NA    NA    NA    NA    NA   
# 5 NA    NA    NA    RH    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
# 6 NA    NA    NA    NA    NA    NA    ONN   NA    NA    NA    NA    NA    NA    NA   
# 7 NA    NA    NA    NA    NA    N     NA    NA    NA    NA    NA    NA    NA    NA   
# 8 NA    NA    NA    NA    NA    NA    NA    O     NA    NA    NA    NA    NA    NA   
# 9 NA    NA    NA    RH    NA    NA    NA    NA    NS    NA    NA    NA    NA    NA   
#10 NA    NA    NA    NA    NA    NA    NA    NA    NA    NMC   NA    NA    NA    NA   
# … with 61 more rows, and 3 more variables: NW <chr>, SW <chr>, OBN <chr>

Upvotes: 1

Matias Andina

Reputation: 4230

This produces the outcome you are looking for. I'm not sure that it would scale well for your application purpose. I modified your data to account for the "A,C" case.

df <- data.frame(col = c("A,C", "B", "A,B", "C"))

df %>%
  separate(col, c("A", "B", "C")) %>% 
  mutate(C=ifelse(is.na(C) & A=="C" | B=="C", "C", NA_character_),
         B=ifelse(A=="B" | B=="B", "B", NA_character_),
         A=ifelse(A!="A", NA_character_, A))

     A    B    C
1    A <NA> <NA>
2 <NA>    B <NA>
3    A    B <NA>
4 <NA> <NA>    C

Alternative

I coded a little function that will give you a rowwise boolean of what you want. You can apply that using dplyr::rowwise() to your data frame like so

df %>%
 rowwise() %>% 
 mutate(adjust_col(col)) %>% 
 mutate(A=ifelse(A, "A", NA_character_), 
        B=ifelse(B, "B", NA_character_), 
        C=ifelse(C, "C", NA_character_)) %>%
 select(-col)

There's probably a clever way of using dplyr::across() to select and codifying a function to mutate the TRUE value into the corresponding column name.

Here's the helper I coded, modify the dictionary to serve your purpose.

adjust_col <- function(x, dictionary = LETTERS[1:3]){
  x <- as.character(x)
  df <- data.frame(matrix(ncol=length(dictionary)))
  names(df) <- dictionary
  
  contents <- unlist(strsplit(x, split = ","))
  results <- sapply(contents, function(letter) letter %in% dictionary)
  return(bind_rows(df[-1, ], results))
}

Update

Let's create the boolean data.frame first.

df %>% rowwise() %>%
  mutate(adjust_col(col)) %>% 
  select(-col) -> tt 

tt

tt
# A tibble: 4 x 3
# Rowwise: 
  A     B     C    
  <lgl> <lgl> <lgl>
1 TRUE  NA    TRUE 
2 NA    TRUE  NA   
3 TRUE  TRUE  NA   
4 NA    NA    TRUE

Then programmatically replace the TRUE values for the column names using purrr::map2_df()

purrr::map2_df(tt, names(tt), ~  replace(.x, .x==1, .y))

  A     B     C    
  <chr> <chr> <chr>
1 A     NA    C    
2 NA    B     NA   
3 A     B     NA   
4 NA    NA    C

The gist of the replacement comes from this other answer on a related question.

Update with data :)

So, now that we have your data, we can get the dictionary properly. Check that your data has , (comma space) instead of just comma as a sep. That's why you get the error.

Here's how I have it working on my end (notice I adjusted the helper function to add the space!):

adjust_col <- function(x, dictionary = LETTERS[1:3]){
  x <- as.character(x)
  df <- data.frame(matrix(ncol=length(dictionary)))
  names(df) <- dictionary
  
  contents <- unlist(strsplit(x, split = ", "))
  results <- sapply(contents, function(letter) letter %in% dictionary)
  return(bind_rows(df[-1, ], results))
}

my_dict <- unlist(str_split(df$Shortcuts, ", ")) %>%
  unique()

df %>%
  rowwise() %>%
  mutate(adjust_col(Shortcuts, my_dict)) %>% 
  select(-Shortcuts) -> tt

purrr::map2_df(tt, names(tt), ~  replace(.x, .x==1, .y))

Which produces

# A tibble: 71 x 17
   WTW   LG    NG    RH    ONM   N     ONN   O     NS   
   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
 1 WTW   NA    NA    NA    NA    NA    NA    NA    NA   
 2 NA    LG    NG    RH    ONM   NA    NA    NA    NA   
 3 NA    LG    NA    NA    ONM   N     NA    NA    NA   
 4 NA    NA    NA    NA    NA    NA    ONN   NA    NA   
 5 NA    NA    NA    RH    NA    NA    NA    NA    NA   
 6 NA    NA    NA    NA    NA    NA    ONN   NA    NA   
 7 NA    NA    NA    NA    NA    N     NA    NA    NA   
 8 NA    NA    NA    NA    NA    NA    NA    O     NA   
 9 NA    NA    NA    RH    NA    NA    NA    NA    NS   
10 NA    NA    NA    NA    NA    NA    NA    NA    NA   
# … with 61 more rows, and 8 more variables: NMC <chr>,
#   SS <chr>, Aut <chr>, ALL <chr>, ZNM <chr>, NW <chr>,
#   SW <chr>, OBN <chr>