Reputation: 796
my issue is quite simple. I want to split column by delimiter, but aligned with column name.
Example:
library(tidyr)
library(dplyr)
df <- data.frame(col = c("A", "B", "A,B", "C"))
df %>% separate(col, c("A", "B", "C"))
I get:
A B C
1 A <NA> <NA>
2 B <NA> <NA>
3 A B <NA>
4 C <NA> <NA>
and I expect:
A B C
1 A <NA> <NA>
2 <NA> B <NA>
3 A B <NA>
4 <NA><NA> C
A to A column, B to B column... etc. How to make it done?
My actual data is the following:
structure(list(Shortcuts = structure(c(37L, 5L, 9L, 27L, 28L,
27L, 8L, 19L, 15L, 11L, 29L, 7L, 38L, 18L, 23L, 27L, 27L, 24L,
19L, 21L, 4L, 27L, 34L, 2L, 28L, 33L, 26L, 27L, 14L, 27L, 37L,
8L, 17L, 40L, 37L, 27L, 25L, 22L, 36L, 27L, 34L, 27L, 27L, 19L,
32L, 6L, 14L, 27L, 30L, 21L, 12L, 15L, 27L, 14L, 39L, 16L, 27L,
19L, 21L, 6L, 20L, 35L, 27L, 31L, 8L, 27L, 27L, 10L, 27L, 13L,
28L), .Label = c("ALL", "ALL, NS, N, SS", "ALL, NS, SS", "LG",
"LG, NG, RH, ONM", "LG, RH, NMC", "LG, RH, NS", "N", "N, ONM, LG",
"NG", "NMC", "NMC, NS", "NMC, RH", "NS", "NS, RH", "NS, SS",
"NS, WTW", "NS, WTW, SS", "O", "OBN, RH", "ONM", "ONM, LG", "ONM, LG, RH",
"ONM, LG, RH, N, Aut", "ONM, N", "ONM, NS", "ONN", "RH", "RH, LG",
"RH, LG, NG, ONM", "RH, NS", "RH, NS, NMC", "RH, ONM", "SS",
"SS, RH", "SS, SW", "WTW", "WTW, NS, N", "WTW, SS, RH, NS", "ZNM, RH, WTW, NW, NMC"
), class = "factor")), row.names = c(NA, -71L), class = c("tbl_df",
"tbl", "data.frame"))
Upvotes: 0
Views: 403
Reputation: 37
I do not know if this issue is solved by now but since no answer is marked as the final answer I want to show you my approach which should work for you.
# Pull all the different categories and in your case shortcuts
conditions <- separate_rows(df,Shortcuts,convert=T) %>%
distinct(Shortcuts) %>%
pull(Shortcuts)
clean <- function(data,cond){
ls <- apply(data,1,function(x) sapply(cond,function(y) str_detect(x,paste0("\\b",y,"\\b")))) %>%
t() %>%
as.data.frame() %>%
setNames(.,cond) %>%
mutate_all(function(x) ifelse(x==T,1,0)) %>%
cbind(data,.) %>%
as_tibble() %>%
return(ls)
}
final_df <- clean(df,conditions)
The outcome looks like this then (I used dummy variables to show which shortcuts can be found. You can easily change that to give the 1s the name of the columns):
Shortcuts WTW LG NG RH ONM N ONN O NS NMC SS Aut ALL ZNM NW SW OBN
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 WTW 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 LG, NG, RH, ONM 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
3 N, ONM, LG 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
4 ONN 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
5 RH 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
6 ONN 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
7 N 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
8 O 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
9 NS, RH 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0
10 NMC 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
Upvotes: 0
Reputation: 389145
You can use separate_rows
+ pivot_wider
-
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
separate_rows(Shortcuts, sep = ',\\s+') %>%
pivot_wider(names_from = Shortcuts, values_from = Shortcuts) %>%
select(-row)
# WTW LG NG RH ONM N ONN O NS NMC SS Aut ALL ZNM
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 WTW NA NA NA NA NA NA NA NA NA NA NA NA NA
# 2 NA LG NG RH ONM NA NA NA NA NA NA NA NA NA
# 3 NA LG NA NA ONM N NA NA NA NA NA NA NA NA
# 4 NA NA NA NA NA NA ONN NA NA NA NA NA NA NA
# 5 NA NA NA RH NA NA NA NA NA NA NA NA NA NA
# 6 NA NA NA NA NA NA ONN NA NA NA NA NA NA NA
# 7 NA NA NA NA NA N NA NA NA NA NA NA NA NA
# 8 NA NA NA NA NA NA NA O NA NA NA NA NA NA
# 9 NA NA NA RH NA NA NA NA NS NA NA NA NA NA
#10 NA NA NA NA NA NA NA NA NA NMC NA NA NA NA
# … with 61 more rows, and 3 more variables: NW <chr>, SW <chr>, OBN <chr>
Upvotes: 1
Reputation: 4230
This produces the outcome you are looking for. I'm not sure that it would scale well for your application purpose. I modified your data to account for the "A,C" case.
df <- data.frame(col = c("A,C", "B", "A,B", "C"))
df %>%
separate(col, c("A", "B", "C")) %>%
mutate(C=ifelse(is.na(C) & A=="C" | B=="C", "C", NA_character_),
B=ifelse(A=="B" | B=="B", "B", NA_character_),
A=ifelse(A!="A", NA_character_, A))
A B C
1 A <NA> <NA>
2 <NA> B <NA>
3 A B <NA>
4 <NA> <NA> C
I coded a little function that will give you a rowwise boolean of what you want. You can apply that using dplyr::rowwise()
to your data frame like so
df %>%
rowwise() %>%
mutate(adjust_col(col)) %>%
mutate(A=ifelse(A, "A", NA_character_),
B=ifelse(B, "B", NA_character_),
C=ifelse(C, "C", NA_character_)) %>%
select(-col)
There's probably a clever way of using dplyr::across()
to select and codifying a function to mutate the TRUE value into the corresponding column name.
Here's the helper I coded, modify the dictionary to serve your purpose.
adjust_col <- function(x, dictionary = LETTERS[1:3]){
x <- as.character(x)
df <- data.frame(matrix(ncol=length(dictionary)))
names(df) <- dictionary
contents <- unlist(strsplit(x, split = ","))
results <- sapply(contents, function(letter) letter %in% dictionary)
return(bind_rows(df[-1, ], results))
}
Let's create the boolean data.frame
first.
df %>% rowwise() %>%
mutate(adjust_col(col)) %>%
select(-col) -> tt
tt
tt
# A tibble: 4 x 3
# Rowwise:
A B C
<lgl> <lgl> <lgl>
1 TRUE NA TRUE
2 NA TRUE NA
3 TRUE TRUE NA
4 NA NA TRUE
Then programmatically replace the TRUE values for the column names using purrr::map2_df()
purrr::map2_df(tt, names(tt), ~ replace(.x, .x==1, .y))
A B C
<chr> <chr> <chr>
1 A NA C
2 NA B NA
3 A B NA
4 NA NA C
The gist of the replacement comes from this other answer on a related question.
So, now that we have your data, we can get the dictionary properly. Check that your data has ,
(comma space) instead of just comma as a sep. That's why you get the error.
Here's how I have it working on my end (notice I adjusted the helper function to add the space!):
adjust_col <- function(x, dictionary = LETTERS[1:3]){
x <- as.character(x)
df <- data.frame(matrix(ncol=length(dictionary)))
names(df) <- dictionary
contents <- unlist(strsplit(x, split = ", "))
results <- sapply(contents, function(letter) letter %in% dictionary)
return(bind_rows(df[-1, ], results))
}
my_dict <- unlist(str_split(df$Shortcuts, ", ")) %>%
unique()
df %>%
rowwise() %>%
mutate(adjust_col(Shortcuts, my_dict)) %>%
select(-Shortcuts) -> tt
purrr::map2_df(tt, names(tt), ~ replace(.x, .x==1, .y))
Which produces
# A tibble: 71 x 17
WTW LG NG RH ONM N ONN O NS
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 WTW NA NA NA NA NA NA NA NA
2 NA LG NG RH ONM NA NA NA NA
3 NA LG NA NA ONM N NA NA NA
4 NA NA NA NA NA NA ONN NA NA
5 NA NA NA RH NA NA NA NA NA
6 NA NA NA NA NA NA ONN NA NA
7 NA NA NA NA NA N NA NA NA
8 NA NA NA NA NA NA NA O NA
9 NA NA NA RH NA NA NA NA NS
10 NA NA NA NA NA NA NA NA NA
# … with 61 more rows, and 8 more variables: NMC <chr>,
# SS <chr>, Aut <chr>, ALL <chr>, ZNM <chr>, NW <chr>,
# SW <chr>, OBN <chr>
Upvotes: 1