Create columns that extract and flag a detected string pattern from multiple selected columns

I have the below dataframe, where each observation has a string of key and values (k:v)s of different objects. Each object is separated by a \n

df = tribble(
  ~col, ~summary,
  "a",   "Oranges: 1\nApples: 1\nPeaches: 6\nBananas: 2",
  "b",   "Apples: 2\nBananas: 1",
  "c",   "Oranges: 4\nPeaches: 2\nBananas: 5",
  "d",   "Peaches: 2"
)

I have separated those values as below:

df_sep = df %>% 
  separate(summary,c("col1","col2","col3","col4"),sep = "\n")

I would like to have the below output using dplyr possibly with across and any if applciable.

# A tibble: 4 x 10
  col   summary                                       Apples_flag Bananas_flag oranges_flag Peaches_flag_value Apples_value Bananas_value oranges_value Peaches_value
  <chr> <chr>                                               <dbl>        <dbl>        <dbl>              <dbl>        <dbl>         <dbl>         <dbl>         <dbl>
1 b     Apples: 2, Bananas: 1                                   1            1            0                  0            2             1             0             0
2 a     Oranges: 1, Apples: 1, Peaches: 6, Bananas: 2           1            1            1                  1            1             2             1             6
3 c     Oranges: 4, Peaches: 2, Bananas: 5                      0            1            1                  1            0             5             4             2
4 d     Peaches: 2                                              0            0            0                  1            0             0             0             2

Upvotes: 0

Views: 271

Answers (3)

Onyambu
Onyambu

Reputation: 79288

Another way would be:

df %>%
  left_join(df%>%
  separate_rows(summary, sep='\n') %>%
  separate(summary, c('name', 'value'), convert = TRUE) %>%
  pivot_wider(col,names_glue = '{name}_value', values_fill = 0) %>%
  mutate(across(-col, sign, .names = '{.col}_flag')))

Upvotes: 3

The below is another possible solution within the tidyverse utilizing c_cross, any, str_detect and str_match.

It does the flagging by applying str_detect to any of the specified columns from the match argument and adding 1 as a flag using + to the appropriate column.

It extracts the value by applying a regex using str_match and adds it to the appropriate column.

df %>% 
  mutate(summary_clone = summary) %>% 
  separate(summary_clone,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>% 
  rowwise() %>%
# Flagging
  mutate(Apples_flag = +any(str_detect(c_across(matches("col[1-4]")), "Apples")),
         Oranges_flag = +any(str_detect(c_across(matches("col[1-4]")), "oranges")),
         Bananas_flag = +any(str_detect(c_across(matches("col[1-4]")), "Bananas")),
         Peaches_flag = +any(str_detect(c_across(matches("col[1-4]")), "Peaches"))
         ) %>% 
  mutate(summary = str_replace_all(summary,"\\n"," ")) %>% 
# Parse matched values
  mutate(Apples_value =  str_match(summary,"Apples: (\\d+)")[2],
         Oranges_value =  str_match(summary,"Oranges: (\\d+)")[2],
         Bananas_value =  str_match(summary,"Bananas: (\\d+)")[2],
         Peaches_value =  str_match(summary,"Peaches: (\\d+)")[2]
         )  %>% 
  select(-matches("col[1-4]")) %>% 
  mutate(across(3:10, .fns = ~replace_na(as.numeric(.),0))) 

# A tibble: 4 x 10
# Rowwise: 
 col   summary                                    Apples_flag Oranges_flag Bananas_flag Peaches_flag Apples_value Oranges_value Bananas_value Peaches_value
 <chr> <chr>                                            <dbl>        <dbl>        <dbl>        <dbl>        <dbl>         <dbl>         <dbl>         <dbl>
1 a     Oranges: 1 Apples: 1 Peaches: 6 Bananas: 2           1            1            1            1            1             0             2             6
2 b     Apples: 2 Bananas: 1                                 1            0            1            0            2             0             1             0
3 c     Oranges: 4 Peaches: 2 Bananas: 5                     0            1            1            1            0             0             5             2
4 d     Peaches: 2                                           0            0            0            0            0             0             0             0 

Upvotes: 0

Cettt
Cettt

Reputation: 11981

Here is one possibility within the tidyverse which relies on pivoting the data:

df %>% 
  separate(summary,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>%
  gather(dummy, value, -col) %>%
  filter(!is.na(value)) %>% 
  group_by(col) %>%
  transmute(
    col,
    summary = paste(value, collapse = ", "),
    fruit = gsub(": \\d+", "", value),
    n = parse_number(value)
  ) %>%
  arrange(fruit) %>% 
  pivot_wider(names_from = fruit, values_from = n, values_fill =  0) %>%
  mutate(across(-summary, list(flag = sign)))


# A tibble: 4 x 10
# Groups:   col [4]
  col   summary                                       Apples Bananas oranges Peaches Apples_flag Bananas_flag oranges_flag Peaches_flag
  <chr> <chr>                                          <dbl>   <dbl>   <dbl>   <dbl>       <dbl>        <dbl>        <dbl>        <dbl>
1 b     Apples: 2, Bananas: 1                              2       1       0       0           1            1            0            0
2 a     oranges: 1, Apples: 1, Peaches: 6, Bananas: 2      1       2       1       6           1            1            1            1
3 c     oranges: 4, Peaches: 2, Bananas: 5                 0       5       4       2           0            1            1            1
4 d     Peaches: 2                                         0       0       0       2           0            0            0            1

Upvotes: 1

Related Questions