Reputation: 287
I have the below dataframe, where each observation has a string of key and values (k:v)s of different objects. Each object is separated by a \n
df = tribble(
~col, ~summary,
"a", "Oranges: 1\nApples: 1\nPeaches: 6\nBananas: 2",
"b", "Apples: 2\nBananas: 1",
"c", "Oranges: 4\nPeaches: 2\nBananas: 5",
"d", "Peaches: 2"
)
I have separated those values as below:
df_sep = df %>%
separate(summary,c("col1","col2","col3","col4"),sep = "\n")
I would like to have the below output using dplyr possibly with across
and any
if applciable.
# A tibble: 4 x 10
col summary Apples_flag Bananas_flag oranges_flag Peaches_flag_value Apples_value Bananas_value oranges_value Peaches_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 b Apples: 2, Bananas: 1 1 1 0 0 2 1 0 0
2 a Oranges: 1, Apples: 1, Peaches: 6, Bananas: 2 1 1 1 1 1 2 1 6
3 c Oranges: 4, Peaches: 2, Bananas: 5 0 1 1 1 0 5 4 2
4 d Peaches: 2 0 0 0 1 0 0 0 2
Upvotes: 0
Views: 271
Reputation: 79288
Another way would be:
df %>%
left_join(df%>%
separate_rows(summary, sep='\n') %>%
separate(summary, c('name', 'value'), convert = TRUE) %>%
pivot_wider(col,names_glue = '{name}_value', values_fill = 0) %>%
mutate(across(-col, sign, .names = '{.col}_flag')))
Upvotes: 3
Reputation: 287
The below is another possible solution within the tidyverse
utilizing c_cross
, any
, str_detect
and str_match
.
It does the flagging by applying str_detect
to any
of the specified columns from the match
argument and adding 1 as a flag using +
to the appropriate column.
It extracts the value by applying a regex using str_match
and adds it to the appropriate column.
df %>%
mutate(summary_clone = summary) %>%
separate(summary_clone,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>%
rowwise() %>%
# Flagging
mutate(Apples_flag = +any(str_detect(c_across(matches("col[1-4]")), "Apples")),
Oranges_flag = +any(str_detect(c_across(matches("col[1-4]")), "oranges")),
Bananas_flag = +any(str_detect(c_across(matches("col[1-4]")), "Bananas")),
Peaches_flag = +any(str_detect(c_across(matches("col[1-4]")), "Peaches"))
) %>%
mutate(summary = str_replace_all(summary,"\\n"," ")) %>%
# Parse matched values
mutate(Apples_value = str_match(summary,"Apples: (\\d+)")[2],
Oranges_value = str_match(summary,"Oranges: (\\d+)")[2],
Bananas_value = str_match(summary,"Bananas: (\\d+)")[2],
Peaches_value = str_match(summary,"Peaches: (\\d+)")[2]
) %>%
select(-matches("col[1-4]")) %>%
mutate(across(3:10, .fns = ~replace_na(as.numeric(.),0)))
# A tibble: 4 x 10
# Rowwise:
col summary Apples_flag Oranges_flag Bananas_flag Peaches_flag Apples_value Oranges_value Bananas_value Peaches_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a Oranges: 1 Apples: 1 Peaches: 6 Bananas: 2 1 1 1 1 1 0 2 6
2 b Apples: 2 Bananas: 1 1 0 1 0 2 0 1 0
3 c Oranges: 4 Peaches: 2 Bananas: 5 0 1 1 1 0 0 5 2
4 d Peaches: 2 0 0 0 0 0 0 0 0
Upvotes: 0
Reputation: 11981
Here is one possibility within the tidyverse
which relies on pivoting the data:
df %>%
separate(summary,c("col1","col2","col3","col4"),sep = "\n", fill = "right") %>%
gather(dummy, value, -col) %>%
filter(!is.na(value)) %>%
group_by(col) %>%
transmute(
col,
summary = paste(value, collapse = ", "),
fruit = gsub(": \\d+", "", value),
n = parse_number(value)
) %>%
arrange(fruit) %>%
pivot_wider(names_from = fruit, values_from = n, values_fill = 0) %>%
mutate(across(-summary, list(flag = sign)))
# A tibble: 4 x 10
# Groups: col [4]
col summary Apples Bananas oranges Peaches Apples_flag Bananas_flag oranges_flag Peaches_flag
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 b Apples: 2, Bananas: 1 2 1 0 0 1 1 0 0
2 a oranges: 1, Apples: 1, Peaches: 6, Bananas: 2 1 2 1 6 1 1 1 1
3 c oranges: 4, Peaches: 2, Bananas: 5 0 5 4 2 0 1 1 1
4 d Peaches: 2 0 0 0 2 0 0 0 1
Upvotes: 1