Reputation: 4201
I need to build a table that mutates additional columns based on many rowwise conditions. I'm trying to write the very minimal code needed and therefore settled on a certain way. I store all the conditions inside a table up front, and then draw everything from the table. However, I don't know how to iteratively circle over the conditions in the table when approaching the mutation stage.
Let's say that I want to classify geometric shapes based on their properties. Instead of first writing out code for creating the properties table and then writing very similar code for the conditions, I can specify it once.
library(tibble)
my_table <-
tribble(~shape_id, ~string_of_conditions, ~shape_classification,
"a", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"TRUE\" & n_of_sides == \"4\" & all_sides_equal == \"FALSE\" & is_curve == \"FALSE\""), "rectangle",
"b", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"TRUE\" & n_of_sides == \"4\" & all_sides_equal == \"TRUE\" & is_curve == \"FALSE\""), "square",
"c", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"FALSE\" & n_of_sides == \"4\" & all_sides_equal == \"TRUE\" & is_curve == \"FALSE\""), "rhombus",
"d", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"FALSE\" & n_of_sides == \"4\" & all_sides_equal == \"FALSE\" & is_curve == \"FALSE\""), "trapezoid",
"e", c("is_curve == \"TRUE\" & equidistant_from_center == \"TRUE\""), "circle",
"f", c("is_curve == \"TRUE\" & equidistant_from_center == \"FALSE\""), "ellipse")
my_table
#> # A tibble: 6 x 3
#> shape_id string_of_conditions shape_classificat~
#> <chr> <chr> <chr>
#> 1 a "n_of_vertices == \"4\" & n_of_angles == \"4\" & ~ rectangle
#> 2 b "n_of_vertices == \"4\" & n_of_angles == \"4\" & ~ square
#> 3 c "n_of_vertices == \"4\" & n_of_angles == \"4\" & ~ rhombus
#> 4 d "n_of_vertices == \"4\" & n_of_angles == \"4\" & ~ trapezoid
#> 5 e "is_curve == \"TRUE\" & equidistant_from_center =~ circle
#> 6 f "is_curve == \"TRUE\" & equidistant_from_center =~ ellipse
Created on 2021-02-19 by the reprex package (v0.3.0)
Then I can build the dataset based on my_table
:
library(rlang)
library(dplyr)
library(purrr)
convert_to_named_vector <- function(x) {
gsub("==", "=", x) %>%
gsub("&", ",", .) %>%
paste0("c(", ., ")") %>%
parse_expr(.) %>%
eval(.)
}
new_data <-
my_table %>%
mutate(as_named_vec = map(.x = string_of_conditions, .f = convert_to_named_vector)) %>%
pull(as_named_vec) %>%
bind_rows()
> new_data
## # A tibble: 6 x 7
## n_of_vertices n_of_angles all_angles_right n_of_sides all_sides_equal is_curve equidistant_from_center
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 4 4 TRUE 4 FALSE FALSE NA
## 2 4 4 TRUE 4 TRUE FALSE NA
## 3 4 4 FALSE 4 TRUE FALSE NA
## 4 4 4 FALSE 4 FALSE FALSE NA
## 5 NA NA NA NA NA TRUE TRUE
## 6 NA NA NA NA NA TRUE FALSE
So far so good.
Here is my question: for each row in new_data
, I want to mutate a new column that will hold the classification label for that row. I want to use case_when
where the LHS will be based on the condition stored in my_table["string_of_conditions"]
, and the RHS on the value in my_table["shape_classification"]
.
How can I iteratively run over my_table
&new_data
to do this using case_when
?
Otherwise, a tedious, undesired 1-by-1 would be:
cond_shape_a <-
my_table %>%
slice(1) %>%
pull(string_of_conditions) %>%
parse_expr()
classification_shape_a <-
my_table %>%
slice(1) %>%
pull(shape_classification)
new_data %>%
mutate(figured_out_shape_class = case_when(!!cond_shape_a ~ classification_shape_a))
## # A tibble: 6 x 8
## n_of_vertices n_of_angles all_angles_right n_of_sides all_sides_equal is_curve equidistant_from_center figured_out_shape_class
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 4 4 TRUE 4 FALSE FALSE NA rectangle
## 2 4 4 TRUE 4 TRUE FALSE NA NA
## 3 4 4 FALSE 4 TRUE FALSE NA NA
## 4 4 4 FALSE 4 FALSE FALSE NA NA
## 5 NA NA NA NA NA TRUE TRUE NA
## 6 NA NA NA NA NA TRUE FALSE NA
Is there a way to achieve this easily enough using purrr
maybe?
I'm aware this question might seem silly because I could have simply written out the desired output from the beginning using tribble
. But in my real data this gets cumbersome very quickly, and I need a programmatic solution.
EDIT
@Jon Spring added in the comments that I could avoid case_when()
altogether if I had included shape_classification
within string_of_conditions
column instead of a dedicated column.
library(tibble)
library(purrr)
library(rlang)
library(dplyr)
my_table_2 <-
tribble(~shape_id, ~string_of_conditions,
"a", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"TRUE\" & n_of_sides == \"4\" & all_sides_equal == \"FALSE\" & is_curve == \"FALSE\" , shape_classification == \"rectangle\""),
"b", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"TRUE\" & n_of_sides == \"4\" & all_sides_equal == \"TRUE\" & is_curve == \"FALSE\", shape_classification == \"square\""),
"c", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"FALSE\" & n_of_sides == \"4\" & all_sides_equal == \"TRUE\" & is_curve == \"FALSE\", shape_classification == \"rhombus\""),
"d", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"FALSE\" & n_of_sides == \"4\" & all_sides_equal == \"FALSE\" & is_curve == \"FALSE\" , shape_classification == \"trapezoid\""),
"e", c("is_curve == \"TRUE\" & equidistant_from_center == \"TRUE\", shape_classification == \"circle\""),
"f", c("is_curve == \"TRUE\" & equidistant_from_center == \"FALSE\", shape_classification == \"ellipse\""))
my_table_2
#> # A tibble: 6 x 2
#> shape_id string_of_conditions
#> <chr> <chr>
#> 1 a "n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right ==~
#> 2 b "n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right ==~
#> 3 c "n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right ==~
#> 4 d "n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right ==~
#> 5 e "is_curve == \"TRUE\" & equidistant_from_center == \"TRUE\", shape_c~
#> 6 f "is_curve == \"TRUE\" & equidistant_from_center == \"FALSE\", shape_~
convert_to_named_vector <- function(x) {
gsub("==", "=", x) %>%
gsub("&", ",", .) %>%
paste0("c(", ., ")") %>%
parse_expr(.) %>%
eval(.)
}
new_data_2 <-
my_table_2 %>%
mutate(as_named_vec = map(.x = string_of_conditions, .f = convert_to_named_vector)) %>%
pull(as_named_vec) %>%
bind_rows()
new_data_2
#> # A tibble: 6 x 8
#> n_of_vertices n_of_angles all_angles_right n_of_sides all_sides_equal is_curve
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 4 4 TRUE 4 FALSE FALSE
#> 2 4 4 TRUE 4 TRUE FALSE
#> 3 4 4 FALSE 4 TRUE FALSE
#> 4 4 4 FALSE 4 FALSE FALSE
#> 5 <NA> <NA> <NA> <NA> <NA> TRUE
#> 6 <NA> <NA> <NA> <NA> <NA> TRUE
#> # ... with 2 more variables: shape_classification <chr>,
#> # equidistant_from_center <chr>
Created on 2021-02-19 by the reprex package (v0.3.0)
The problem with such solution is that it works fine with simple shape_classification
outputs such as strings (e.g., "square", "rectangle", etc.), but not scalable if we want shape_classification
to be complex structures such as lists nesting lists, etc.
In such cases we have to resort to a dedicated column for the output:
library(tibble)
my_table_3 <-
tribble(~shape_id, ~string_of_conditions, ~shape_classification,
"a", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"TRUE\" & n_of_sides == \"4\" & all_sides_equal == \"FALSE\" & is_curve == \"FALSE\""), list(list(shape_name = "rectangle", shape_aspects = tribble(~aspect, ~value,
"type_of_geometry", "Euclidean plane geometry",
"sum_of_angles", "360",
"other_names", "oblong"))),
"b", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"TRUE\" & n_of_sides == \"4\" & all_sides_equal == \"TRUE\" & is_curve == \"FALSE\""), list(list(shape_name = "square", shape_aspects = tribble(~aspect, ~value,
"type_of_geometry", "Euclidean plane geometry",
"sum_of_angles", "360",
"n_of_diagonals", "2"))),
"c", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"FALSE\" & n_of_sides == \"4\" & all_sides_equal == \"TRUE\" & is_curve == \"FALSE\""), list(list(shape_name = "rhombus", shape_aspects = tribble(~aspect, ~value,
"type_of_geometry", "Euclidean plane geometry",
"plural_form", "rhombi"))),
"d", c("n_of_vertices == \"4\" & n_of_angles == \"4\" & all_angles_right == \"FALSE\" & n_of_sides == \"4\" & all_sides_equal == \"FALSE\" & is_curve == \"FALSE\""), list(list(shape_name = "trapezoid", shape_aspects = tribble(~aspect, ~value,
"sum_of_angles", "360",
"n_of_diagonals", "2"))),
"e", c("is_curve == \"TRUE\" & equidistant_from_center == \"TRUE\""), list(c("circle")),
"f", c("is_curve == \"TRUE\" & equidistant_from_center == \"FALSE\""), list(c("ellipse")))
my_table_3
#> # A tibble: 6 x 3
#> shape_id string_of_conditions shape_classificat~
#> <chr> <chr> <list>
#> 1 a "n_of_vertices == \"4\" & n_of_angles == \"4\" & ~ <list [1]>
#> 2 b "n_of_vertices == \"4\" & n_of_angles == \"4\" & ~ <list [1]>
#> 3 c "n_of_vertices == \"4\" & n_of_angles == \"4\" & ~ <list [1]>
#> 4 d "n_of_vertices == \"4\" & n_of_angles == \"4\" & ~ <list [1]>
#> 5 e "is_curve == \"TRUE\" & equidistant_from_center =~ <list [1]>
#> 6 f "is_curve == \"TRUE\" & equidistant_from_center =~ <list [1]>
Created on 2021-02-19 by the reprex package (v0.3.0)
Upvotes: 0
Views: 364
Reputation: 887223
Perhaps this helps
library(dplyr)
library(purrr)
bind_cols(my_table, new_data) %>%
transmute(tmp = map2(string_of_conditions,
shape_classification, ~
case_when(eval(parse_expr(.x)) ~ .y))) %>%
pull(tmp) %>%
reduce(coalesce) %>%
bind_cols(new_data, figured_out_shape_class = .)
-output
# A tibble: 6 x 8
# n_of_vertices n_of_angles all_angles_right n_of_sides all_sides_equal is_curve equidistant_from_center figured_out_shape_class
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 4 4 TRUE 4 FALSE FALSE <NA> rectangle
#2 4 4 TRUE 4 TRUE FALSE <NA> square
#3 4 4 FALSE 4 TRUE FALSE <NA> rhombus
#4 4 4 FALSE 4 FALSE FALSE <NA> trapezoid
#5 <NA> <NA> <NA> <NA> <NA> TRUE TRUE circle
#6 <NA> <NA> <NA> <NA> <NA> TRUE FALSE ellipse
Upvotes: 2