Reputation: 151
I built a prediction model using logistic regression which works well. But when I analyze the estimates calculated on the test dataset, I can see the variable I used to stratify the split comes up when I want it to be excluded of the model as a predictor. update_role()
doesn't do that...
data_split <- initial_split(mldata, prop = 3/4, strata = strata_var)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(vital ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(strata_var, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_smote(vital)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = strata_var)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05,
0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
In the last plot I can see the strata variable coming up.
Upvotes: 0
Views: 126
Reputation: 3185
You got this result because of the combination of role selection functions you used in step_dummy()
. (full reprex at the end of post)
You used the following selections. Which selects all nominal, but not any outcomes. This selected the strata variables because it is both a nominal variable and not an outcome.
all_nominal(), -all_outcomes()
A better option would be to use all_nominal_predictors()
which won't select id/strata variables.
library(tidymodels)
data("penguins")
rec_spec1 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal(), -all_outcomes())
rec_spec1 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 4
#> body_mass_g species island_Dream island_Torgersen
#> <int> <fct> <dbl> <dbl>
#> 1 3750 Adelie 0 1
#> 2 3800 Adelie 0 1
#> 3 3250 Adelie 0 1
#> 4 NA Adelie 0 1
#> 5 3450 Adelie 0 1
#> 6 3650 Adelie 0 1
#> 7 3625 Adelie 0 1
#> 8 4675 Adelie 0 1
#> 9 3475 Adelie 0 1
#> 10 4250 Adelie 0 1
#> # … with 334 more rows
rec_spec2 <- recipe(species ~ island + body_mass_g, data = penguins) %>%
update_role(island, new_role = "strata") %>%
step_dummy(all_nominal_predictors())
rec_spec2 %>%
prep() %>%
bake(new_data = NULL)
#> # A tibble: 344 × 3
#> island body_mass_g species
#> <fct> <int> <fct>
#> 1 Torgersen 3750 Adelie
#> 2 Torgersen 3800 Adelie
#> 3 Torgersen 3250 Adelie
#> 4 Torgersen NA Adelie
#> 5 Torgersen 3450 Adelie
#> 6 Torgersen 3650 Adelie
#> 7 Torgersen 3625 Adelie
#> 8 Torgersen 4675 Adelie
#> 9 Torgersen 3475 Adelie
#> 10 Torgersen 4250 Adelie
#> # … with 334 more rows
library(tidymodels)
library(themis)
library(forcats)
data("penguins")
penguins0 <- penguins %>%
mutate(ids = row_number(),
species = factor(species == "Adelie")) %>%
drop_na()
data_split <- initial_split(penguins0, prop = 3/4, strata = island)
# Create training and testing datasets:
train_data <- training(data_split)
test_data <- testing(data_split)
# Build model
mldata_recipe <-
recipe(species ~ ., data = train_data) %>%
update_role(ids, new_role = "ID") %>%
update_role(island, new_role = "strata") %>%
step_zv(all_predictors()) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(species)
set.seed(456)
# 10 fold cross validation
mldata_folds <- vfold_cv(train_data, strata = island)
glmnet_spec <-
logistic_reg(penalty = tune(), mixture = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
glmnet_workflow <-
workflow() %>%
add_recipe(mldata_recipe) %>%
add_model(glmnet_spec)
glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20),
mixture = c(0, 0.05, 0.2, 0.4, 0.6, 0.8, 1))
set.seed(789)
glmnet_tune <-
tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid)
final_glmnet <- glmnet_workflow %>%
finalize_workflow(select_best(glmnet_tune, "roc_auc"))
glmnet_results <- final_glmnet %>%
fit_resamples(
resamples = mldata_folds,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
set.seed(789)
final_fit <- final_glmnet %>%
last_fit(data_split)
final_fit %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
arrange(desc(abs(estimate))) %>%
filter(abs(estimate) >0) %>%
ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+
geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) +
geom_point() +
scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) +
theme_minimal()+
ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
Created on 2021-08-20 by the reprex package (v2.0.1)
Upvotes: 1