Reputation: 69
I ran three machine learning algorithms (logistic regression, xg-boost, random forest) on my data named 'muvr_model_1' using tidymodels package in R using following codes;
library(dplyr)
library(magrittr)
library(ggplot2)
library(tidyverse)
library(tidymodels)
####
#Logistic regression
set.seed(9011) #For reproducibility. Shares the same way every time. Less chance in evaluation
library(tidymodels)
muvr_model_1_split <- initial_split (muvr_model_1, strata = Label_numeric_response, prop = 0.7) #strata to insure approximately equal percentage of outcomes in both training data and test data. 75% goes to training data
muvr_model_1_train <- training(muvr_model_1_split) #70% of data
muvr_model_1_test <- testing(muvr_model_1_split) #30% of data
##Heuristic – logistic regression
log_model <- logistic_reg() %>% #We want a logistic regression model
set_engine('glm') %>% #To be done according to glm, there are other ways to create logistic regression models
set_mode('classification') #The model should classify
log_fit <- log_model %>% #Takes and fits our model to training data
fit(Label_numeric_response ~ ., data = muvr_model_1_train) #Modeling response against all other factors
tidy(log_fit) # Summarizes the model aligned with the training data
log_class_pred <- predict(log_fit, new_data = muvr_model_1_test, type = 'class') #What outcome does it predict?
print(log_class_pred, n = 68)
log_prob_pred <- predict(log_fit, new_data = muvr_model_1_test, type = 'prob') # What probability for each class?
#Heuristic result
log_results <- muvr_model_1_test %>% select(Label_numeric_response) %>% #Getting the true class
bind_cols(log_class_pred, log_prob_pred) #Adds predicted class and probabilities
#Confusion matrix
conf_mat(log_results, truth = Label_numeric_response, estimate = .pred_class)
conf_mat(log_results, truth = Label_numeric_response, estimate = .pred_class) %>%
summary()
log_results %>% roc_curve(truth = Label_numeric_response, .pred_No) %>% autoplot()
roc_auc(log_results, truth = Label_numeric_response, .pred_No)
#Streamlines workflow in tidymodels
log_last_fit <- log_model %>%
last_fit(Label_numeric_response ~ ., split = muvr_model_1_split)
log_last_fit %>% collect_metrics()
#Includes Cross validation part
set.seed(9011)
muvr_model_1_folds_5 <- vfold_cv(muvr_model_1_train, v = 5, repeats = 5)
log_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train, family = binomial) %>% step_normalize(all_numeric()) %>% step_dummy(all_nominal(), -all_outcomes())
log_recipe %>% prep() #Provides us with the following information
#After resampling, we have validated, i.e. evaluated on parts of the training data
log_reg_model <- logistic_reg() %>%
set_mode('classification') %>%
set_engine('glm')
logistic_wf <- workflow() %>%
add_recipe(log_recipe) %>%
add_model(log_reg_model)
log_reg_resamp <- fit_resamples(logistic_wf, muvr_model_1_folds_5) ## TRAIN with Cross validation
collect_metrics(log_reg_resamp)
#With last_fit, we predict the test data
logistic_wf %>% last_fit(muvr_model_1_split) %>% collect_metrics() ## TEST
######
#####
#XG-BOOST
library(xgboost)
xgb_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train) %>% update_role(Label_numeric_response, new_role = "outcome") %>% step_normalize(all_numeric_predictors()) %>% step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors())
hf_xgb_model <- boost_tree( mtry = tune(),
trees = tune(), min_n = tune(),
tree_depth = tune(),
learn_rate = tune()) %>%
set_mode('classification') %>%
set_engine('xgboost')
hf_xgb_wflow <- workflow() %>% add_recipe(xgb_recipe) %>% add_model(hf_xgb_model)
set.seed(9192)
hf_xgb_grid <- grid_latin_hypercube(
finalize(mtry(), muvr_model_1_train),
trees(),
min_n(),
tree_depth(),
learn_rate(),
size = 100)
library(finetune)
auc_metric <- metric_set(yardstick::roc_auc) #Wants to measure the predictive ability of the model with ROC AUC
xgb_ctrl <- control_race(
save_pred = TRUE,
save_workflow = TRUE,
verbose_elim = TRUE)
set.seed(9011)
hf_xgb_result <- hf_xgb_wflow %>%
tune_race_anova( #tune_race_anova for faster driving. Here, instead, run the function tune_race_anova instead of specifying it as an argument for workflow function
resamples = muvr_model_1_folds_5, # resampling
grid = hf_xgb_grid, # Which grid should it be evaluated on?
metrics = auc_metric, #vilket measures should the models be evaluated according to?
control = xgb_ctrl,
verbose = F) #Write out in the console while I am running
collect_metrics(hf_xgb_result) %>%
print(n = 18)
show_best(hf_xgb_result, "roc_auc") ##TRAIN
#Visualize hyperparameter tuning
hf_xgb_result %>% collect_metrics() %>%
filter(.metric == "roc_auc") %>% select(mean, mtry:learn_rate) %>% pivot_longer(mtry:learn_rate,
names_to = "parameter", values_to = "value") %>%
ggplot(aes(value, mean, color = parameter)) + geom_point(show.legend = FALSE) + facet_wrap(~parameter, scales = "free_x") + labs(x = NULL, y = "AUC")
#Choose the best model and predict in test data
best_xgb <- select_best(hf_xgb_result, "roc_auc")
final_xgb <- finalize_workflow(hf_xgb_wflow, best_xgb)
final_xgb_res <- last_fit(final_xgb, muvr_model_1_split, metrics = auc_metric)
collect_metrics(final_xgb_res) ##TEST
show_best(final_xgb_res, "roc_auc")
final_xgb_res%>%
print(n =6)
#Feature importance
library(vip)
final_xgb %>%
fit(data = muvr_model_1_train) %>%
pull_workflow_fit() %>%
vip(geom = "col")
######
#####
#Random forest
set.seed(9011) #For reproducibility. Shares the same way every time. Less chance in evaluation
CM_metric <- metric_set(yardstick::roc_auc)
CM_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train) %>% step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
rand_model <- rand_forest(
mtry = tune(),
trees = tune(),
min_n = tune()) %>%
set_mode('classification') %>%
set_engine('ranger')
rand_wflow <- workflow() %>%
add_recipe(CM_recipe) %>%
add_model(rand_model)
set.seed(9293)
rand_grid <- grid_latin_hypercube(
finalize(mtry(), muvr_model_1_train),
trees(),
min_n(),
size = 100 )
rand_ctrl <- control_race(
save_pred = TRUE,
save_workflow = TRUE,
verbose_elim = TRUE)
set.seed(9394)
rand_result <- rand_wflow %>% tune_race_anova(
resamples = muvr_model_1_folds_5,
grid = rand_grid,
metrics = CM_metric,
control = rand_ctrl,
verbose = TRUE )
collect_metrics(rand_result)
show_best(rand_result, "roc_auc") ##TRAIN
best_rand <- select_best(rand_result, "roc_auc")
best_rand
final_rand <- finalize_workflow(
rand_wflow, best_rand )
final_rand_res <- last_fit(final_rand,
muvr_model_1_split, metrics = CM_metric)
collect_metrics(final_rand_res) ###TEST
######
Question: I want to pull out the information from all three alogrithms to create a ROC curve, separately for model performance in TRINING and TEST data, and want to overlay all three train ROC curves in one plot and all three test ROC curves in second plot. Something as follows;
What is the best way to do it in R? This is my data 'muvr_model_1' snippet;
structure(list(Label_numeric_response = structure(c(1L, 2L, 1L,
1L, 1L, 1L), levels = c("No", "Yes"), class = "factor"), Acet = c(-0.128692417994666,
-0.482249202992499, 2.26777375399065, -0.470214149992573, -0.480794796992508,
0.102343519003918), Val = c(0.0385410601921631, 0.407464020291187,
-0.75256524002018, -1.36200022018376, -0.790162200030271, -1.01311664009012
), DA = c(-0.379477365588001, -0.230197042960625, 1.60781692938895,
-0.127566821154304, 0.394914308041514, 0.712134993624688), Mal = c(0.257350020059903,
0.357840680030046, 0.0171883801312568, -0.412952569740945, 1.25095511976469,
1.4077690597181), X3.Hydr = c(1.66045651392454,
-0.38828213999918, -0.830745459982707, -0.904767569979951, -0.0552838890115781,
0.760685376958043), BAM = c(0.0601078484051758, -0.296157370351331,
0.814551841066014, 0.206805291422561, -0.463811590942629, 0.500200177457331
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
Upvotes: 0
Views: 456
Reputation: 14331
A small reproducible example is always helpful :-)
yardstick metric functions are group-aware so you can just group-by then use autoplot()
(see example below). To add the legend you can do the same with roc_auc()
and then use geom_label()
to add the label.
library(tidymodels)
tidymodels_prefer()
set.seed(1)
split <- initial_split(two_class_dat)
lr_res <-
last_fit(
logistic_reg(),
Class ~ .,
split = split)
cart_res <-
last_fit(
decision_tree(cost_complexity = 0) %>% set_mode("classification"),
Class ~ .,
split = split)
bind_rows(
collect_predictions(lr_res) %>% mutate(model = "logistic reg"),
collect_predictions(cart_res) %>% mutate(model = "CART")
) %>%
group_by(model) %>%
roc_curve(Class, .pred_Class1) %>%
autoplot()
Created on 2024-02-20 with reprex v2.0.2
Upvotes: 0