Tabbi
Tabbi

Reputation: 69

How to plot multiple ROC curves in one plot in R from three machine learning algorithms ran in tidymodels?

I ran three machine learning algorithms (logistic regression, xg-boost, random forest) on my data named 'muvr_model_1' using tidymodels package in R using following codes;

library(dplyr)
library(magrittr)
library(ggplot2)
library(tidyverse)
library(tidymodels)


####
#Logistic regression

set.seed(9011) #For reproducibility. Shares the same way every time. Less chance in evaluation

library(tidymodels)

muvr_model_1_split <- initial_split (muvr_model_1, strata = Label_numeric_response, prop = 0.7) #strata to insure approximately equal percentage of outcomes in both training data and test data. 75% goes to training data

muvr_model_1_train <- training(muvr_model_1_split) #70% of data
muvr_model_1_test <- testing(muvr_model_1_split) #30% of data

##Heuristic – logistic regression

log_model <- logistic_reg() %>% #We want a logistic regression model
  
  set_engine('glm') %>% #To be done according to glm, there are other ways to create logistic regression models
  
  set_mode('classification') #The model should classify

log_fit <- log_model %>% #Takes and fits our model to training data
  
  fit(Label_numeric_response ~ ., data = muvr_model_1_train) #Modeling response against all other factors

tidy(log_fit) # Summarizes the model aligned with the training data

log_class_pred <- predict(log_fit, new_data = muvr_model_1_test, type = 'class') #What outcome does it predict?

print(log_class_pred, n = 68)

log_prob_pred <- predict(log_fit, new_data = muvr_model_1_test, type = 'prob') # What probability for each class?

#Heuristic result
log_results <- muvr_model_1_test %>% select(Label_numeric_response) %>% #Getting the true class
  bind_cols(log_class_pred, log_prob_pred) #Adds predicted class and probabilities

#Confusion matrix
conf_mat(log_results, truth = Label_numeric_response, estimate = .pred_class)
conf_mat(log_results, truth = Label_numeric_response, estimate = .pred_class) %>%
  summary()

log_results %>% roc_curve(truth = Label_numeric_response, .pred_No) %>% autoplot()
roc_auc(log_results, truth = Label_numeric_response, .pred_No)


#Streamlines workflow in tidymodels

log_last_fit <- log_model %>%
  last_fit(Label_numeric_response ~ ., split = muvr_model_1_split)

log_last_fit %>% collect_metrics()

#Includes Cross validation part

set.seed(9011)

muvr_model_1_folds_5 <- vfold_cv(muvr_model_1_train, v = 5, repeats = 5)

log_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train, family = binomial) %>% step_normalize(all_numeric()) %>% step_dummy(all_nominal(), -all_outcomes())

log_recipe %>% prep() #Provides us with the following information

#After resampling, we have validated, i.e. evaluated on parts of the training data

log_reg_model <- logistic_reg() %>%
  set_mode('classification') %>%
  set_engine('glm')

logistic_wf <- workflow() %>%
  add_recipe(log_recipe) %>% 
  add_model(log_reg_model)

log_reg_resamp <- fit_resamples(logistic_wf, muvr_model_1_folds_5) ## TRAIN with Cross validation

collect_metrics(log_reg_resamp)

#With last_fit, we predict the test data

logistic_wf %>% last_fit(muvr_model_1_split) %>% collect_metrics() ## TEST
######

#####
#XG-BOOST

library(xgboost)

xgb_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train) %>% update_role(Label_numeric_response, new_role = "outcome") %>% step_normalize(all_numeric_predictors()) %>% step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_zv(all_predictors())

hf_xgb_model <- boost_tree( mtry = tune(),
                            trees = tune(), min_n = tune(),
                            tree_depth = tune(),
                            learn_rate = tune()) %>%
  set_mode('classification') %>%
  set_engine('xgboost')

hf_xgb_wflow <- workflow() %>% add_recipe(xgb_recipe) %>% add_model(hf_xgb_model)


set.seed(9192)

hf_xgb_grid <- grid_latin_hypercube(
  finalize(mtry(), muvr_model_1_train),
  trees(),
  min_n(),
  tree_depth(),
  learn_rate(),
  size = 100)

library(finetune)

auc_metric <- metric_set(yardstick::roc_auc) #Wants to measure the predictive ability of the model with ROC AUC

xgb_ctrl <- control_race(
  save_pred = TRUE,
  save_workflow = TRUE,
  verbose_elim = TRUE)

set.seed(9011)

hf_xgb_result <- hf_xgb_wflow %>%
  tune_race_anova( #tune_race_anova for faster driving. Here, instead, run the function tune_race_anova instead of specifying it as an argument for workflow function
    resamples = muvr_model_1_folds_5, # resampling
    grid = hf_xgb_grid, # Which grid should it be evaluated on?
    metrics = auc_metric, #vilket measures should the models be evaluated according to?
    control = xgb_ctrl,
    verbose = F) #Write out in the console while I am running

collect_metrics(hf_xgb_result) %>%
  print(n = 18)

show_best(hf_xgb_result, "roc_auc") ##TRAIN

#Visualize hyperparameter tuning

hf_xgb_result %>% collect_metrics() %>%
  filter(.metric == "roc_auc") %>% select(mean, mtry:learn_rate) %>% pivot_longer(mtry:learn_rate,
                                                                                  names_to = "parameter", values_to = "value") %>%
  ggplot(aes(value, mean, color = parameter)) + geom_point(show.legend = FALSE) + facet_wrap(~parameter, scales = "free_x") + labs(x = NULL, y = "AUC")


#Choose the best model and predict in test data

best_xgb <- select_best(hf_xgb_result, "roc_auc")
final_xgb <- finalize_workflow(hf_xgb_wflow, best_xgb)
final_xgb_res <- last_fit(final_xgb, muvr_model_1_split, metrics = auc_metric)
collect_metrics(final_xgb_res)    ##TEST

show_best(final_xgb_res, "roc_auc")

final_xgb_res%>%
  print(n =6)

#Feature importance

library(vip)
final_xgb %>%
  fit(data = muvr_model_1_train) %>%
  pull_workflow_fit() %>%
  vip(geom = "col")

######


#####
#Random forest

set.seed(9011) #For reproducibility. Shares the same way every time. Less chance in evaluation

CM_metric <- metric_set(yardstick::roc_auc)
CM_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train) %>% step_normalize(all_numeric()) %>%
  step_dummy(all_nominal(), -all_outcomes())

rand_model <- rand_forest(
  mtry = tune(),
  trees = tune(),
  min_n = tune()) %>%
  set_mode('classification') %>%
  set_engine('ranger')

rand_wflow <- workflow() %>%
  add_recipe(CM_recipe) %>%
  add_model(rand_model)

set.seed(9293)

rand_grid <- grid_latin_hypercube(
  finalize(mtry(), muvr_model_1_train),
  trees(),
  min_n(),
  size = 100 )

rand_ctrl <- control_race(
  save_pred = TRUE,
  save_workflow = TRUE,
  verbose_elim = TRUE)

set.seed(9394)

rand_result <- rand_wflow %>% tune_race_anova(
  resamples = muvr_model_1_folds_5,
  grid = rand_grid,
  metrics = CM_metric,
  control = rand_ctrl,
  verbose = TRUE )

collect_metrics(rand_result)

show_best(rand_result, "roc_auc") ##TRAIN

best_rand <- select_best(rand_result, "roc_auc")

best_rand

final_rand <- finalize_workflow(
  rand_wflow, best_rand )

final_rand_res <- last_fit(final_rand,
                           muvr_model_1_split, metrics = CM_metric)

collect_metrics(final_rand_res) ###TEST

######

Question: I want to pull out the information from all three alogrithms to create a ROC curve, separately for model performance in TRINING and TEST data, and want to overlay all three train ROC curves in one plot and all three test ROC curves in second plot. Something as follows;

enter image description here

What is the best way to do it in R? This is my data 'muvr_model_1' snippet;

structure(list(Label_numeric_response = structure(c(1L, 2L, 1L, 
1L, 1L, 1L), levels = c("No", "Yes"), class = "factor"), Acet = c(-0.128692417994666, 
-0.482249202992499, 2.26777375399065, -0.470214149992573, -0.480794796992508, 
0.102343519003918), Val = c(0.0385410601921631, 0.407464020291187, 
-0.75256524002018, -1.36200022018376, -0.790162200030271, -1.01311664009012
), DA = c(-0.379477365588001, -0.230197042960625, 1.60781692938895, 
-0.127566821154304, 0.394914308041514, 0.712134993624688), Mal = c(0.257350020059903, 
0.357840680030046, 0.0171883801312568, -0.412952569740945, 1.25095511976469, 
1.4077690597181), X3.Hydr = c(1.66045651392454, 
-0.38828213999918, -0.830745459982707, -0.904767569979951, -0.0552838890115781, 
0.760685376958043), BAM = c(0.0601078484051758, -0.296157370351331, 
0.814551841066014, 0.206805291422561, -0.463811590942629, 0.500200177457331
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))

Upvotes: 0

Views: 456

Answers (1)

topepo
topepo

Reputation: 14331

A small reproducible example is always helpful :-)

yardstick metric functions are group-aware so you can just group-by then use autoplot() (see example below). To add the legend you can do the same with roc_auc() and then use geom_label() to add the label.

library(tidymodels)
tidymodels_prefer()

set.seed(1)
split <- initial_split(two_class_dat)

lr_res <- 
  last_fit(
    logistic_reg(),
    Class ~ ., 
    split = split)

cart_res <- 
  last_fit(
    decision_tree(cost_complexity = 0) %>% set_mode("classification"),
    Class ~ ., 
    split = split)

bind_rows(
    collect_predictions(lr_res) %>% mutate(model = "logistic reg"), 
    collect_predictions(cart_res) %>% mutate(model = "CART")
    ) %>% 
  group_by(model) %>% 
  roc_curve(Class, .pred_Class1) %>% 
  autoplot()

Created on 2024-02-20 with reprex v2.0.2

Upvotes: 0

Related Questions