Switching binary classification python scikit-learn model to multi-class classification model

Question

I'm currently having trouble switching the following code to fit a multiclass variable (3 levels).

# data import
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# fetch breast cancer dataset
bc = fetch_ucirepo(id=17)

# data (as pandas dataframes)
bc_X = bc.data.features
bc_y = bc.data.targets

# fetch heart disease dataset
hd = fetch_ucirepo(id=45)
# data (as pandas dataframes)
hd_X = hd.data.features
hd_y = hd.data.targets


# fetch iris dataset
ir = fetch_ucirepo(id=53)
# data (as pandas dataframes)
ir_X = ir.data.features
ir_y = ir.data.targets


# fetch wine quality dataset
wq = fetch_ucirepo(id=186)
# data (as pandas dataframes)
wq_X = wq.data.features
wq_y = wq.data.targets


# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(hd_X, hd_y, test_size=0.2, random_state=42)

# cap values function
def cap_values(values):
    # Convert input to a numpy array
    values = np.array(values)
    # Cap values at 1 using numpy's clip function
    capped_values = np.clip(values, None, 1)
    return capped_values

# Find the indices of rows with missing values in X_train
missing_indices = X_train.isnull().any(axis=1)
missing_indices2 = X_test.isnull().any(axis=1)

# Remove rows with missing values from X_train and y_train
X_train = X_train[~missing_indices]
y_train = y_train[~missing_indices]
X_test = X_test[~missing_indices2]
y_test = y_test[~missing_indices2]

# Apply the cap_values function to y_test and y_train
y_test = cap_values(y_test)
y_train = cap_values(y_train)

# Step 3: Fit each model to the training data

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Support Vector Machine
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# Step 4: Evaluate the models using the testing data
models = {
    'Decision Tree': dt_model,
    'Naïve Bayes': nb_model,
    'Random Forest': rf_model,
    'Support Vector Machine': svm_model
}

# Initialize a dictionary to store the performance metrics
performance_metrics = {}

# Iterate through each model and calculate the metrics
for name, model in models.items():
    # Get predictions
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_test)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm, display_labels=model.classes_).plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix: {name}')
    plt.show()

    # Calculate sensitivity (recall) and specificity
    if cm.shape == (2, 2):
      # Binary classification metrics
      tn, fp, fn, tp = cm.ravel()
      sensitivity = tp / (tp + fn)
      specificity = tn / (tn + fp)
    else:
      # Multi-class classification metrics
      sensitivity = recall_score(y_test, y_pred, average='macro')
      specificity = None
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)

    # Calculate balanced accuracy
    balanced_accuracy = (sensitivity + specificity) / 2

    # Calculate precision, recall, and F1-score
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Calculate AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    auc = roc_auc_score(y_test, y_pred_prob)

    # Store the performance metrics
    performance_metrics[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'Balanced Accuracy': balanced_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC': auc
    }

    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
    plt.title(f'ROC Curve: {name}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

# Display the performance metrics in a DataFrame
metrics_df = pd.DataFrame(performance_metrics).transpose()
print(metrics_df)

I need the accuracy, sensitivity, specificity, balanced accuracy, precision, recall, f1 score, auc, but, most importantly, and roc plot of the model. This works perfectly for binary classification so I'm confused on why it wont work for multiclass. Flattening the variable did not work.

Thanks for any help

I tried flattening the variable, I tried doing a for loop to do 3 separate binary classifications and taking the average of each, I tried splitting the data into 3 datasets (each with only 2 of the classes so it was essentially a binary classification problem).

Switching binary classification python scikit-learn model to multi-class classification model

Answers (1)

Related Questions