Reputation: 71
I was wondering how to run a multi-class, multi-label, ordinal classification with sklearn. I want to predict a ranking of target groups, ranging from the one that is most prevalant at a certain location (1) to the one that is least prevalent (7). I don't seem to be able to get it right. Could you please help me out?
# Random Forest Classification
# Import
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
# Import dataset
dataset = pd.read_excel('alle_probs_edit.v2.xlsx')
X = dataset.iloc[:,4:-1].values
Y = dataset.iloc[:,-1].values
# Split in Train and Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42 )
# Scaling the features (alle Variablen auf eine gleiche Ebene), necessary depend on the choosen method
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
# Creat classifier
classifier = RandomForestClassifier(criterion = 'entropy')
# Choose some parameter combinations to try
parameters = {'bootstrap': [True, False],
'max_depth': [50],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 3, 4],
'min_samples_split': [9, 10, 11, 12, 13],
'n_estimators': [500,1000,1500]}
# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)
# Run the grid search
grid_obj = GridSearchCV(classifier, parameters, scoring=acc_scorer, cv = 3, n_jobs = -1)
grid_obj = grid_obj.fit(X_train, Y_train)
# Set the classifier to the best combination of parameters
classifier = grid_obj.best_estimator_
# Fit the best algorithm to the data
classifier.fit(X_train, Y_train)
#Prediction the Test data
Y_pred = classifier.predict(X_test)
#Confusion Matrix
cm = pd.DataFrame(confusion_matrix(Y_test, Y_pred))
#Accuracy
accuracy1 = accuracy_score(Y_test, Y_pred)
print("Accuracy1: %.2f%%" % (accuracy1 * 100.0))
# k-Fold Class Validation
accuracy1 = cross_val_score(estimator = classifier, X = X_train, y = Y_train, cv = 10)
kfold = accuracy1.mean()
accuracy1.std()
Upvotes: 7
Views: 10949
Reputation: 1149
This may not be the precise answer you're looking for, this article outlines a technique as follows:
We can take advantage of the ordered class value by transforming a k-class ordinal regression problem to a k-1 binary classification problem, we convert an ordinal attribute A* with ordinal value V1, V2, V3, … Vk into k-1 binary attributes, one for each of the original attribute’s first k − 1 values. The ith binary attribute represents the test A* > Vi
Essentially, aggregate multiple binary classifiers (predict target > 1, target > 2, target > 3, target > 4) to be able to predict whether a target is 1, 2, 3, 4 or 5. The author creates an OrdinalClassifier class that stores multiple binary classifiers in a Python dictionary.
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
class OrdinalClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, clf):
self.clf = clf
self.clfs = {}
self.unique_class = np.NaN
def fit(self, X, y):
self.unique_class = np.sort(np.unique(y))
if self.unique_class.shape[0] > 2:
for i in range(self.unique_class.shape[0]-1):
# for each k - 1 ordinal value we fit a binary classification problem
binary_y = (y > self.unique_class[i]).astype(np.uint8)
clf = clone(self.clf)
clf.fit(X, binary_y)
self.clfs[i] = clf
def predict_proba(self, X):
clfs_predict = {i: self.clfs[i].predict_proba(X) for i in self.clfs}
predicted = []
k = len(self.unique_class) - 1
for i, y in enumerate(self.unique_class):
if i == 0:
# V1 = 1 - Pr(y > V1)
predicted.append(1 - clfs_predict[0][:,1])
elif i < k:
# Vi = Pr(y <= Vi) * Pr(y > Vi-1)
predicted.append((1 - clfs_predict[i][:,1]) * clfs_predict[i-1][:,1])
else:
# Vk = Pr(y > Vk-1)
predicted.append(clfs_predict[k-1][:,1])
return np.vstack(predicted).T
def predict(self, X):
return self.unique_class[np.argmax(self.predict_proba(X), axis=1)]
def score(self, X, y, sample_weight=None):
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
The technique originates in A Simple Approach to Ordinal Classification
Upvotes: 7
Reputation: 444
Building off both David Diaz, the white paper, and Kartik above along with others linked to on Medium and attributed in the readme, I'm working on an OrdinalClassifier that is built on the sklearn framework and which works well with sklearn pipelines, scoring, and cross validation.
The OC performs very well vs. standard non ordinal mc classification and gives greater control over optimizing for precision/recall on the positive class (ie. "high" in for example the diabetes disease progression of low<medium<high classes. It supports any sklearn classifier that supports pred_proba. Cross validation scores are shown on repo.
OrdinalClassifer based on sklearn
https://github.com/leeprevost/OrdinalClassifier
At this time, I would not call it multi-label.
Upvotes: 1
Reputation: 457
Here is an example using KNN that should be tuneable in an sklearn pipeline or grid search.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted, check_array
from sklearn.utils.multiclass import check_classification_targets
class KNeighborsOrdinalClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, n_neighbors=5, *, weights='uniform',
algorithm='auto', leaf_size=30, p=2,
metric='minkowski', metric_params=None, n_jobs=None):
self.n_neighbors = n_neighbors
self.weights = weights
self.algorithm = algorithm
self.leaf_size = leaf_size
self.p = p
self.metric = metric
self.metric_params = metric_params
self.n_jobs = n_jobs
def fit(self, X, y):
X, y = check_X_y(X, y)
check_classification_targets(y)
self.clf_ = KNeighborsClassifier(**self.get_params())
self.clfs_ = {}
self.classes_ = np.sort(np.unique(y))
if self.classes_.shape[0] > 2:
for i in range(self.classes_.shape[0]-1):
# for each k - 1 ordinal value we fit a binary classification problem
binary_y = (y > self.classes_[i]).astype(np.uint8)
clf = clone(self.clf_)
clf.fit(X, binary_y)
self.clfs_[i] = clf
return self
def predict_proba(self, X):
X = check_array(X)
check_is_fitted(self, ['classes_', 'clf_', 'clfs_'])
clfs_predict = {k:self.clfs_[k].predict_proba(X) for k in self.clfs_}
predicted = []
for i,y in enumerate(self.classes_):
if i == 0:
# V1 = 1 - Pr(y > V1)
predicted.append(1 - clfs_predict[y][:,1])
elif y in clfs_predict:
# Vi = Pr(y > Vi-1) - Pr(y > Vi)
predicted.append(clfs_predict[y-1][:,1] - clfs_predict[y][:,1])
else:
# Vk = Pr(y > Vk-1)
predicted.append(clfs_predict[y-1][:,1])
return np.vstack(predicted).T
def predict(self, X):
X = check_array(X)
check_is_fitted(self, ['classes_', 'clf_', 'clfs_'])
return np.argmax(self.predict_proba(X), axis=1)
Upvotes: 3