Reputation: 1543
I'm trying ensembling SVMs with Scikit-learn, specifically optimizing hyperparameters. I'm quite randomly getting the following error:
File "C:\Users\jakub\anaconda3\envs\SVM_ensembles\lib\site-packages\sklearn\svm\_base.py", line 250, in _dense_fit
self.probB_, self.fit_status_ = libsvm.fit(
File "sklearn\svm\_libsvm.pyx", line 191, in sklearn.svm._libsvm.fit
ValueError: Invalid input - all samples with positive weights have the same label.
From what I understand, this is from file https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/svm/src/libsvm/svm.cpp and has something to do with examples from 1 class only going into SVM. I'm using stratified K-fold cross validation and have quite balanced dataset (45% one class, 55% other), so this should not happen anyway.
What can I do?
Optimizing code that throws error:
def get_best_ensemble_params(X_train, y_train, X_test, y_test, n_tries=5):
search_spaces = {
"max_samples": Real(0.1, 1, "uniform"),
"max_features": Real(0.1, 1, "uniform"),
"kernel": Categorical(["linear", "poly", "rbf", "sigmoid"]),
"C": Real(1e-6, 1e+6, "log-uniform"),
"gamma": Real(1e-6, 1e+1, "log-uniform")
}
best_accuracy = 0
best_model = None
for i in range(n_tries):
done = False
while not done:
try:
optimizer = BayesSearchCV(SVMEnsemble(), search_spaces, cv=3, n_iter=10, n_jobs=-1, n_points=10,
verbose=1)
optimizer.fit(X_train, y_train) # <- ERROR HERE
accuracy = accuracy_score(y_test, optimizer)
if accuracy > best_accuracy:
best_accuracy = accuracy
best_model = optimizer
done = True
print(i, "job done")
except:
pass
return best_model.best_params_
if __name__ == "__main__":
dataset_name = "acute_inflammations"
loading_functions = {
"acute_inflammations": load_acute_inflammations,
"breast_cancer_coimbra": load_breast_cancer_coimbra,
"breast_cancer_wisconsin": load_breast_cancer_wisconsin
}
X, y = loading_functions[dataset_name]()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
params = get_best_ensemble_params(X_train, y_train, X_test, y_test)
params["n_jobs"] = -1
params["random_state"] = 0
model = SVMEnsemble(n_estimators=20, **params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
My custom SVMEnsemble is just BaggingClassifier
with hard-coded SVC
:
import inspect
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from skopt import BayesSearchCV
svm_possible_args = {"C", "kernel", "degree", "gamma", "coef0", "shrinking", "probability", "tol", "cache_size",
"class_weight", "max_iter", "decision_function_shape", "break_ties"}
bagging_possible_args = {"n_estimators", "max_samples", "max_features", "bootstrap", "bootstrap_features",
"oob_score", "warm_start", "n_jobs"}
common_possible_args = {"random_state", "verbose"}
class SVMEnsemble(BaggingClassifier):
def __init__(self, voting_method="hard", n_jobs=-1,
n_estimators=10, max_samples=1.0, max_features=1.0,
C=1.0, kernel="linear", gamma="scale",
**kwargs):
if voting_method not in {"hard", "soft"}:
raise ValueError(f"voting_method {voting_method} is not recognized.")
self._voting_method = voting_method
self._C = C
self._gamma = gamma
self._kernel = kernel
passed_args = {
"n_jobs": n_jobs,
"n_estimators": n_estimators,
"max_samples": max_samples,
"max_features": max_features,
"C": C,
"gamma": gamma,
"cache_size": 1024,
}
kwargs.update(passed_args)
svm_args = {
"probability": True if voting_method == "soft" else False,
"kernel": kernel
}
bagging_args = dict()
for arg_name, arg_val in kwargs.items():
if arg_name in svm_possible_args:
svm_args[arg_name] = arg_val
elif arg_name in bagging_possible_args:
bagging_args[arg_name] = arg_val
elif arg_name in common_possible_args:
svm_args[arg_name] = arg_val
bagging_args[arg_name] = arg_val
else:
raise ValueError(f"argument {voting_method} is not recognized.")
self.svm_args = svm_args
self.bagging_args = bagging_args
base_estimator = SVC(**svm_args)
super().__init__(base_estimator=base_estimator, **bagging_args)
@property
def voting_method(self):
return self._voting_method
@voting_method.setter
def voting_method(self, new_voting_method):
if new_voting_method == "soft":
self._voting_method = new_voting_method
self.svm_args["probability"] = True
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
elif self._voting_method == "soft":
self._voting_method = new_voting_method
self.svm_args["probability"] = False
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
else:
self._voting_method = new_voting_method
@property
def C(self):
return self._C
@C.setter
def C(self, new_C):
self._C = new_C
self.svm_args["C"] = new_C
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
@property
def gamma(self):
return self._gamma
@gamma.setter
def gamma(self, new_gamma):
self._gamma = new_gamma
self.svm_args["gamma"] = new_gamma
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
@property
def kernel(self):
return self._kernel
@kernel.setter
def kernel(self, new_kernel):
self._kernel = new_kernel
self.svm_args["kernel"] = new_kernel
base_estimator = SVC(**self.svm_args)
super().__init__(base_estimator=base_estimator, **self.bagging_args)
def predict(self, X):
if self._voting_method == "hard":
return super().predict(X)
elif self._voting_method == "soft":
probabilities = np.zeros((X.shape[0], self.classes_.shape[0]))
for estimator in self.estimators_:
estimator_probabilities = estimator.predict_proba(X)
probabilities += estimator_probabilities
return self.classes_[probabilities.argmax(axis=1)]
else:
raise ValueError(f"voting_method {self._voting_method} is not recognized.")
Upvotes: 0
Views: 1431
Reputation: 4990
From the way you describe your problem (that you are getting it "quite randomly") and the description of your data and the code I'm almost positive that the problem is with bagging classifier occasionally randomly selecting sub-sample of training examples with only one class. K-fold stratified split won't help you here because it only will control the original split(s) of your data into training/test, but not how BaggingClassifier picks random subsample of max_samples
from training set. If you look at the code of how BaggingClassifier picks a subsample you'll notice there is no protection against such issue.
One very easy way to tell for sure is to replace the "max_samples": Real(0.1, 1, "uniform")
with some smaller numbers e.g. "max_samples": Real(0.02, 0.03, "uniform")
(or set to some fixed smaller value) and check that you start getting the error much more frequently.
I'm not sure if you really use it with n_tries=5
and n_iter=10
(seems a bit small for all the hyperparameters you have) or with larger values and/or maybe you run the whole script multiple times with different random seed, but in any case let's just compute the probability of getting such a problem with max_samples=0.1
and having a dataset with 120 examples with 55%/45% split. Let's say you got 96 examples for your training set with 45/55 split, e.g. 53+43 examples. Now with bootstrap enabled each time you train a bagging classifier it will randomly pick, say 10 examples out of 96 (with replacement since bootstrap is enabled by default). Chances of picking all of them from larger class are (53/96)^10, i.e. approximately 0.26%. That means that if you train 50 classifiers in a row like this chances of one of them hitting the issue are now 12.5%. And if you continuously run some searches like that you're pretty much inevitably running into this problem (I did use the fact that max_samples=0.1
here for simplicity which is not correct, but you'll likely to get close to that value frequently enough).
The last question is what to do with the issue. There are a few possible answers:
max_samples
in your searches or make it dependent on number of examples.There are also other possibilities too - e.g. after you split your data in train/test you can artificially inflate your training data by replacing each sample with e.g. N
identical samples (where N
is e.g. 2 or 10) to reduce the chance of having bagging classifier randomly picking subsample with only one class.
Upvotes: 1