Reputation: 21
I am wondering how to use cross validation in python to improve the accuracy of my logistic regression model. The dataset being used is called 'iris'. I have already successfully used cross validation for a SVM model but I am struggling to adjust my code to do the same for the logistic regression model. Here's my code so far:
from sklearn import cross_validation
from sklearn import datasets, linear_model
iris = datasets.load_iris()
x_iris = iris.data
y_iris = iris.target
svc = svm.SVC(C=1, kernel='linear')
k_fold = cross_validation.StratifiedKFold(y_iris, n_folds=10)
# labels, the number of folders
#for train, test in k_fold:
# print train, test
scores = cross_validation.cross_val_score(svc, x_iris, y_iris, cv=k_fold, scoring='accuracy')
# clf.fit() is repeatedly called inside the cross_validation.cross_val_score()
print scores
print 'average score = ', np.mean(scores)
print 'std of scores = ', np.std(scores)
What adjustments must I make to the code to achieve successful cross validation for my logistic regression model?
Thanks for any help.
Upvotes: 2
Views: 2908
Reputation: 597
e.g. Binary classification with LogisticRegressionCV:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
##from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
##X, y = load_iris(return_X_y=True)
from sklearn.datasets import make_classification
X, y = make_classification(200, n_features=2, n_informative=2, n_redundant=0, random_state=42)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
# plot the decision function for 2-features ds
def plot_decision_function(classifier, axis, title):
h = .02 # step size in the mesh
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
grid = np.c_[xx.ravel(), yy.ravel()]
# !!!!!!!!! probabilitiess
probs = classifier.predict_proba(grid)[:, 1].reshape(xx.shape)
# plot the line, the points, and the nearest vectors to the plane
contour= axis.contourf(xx, yy, probs, 25, cmap="RdGy",
vmin=0, vmax=1)
ax_c = f.colorbar(contour)
ax_c.set_label("$P(y = 1)$")
## ax_c.set_ticks([0, .25, .5, .75, 1])
axis.scatter(
X[:, 0],
X[:, 1],
c=y,
alpha=0.9,
cmap=plt.cm.bone,
edgecolors="black",
)
axis.set(aspect="equal",
xlim=(x_min, x_max), ylim=(y_min, y_max),
xlabel="$X_1$", ylabel="$X_2$")
## axis.axis("off")
axis.set_title(title)
plt.show()
# Array of C i.e. inverse of regularization parameter values used for cross-validation.
Cs_arr = np.logspace(-5, 5, 20)
lr = LogisticRegressionCV( cv=5, tol=0.01, solver="saga", random_state=10)
##clf = make_pipeline([('sc', StandardScaler()), ('lr0', LogisticRegression() )])
pipe = Pipeline([("sc", StandardScaler()),("model", lr)])
res= pipe.set_params(model__Cs= Cs_arr, model__class_weight= 'balanced').fit(X_train, y_train).score(X_test, y_test)
print('score', res)
##print(f"Optimal C for clf: {pipe[-1].C_[0]:.2f}")
import easygui
easygui.msgbox(f"Optimal C for clf: {pipe[-1].C_[0]:.2f}", title="CrossValidation done", ok_button="OK")
f, axes = plt.subplots(figsize=(8, 6))
plot_decision_function(pipe[-1], axes, "Logistic Regression")
Upvotes: 0
Reputation: 2768
lg = LogisticRegression()
scores = cross_validation.cross_val_score(lg, x_iris, y_iris, cv=k_fold,scoring='accuracy')
print scores
print 'average score = ', np.mean(scores)
print 'std of scores = ', np.std(scores)
Creating the LogisticRegression
with default values classifier works fine for me. The output is slightly lower than the SVM
machine approach, 0.953333333333
vs. 0.973333333333
.
But for parameter adjustment you can always use GridSearchCV
which automatically performs a cross-validation of cv
folds (in the next example I'll use 10
as you did before) trying all possible combinations of parameters. Example:
from sklearn import grid_search
parameters = {
'penalty':['l2'],
'C':[1,10,100],
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'],
}
GS = grid_search.GridSearchCV(lg, parameters,cv=10,verbose=10)
GS.fit(x_iris,y_iris)
print GS.best_params_ # output: {'penalty': 'l2', 'C': 100, 'solver': 'liblinear'}
print GS.best_score_ # output: 0.98
By doing this, creating your classifier with best params LogisticRegression(penalty='l2',C=100,solver='liblinear')
will give you a 0.98
accuracy.
Gentle warning: when performing cross validation you'd better save a portion of your data for testing purposes that has not been included in the learning process. Otherwise, one way or another your learning algorithm has seen all data and you could easily fall into overfitting.
Upvotes: 3