Reputation: 21
I would like to apply Naive Bayes with 10-fold stratified cross-validation to my data, and then I want to see how the model performs on the test data I set aside initially.
However, the results I am getting (i.e. the predicted outcome and probability values y_pred_nb2
and y_score_nb2
) are identical to when I run the code without any cross-validation.
QUESTION: How can I correct this?
The code is below, where X_train
consists of 75% of the entire dataset and X_test
consists of 25%.
from sklearn.model_selection import StratifiedKFold
params = {}
#gridsearch searches for the best hyperparameters and keeps the classifier with the highest recall score
skf = StratifiedKFold(n_splits=10)
nb2 = GridSearchCV(GaussianNB(), cv=skf, param_grid=params)
%time nb2.fit(X_train, y_train)
# predict values on the test set
y_pred_nb2 = nb2.predict(X_test)
print(y_pred_nb2)
# predicted probabilities on the test set
y_scores_nb2 = nb2.predict_proba(X_test)[:, 1]
print(y_scores_nb2)
Upvotes: 2
Views: 17299
Reputation: 16079
First off GaussianNB
only accepts priors
as an argument so unless you have some priors to set for your model ahead of time you will have nothing to grid search over.
Furthermore, your param_grid
is set to an empty dictionary which ensures that you only fit one estimator with GridSearchCV
. This is the same as fitting an estimator without using a grid search (e.g., I use MultinomialNB
in order to show use of hyperparameters):
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB
skf = StratifiedKFold(n_splits=10)
params = {}
nb = MultinomialNB()
gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=True)
data = load_iris()
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target)
gs.fit(x_train, y_train)
gs.cv_results_
{'mean_fit_time': array([0.]),
'mean_score_time': array([0.]),
'mean_test_score': array([0.85714286]),
'mean_train_score': array([0.85992157]),
'params': [{}],
'rank_test_score': array([1]),
'split0_test_score': array([0.91666667]),
'split0_train_score': array([0.84]),
'split1_test_score': array([0.75]),
'split1_train_score': array([0.86]),
'split2_test_score': array([0.83333333]),
'split2_train_score': array([0.84]),
'split3_test_score': array([0.91666667]),
'split3_train_score': array([0.83]),
'split4_test_score': array([0.83333333]),
'split4_train_score': array([0.85]),
'split5_test_score': array([0.91666667]),
'split5_train_score': array([0.84]),
'split6_test_score': array([0.9]),
'split6_train_score': array([0.88235294]),
'split7_test_score': array([0.8]),
'split7_train_score': array([0.88235294]),
'split8_test_score': array([0.8]),
'split8_train_score': array([0.89215686]),
'split9_test_score': array([0.9]),
'split9_train_score': array([0.88235294]),
'std_fit_time': array([0.]),
'std_score_time': array([0.]),
'std_test_score': array([0.05832118]),
'std_train_score': array([0.02175538])}
nb.fit(x_train, y_train)
nb.score(x_test, y_test)
0.8157894736842105
gs.score(x_test, y_test)
0.8157894736842105
gs.param_grid = {'alpha': [0.1, 2]}
gs.fit(x_train, y_train)
gs.score(x_test, y_test)
0.8421052631578947
gs.cv_results_
{'mean_fit_time': array([0.00090394, 0.00049713]),
'mean_score_time': array([0.00029924, 0.0003005 ]),
'mean_test_score': array([0.86607143, 0.85714286]),
'mean_train_score': array([0.86092157, 0.85494118]),
'param_alpha': masked_array(data=[0.1, 2],
mask=[False, False],
fill_value='?',
dtype=object),
'params': [{'alpha': 0.1}, {'alpha': 2}],
'rank_test_score': array([1, 2]),
'split0_test_score': array([0.91666667, 0.91666667]),
'split0_train_score': array([0.84, 0.83]),
'split1_test_score': array([0.75, 0.75]),
'split1_train_score': array([0.86, 0.86]),
'split2_test_score': array([0.83333333, 0.83333333]),
'split2_train_score': array([0.85, 0.84]),
'split3_test_score': array([0.91666667, 0.91666667]),
'split3_train_score': array([0.83, 0.81]),
'split4_test_score': array([0.83333333, 0.83333333]),
'split4_train_score': array([0.85, 0.84]),
'split5_test_score': array([0.91666667, 0.91666667]),
'split5_train_score': array([0.84, 0.84]),
'split6_test_score': array([0.9, 0.9]),
'split6_train_score': array([0.88235294, 0.88235294]),
'split7_test_score': array([0.9, 0.8]),
'split7_train_score': array([0.88235294, 0.88235294]),
'split8_test_score': array([0.8, 0.8]),
'split8_train_score': array([0.89215686, 0.89215686]),
'split9_test_score': array([0.9, 0.9]),
'split9_train_score': array([0.88235294, 0.87254902]),
'std_fit_time': array([0.00030147, 0.00049713]),
'std_score_time': array([0.00045711, 0.00045921]),
'std_test_score': array([0.05651628, 0.05832118]),
'std_train_score': array([0.02103457, 0.02556351])}
Upvotes: 4
Reputation: 5460
How about something like this
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
#because only var_smoothing can be 'tuned'
#do a cross validation on different var_smoothing values
def cross_val(params):
model = GaussianNB()
model.set_params(**params)
cv_results = cross_val_score(model, X_train, y_train,
cv = 10, #10 folds
scoring = "accuracy",
verbose = 2
)
#return the mean of the 10 fold cross validation
return cv_results.mean()
#baseline parameters
params = {
"priors" : "None",
"var_smoothing" : 1e-9
}
#create an list of var_smoothing to cross validate
steps = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
#will contain the cv results
results = []
for step in steps:
params["var_smoothing"] = step
cv_result = cross_val(params)
#save result
results.append(cv_result)
#print results
#convert results to pandas dataframe for easier visualization
df = pd.DataFrame({"var_smoothing" : steps, "accuracy" : results})
#sort it
df_sorted = df.sort_values("accuracy", ascending=False)
#reset the index of the sorted dataframe
df_sorted.reset_index(inplace=True, drop=True)
df_sorted.head()
Upvotes: 1