
Reputation: 133

Perform cross validation without cross_val_score

In order to have full access to the inner and outer score I would like to create a nested cros validation and grid-search without using cross_val_score.

I have followed examples I found online like this

I am having doubts that the inner nest is ok. I am not sure if I have to split the data before calling GridSearchCV:

    for train_index_inner, test_index_inner in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner = X_train_outer[train_index_inner]
            y_train_inner = y_train_outer[train_index_inner]
            X_test_inner  = X_train_outer[test_index_inner]
            y_test_inner  = y_train_outer[test_index_inner] 
# inner cross-validation
            for name, gs_est in sorted(gridcvs.items()):
      , y_train_inner)
                y_pred = gs_est.predict(X_test_inner)
                inner_score = r2_score(y_true=y_test_inner, y_pred=y_pred)
                #for mean_score, params in zip(gs_est.cv_results_ ['mean_test_score'], 
                                              #gs_est.cv_results_ ['params']):
                                             #print(name, params, mean_score)
    print('print cvscores for model:', cv_scores)                   
    outer_counter = outer_counter + 1   

The whole code:

import numpy as np
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import operator

perf_list = []     # list with the performance
hp_list = []      # hyperparameter list
algo_familiy = [] # algorithm familiy list

average_scores_across_outer_folds_for_each_model = dict()
X, y = make_regression(n_samples=1000, n_features=10)
# Create X_test,   y_test = TEST SET
# Create X_train, y_train = TRAIN & VALIDATION SET
X_train, X_gtest, y_train, y_gtest= train_test_split(X, y, train_size=0.8, random_state=randomState)
# Regressors you want to use 
reg1 = KNeighborsRegressor()
reg2 = RandomForestRegressor()

# Building the pipelines (Transformer, Classifier)
pipe1 = Pipeline([('std' , StandardScaler()),
                  ('reg1', reg1)])

pipe2 = Pipeline([('std' , StandardScaler()),
                  ('reg2', reg2)])

# Setting up parameters for grid
param_grid1 = [{'reg1__n_neighbors': list(range(7, 10))}]

param_grid2 = [{'reg2__max_depth': [50, 20]}]

# outer cross-validation
outer_counter = 1
outer_cv = KFold(n_splits=3, shuffle=True)
inner_cv = KFold(n_splits=2, shuffle=True, random_state=randomState)
gridcvs = {}
for pgrid, est, name in zip((param_grid1, param_grid2),
                            (pipe1, pipe2),
                            ('KNN', 'RF')):
    regressor_that_optimizes_its_hyperparams = GridSearchCV(estimator=est,
    gridcvs[name] = regressor_that_optimizes_its_hyperparams
for train_index_outer, test_index_outer in outer_cv.split(X_train, y_train): 
    print('outer_cv', outer_counter)
    X_train_outer = X_train[train_index_outer]
    y_train_outer = y_train[train_index_outer]
    X_test_outer  = X_train[test_index_outer]
    y_test_outer  = y_train[test_index_outer]
#    print(X_train_outer.shape)
#    print(X_test_outer.shape)
    cv_scores = {name: [] for name, gs_est in gridcvs.items()}
    for train_index_inner, test_index_inner in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner = X_train_outer[train_index_inner]
            y_train_inner = y_train_outer[train_index_inner]
            X_test_inner  = X_train_outer[test_index_inner]
            y_test_inner  = y_train_outer[test_index_inner] 
# inner cross-validation
            for name, gs_est in sorted(gridcvs.items()):
      , y_train_inner)
                y_pred = gs_est.predict(X_test_inner)
                inner_score = r2_score(y_true=y_test_inner, y_pred=y_pred)
                #for mean_score, params in zip(gs_est.cv_results_ ['mean_test_score'], 
                                              #gs_est.cv_results_ ['params']):
                                             #print(name, params, mean_score)
    print('print cvscores for model:', cv_scores)                   
    outer_counter = outer_counter + 1
# Looking at the results        
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))

many_stars = '\n' + '*' * 100 + '\n'
print(many_stars + 'Now we choose the best model and refit on the whole dataset' + many_stars) 

# Fitting a model to the whole training set
# using the "best" algorithm
best_algo = gridcvs['RF'], y_train)
train_acc = r2_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = r2_score(y_true=y_gtest, y_pred=best_algo.predict(X_gtest))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RF'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
# Fitting a model to the whole dataset
# using the "best" algorithm and hyperparameter settings
best_clf = best_algo.best_estimator_
final_model =, y)

Upvotes: 0

Views: 1387

Answers (1)


Reputation: 5367

In general you can obtain the nested cross-validation using the code you posted.

for train_index_outer, test_index_outer in outer_cv.split(X_train, y_train): 
    print('outer_cv', outer_counter)
    X_train_outer = X_train[train_index_outer]
    y_train_outer = y_train[train_index_outer]
    X_test_outer  = X_train[test_index_outer]
    y_test_outer  = y_train[test_index_outer]
    for train_index_inner, test_index_inner in inner_cv.split(X_train_outer, y_train_outer):
            X_train_inner = X_train_outer[train_index_inner]
            y_train_inner = y_train_outer[train_index_inner]
            X_test_inner  = X_train_outer[test_index_inner]
            y_test_inner  = y_train_outer[test_index_inner] 
            # fit something on X_train_inner 
            # evaluate it on X_test_inner  

or you could do the following: If you pass to GridSearchCV the argument cv inner_cv, then the GridSearchCV will automatically perform the split when you call the .fit() method. When the fit is complete you can explore the .cv_results to get the individual model score on each of the automatically generated inner folds.

   for train_index_outer, test_index_outer in outer_cv.split(X_train, y_train): 
        X_train_outer = X_train[train_index_outer]
        y_train_outer = y_train[train_index_outer]
        X_test_outer  = X_train[test_index_outer]
        y_test_outer  = y_train[test_index_outer]

        cv= GridSearchCV(estimator=est,

Upvotes: 1

Related Questions