Reputation: 1521
How to amend this code (which is a reproducible example):
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, step=1,cv=5, scoring='roc_auc')
search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',search)])
result = pipeline.fit(split_x_train,split_y_train)
#result = search.fit(split_x_train,split_y_train)
best_model = result.best_estimator_
yhat = best_model.predict(split_x_test)
print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,result.best_score_,result.best_params_))
return
param_grid = [{
'min_samples_leaf':[1,3,5],
}]
run_model_with_grid_search(param_grid=param_grid)
Generates:
Attribute Error: Pipeline object has no attribute 'best_estimator_'
The ultimate aim is to perform nested cross validation, hyper parameter optimization and feature selection in this function, and I was trying to follow this example
How to edit this function to perform that correctly?
Upvotes: 1
Views: 5416
Reputation: 2851
Normally, you'd run grid search on the pipeline, not the pipeline on grid search. Is there a certain reason you'd want it the other way round?
pipeline = Pipeline([('feature_sele',rfecv), ('clf',model)])
search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=cv_inner, refit=True)
result = search.fit(split_x_train, split_y_train)
best_model = result.best_estimator_
(param_grid
will require clf_
prefix to hyperparameter names of course.)
On an unrelated note, accuracy
seems to be undefined.
Upvotes: 5