Slowat_Kela
Slowat_Kela

Reputation: 1511

TypeError: Singleton array array(X) cannot be considered a valid collection in sklearn

Can anyone explain to me why this code:

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification 
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve 
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd


df = pd.DataFrame({
    'Height': [167, 175, 170, 186, 190, 188, 158, 169, 183, 180],
    'Weight': [65, 70, 72, 80, 86, 94, 50, 58, 78, 85],
    'Team': ['A', 'A', 'B', 'B', 'B', 'B', 'A', 'A', 'B', 'A']
})

full_X_train = df.iloc[:,:-1]
full_y_train = df.iloc[:,-1]


def create_model(X_train=full_X_train,y_train=full_y_train,model_name=SVC(kernel='linear'),n_splits=5,file_name='test_ml'):

      clf = model_name 
      k_fold = StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=True) 
      precision = [] 
      recall = [] 
      f1 = [] 
      aucs = []

       
      for train_index,test_index in k_fold.split(X_train,y_train): 
            x_train_fold,x_test_fold = X_train.iloc[train_index],X_train.iloc[test_index] 
            y_train_fold,y_test_fold = y_train[train_index],y_train[test_index] 
            clf.fit(x_train_fold,y_train_fold) 
            y_pred = clf.predict(x_test_fold) 
            save_mod = file_name + '.' + str(count) + '.fold.json' 
            pickle.dump(clf,open(save_mod,'wb')) 
  
            f1.append(f1_score(y_test_fold,y_pred)) 
      return f1




def get_scores(model,output_file = 'output.txt'):
      open_output = open(output_file, 'a') 
      open_output.write('F1: mean=%.2f std=%.2f, n=%d' % (mean(model[2])*100, std(model[2])*100, len(model[2])) + '\n') 
      return 


def run_model_with_grid_search(model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='test_id', n_splits=5, output_file='', param_grid={}): 
      search = GridSearchCV( 
        model_name,
        cv=3,
        param_grid=param_grid,
        scoring='accuracy',
        refit=True
        ) 

      fit_model = search.fit(X_train,y_train)
      return fit_model,fit_model.best_params_,fit_model.best_score_


fit_model,params,best_score = run_model_with_grid_search(param_grid=[{'max_depth':list(range(5,9))}])
model = create_model(fit_model) #n_jobs=-1 
print(get_scores(model)) 

Returns:

  File "ml_models.py", line 84, in <module>
    model = create_model(fit_model) #n_jobs=-1 
  File "ml_models.py", line 50, in create_model
    for train_index,test_index in k_fold.split(X_train,y_train): 
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/model_selection/_split.py", line 324, in split
    X, y, groups = indexable(X, y, groups)
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 299, in indexable
    check_consistent_length(*result)
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 259, in check_consistent_length
    lengths = [_num_samples(X) for X in arrays if X is not None]
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 259, in <listcomp>
    lengths = [_num_samples(X) for X in arrays if X is not None]
  File "/Users/slowat/anaconda/envs/nlp_course/lib/python3.7/site-packages/sklearn/utils/validation.py", line 203, in _num_samples
    " a valid collection." % x)
TypeError: Singleton array array(GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid=[{'max_depth': [5, 6, 7, 8]}], scoring='accuracy'),
      dtype=object) cannot be considered a valid collection.

I have seen this answer, but I don't think this applies to me?

(In case it matters, the overall aim is to implement a grid search with features selection in a Pipeline object - but I haven't figured out how to do that yet because of this error).

Upvotes: 1

Views: 1383

Answers (1)

user2246849
user2246849

Reputation: 4407

You are passing fit_model as positional argument to create_model. The create_model function has this signature:

create_model(X_train=full_X_train,y_train=full_y_train,model_name=SVC(kernel='linear'),n_splits=5,file_name='test_ml')

So, currently X_train will have the value fit_model and get passed to the grid search, creating this error. To fix it, you should use the keyword argument:

model = create_model(model_name=fit_model)

Upvotes: 1

Related Questions