Reputation: 1
I am trying to implement a pipeline with sklearn combining a column transformer for numeric and categorical data and sequential feature selection. The issue is when doing the complete pipeline it gets an error.
Code:
X = data_features
y = data_labels
# To have the name of all the features
features_name = list(data_features.columns)
# To have the name of the categorical features
categorical_features = ['sex','num_comorbidities','sarcopenia','osteoporosis']
categorical_features_idx = [features_name.index(i) for i in categorical_features]
# To have the numeric features
numeric_features = [feature for feature in features_name if feature not in categorical_features]
numeric_features_idx = [features_name.index(i) for i in numeric_features]
'''To have the data pre-procesing steps'''
#To create separated pipelines for categorical and numeric variables
numeric_transformer = Pipeline(steps = [
('scaler1', QuantileTransformer(n_quantiles = 20, output_distribution = 'normal', random_state = 1)),
('scaler2', Normalizer())
])
categorical_transformer = Pipeline(steps = [
('encoder', OneHotEncoder(categories = 'auto', handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers = [
('numeric', numeric_transformer, numeric_features_idx),
('cat', categorical_transformer, categorical_features_idx)
])
pipe = Pipeline(steps = [
('preprocessor', preprocessor),
('tomekU', SMOTETomek(tomek = TomekLinks(sampling_strategy = 'majority'))),
('classifier', RandomForestClassifier(n_estimators = 100,
criterion = 'gini',
max_depth = 90,
max_features = 'auto',
min_samples_leaf = 3,
min_samples_split = 10,
bootstrap = True,
class_weight = 'balanced'))
])
'''To do feature selection'''
sfs = SequentialFeatureSelector(pipe, n_features_to_select = 20, direction = 'backward', scoring = 'roc_auc', cv = 20)
X_transformed = sfs.fit_transform(X, np.ravel(y))
I get the following error at the last line above
ValueError: all features must be in \[0, 156\] or \[-157, 0\]
I have the following data:\
X.shape:\
(414, 158)
y.shape:\
(414, 1)
categorical_features_idx:\
\[0, 2, 3, 4\]
numeric_features_idx:\
[1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157\]
Traceback
{
"name": "ValueError",
"message": "
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\__init__.py\", line 414, in _get_column_indices
idx = _safe_indexing(np.arange(n_columns), key)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\__init__.py\", line 356, in _safe_indexing
return _array_indexing(X, indices, indices_dtype, axis=axis)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\__init__.py\", line 185, in _array_indexing
return array[key] if axis == 0 else array[:, key]
IndexError: index 157 is out of bounds for axis 0 with size 157
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\imblearn\\pipeline.py\", line 293, in fit
Xt, yt = self._fit(X, y, **fit_params_steps)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\imblearn\\pipeline.py\", line 240, in _fit
X, fitted_transformer = fit_transform_one_cached(
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\joblib\\memory.py\", line 349, in __call__
return self.func(*args, **kwargs)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\pipeline.py\", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\_set_output.py\", line 140, in wrapped
data_to_wrap = f(self, X, *args, **kwargs)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\compose\\_column_transformer.py\", line 724, in fit_transform
self._validate_column_callables(X)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\compose\\_column_transformer.py\", line 426, in _validate_column_callables
transformer_to_input_indices[name] = _get_column_indices(X, columns)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\__init__.py\", line 416, in _get_column_indices
raise ValueError(
ValueError: all features must be in [0, 156] or [-157, 0]
",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File g:\\My Drive\\GitHub_Projects\\Balance_data_processing\\Balance_hyperparameter_tuning_v4.py:267
265 # ...
266 if __name__ == '__main__':
--> 267 features_sfs = main()
File g:\\My Drive\\GitHub_Projects\\Balance_data_processing\\Balance_hyperparameter_tuning_v4.py:249
246 [X, y] = read_data()
248 # Sequential feature selection
--> 249 features_sfs = select_sfs(X, y, 'Random_Forest', 'backward', 'roc_auc')
251 # Recursive feature elimination
252 #X_selected = select_rfe(X, y, 'random_forest', 'balanced_accuracy')
253
(...)
260 # To do the gridsearch
261 #grid = grid_search(X_selected, y, 'Random_Forest', 'roc_auc')
263 return features_sfs
File g:\\My Drive\\GitHub_Projects\\Balance_data_processing\\Balance_hyperparameter_tuning_v4.py:126
124 sfs = SequentialFeatureSelector(pipe, n_features_to_select = 20, direction = sfs_direction, scoring = sfs_scoring)
125 # To fit and transform
--> 126 X_transformed = sfs.fit_transform(X, np.ravel(y))
127 # get features
128 features_sfs = list(sfs.get_feature_names_out())
File c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\_set_output.py:140, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
138 @wraps(f)
139 def wrapped(self, X, *args, **kwargs):
--> 140 data_to_wrap = f(self, X, *args, **kwargs)
141 if isinstance(data_to_wrap, tuple):
142 # only wrap the first output for cross decomposition
143 return (
144 _wrap_data_with_container(method, data_to_wrap[0], X, self),
145 *data_to_wrap[1:],
146 )
File c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\base.py:881, in TransformerMixin.fit_transform(self, X, y, **fit_params)
878 return self.fit(X, **fit_params).transform(X)
879 else:
880 # fit method of arity 2 (supervised transformation)
--> 881 return self.fit(X, y, **fit_params).transform(X)
File c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\feature_selection\\_sequential.py:276, in SequentialFeatureSelector.fit(self, X, y)
274 is_auto_select = self.tol is not None and self.n_features_to_select == \"auto\"
275 for _ in range(n_iterations):
--> 276 new_feature_idx, new_score = self._get_best_new_feature_score(
277 cloned_estimator, X, y, current_mask
278 )
279 if is_auto_select and ((new_score - old_score) < self.tol):
280 break
File c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\feature_selection\\_sequential.py:307, in SequentialFeatureSelector._get_best_new_feature_score(self, estimator, X, y, current_mask)
305 candidate_mask = ~candidate_mask
306 X_new = X[:, candidate_mask]
--> 307 scores[feature_idx] = cross_val_score(
308 estimator,
309 X_new,
310 y,
311 cv=self.cv,
312 scoring=self.scoring,
313 n_jobs=self.n_jobs,
314 ).mean()
315 new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
316 return new_feature_idx, scores[new_feature_idx]
File c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:515, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
512 # To ensure multimetric format is not supported
513 scorer = check_scoring(estimator, scoring=scoring)
--> 515 cv_results = cross_validate(
516 estimator=estimator,
517 X=X,
518 y=y,
519 groups=groups,
520 scoring={\"score\": scorer},
521 cv=cv,
522 n_jobs=n_jobs,
523 verbose=verbose,
524 fit_params=fit_params,
525 pre_dispatch=pre_dispatch,
526 error_score=error_score,
527 )
528 return cv_results[\"test_score\"]
File c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:285, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
266 results = parallel(
267 delayed(_fit_and_score)(
268 clone(estimator),
(...)
282 for train, test in cv.split(X, y, groups)
283 )
--> 285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
290 if callable(scoring):
File c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\model_selection\\_validation.py:367, in _warn_or_raise_about_fit_failures(results, error_score)
360 if num_failed_fits == num_fits:
361 all_fits_failed_message = (
362 f\"\
All the {num_fits} fits failed.\
\"
363 \"It is very likely that your model is misconfigured.\
\"
364 \"You can try to debug the error by setting error_score='raise'.\
\
\"
365 f\"Below are more details about the failures:\
{fit_errors_summary}\"
366 )
--> 367 raise ValueError(all_fits_failed_message)
369 else:
370 some_fits_failed_message = (
371 f\"\
{num_failed_fits} fits failed out of a total of {num_fits}.\
\"
372 \"The score on these train-test partitions for these parameters\"
(...)
376 f\"Below are more details about the failures:\
{fit_errors_summary}\"
377 )
ValueError:
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\__init__.py\", line 414, in _get_column_indices
idx = _safe_indexing(np.arange(n_columns), key)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\__init__.py\", line 356, in _safe_indexing
return _array_indexing(X, indices, indices_dtype, axis=axis)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\__init__.py\", line 185, in _array_indexing
return array[key] if axis == 0 else array[:, key]
IndexError: index 157 is out of bounds for axis 0 with size 157
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\imblearn\\pipeline.py\", line 293, in fit
Xt, yt = self._fit(X, y, **fit_params_steps)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\imblearn\\pipeline.py\", line 240, in _fit
X, fitted_transformer = fit_transform_one_cached(
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\joblib\\memory.py\", line 349, in __call__
return self.func(*args, **kwargs)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\pipeline.py\", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\_set_output.py\", line 140, in wrapped
data_to_wrap = f(self, X, *args, **kwargs)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\compose\\_column_transformer.py\", line 724, in fit_transform
self._validate_column_callables(X)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\compose\\_column_transformer.py\", line 426, in _validate_column_callables
transformer_to_input_indices[name] = _get_column_indices(X, columns)
File \"c:\\ProgramData\\Anaconda3\\envs\\env1\\lib\\site-packages\\sklearn\\utils\\__init__.py\", line 416, in _get_column_indices
raise ValueError(
ValueError: all features must be in [0, 156] or [-157, 0]
"
}
Upvotes: 0
Views: 172