Reputation: 11
I'm trying to build a pipeline with my own functions. To do so I inherited BaseEstimator and TransformerMixin from sklearn base and defined my own transform methods.
When I do pipeline.fit(X,y), it works fine.
The problem is when I try to create a GridSearchCV object with the pipeline. I get the following error: ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36).
730 Is just the number of lines of matrix X divided by 'cv' = 2, the number of folds I choose for the cross-validation in the GridSearchCV.
I have no idea how to debug that. I've tried some prints in the middle of my functions, and the result is pretty weird.
I'm attaching the functions I created as well as the pipeline. I'd be really glad if someone could help.
Here are the functions I created for the Pipeline:
from sklearn.base import BaseEstimator, TransformerMixin
class MissingData(BaseEstimator, TransformerMixin):
def fit( self, X, y = None ):
return self
def transform(self, X , y = None, strategies = ( "most_frequent", "mean") ):
print('Started MissingData')
X_ = X.copy()
#Categorical Variables handling
categorical_variables = list(X_.select_dtypes(include=['category','object']))
imp_category = SimpleImputer(strategy = strategies[0])
X_[categorical_variables] = pd.DataFrame(imp_category.fit_transform(X_[categorical_variables]))
#Numeric varialbes handling
numerical_variables = list(set(X_.columns) - set(categorical_variables))
imp_numerical = SimpleImputer(strategy = strategies[1])
X_[numerical_variables] = pd.DataFrame(imp_numerical.fit_transform(X_[numerical_variables]))
print('Finished MissingData')
print('Inf: ',X_.isnull().sum().sum())
return X_
class OHEncode(BaseEstimator, TransformerMixin):
def fit(self, X, y = None ):
return self
def encode_and_drop_original_and_first_dummy(self,df, feature_to_encode):
dummies = pd.get_dummies(df[feature_to_encode] , prefix = feature_to_encode, drop_first=True) #Drop first equals true will take care of the dummies variables trap
res = pd.concat([df, dummies], axis=1)
res = res.drop([feature_to_encode], axis=1)
return(res)
def transform(self, X , y = None, categorical_variables = None ):
X_ = X.copy()
if categorical_variables == None:
categorical_variables = list(X_.select_dtypes(include=['category','object']))
print('Started Encoding')
#Let's update the matrix X with the one hot ecoded version of all features in categorical_variables
for feature_to_encode in categorical_variables:
X_ = self.encode_and_drop_original_and_first_dummy(X_ , feature_to_encode)
print('Finished Encoding')
print('Inf: ',X_.isnull().sum().sum())
return X_
Here is the Pipeline with the GridSearchCV:
pca = PCA(n_components=10)
pipeline = Pipeline([('MissingData', MissingData()), ('OHEncode', OHEncode()) ,
('scaler', StandardScaler()) , ('pca', pca), ('rf', LinearRegression())])
parameters = {'pca__n_components': [5, 15, 30, 45, 64]}
grid = GridSearchCV(pipeline, param_grid=parameters, cv = 2)
grid.fit(X, y)
And finally here is the full output including my prints and the error:
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
Started MissingData
Finished MissingData
Inf: 0
Started Encoding
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:765: RuntimeWarning: invalid value encountered in true_divide
updated_mean = (last_sum + new_sum) / updated_sample_count
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:706: RuntimeWarning: Degrees of freedom <= 0 for slice.
result = op(x, *args, **kwargs)
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
FitFailedWarning)
Finished Encoding
Inf: 0
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-67-f78b56dad89d> in <module>
15
16 #pipeline.set_params(rf__n_estimators = 50)
---> 17 grid.fit(X, y)
18
19 #rf_val_predictions = pipeline.predict(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
710 return results
711
--> 712 self._run_search(evaluate_candidates)
713
714 # For multi-metric evaluation, store the best_index_, best_params_ and
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1151 def _run_search(self, evaluate_candidates):
1152 """Search all candidates in param_grid"""
-> 1153 evaluate_candidates(ParameterGrid(self.param_grid))
1154
1155
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
689 for parameters, (train, test)
690 in product(candidate_params,
--> 691 cv.split(X, y, groups)))
692
693 if len(out) < 1:
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self, iterable)
1005 self._iterating = self._original_iterator is not None
1006
-> 1007 while self.dispatch_one_batch(iterator):
1008 pass
1009
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
589 scores = scorer(estimator, X_test)
590 else:
--> 591 scores = scorer(estimator, X_test, y_test)
592
593 error_msg = ("scoring must return a number, got %s (%s) "
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
87 *args, **kwargs)
88 else:
---> 89 score = scorer(estimator, *args, **kwargs)
90 scores[name] = score
91 return scores
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
369 def _passthrough_scorer(estimator, *args, **kwargs):
370 """Function that wraps estimator.score"""
--> 371 return estimator.score(*args, **kwargs)
372
373
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in score(self, X, y, sample_weight)
611 Xt = X
612 for _, name, transform in self._iter(with_final=False):
--> 613 Xt = transform.transform(Xt)
614 score_params = {}
615 if sample_weight is not None:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_data.py in transform(self, X, copy)
804 else:
805 if self.with_mean:
--> 806 X -= self.mean_
807 if self.with_std:
808 X /= self.scale_
ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36)
Upvotes: 1
Views: 417
Reputation: 5467
The first point, I would should you to use OneHotEncoder
(OHE) class from sklearn. Then, define in the constructor of OHEncode
an object of OHE and fit it with the all categorical values you have (to make them "seen" at each GridSearch iteration). Then in transform
fuction of OHEncode
, apply transform using the object of OHE.
DON'T fit the OHE object Inside the fit
function because then you will have the same error; at each GridSearch iteration, the fit and transform functions are applied.
Upvotes: 1