Reputation: 369
I am very new to Python and ML. I have been doing few courses from Kaggle and working on pipelines.Everything seemed to work fine without the pipelines but got XGBoostError when I piped it all. I have an issue with my code but I cannot figure it out. Here below is the code and the error after:
X_full = pd.read_csv(train_path).copy()
X_test = pd.read_csv(test_path).copy()
def cleaning(var):
q1, q3 = np.percentile(var['Fare'], [25, 75])
iqr = q3 - q1
lower_bound_val = q1 - (1.5 * iqr)
upper_bound_val = q3 + (1.5 * iqr)
var = var[(var['Fare'] >= lower_bound_val) & (var['Fare'] < upper_bound_val)].copy()
var['family_size'] = var.SibSp + var.Parch
drop_cols = ['PassengerId', 'Name', 'Parch', 'SibSp', 'Ticket', 'Cabin', 'Embarked']
var = var.drop(drop_cols, axis=1)
return var
get_cleaning = FunctionTransformer(cleaning, validate=False)
age_transformer = SimpleImputer(missing_values=np.nan, strategy='median')
age_col = ['Age']
sex_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
sex_col = ['Sex']
# Define the model
xgboost_m = XGBRegressor(random_state=0)
prepro_col = ColumnTransformer(
transformers=[
('age', age_transformer, age_col),
('sex', sex_transformer, sex_col)
])
pl = Pipeline(steps=[('get_cleaning', get_cleaning),
('prepro_col', prepro_col),
('XGBoost', xgboost_m)
])
# Drop assign target to y and drop from X_full
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)
# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)
pl.fit(X_train, y_train)
And here the error:
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-887-676d922c8ba5> in <module>
----> 1 pl.fit(X_train, y_train)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
333 if self._final_estimator != 'passthrough':
334 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335 self._final_estimator.fit(Xt, y, **fit_params_last_step)
336
337 return self
/opt/conda/lib/python3.7/site-packages/xgboost/sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, callbacks)
546 obj=obj, feval=feval,
547 verbose_eval=verbose, xgb_model=xgb_model,
--> 548 callbacks=callbacks)
549
550 if evals_result:
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks)
210 evals=evals,
211 obj=obj, feval=feval,
--> 212 xgb_model=xgb_model, callbacks=callbacks)
213
214
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
73 # Skip the first update if it is a recovery step.
74 if version % 2 == 0:
---> 75 bst.update(dtrain, i, obj)
76 bst.save_rabit_checkpoint()
77 version += 1
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
1159 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
1160 ctypes.c_int(iteration),
-> 1161 dtrain.handle))
1162 else:
1163 pred = self.predict(dtrain, output_margin=True, training=True)
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in _check_call(ret)
186 """
187 if ret != 0:
--> 188 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
189
190
XGBoostError: [22:28:42] ../src/data/data.cc:530: Check failed: labels_.Size() == num_row_ (712 vs. 622) : Size of labels must equal to number of rows.
Stack trace:
[bt] (0) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xa5dc4) [0x7f27232f2dc4]
[bt] (1) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x106c92) [0x7f2723353c92]
[bt] (2) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1a84b7) [0x7f27233f54b7]
[bt] (3) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1aae4e) [0x7f27233f7e4e]
[bt] (4) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x55) [0x7f27232e4f35]
[bt] (5) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f2783ff0630]
[bt] (6) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f2783feffed]
[bt] (7) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f278323c60e]
[bt] (8) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x13044) [0x7f278323d044]
Upvotes: 1
Views: 422
Reputation: 2782
The error indicates that, labels_.Size() == num_row_ (712 vs. 622)
, your have 622 rows and 712 label, that isn't equal. Check your dataset and try again. In your dataset y = X_full.Survived
is label/ Target Output.
Upvotes: 1