Reputation: 11
I tried to use GridSearchCV
to train pipeline model but I met this error. When I trained without GridSearchCV
, there is no error, so I don't understand what is the problem.
class FeatureEngineering(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, x):
transformed_data = x.toarray()
feature_names = preprocessor_1.get_feature_names_out()
df_transformed = pd.DataFrame(x.toarray(), columns=preprocessor_1.get_feature_names_out())
if transformed_data.shape[1] != len(feature_names):
raise ValueError(f"Mismatch: Data has {transformed_data.shape[1]} columns but {len(feature_names)} feature names.")
df_transformed['total_room'] = df_transformed['num_dcr__bedrooms'] + df_transformed['num_dcr__bathrooms']
df_transformed['total_room_add_floors'] = df_transformed['total_room'] + df_transformed['num_dcr__floors']
df_transformed['bedrooms_multi_area'] = df_transformed['num_dcr__bedrooms'] * df_transformed['num_cont__area']
df_transformed['bathrooms_multi_area'] = df_transformed['num_dcr__bathrooms'] * df_transformed['num_cont__area']
df_transformed['area_floors'] = df_transformed['num_cont__area'] * df_transformed['num_dcr__floors']
print(df_transformed.shape)
return df_transformed
num_dcr_cols = ['bathrooms', 'bedrooms', 'floors']
num_cont_cols = ['area', 'frontage', 'access_road']
cat_nom_cols = ['legal_status', 'province']
cat_ord_cols = ['furniture_state']
comp_fe = ['bathrooms', 'bedrooms', 'floors', 'area']
cat_ord = ["Full", "Basic", "No furniture"]
# descret cols
num_dcr_transformer = Pipeline(steps=[
('imputer', dcr_imputer),
('scaler', StandardScaler())
])
# continious cols
num_cont_transformer = Pipeline(steps=[
('imputer', cont_imputer),
('scaler', StandardScaler())
])
# nominal cols
cat_nom_transformer = Pipeline(steps=[
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
# ordinal cols
cat_ord_transformer = Pipeline(steps=[
('encoder', OrdinalEncoder(categories=[cat_ord]))
])
# new cols
fe = Pipeline(steps=[
('fe', FeatureEngineering())
])
# ColumnTransformer
preprocessor_1 = ColumnTransformer(
transformers=[
('cat_nom', cat_nom_transformer, cat_nom_cols),
('cat_ord', cat_ord_transformer, cat_ord_cols),
('num_dcr', num_dcr_transformer, num_dcr_cols),
('num_cont', num_cont_transformer, num_cont_cols),
])
preprocessor_2 = Pipeline(steps=[
('preprocessor_1', preprocessor_1),
('fe', fe)
])
pipeline_rf = Pipeline(
steps=[
('preprocessor_2', preprocessor_2),
('rf', RandomForestRegressor(random_state=1))
]
)
params = {
"rf__n_estimators": [50,100],
"rf__max_depth": [10,20],
"rf__min_samples_split": [5, 10]
}
model_rf_cv = GridSearchCV(pipeline_rf, param_grid=params, cv=3, n_jobs=-1,verbose=4, scoring="r2")
model_rf_cv.fit(x_train, y_train)
y_pred = model_rf_cv.predict(x_test)
print("The best param: ", model_rf_cv.best_params_)
print("Performance on validation set: ", model_rf_cv.best_score_)
print("Perfromance on test set: ", r2_score(y_test, y_pred))
This is my error:
raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") ValueError: Shape of passed values is (12610, 54), indices imply (12610, 55)
I tried train without GridSearchCV
and it worked well. I do not know where the error come from, this is my data link:
Link data
Upvotes: 1
Views: 29