Đỗ Nhật Nam
Đỗ Nhật Nam

Reputation: 11

ERROR: raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") when using GridSearchCV

I tried to use GridSearchCV to train pipeline model but I met this error. When I trained without GridSearchCV, there is no error, so I don't understand what is the problem.

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, x):
        transformed_data = x.toarray()
        feature_names = preprocessor_1.get_feature_names_out()
        df_transformed = pd.DataFrame(x.toarray(), columns=preprocessor_1.get_feature_names_out())

        if transformed_data.shape[1] != len(feature_names):
          raise ValueError(f"Mismatch: Data has {transformed_data.shape[1]} columns but {len(feature_names)} feature names.")
    
        df_transformed['total_room'] = df_transformed['num_dcr__bedrooms'] + df_transformed['num_dcr__bathrooms']
        df_transformed['total_room_add_floors'] = df_transformed['total_room'] + df_transformed['num_dcr__floors']
        df_transformed['bedrooms_multi_area'] = df_transformed['num_dcr__bedrooms'] * df_transformed['num_cont__area']
        df_transformed['bathrooms_multi_area'] = df_transformed['num_dcr__bathrooms'] * df_transformed['num_cont__area']
        df_transformed['area_floors'] = df_transformed['num_cont__area'] * df_transformed['num_dcr__floors']
        print(df_transformed.shape)
        return df_transformed

num_dcr_cols = ['bathrooms', 'bedrooms', 'floors']
num_cont_cols = ['area', 'frontage', 'access_road']
cat_nom_cols = ['legal_status', 'province']
cat_ord_cols = ['furniture_state']
comp_fe = ['bathrooms', 'bedrooms', 'floors', 'area']

cat_ord = ["Full", "Basic", "No furniture"]

# descret cols
num_dcr_transformer = Pipeline(steps=[
    ('imputer', dcr_imputer),
    ('scaler', StandardScaler())
])

# continious cols
num_cont_transformer = Pipeline(steps=[
    ('imputer', cont_imputer),
    ('scaler', StandardScaler())
])

# nominal cols
cat_nom_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# ordinal cols
cat_ord_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder(categories=[cat_ord]))
])

# new cols
fe = Pipeline(steps=[
    ('fe', FeatureEngineering())
])

# ColumnTransformer
preprocessor_1 = ColumnTransformer(
    transformers=[
        ('cat_nom', cat_nom_transformer, cat_nom_cols),
        ('cat_ord', cat_ord_transformer, cat_ord_cols),
        ('num_dcr', num_dcr_transformer, num_dcr_cols),
        ('num_cont', num_cont_transformer, num_cont_cols),
    ])

preprocessor_2 = Pipeline(steps=[
    ('preprocessor_1', preprocessor_1),
    ('fe', fe)
])

pipeline_rf = Pipeline(
    steps=[
        ('preprocessor_2', preprocessor_2),
        ('rf', RandomForestRegressor(random_state=1))
    ]
)

params = {
    "rf__n_estimators": [50,100],
    "rf__max_depth": [10,20],
    "rf__min_samples_split": [5, 10]
}

model_rf_cv = GridSearchCV(pipeline_rf, param_grid=params, cv=3, n_jobs=-1,verbose=4, scoring="r2")
model_rf_cv.fit(x_train, y_train)
y_pred = model_rf_cv.predict(x_test)
print("The best param: ", model_rf_cv.best_params_)
print("Performance on validation set: ", model_rf_cv.best_score_)
print("Perfromance on test set: ", r2_score(y_test, y_pred))

This is my error:

raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") ValueError: Shape of passed values is (12610, 54), indices imply (12610, 55)

I tried train without GridSearchCV and it worked well. I do not know where the error come from, this is my data link: Link data

Upvotes: 1

Views: 29

Answers (0)

Related Questions