Reputation: 45
Here is the outline of my Machine Learning / Python project:
Steps 1, 2 and 3 run fine. When I try Step 4 (to get the xgb_pipe to make a prediction on a single observation) that I get the error: ValueError: Feature shape mismatch, expected: 10, got 6.
I believe that this is because One Hot Encoding makes more features for the full dataset than it would for the single observation, but I'm not sure how to get around this.
It's difficult to include the code here, so I've created a smaller dataset and reproduced the issue in a notebook available on this repository.
I have seen an example before where One Hot Encoding was applied in the pipeline, the pipeline was trained on a dataset and then the pipeline was able to make predictions on a single observation. I know it's possible to achieve this, I'm just not sure how..
Upvotes: 0
Views: 190
Reputation: 101
The problem is that when you're looking to predict on a sample of data, the one-hot encoding performed on this test set may not match the one which was learned on your entire training set if the number of distinct categories unique_items_id
in your test_set is less than the one in the training set.
You should therefore retain this number of distinct categories unique_items_id
obtained during training:
if self.training:
# Save unique items identifier only during training phase
self.unique_items_id = ["item_id_" + item for item in clean_df_copy["transformed_item_id"].unique()]
and use it to perform encoding on the test set in transform function with:
one_hot_encoded = one_hot_encoded.T.reindex(self.unique_items_id).T.fillna(0)
Here is the full code:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
# Load the datasets
clean_df = pd.read_csv('sales_train_10.csv')
events_df = pd.read_csv('calendar_events.csv')
class EventTransformer(BaseEstimator, TransformerMixin):
def __init__(self, events_df):
self.events_df = events_df
def fit(self, clean_df, y=None):
return self
def transform(self, clean_df):
clean_df_copy = clean_df.merge(self.events_df, on='date', how='left')
clean_df_copy['event_name'].fillna('NoEvent', inplace=True)
clean_df_copy['event_type'].fillna('NoEvent', inplace=True)
# One-hot encode 'event_name' and 'event_type'
event_name_encoded = pd.get_dummies(clean_df_copy['event_name'], prefix='event_name', dtype=int)
event_type_encoded = pd.get_dummies(clean_df_copy['event_type'], prefix='event_type', dtype=int)
# Concatenate the one-hot encoded columns with the original DataFrame
transformed_df = pd.concat([clean_df_copy, event_name_encoded, event_type_encoded], axis=1)
# Drop the original 'date', 'event_name' and 'event_type' columns
transformed_df.drop(['date', 'event_name', 'event_type'], axis=1, inplace=True)
print("\n-------------------\n EventTransformer\n-------------------\n {0} \n-------------------".format(transformed_df))
return transformed_df
class ItemIdTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.unique_items_id = []
# Initialize a flag to indicate if it's during the learning phase
self.training = False
def fit(self, clean_df, y=None):
# learning phase
self.training = True
return self
def transform(self, clean_df):
clean_df_copy = clean_df.copy()
# extract the last 3 numbers of the item_id e.g. 'HOBBIES_1_001' becomes '001'
clean_df_copy['transformed_item_id'] = clean_df_copy['item_id'].str.extract('_(\d+)$').astype(str)
if self.training:
# Save unique items identifier only during training phase
self.unique_items_id = ["item_id_" + item for item in clean_df_copy["transformed_item_id"].unique()]
self.training = False
# One-hot encode the 'transformed_item_id' column using pd.get_dummies
one_hot_encoded = pd.get_dummies(clean_df_copy['transformed_item_id'], prefix='item_id', dtype=int)
# Reorder the one-hot encoded columns to match the unique items identifier in 'self.unique_items_id'
# If a category doesn't exist in 'self.unique_items_id', fill it with 0 values
one_hot_encoded = one_hot_encoded.T.reindex(self.unique_items_id).T.fillna(0)
# Concatenate the one-hot encoded columns with the original DataFrame
transformed_df = pd.concat([clean_df_copy, one_hot_encoded], axis=1)
# Drop the original 'transformed_item_id' column
transformed_df.drop(['transformed_item_id', 'item_id'], axis=1, inplace=True)
print("\n-------------------\n ItemIdTransformer\n-------------------\n {0} \n-------------------".format(transformed_df))
return transformed_df
cat_transformer = Pipeline(
steps=[
('one_hot_encoder', OneHotEncoder())
]
)
cat_cols = ['dept_id', 'store_id']
# Instantiate transformers
event_transformer = EventTransformer(events_df)
item_id_transformer = ItemIdTransformer()
# Create preprocessor
preprocessor = ColumnTransformer(
transformers=[
('event_transformer', event_transformer, ['date']),
('item_id_transformer', item_id_transformer, ['item_id']),
('cat_transformer', cat_transformer, cat_cols)
]
)
xgb_pipe = Pipeline(
steps=[
('preprocessor', preprocessor),
('xgb', XGBRegressor(objective ='reg:linear', n_estimators = 10))
]
)
target = clean_df['sales']
xgb_pipe.fit(clean_df, target)
By testing on the first line, for example:
# Sample test set
sample_test_set = clean_df.iloc[[0]]
print("-------------------\n sample_test_set\n-------------------\n {0} \n-------------------".format(sample_test_set))
# Use the preprocessor learned from training data
X_test_transformed = xgb_pipe.named_steps['preprocessor'].transform(sample_test_set)
# Use the model to make predictions about test data
predictions = xgb_pipe.named_steps['xgb'].predict(X_test_transformed)
print("\n> predictions : {0}".format(predictions[0]))
The result is as follows:
-------------------
sample_test_set
-------------------
item_id dept_id store_id sales date
0 HOBBIES_1_001 HOBBIES_1 CA_1 0 2012-01-01
-------------------
-------------------
EventTransformer
-------------------
event_name_NewYear event_type_National
0 1 1
-------------------
-------------------
ItemIdTransformer
-------------------
item_id_001 item_id_002 item_id_003 item_id_004 item_id_005
0 1.0 0.0 0.0 0.0 0.0
-------------------
> predictions : 0.016016611829400063
Upvotes: 0