Reputation: 3
The code below runs SARIMAX to predict product demand by customer. First, the product types by customer are considered as a single combination, and then STL decomposition is performed on each combination, and the residuals from the decomposition results are used as dependent variables. Next, various variables are used as explanatory variables to run SARIMAX, and finally, the final demand is generated by multiplying the SARIMAX results by seasonality and trend. However, when I do this, the predicted value is set to be significantly higher than the actual value. Is there something wrong with the code? The data is normal, and there are no particular outliers
# SARIMAX
import time
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.seasonal import STL
scaler = MinMaxScaler()
residual_logs_dict = {}
trend_logs_dict = {}
seasonal_logs_dict = {}
exog_train_dict = {}
exog_test_dict = {}
results_table = []
# Grouping
for group_key, train_group in train_grouped:
test_group = test_grouped.get_group(group_key) if group_key in test_grouped.groups else None
if test_group is None:
continue
industry, customer, box_type = group_key
train_agg = train_group.groupby('ARV_REQ_DAT')['REQ_QTY'].sum().reindex(
pd.date_range(start=train_data['ARV_REQ_DAT'].min(), end=train_data['ARV_REQ_DAT'].max(), freq='D'), fill_value=0
).sort_index()
test_agg = test_group.groupby('ARV_REQ_DAT')['REQ_QTY'].sum().reindex(
pd.date_range(start=test_data['ARV_REQ_DAT'].min(), end=test_data['ARV_REQ_DAT'].max(), freq='D'), fill_value=0
).sort_index()
train_agg.fillna(0, inplace=True)
test_agg.fillna(0, inplace=True)
train_agg_log = np.log1p(train_agg)
stl = STL(train_agg_log, period=90, seasonal=15, robust=True).fit()
trend_logs_dict[group_key] = stl.trend
seasonal_logs_dict[group_key] = stl.seasonal
residual_logs_dict[group_key] = stl.resid
# explantory variables (train)
exog_train = pd.DataFrame({
'IS_WEEKDAY': train_group.groupby('ARV_REQ_DAT')['IS_WEEKDAY'].mean().reindex(train_agg.index, fill_value=0),
'IS_HOLIDAY': train_group.groupby('ARV_REQ_DAT')['IS_HOLIDAY'].mean().reindex(train_agg.index, fill_value=0),
'IS_HOLIDAY2_PERIOD': train_group.groupby('ARV_REQ_DAT')
}).reindex(train_agg.index, fill_value=0)
# explantory variables (test)
exog_test = pd.DataFrame({
'Industry_Trend': industry_trend_scaled_test,
'Industry_Seasonal': industry_seasonal_scaled_test,
'IS_WEEKDAY': test_group.groupby('ARV_REQ_DAT')['IS_WEEKDAY'].mean().reindex(test_agg.index, fill_value=0),
'IS_HOLIDAY': test_group.groupby('ARV_REQ_DAT')['IS_HOLIDAY'].mean().reindex(test_agg.index, fill_value=0),
}).reindex(test_agg.index, fill_value=0)
exog_train_dict[group_key] = exog_train
exog_test_dict[group_key] = exog_test
# Training and forecasting
sarimax_models = {}
predicted_residuals_dict = {}
for group_key in residual_logs_dict.keys():
residual_log = residual_logs_dict[group_key]
exog_train = exog_train_dict[group_key]
exog_test = exog_test_dict[group_key]
try:
sarimax_model = SARIMAX(
residual_log, exog=exog_train, order=(1, 1, 1),
seasonal_order=(1, 1, 1, 90), enforce_stationarity=False, enforce_invertibility=False
).fit(disp=False)
sarimax_models[group_key] = sarimax_model
predicted_residuals = sarimax_model.predict(
start=len(residual_log), end=len(residual_log) + len(exog_test) - 1, exog=exog_test
)
predicted_residuals_dict[group_key] = predicted_residuals
except Exception as e:
print(f"Error in group {group_key}: {e}")
final_demand_dict = {}
for group_key, predicted_residuals in predicted_residuals_dict.items():
trend_log = trend_logs_dict[group_key].reindex_like(predicted_residuals)
seasonal_log = seasonal_logs_dict[group_key].reindex_like(predicted_residuals)
log_final_forecast = trend_log + seasonal_log + predicted_residuals
final_forecast = np.expm1(log_final_forecast)
Upvotes: 0
Views: 28