Reputation: 61
I am training a ML model from a dataset which contains 10 partitions so that I don't run out of available memory. I am currently training 3 different models on each partition and then putting those into a VotingRegressor which I am then fitting again, however I am unable to fit on the whole training set because of how much memory it uses. Here's a little snipit
all_feature_cols = [f"feature_{i:02d}" for i in range(79)]
if TRAINING:
# Initialize lists to store models
lgbm_models = []
xgb_models = []
cat_models = []
# Train each model incrementally
for partition in range(10):
start_time = time.time()
# Create new model instances for each partition
lgbm = LGBMRegressor(num_leaves=127, n_estimators=200, max_depth=3,
learning_rate=0.05, device_type='gpu', verbose=-1,
reg_alpha = 0.1, reg_lambda = 0.1)
xgb = XGBRegressor(n_estimators=200, min_child_weight=5, max_depth=7,
learning_rate=0.01, device='cpu',
reg_alpha = 0.1, reg_lambda = 0.1)
cat = CatBoostRegressor(n_estimators=200, max_depth=7, learning_rate=0.05,
reg_lambda = 0.1, task_type='GPU', verbose=False)
# Filter for current partition and collect target separately
partition_df = df.filter(pl.col("partition_id") == partition)
# Extract and collect target column before preprocessing
y = partition_df.select("responder_6").collect().to_numpy().ravel()
# Preprocess features (excluding target)
X = partition_df.select(all_feature_cols).collect().to_numpy()
# Split into train/validation maintaining temporal order
train_idx = int(len(X) * 0.8)
X_train, X_test = X[:train_idx], X[train_idx:]
y_train, y_test = y[:train_idx], y[train_idx:]
# Train models on this partition
lgbm.fit(X_train, y_train)
xgb.fit(X_train, y_train)
cat.fit(X_train, y_train)
# Save trained models with partition identifier
lgbm_models.append((f'lgbm_{partition}', deepcopy(lgbm)))
xgb_models.append((f'xgb_{partition}', deepcopy(xgb)))
cat_models.append((f'cat_{partition}', deepcopy(cat)))
# Calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time
elapsed_str = str(timedelta(seconds=int(elapsed_time)))
print(f"Partition {partition} completed in {elapsed_str}")
# Clean up memory
if partition < 9:
del X, y, X_train, X_test, y_train, y_test, partition_df
else:
del X, y, X_train, y_train, partition_df
gc.collect()
# Create and fit VotingRegressor with all trained models
model = VotingRegressor(lgbm_models + xgb_models + cat_models)
# Fitting model
model.fit(X_test, y_test)
y_pred = model.predict(X_test)
print(r2_score(y_test, y_pred))
# Save the final model
dump(model, "/kaggle/working/JS_model.joblib")
```
Upvotes: 0
Views: 35