How do I train a partitioned dataset when there is not an option for partial fitting?

Question

I am training a ML model from a dataset which contains 10 partitions so that I don't run out of available memory. I am currently training 3 different models on each partition and then putting those into a VotingRegressor which I am then fitting again, however I am unable to fit on the whole training set because of how much memory it uses. Here's a little snipit

all_feature_cols = [f"feature_{i:02d}" for i in range(79)]

if TRAINING:
    # Initialize lists to store models
    lgbm_models = []
    xgb_models = []
    cat_models = []

    # Train each model incrementally
    for partition in range(10):
        start_time = time.time()

        # Create new model instances for each partition
        lgbm = LGBMRegressor(num_leaves=127, n_estimators=200, max_depth=3, 
                            learning_rate=0.05, device_type='gpu', verbose=-1,
                            reg_alpha = 0.1, reg_lambda = 0.1)
        xgb = XGBRegressor(n_estimators=200, min_child_weight=5, max_depth=7, 
                          learning_rate=0.01, device='cpu',
                          reg_alpha = 0.1, reg_lambda = 0.1)
        cat = CatBoostRegressor(n_estimators=200, max_depth=7, learning_rate=0.05, 
                               reg_lambda = 0.1, task_type='GPU', verbose=False)

        # Filter for current partition and collect target separately
        partition_df = df.filter(pl.col("partition_id") == partition)

        # Extract and collect target column before preprocessing
        y = partition_df.select("responder_6").collect().to_numpy().ravel()

        # Preprocess features (excluding target)
        X = partition_df.select(all_feature_cols).collect().to_numpy()

        # Split into train/validation maintaining temporal order
        train_idx = int(len(X) * 0.8)
        X_train, X_test = X[:train_idx], X[train_idx:]
        y_train, y_test = y[:train_idx], y[train_idx:]

        # Train models on this partition
        lgbm.fit(X_train, y_train)
        xgb.fit(X_train, y_train)
        cat.fit(X_train, y_train)

        # Save trained models with partition identifier
        lgbm_models.append((f'lgbm_{partition}', deepcopy(lgbm)))
        xgb_models.append((f'xgb_{partition}', deepcopy(xgb)))
        cat_models.append((f'cat_{partition}', deepcopy(cat)))

        # Calculate elapsed time
        end_time = time.time()
        elapsed_time = end_time - start_time
        elapsed_str = str(timedelta(seconds=int(elapsed_time)))
        print(f"Partition {partition} completed in {elapsed_str}")

        # Clean up memory
        if partition < 9:
            del X, y, X_train, X_test, y_train, y_test, partition_df
        else:
            del X, y, X_train, y_train, partition_df
        gc.collect()

    # Create and fit VotingRegressor with all trained models
    model = VotingRegressor(lgbm_models + xgb_models + cat_models)

    # Fitting model
    model.fit(X_test, y_test)

    y_pred = model.predict(X_test)
    print(r2_score(y_test, y_pred))

    # Save the final model 
    dump(model, "/kaggle/working/JS_model.joblib")
```

How do I train a partitioned dataset when there is not an option for partial fitting?

Answers (0)

Related Questions