Angelika
Angelika

Reputation: 216

Using Optuna for CatBoost with batches: got nan on second trial

I am trying to tune CatBoost's hyperparameters using Optuna. I need to train my CatBoost model using batches, because training data is too big.

Here is my code:

def expand_embeddings(df, embedding_col="embeddings"):
    embeddings = np.array(df[embedding_col].to_list(), dtype=np.float32)
    other_features = df.drop(columns=[embedding_col]).to_numpy(dtype=np.float32)
    return np.hstack([other_features, embeddings])

def batch_generator(df, target_col, batch_size):
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        y = batch[target_col].to_numpy(dtype=np.float32)
        X = batch.drop(columns=[target_col])
        X = expand_embeddings(X)
        yield X, y

train_data, val_data = train_test_split(result, test_size=0.1, random_state=42)
num_batches = 1300
batch_size_train = math.ceil(train_data.shape[0] / num_batches)
batch_size_test = math.ceil(val_data.shape[0] / num_batches)
train_batches_regressor = batch_generator(train_data, target_col="weight", batch_size=batch_size_train)
val_batches_regressor = batch_generator(val_data, target_col="weight", batch_size=batch_size_test)

def objective_regressor(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10),
        'eval_metric': 'RMSE'}

    model = CatBoostRegressor(
        **params,
        task_type = 'CPU',
        random_seed=42,
        verbose=0)

    rmse = []

    iterations = 0

    for X_batch, y_batch in tqdm(train_batches_regressor):
        X_val_batch, y_val_batch = next(val_batches_regressor)
        if iterations == 0:
            model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch), 
                      use_best_model=True, verbose=0)
        else:
            model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch), 
                      use_best_model=True, verbose=0, init_model=model)
        y_pred = model.predict(X_val_batch)
        rmse.append(mean_squared_error(y_val_batch, y_pred))
        iterations += 1
    
    return np.mean(rmse)

study_regressor = optuna.create_study(direction='minimize')
study_regressor.optimize(objective_regressor, n_trials=20)

First (zero) trial goes fine, but for next I got following error:

/opt/anaconda3/lib/python3.12/site-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.12/site-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in scalar divide
  ret = ret.dtype.type(ret / rcount)
[W 2024-11-29 05:47:52,256] Trial 1 failed with parameters: {'iterations': 1053, 'depth': 9, 'learning_rate': 0.03843036523508586, 'l2_leaf_reg': 7.883891260457, 'bagging_temperature': 0.5680668697003115, 'random_strength': 5.111730514165936} because of the following error: The value nan is not acceptable.
[W 2024-11-29 05:47:52,256] Trial 1 failed with value nan.

How should I tune parameters correctly?

Upvotes: 0

Views: 65

Answers (2)

Angelika
Angelika

Reputation: 216

It turned out that the problem was not in the "Optuna" library. I used generator for getting batches and created instance outside the trial function. After first trial generator returns NaN values.

Correct code is:

def expand_embeddings(df, embedding_col="embeddings"):
    embeddings = np.array(df[embedding_col].to_list(), dtype=np.float32)
    other_features = df.drop(columns=[embedding_col]).to_numpy(dtype=np.float32)
    return np.hstack([other_features, embeddings])

def batch_generator(df, target_col, batch_size):
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        y = batch[target_col].to_numpy(dtype=np.float32)
        X = batch.drop(columns=[target_col])
        X = expand_embeddings(X)
        yield X, y

train_data, val_data = train_test_split(result, test_size=0.1, random_state=42)
num_batches = 1300
batch_size_train = math.ceil(train_data.shape[0] / num_batches)
batch_size_test = math.ceil(val_data.shape[0] / num_batches)

def objective_regressor(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10),
        'eval_metric': 'RMSE'}

    model = CatBoostRegressor(
        **params,
        task_type = 'CPU',
        random_seed=42,
        verbose=0)

    rmse = []

    # creating generators inside func
    train_batches_regressor = batch_generator(train_data, target_col="weight", 
    batch_size=batch_size_train)
    val_batches_regressor = batch_generator(val_data, target_col="weight", 
    batch_size=batch_size_test)

    iterations = 0

    for X_batch, y_batch in tqdm(train_batches_regressor):
        X_val_batch, y_val_batch = next(val_batches_regressor)
        if iterations == 0:
            model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch), 
                      use_best_model=True, verbose=0)
        else:
            model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch), 
                      use_best_model=True, verbose=0, init_model=model)
        y_pred = model.predict(X_val_batch)
        rmse.append(mean_squared_error(y_val_batch, y_pred))
        iterations += 1
    
    return np.mean(rmse)

study_regressor = optuna.create_study(direction='minimize')
study_regressor.optimize(objective_regressor, n_trials=20)

Upvotes: 0

WBLord
WBLord

Reputation: 1025

You might encounter a nan error when calculating RMSE if your training batches run out of data or if there are issues with your validation batches.

try something like that

def objective_regressor(trial):
    # Define hyperparameters to tune
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10),
        'eval_metric': 'RMSE'
    }

    # Initialize the CatBoostRegressor with the parameters
    model = CatBoostRegressor(**params, task_type='CPU', random_seed=42, verbose=0)
    rmse = []  # Initialize a list to store RMSE values

    # Iterate through training batches
    for X_batch, y_batch in tqdm(train_batches_regressor):
        val_batches_regressor = batch_generator(val_data, target_col="weight", batch_size=batch_size_test)
        
        try:
            # Get the next validation batch
            X_val_batch, y_val_batch = next(val_batches_regressor)
        except StopIteration:
            break  # Exit if there are no more validation batches

        # Fit the model on the current training batch and evaluate on the validation batch
        model.fit(X_batch, y_batch, eval_set=(X_val_batch, y_val_batch), use_best_model=True, verbose=0)
        
        # Make predictions on the validation batch
        y_pred = model.predict(X_val_batch)
        
        # Calculate and store RMSE for the current validation batch
        rmse.append(mean_squared_error(y_val_batch, y_pred))

    # Return the mean RMSE; if no RMSE was computed, return infinity
    return np.mean(rmse) if rmse else float('inf')

Make sure your generators work correctly and that RMSE is not empty

Upvotes: -1

Related Questions