Rocco Boyd
Rocco Boyd

Reputation: 1

GPBoost and Bayesian optimization

I am trying to create a parameter grid for a gpboosted model. Currently, I am using the grid tune search method, but I have been reading on Bayesian Optimization in the package rBayesianOptimization. However, it seems that there is no simple way to connect a gpboost model with the bayesian tuning. I am able to get the parameter search to start, however, the value is returned as zero. I figured I would ask here if anyone knows how to better connect these processes.

Here is the code.

Index <- createDataPartition(y = mean_tl_scale$fb_mean_tl_mm, p = 0.75, list = FALSE)
LM_train_tl_mean <- mean_tl_scale[Index, ]
LM_test_tl_mean <- mean_tl_scale[-Index, ]
# Convert data frames to matrices
LM_matrix_train <- data.matrix(LM_train_tl_mean)
LM_matrix_test <- data.matrix(LM_test_tl_mean)
# Extract feature data
features_train <- LM_matrix_train[, !colnames(LM_matrix_train) %in% c("year")]
features_test <- LM_matrix_test[, !colnames(LM_matrix_test) %in% c("year")]
colnames <- colnames(features_test)[-3]
# Train the gpmodel
gp_model <- GPModel(likelihood = "gaussian", cov_function = "exponential", group_data = LM_matrix_train[, c("year")])
boost_data <- gpb.Dataset(data = features_train[, colnames], label = features_train[, "fb_mean_tl_mm"])
gpb_boost <- gpb.Dataset.construct(boost_data)
# Define parameter bounds for Bayesian optimization
bounds <- list(
  learning_rate = c(0.05, 0.15),
  max_depth = c(5L, 7L),
  min_child_weight = c(5L, 7L),
  subsample = c(0.3, 0.5),
  colsample_bytree = c(0.5, 0.9),
  num_iterations = c(800L, 1000L),
  lambda_l2 = c(0, 5)
)
# Define the optimization function
opt_func <- function(learning_rate, max_depth, min_child_weight, subsample, colsample_bytree, num_iterations, lambda_l2) {
  params <- list(
    objective = "regression",
    learning_rate = learning_rate,
    max_depth = max_depth,
    min_child_weight = min_child_weight,
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    num_iterations = num_iterations,
    lambda_l2 = lambda_l2
  )
  cv_result <- gpb.cv(
    params = params,
    data = gpb_boost,
    gp_model = gp_model,
    nrounds = 500,
    nfold = 5,
    verbose = 0,
    eval = "rmse"
  )
  
  return(list(Score = -min(cv_result$evaluation_log$test_rmse_mean), Pred = NULL))
}
# Run Bayesian optimization
set.seed(68)
opt_result <- BayesianOptimization(
  FUN = opt_func,
  bounds = bounds,
  init_points = 10,
  n_iter = 20,
  acq = "ei",
  kappa = 2.576,
  eps = 0.0,
  verbose = TRUE
)
# Print the optimum set of parameters
opt_result
# Test the model with the output from Bayesian optimization
best_params <- list(
  objective = "regression",
  learning_rate = opt_result$Best_Par["learning_rate"],
  max_depth = opt_result$Best_Par["max_depth"],
  min_child_weight = opt_result$Best_Par["min_child_weight"],
  subsample = opt_result$Best_Par["subsample"],
  colsample_bytree = opt_result$Best_Par["colsample_bytree"],
  num_iterations = opt_result$Best_Par["num_iterations"],
  lambda_l2 = opt_result$Best_Par["lambda_l2"]
)
gp_model <- GPModel(group_data = LM_matrix_train[, c("year")], likelihood = "gaussian", cov_function = "exponential")
gpboost_model <- gpboost(
  data = boost_data,
  gp_model = gp_model,
  params = best_params,
  verbose = 1
)

Upvotes: 0

Views: 52

Answers (1)

fabsig
fabsig

Reputation: 151

There is a recently added function called tune.pars.bayesian.optimization that allows for doing Bayesian optimization using the mlrMBO R package. Here is an example:

library(mlrMBO)
library(DiceKriging)
library(rgenoud)
source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_tune_pars_bayesian_optimization.R")# Load required function
# Define search space
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
search_space <- list("learning_rate" = c(0.001, 10), 
                     "min_data_in_leaf" = c(1, 1000),
                     "max_depth" = c(-1, -1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
                     "num_leaves" = c(2, 2^10),
                     "lambda_l2" = c(0, 100),
                     "max_bin" = c(63, min(n,10000)),
                     "line_search_step_length" = c(TRUE, FALSE))
metric = "mse" # Define metric
if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) {
  metric = "binary_logloss"
}
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
gp_model <- GPModel(group_data = group, likelihood = likelihood)
data_train <- gpb.Dataset(data = X, label = y)
# Run parameter optimization using Bayesian optimization and k-fold CV 
crit = makeMBOInfillCritCB() # other criterion options: makeMBOInfillCritEI()
opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
                                              data = dataset, gp_model = gp_model,
                                              nfold = 5, nrounds = 1000, early_stopping_rounds = 20,
                                              metric = metric, crit = crit,
                                              cv_seed = 4, verbose_eval = 1)
print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params), 
                                  function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params, 
                                  n=names(opt_params$best_params))), collapse=", ")))
print(paste0("Best number of iterations: ", opt_params$best_iter))
print(paste0("Best score: ", round(opt_params$best_score, digits=3)))

# Alternatively and faster: using manually defined validation data instead of cross-validation
valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data
folds <- list(valid_tune_idx)
opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
                                              data = dataset, gp_model = gp_model,
                                              folds = folds, nrounds = 1000, early_stopping_rounds = 20,
                                              metric = metric, crit = crit, 
                                              cv_seed = 4, verbose_eval = 1)

Upvotes: 0

Related Questions