Reputation: 1
I am trying to create a parameter grid for a gpboosted model. Currently, I am using the grid tune search method, but I have been reading on Bayesian Optimization in the package rBayesianOptimization. However, it seems that there is no simple way to connect a gpboost model with the bayesian tuning. I am able to get the parameter search to start, however, the value is returned as zero. I figured I would ask here if anyone knows how to better connect these processes.
Here is the code.
Index <- createDataPartition(y = mean_tl_scale$fb_mean_tl_mm, p = 0.75, list = FALSE)
LM_train_tl_mean <- mean_tl_scale[Index, ]
LM_test_tl_mean <- mean_tl_scale[-Index, ]
# Convert data frames to matrices
LM_matrix_train <- data.matrix(LM_train_tl_mean)
LM_matrix_test <- data.matrix(LM_test_tl_mean)
# Extract feature data
features_train <- LM_matrix_train[, !colnames(LM_matrix_train) %in% c("year")]
features_test <- LM_matrix_test[, !colnames(LM_matrix_test) %in% c("year")]
colnames <- colnames(features_test)[-3]
# Train the gpmodel
gp_model <- GPModel(likelihood = "gaussian", cov_function = "exponential", group_data = LM_matrix_train[, c("year")])
boost_data <- gpb.Dataset(data = features_train[, colnames], label = features_train[, "fb_mean_tl_mm"])
gpb_boost <- gpb.Dataset.construct(boost_data)
# Define parameter bounds for Bayesian optimization
bounds <- list(
learning_rate = c(0.05, 0.15),
max_depth = c(5L, 7L),
min_child_weight = c(5L, 7L),
subsample = c(0.3, 0.5),
colsample_bytree = c(0.5, 0.9),
num_iterations = c(800L, 1000L),
lambda_l2 = c(0, 5)
)
# Define the optimization function
opt_func <- function(learning_rate, max_depth, min_child_weight, subsample, colsample_bytree, num_iterations, lambda_l2) {
params <- list(
objective = "regression",
learning_rate = learning_rate,
max_depth = max_depth,
min_child_weight = min_child_weight,
subsample = subsample,
colsample_bytree = colsample_bytree,
num_iterations = num_iterations,
lambda_l2 = lambda_l2
)
cv_result <- gpb.cv(
params = params,
data = gpb_boost,
gp_model = gp_model,
nrounds = 500,
nfold = 5,
verbose = 0,
eval = "rmse"
)
return(list(Score = -min(cv_result$evaluation_log$test_rmse_mean), Pred = NULL))
}
# Run Bayesian optimization
set.seed(68)
opt_result <- BayesianOptimization(
FUN = opt_func,
bounds = bounds,
init_points = 10,
n_iter = 20,
acq = "ei",
kappa = 2.576,
eps = 0.0,
verbose = TRUE
)
# Print the optimum set of parameters
opt_result
# Test the model with the output from Bayesian optimization
best_params <- list(
objective = "regression",
learning_rate = opt_result$Best_Par["learning_rate"],
max_depth = opt_result$Best_Par["max_depth"],
min_child_weight = opt_result$Best_Par["min_child_weight"],
subsample = opt_result$Best_Par["subsample"],
colsample_bytree = opt_result$Best_Par["colsample_bytree"],
num_iterations = opt_result$Best_Par["num_iterations"],
lambda_l2 = opt_result$Best_Par["lambda_l2"]
)
gp_model <- GPModel(group_data = LM_matrix_train[, c("year")], likelihood = "gaussian", cov_function = "exponential")
gpboost_model <- gpboost(
data = boost_data,
gp_model = gp_model,
params = best_params,
verbose = 1
)
Upvotes: 0
Views: 52
Reputation: 151
There is a recently added function called tune.pars.bayesian.optimization
that allows for doing Bayesian optimization using the mlrMBO
R package. Here is an example:
library(mlrMBO)
library(DiceKriging)
library(rgenoud)
source("https://raw.githubusercontent.com/fabsig/GPBoost/master/helpers/R_package_tune_pars_bayesian_optimization.R")# Load required function
# Define search space
# Note: if the best combination found below is close to the bounday for a paramter, you might want to extend the corresponding range
search_space <- list("learning_rate" = c(0.001, 10),
"min_data_in_leaf" = c(1, 1000),
"max_depth" = c(-1, -1), # -1 means no depth limit as we tune 'num_leaves'. Can also additionally tune 'max_depth', e.g., "max_depth" = c(-1, 1, 2, 3, 5, 10)
"num_leaves" = c(2, 2^10),
"lambda_l2" = c(0, 100),
"max_bin" = c(63, min(n,10000)),
"line_search_step_length" = c(TRUE, FALSE))
metric = "mse" # Define metric
if (likelihood %in% c("bernoulli_probit","bernoulli_logit")) {
metric = "binary_logloss"
}
# Note: can also use metric = "test_neg_log_likelihood". For more options, see https://github.com/fabsig/GPBoost/blob/master/docs/Parameters.rst#metric-parameters
gp_model <- GPModel(group_data = group, likelihood = likelihood)
data_train <- gpb.Dataset(data = X, label = y)
# Run parameter optimization using Bayesian optimization and k-fold CV
crit = makeMBOInfillCritCB() # other criterion options: makeMBOInfillCritEI()
opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
data = dataset, gp_model = gp_model,
nfold = 5, nrounds = 1000, early_stopping_rounds = 20,
metric = metric, crit = crit,
cv_seed = 4, verbose_eval = 1)
print(paste0("Best parameters: ", paste0(unlist(lapply(seq_along(opt_params$best_params),
function(y, n, i) { paste0(n[[i]],": ", y[[i]]) }, y=opt_params$best_params,
n=names(opt_params$best_params))), collapse=", ")))
print(paste0("Best number of iterations: ", opt_params$best_iter))
print(paste0("Best score: ", round(opt_params$best_score, digits=3)))
# Alternatively and faster: using manually defined validation data instead of cross-validation
valid_tune_idx <- sample.int(length(y), as.integer(0.2*length(y))) # use 20% of the data as validation data
folds <- list(valid_tune_idx)
opt_params <- tune.pars.bayesian.optimization(search_space = search_space, n_iter = 100,
data = dataset, gp_model = gp_model,
folds = folds, nrounds = 1000, early_stopping_rounds = 20,
metric = metric, crit = crit,
cv_seed = 4, verbose_eval = 1)
Upvotes: 0