Reputation: 5169
I am testing several regression models using Tidyverse's parsnip. Initially the best performing one is rand_forest(), but after I add null_model(), it is the latter one that is best in terms of RMSE.
All are done after parameter tuning and cross-validated resampling.
Here is the result of null_model()
:
> show_best(null_grid_results, metric = "rmse")
# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.421 10 0.0701 Preprocessor1_Model1
> collect_metrics(null_grid_results) %>%
+ filter(.metric == "rmse") %>%
+ pull(mean) %>% mean()
[1] 0.4209793
And this is the random forest:
> show_best(random_forest_grid_results, metric = "rmse")
# A tibble: 5 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 2971 28 rmse standard 0.420 10 0.0700 Preprocessor1_Model15
2 945 21 rmse standard 0.420 10 0.0703 Preprocessor1_Model16
3 1090 40 rmse standard 0.420 10 0.0701 Preprocessor1_Model25
4 2074 32 rmse standard 0.420 10 0.0702 Preprocessor1_Model13
5 1650 27 rmse standard 0.420 10 0.0698 Preprocessor1_Model10
> collect_metrics(random_forest_grid_results) %>%
+ filter(.metric == "rmse") %>%
+ pull(mean) %>% mean()
[1] 0.4369285
The code snippet I used for performing null_model()
is this:
library(tidyverse)
library(tidymodels)
library(rules)
library(baguette)
tidymodels_prefer()
library(doParallel)
# Skip showing steps for getting:
# prolif_feat_outcome_dat_train
# prolif_feat_outcome_dat_folds
null_model_spec <- null_model() %>%
set_engine("parsnip") %>%
set_mode("regression") %>%
translate()
null_model_feature_preproc_rec <- recipe(prolif_outcome ~ ., data = prolif_feat_outcome_dat_train) %>%
step_zv(all_predictors())
null_model_wflow <- workflow() %>%
add_model(null_model_spec) %>%
add_recipe(null_model_feature_preproc_rec )
null_model_set <- extract_parameter_set_dials(null_model_wflow)
grid_ctrl <- control_grid(
verbose = TRUE,
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
nof_grid <- 25
ptm <- proc.time()
cls <- makePSOCKcluster(parallel::detectCores(logical = FALSE))
registerDoParallel(cls)
set.seed(999)
null_model_grid_results <- null_model_wflow %>%
tune_grid(
param_info = null_model_set,
resamples = prolif_feat_outcome_dat_folds,
grid = nof_grid,
control = grid_ctrl
)
stopCluster(cls)
proc.time() - ptm
show_best(null_model_grid_results, metric = "rmse")
collect_metrics(null_model_grid_results) %>%
filter(.metric == "rmse") %>%
pull(mean) %>% mean()
And this is by rand_forest()
:
random_forest_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression") %>%
translate()
random_forest_feature_preproc_rec <- recipe(prolif_outcome ~ ., data = prolif_feat_outcome_dat_train) %>%
step_zv(all_predictors())
random_forest_wflow <- workflow() %>%
add_model(random_forest_spec) %>%
add_recipe(random_forest_feature_preproc_rec )
random_forest_set <- extract_parameter_set_dials(random_forest_wflow)
grid_ctrl <- control_grid(
verbose = TRUE,
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
nof_grid <- 25
ptm <- proc.time()
cls <- makePSOCKcluster(parallel::detectCores(logical = FALSE))
registerDoParallel(cls)
set.seed(999)
random_forest_grid_results <- random_forest_wflow %>%
tune_grid(
param_info = random_forest_set,
resamples = prolif_feat_outcome_dat_folds,
grid = nof_grid,
control = grid_ctrl
)
stopCluster(cls)
proc.time() - ptm
saveRDS(random_forest_grid_results, file = paste0("/home/ubuntu/storage1/find_best_model_for_prolif_predictions_tidymodels/data/", wanted_dose, ".random_forest_grid_results.rds" ))
show_best(random_forest_grid_results, metric = "rmse")
collect_metrics(random_forest_grid_results) %>%
filter(.metric == "rmse") %>%
pull(mean) %>% mean()
I expected null_model()
perform way worse than rand_forest()
?
My question is why null_model()
perform best?
Is my approach correct? If not what is the correct way to implement it?
Upvotes: 0
Views: 218