Reputation: 79
I am having problems with the tidymodels-tuning that give the error and warning:
prediction from a rank-deficient fit may be misleading
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = ob...
Note 1: I am performing the tuning for both normal CV-fold and spatial-cv fold
Note 2: I wanted to include data but Stack Overflow gives me: Body is limited to 30000 characters; you entered 143552. I can send you the data if you wish!
Defining lm model
lm_fit_spatcv <- fit_resamples(
lm_wf,
resamples = spatial_cv_fold,
control = model.control,
metrics = multi.metric)
Defining glm model
glm_fit_spatcv <- fit_resamples(
glm_wf,
resamples = spatial_cv_fold,
control = model.control,
metrics = multi.metric)
I have looked a bit into it here and here and here but still do not really get what of my pre-processing steps might cause these issues..
Made a reprex
# Loading packages
library(tidyverse)
library(parallelMap)
library(parallelly)
library(parallel)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
library(treesnip)
#> Error in library(treesnip): there is no package called 'treesnip'
library(kknn)
library(kernlab)
#>
#> Attaching package: 'kernlab'
#> The following object is masked from 'package:scales':
#>
#> alpha
#> The following object is masked from 'package:purrr':
#>
#> cross
#> The following object is masked from 'package:ggplot2':
#>
#> alpha
library(ranger)
library(datapasta)
library(spatialsample)
library(stacks)
# DATA
# agrofor.biophys.modelling.data <- read.csv(file = here::here("DATA","agrofor.biophys.modelling.data.csv"))
# Creating sample data
# agrofor.biophys.modelling.data <- agrofor.biophys.modelling.data %>%
# dplyr::slice_sample(n = 100, replace = FALSE) %>%
# as_tibble()
# making a tibble::tribble dataset using dpaste() from the datapasta package
# datapasta::dpasta(agrofor.biophys.modelling.data)
# Here was a tibble::tribble dataset. I can send you the data if you wish!
# Removing observations with NAs from the data
ml.data.clean <- data.table::copy(agrofor.biophys.modelling.data) %>%
drop_na()
ml.data.clean.na.check <- ml.data.clean %>%
select(everything()) %>% # replace to your needs
summarise_all(funs(sum(is.na(.))))
#> Warning: `funs()` was deprecated in dplyr 0.8.0.
#> Please use a list of either functions or lambdas:
#>
#> # Simple named list:
#> list(mean = mean, median = median)
#>
#> # Auto named with `tibble::lst()`:
#> tibble::lst(mean, median)
#>
#> # Using lambdas
#> list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
# Checking for na
#sapply(ml.data.clean.na.check, function(x) sum(is.na(x)))
# STEP 1: Splitting data, defining resampling techniques and setting global model metrics
## Splitting data in training and testing sets
set.seed(234)
# Splitting data
af.split <- initial_split(ml.data.clean, prop = 0.80, strata = logRR)
#> Warning: The number of observations in each quantile is below the recommended
#> threshold of 20. Stratification will be done with 3 breaks instead.
af.train <- training(af.split)
af.test <- testing(af.split)
## Defining resampling techniques
# Re-sample technique(s)
boostrap_df <- bootstraps(af.train, times = 10, strata = logRR)
#> Warning: The number of observations in each quantile is below the recommended
#> threshold of 20. Stratification will be done with 2 breaks instead.
cv_fold <- vfold_cv(af.train, v = 10, repeats = 10)
spatial_cv_fold <- spatial_clustering_cv(af.train, coords = c("Longitude", "Latitude"), v = 20)
## Setting global metrics
# Metrics
multi.metric <- metric_set(rmse, rsq, ccc, mae)
model.control <- control_stack_grid()
# STEP 2: Model recipes - pre-processing steps
# Linear model - lm recipe
lm_recipe <-
recipe(formula = logRR ~ ., data = af.train) %>%
update_role(Site.Type, new_role = "predictor") %>%
update_role(Latitude,
Longitude,
Tree,
new_role = "sample ID") %>%
step_novel(Site.Type, -all_outcomes()) %>%
step_dummy(Site.Type, one_hot = TRUE, naming = partial(dummy_names,sep = "_")) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors(), -all_nominal()) %>%
step_nzv(all_numeric(), -all_outcomes()) %>%
step_corr(all_numeric_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors())
# Generalised linear model recipe
glm_recipe <-
recipe(formula = logRR ~ ., data = af.train) %>%
update_role(Site.Type, new_role = "predictor") %>%
update_role(Latitude,
Longitude,
Tree,
new_role = "sample ID") %>%
step_novel(Site.Type, -all_outcomes()) %>%
step_dummy(Site.Type, one_hot = TRUE, naming = partial(dummy_names,sep = "_")) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors(), -all_nominal()) %>%
step_nzv(all_numeric(), -all_outcomes()) %>%
step_corr(all_numeric_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_lincomb(all_numeric(), -all_outcomes())
# STEP 3: Setting model specifications
lm_model <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
glm_model <- linear_reg(
mode = "regression",
penalty = 0.1,
mixture = 0
) %>%
set_engine("glmnet")
# STEP 4: Defining model workflows
lm_wf <- workflow() %>%
add_model(lm_model) %>%
add_recipe(lm_recipe)
glm_wf <- workflow() %>%
add_model(glm_model) %>%
add_recipe(glm_recipe)
# STEP 5: Model (hyper)-parameter tuning
# Initializing parallel processing
parallelStartSocket(cpus = detectCores())
#> Starting parallelization in mode=socket with cpus=8.
##########################################################################
# Spatial k-fold cross validation
##########################################################################
lm_fit_spatcv <- fit_resamples(
lm_wf,
resamples = spatial_cv_fold,
control = model.control,
metrics = multi.metric)
#> ! Fold01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
#> ! Fold02: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
glm_fit_spatcv <- fit_resamples(
glm_wf,
resamples = spatial_cv_fold,
control = model.control,
metrics = multi.metric)
#> x Fold01: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, we...
#> x Fold02: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y,
#> Warning: All models failed. See the `.notes` column.
##########################################################################
# Normal/random k-fold cross validation (CV-fold)
##########################################################################
lm_fit_cv <- fit_resamples(
lm_wf,
resamples = cv_fold,
control = model.control,
metrics = multi.metric)
#> ! Fold01, Repeat01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
#> ! Fold02, Repeat01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
#> ! Fold03, Repeat01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
glm_fit_cv <- fit_resamples(
glm_wf,
resamples = cv_fold,
control = model.control,
metrics = multi.metric)
#> x Fold01, Repeat01: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, we...
#> x Fold02, Repeat01: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, we...
#> x Fold03, Repeat01: preprocessor 1/1, model 1/1: Error in elnet(xd, is.sparse, ix, jx, y, we...
#> Warning: All models failed. See the `.notes` column.
# Stopping parallel session
parallelStop()
#> Stopped parallelization. All cleaned up.
Created on 2021-09-03 by the reprex package (v2.0.1)
Upvotes: 0
Views: 758
Reputation: 79
Solved!
.. Waste of time. I forgot to dummify these
lm_recipe <-
recipe(formula = logRR ~ ., data = af.train) %>%
update_role(Site.Type, new_role = "predictor") %>%
update_role(PrName,
Out.SubInd,
Out.SubInd.Code,
Product,
Latitude,
Longitude,
Tree,
new_role = "sample ID") %>%
step_novel(Site.Type, -all_outcomes()) %>%
step_dummy(Site.Type, one_hot = TRUE, naming = partial(dummy_names,sep = "_")) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors(), -all_nominal()) %>%
step_nzv(all_numeric(), -all_outcomes()) %>%
step_corr(all_numeric_predictors()) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors())
Upvotes: 1