Dat Vuong
Dat Vuong

Reputation: 27

How to fix this error: Recipes fail to load in Caret:: Train?

I have this problem when load recipes into caret:: train

There something wrong with the NA imputation, but I don't know how to fix it. If I remove the cross validation everything work fine.

Thanks in advance,

data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7) 
air_train <- training(air_split)
air_test <- testing(air_split)

# Feature engineering - final recipe
air_recipe <- recipe(Ozone ~ ., data = air_train) %>% 
  step_zv(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>% 
  step_log(Ozone, Wind) %>%
  step_other(Day, threshold = 0.01, other = "other") %>%
  step_dummy(all_nominal(), -all_outcomes())


# Validation
cv5 <- trainControl( method = "repeatedcv", 
                     number = 5,
                     repeats = 5, allowParallel = TRUE)

# Fit an lm model
set.seed(12) 
lm_fit <- train(
  air_recipe,
  data = air_train, 
  method = "lm", 
  trControl = cv5, 
  metric = "RMSE")

Error message

Error in quantile.default(y, probs = seq(0, 1, length = cuts)) : missing values and NaN's not allowed if 'na.rm' is FALSE

R.version _
platform x86_64-apple-darwin15.6.0
arch x86_64
os darwin15.6.0
system x86_64, darwin15.6.0
status
major 3
minor 6.1
year 2019
month 07
day 05
svn rev 76782
language R
version.string R version 3.6.1 (2019-07-05) nickname Action of the Toes

Upvotes: 0

Views: 165

Answers (1)

paqmo
paqmo

Reputation: 3739

Looks like the resamples are made before the recipe is applied.

So you could prep and juice the recipe and use the formula method:

library(recipes)
library(caret)
library(rsample)

data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7) 
air_train <- training(air_split)
air_test <- testing(air_split)

# Feature engineering - final recipe
air_recipe <- recipe(Ozone ~ ., data = air_train) %>% 
  step_zv(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>% 
  step_log(Ozone, Wind) %>%
  step_other(Day, threshold = 0.01, other = "other") %>%
  step_dummy(all_nominal(), -all_outcomes()) %>% 
  step_naomit(all_outcomes(),all_predictors())

# Prep recipe
air_prep <- prep(air_recipe, retain = TRUE)

# Juice the prepared recipe 
air_train <- juice(air_prep)

# Validation
cv5 <- trainControl( method = "repeatedcv", 
                     number = 5,
                     repeats = 5, allowParallel = TRUE)


# Fit an lm model
set.seed(12) 
lm_fit <- train(
  Ozone ~ .,
  data = air_train, 
  method = "lm", 
  trControl = cv5, 
  metric = "RMSE")

lm_fit
#> Linear Regression 
#> 
#> 108 samples
#>   5 predictor
#> 
#> No pre-processing
#> Resampling: Cross-Validated (5 fold, repeated 5 times) 
#> Summary of sample sizes: 86, 88, 86, 86, 86, 86, ... 
#> Resampling results:
#> 
#>   RMSE       Rsquared   MAE      
#>   0.5091496  0.6568485  0.3793589
#> 
#> Tuning parameter 'intercept' was held constant at a value of TRUE

Alternatively, you could use {parsnip} and {tune} to keep everything in the tidymodels idiom:

library(recipes)
library(rsample)
library(parsnip)
library(tune)
library(yardstick)

data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7) 
air_train <- training(air_split)
air_test <- testing(air_split)

air_recipe <- recipe(Ozone ~ ., data = air_train) %>% 
  step_zv(all_predictors()) %>% 
  step_nzv(all_predictors()) %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>% 
  step_log(Ozone, Wind) %>%
  step_other(Day, threshold = 0.01, other = "other") %>%
  step_dummy(all_nominal(), -all_outcomes()) %>% 
  step_naomit(all_outcomes(),all_predictors())

air_cv <- vfold_cv(air_train, v = 5, repeats = 5)

lm_mod <- linear_reg() %>% set_engine("lm")

lm_fits <- fit_resamples(air_recipe, lm_mod, air_cv)

show_best(lm_fits, metric = "rmse", maximize = FALSE)
#> # A tibble: 1 x 5
#>   .metric .estimator  mean     n std_err
#>   <chr>   <chr>      <dbl> <int>   <dbl>
#> 1 rmse    standard   0.526    25  0.0256

Created on 2020-04-05 by the reprex package (v0.3.0)

Upvotes: 1

Related Questions