nyk
nyk

Reputation: 680

How to extract GLMNET coefficients produced by Tidymodels

I estimated a glmnet logistic regression using tidymodels. But I couldn't figure out 2 things, which are closely related, in tidymodels:

Below are the codes for a pseudo model. I tried tidy(), coef(), and predict() but they all failed. Any help will be much appreciated. Thanks.

library(tidymodels)
#> -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidymodels 0.1.0 --
#> v broom     0.7.0      v recipes   0.1.13
#> v dials     0.0.8      v rsample   0.0.7 
#> v dplyr     1.0.0      v tibble    3.0.3 
#> v ggplot2   3.3.2      v tune      0.1.1 
#> v infer     0.5.2      v workflows 0.1.2 
#> v parsnip   0.1.2      v yardstick 0.0.7 
#> v purrr     0.3.4
#> -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()

set.seed(1234)

train <- tibble(y = factor(sample(c(0,1), 1000, replace = TRUE)),
                x1 = rnorm(1000),
                x2 = rnorm(1000)
                )

test <- tibble(y = factor(sample(c(0,1), 300, replace = TRUE)),
               x1 = rnorm(300),
               x2 = rnorm(300)
               )

lr_cv <- vfold_cv(train)

lr_mod <- logistic_reg(penalty = tune(), mixture = 1) %>%
  set_mode("classification") %>%
  set_engine("glmnet")

lr_wf <- workflow() %>%
  add_model(lr_mod) %>%
  add_formula(y ~ .)

lr_grid <- tibble(penalty = 10^seq(-4,1, length.out = 5))

lr_tune <- tune_grid(lr_wf, resamples = lr_cv, grid = lr_grid, metrics = metric_set(roc_auc))

lr_best <- select_best(lr_tune)
lr_best
#> # A tibble: 1 x 2
#>   penalty .config
#>     <dbl> <chr>  
#> 1  0.0001 Model1

lr_wf_final <- lr_wf %>% finalize_workflow(lr_best)

final_mod <- lr_wf_final %>% fit(train)

# the followings were tried but didn't work
lambda <- lr_best[1]
lambda
#> # A tibble: 1 x 1
#>   penalty
#>     <dbl>
#> 1  0.0001

predict(final.mod , s = lambda, type ="coefficients")[1:3 ,]
#> Error in predict(final.mod, s = lambda, type = "coefficients"): object 'final.mod' not found

tidy(final_mod)
#> # A tibble: 35 x 5
#>    term         step estimate  lambda dev.ratio
#>    <chr>       <dbl>    <dbl>   <dbl>     <dbl>
#>  1 (Intercept)     1  -0.0560 0.0183   1.71e-14
#>  2 (Intercept)     2  -0.0561 0.0167   1.65e- 4
#>  3 (Intercept)     3  -0.0562 0.0152   3.02e- 4
#>  4 (Intercept)     4  -0.0562 0.0139   4.16e- 4
#>  5 (Intercept)     5  -0.0563 0.0126   5.10e- 4
#>  6 (Intercept)     6  -0.0564 0.0115   5.89e- 4
#>  7 (Intercept)     7  -0.0564 0.0105   6.54e- 4
#>  8 (Intercept)     8  -0.0565 0.00956  7.08e- 4
#>  9 (Intercept)     9  -0.0565 0.00871  7.53e- 4
#> 10 (Intercept)    10  -0.0566 0.00794  7.90e- 4
#> # ... with 25 more rows

coef(final_mod)
#> NULL

Created on 2020-07-25 by the reprex package (v0.3.0)

Upvotes: 5

Views: 3851

Answers (1)

Julia Silge
Julia Silge

Reputation: 11663

We recently added some updated support for getting out the estimated coefficients.

library(tidymodels)
#> ── Attaching packages ─────────────────────────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom     0.7.0          ✓ recipes   0.1.13    
#> ✓ dials     0.0.8          ✓ rsample   0.0.7     
#> ✓ dplyr     1.0.0          ✓ tibble    3.0.3     
#> ✓ ggplot2   3.3.2          ✓ tidyr     1.1.0     
#> ✓ infer     0.5.3          ✓ tune      0.1.1.9000
#> ✓ modeldata 0.0.2          ✓ workflows 0.1.2     
#> ✓ parsnip   0.1.2.9000     ✓ yardstick 0.0.7     
#> ✓ purrr     0.3.4
#> ── Conflicts ────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter()  masks stats::filter()
#> x dplyr::lag()     masks stats::lag()
#> x recipes::step()  masks stats::step()

set.seed(1234)
train <- tibble(y = factor(sample(c(0,1), 1000, replace = TRUE)),
                x1 = as.numeric(y) + rnorm(1000),
                x2 = rnorm(1000),
                x3 = rnorm(1000)
)

lr_cv <- vfold_cv(train)
lr_grid <- tibble(penalty = 10^seq(-4,1, length.out = 5))
lr_mod <- logistic_reg(penalty = tune(), mixture = 1) %>%
  set_mode("classification") %>%
  set_engine("glmnet")

lr_wf <- workflow() %>%
  add_model(lr_mod) %>%
  add_formula(y ~ .) 

lr_best <- lr_wf %>%
  tune_grid(
    resamples = lr_cv,
    grid = lr_grid
  ) %>%
  select_best("roc_auc")

lr_wf %>% 
  finalize_workflow(lr_best) %>%
  fit(train) %>%
  pull_workflow_fit() %>%
  tidy()
#> Loading required package: Matrix
#> 
#> Attaching package: 'Matrix'
#> The following objects are masked from 'package:tidyr':
#> 
#>     expand, pack, unpack
#> Loaded glmnet 4.0-2
#> # A tibble: 4 x 3
#>   term        estimate penalty
#>   <chr>          <dbl>   <dbl>
#> 1 (Intercept)   -1.30   0.0316
#> 2 x1             0.834  0.0316
#> 3 x2             0      0.0316
#> 4 x3             0      0.0316

Created on 2020-07-30 by the reprex package (v0.3.0.9001)

The steps to extract the coefficients are to pull out the fitted object and then to tidy it.

If you want to save this object for future use, you can either just save these coefficients (it is a linear model, after all), or you can save the finalized and fitted workflow as a binary object, which can be predicted from.

Upvotes: 7

Related Questions