Reputation: 199
In the context of variable selection, I'm trying to count the number of times a variable is selected over bootstrapped iterations. A simple version of the problem is provided below, along with my solution (answer
). But my solution quickly becomes unwieldy when dealing with 50 or 100 variables.
I have the set of variable names I would like to count over (pred
) so I thought it should be possible to create new columns based on those values and then detect the relevant string for each. But I can't figure out how without manually setting the column names and pasting the function. There must be a better way...
Any other solutions would be welcome, including tidyverse or purrr...
library(dplyr)
df <- mtcars
n <- nrow(df)
pred <- colnames(df)[2:length(df)]
target <- "mpg"
mpg_formula <- paste(target, "~", paste(pred, collapse = "+"))
steplm <- data.frame()
bootnum <- 10
for (i in 1:bootnum) {
message("Fitting model ", i, " out of ", bootnum)
data.id <- sample(1:dim(df)[1], replace = T)
fit.lms <- step(lm(mpg_formula, data=df[data.id, ]),
direction = "backward",
trace = 0)
selected.vars <- paste(sort(names(coef(fit.lms)[-1])), collapse = ", ")
step.result <- data.frame("model" = selected.vars,
"nvar" = length(names(coef(fit.lms)[-1])))
steplm <- dplyr::bind_rows(steplm, step.result)
}
steplm %>%
transmute(
steplm %>%
transmute(
cyl = grepl(pattern = "cyl", x = model),
disp = grepl(pattern = "disp", x = model),
hp = grepl(pattern = "hp", x = model),
drat = grepl(pattern = "drat", x = model),
wt = grepl(pattern = "wt", x = model),
qsec = grepl(pattern = "qsec", x = model),
vs = grepl(pattern = "vs", x = model),
am = grepl(pattern = "am", x = model),
gear = grepl(pattern = "gear", x = model),
carb = grepl(pattern = "carb", x = model)
) -> answer
This produces the following data.frame (or matrix), from which I can just sum the columns to get the values I want (or do matrix operations to get pairwise and joint dependencies between terms). This is just to point out the matrix format is needed for the next step...
cyl disp hp drat wt qsec vs am gear carb
TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE
TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE
TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE
FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE
FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE
TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
Upvotes: 1
Views: 157
Reputation: 11663
You can consider taking a different approach to your bootstrap resampling, more similar to this.
library(tidyverse)
library(rsample)
library(broom)
car_boot <- bootstraps(mtcars, times = 100)
car_boot
#> # Bootstrap sampling
#> # A tibble: 100 x 2
#> splits id
#> <list> <chr>
#> 1 <split [32/10]> Bootstrap001
#> 2 <split [32/12]> Bootstrap002
#> 3 <split [32/12]> Bootstrap003
#> 4 <split [32/12]> Bootstrap004
#> 5 <split [32/13]> Bootstrap005
#> 6 <split [32/9]> Bootstrap006
#> 7 <split [32/11]> Bootstrap007
#> 8 <split [32/10]> Bootstrap008
#> 9 <split [32/13]> Bootstrap009
#> 10 <split [32/14]> Bootstrap010
#> # … with 90 more rows
fit_mpg <- function(split) {
step(lm(mpg ~ ., data = analysis(split)), direction = "backward", trace = 0)
}
boot_models <- car_boot %>%
mutate(model = map(splits, fit_mpg),
coef_info = map(model, tidy))
boot_coefs <- boot_models %>%
unnest(coef_info)
boot_coefs
#> # A tibble: 645 x 8
#> splits id model term estimate std.error statistic p.value
#> <list> <chr> <lis> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 <split [32/… Bootstrap… <lm> (Interc… 6.16 7.09 0.868 3.93e-1
#> 2 <split [32/… Bootstrap… <lm> drat 1.89 1.44 1.31 2.01e-1
#> 3 <split [32/… Bootstrap… <lm> wt -1.92 0.738 -2.61 1.50e-2
#> 4 <split [32/… Bootstrap… <lm> qsec 0.754 0.307 2.45 2.12e-2
#> 5 <split [32/… Bootstrap… <lm> am 3.80 1.49 2.55 1.70e-2
#> 6 <split [32/… Bootstrap… <lm> carb -0.859 0.407 -2.11 4.47e-2
#> 7 <split [32/… Bootstrap… <lm> (Interc… 8.51 4.05 2.10 4.48e-2
#> 8 <split [32/… Bootstrap… <lm> disp 0.00752 0.00474 1.59 1.25e-1
#> 9 <split [32/… Bootstrap… <lm> wt -1.17 0.708 -1.66 1.09e-1
#> 10 <split [32/… Bootstrap… <lm> gear 5.37 0.757 7.10 1.25e-7
#> # … with 635 more rows
boot_coefs %>%
select(id, term) %>%
filter(term != "(Intercept)") %>%
mutate(value = TRUE) %>%
complete(id, term, fill = list(value = FALSE)) %>%
pivot_wider(names_from = term, values_from = value)
#> # A tibble: 100 x 11
#> id am carb cyl disp drat gear hp qsec vs wt
#> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 Bootstrap001 TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE
#> 2 Bootstrap002 FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE
#> 3 Bootstrap003 TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
#> 4 Bootstrap004 FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
#> 5 Bootstrap005 FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
#> 6 Bootstrap006 FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
#> 7 Bootstrap007 FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
#> 8 Bootstrap008 FALSE FALSE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
#> 9 Bootstrap009 TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
#> 10 Bootstrap010 FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
#> # … with 90 more rows
Created on 2021-04-07 by the reprex package (v2.0.0)
Upvotes: 2
Reputation: 389275
You can use sapply
:
sapply(pred, grepl, steplm$model)
# cyl disp hp drat wt qsec vs am gear carb
# [1,] FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE
# [2,] TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
# [3,] FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
# [4,] FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE
# [5,] FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
# [6,] TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
# [7,] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE
# [8,] FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
# [9,] FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE
#[10,] TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
sapply
returns a matrix. You can wrap data.frame
to sapply
output if you need dataframe.
identical(data.frame(sapply(pred, grepl, steplm$model)), answer)
#[1] TRUE
Upvotes: 2