Reputation: 67
I'm trying to predict real estate prices in R with Tidymodels. I'm following this tutorial. All goes well until the very and when I try to run prediction on my test data.
Please see the below code example and the error at the very end.
I looked at two similar questions (here and here) but it seems that I have defined variable roles and provided an unprepared recipe to my workflow.
# libraries ---------------------------------------------------------------
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom 0.7.3 ✓ recipes 0.1.15
#> ✓ dials 0.0.9 ✓ rsample 0.0.8
#> ✓ dplyr 1.0.3 ✓ tibble 3.0.5
#> ✓ ggplot2 3.3.3 ✓ tidyr 1.1.2
#> ✓ infer 0.5.4 ✓ tune 0.1.2
#> ✓ modeldata 0.1.0 ✓ workflows 0.2.1
#> ✓ parsnip 0.1.5 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(data.table)
library(purrr)
# data --------------------------------------------------------------------
# 're' means real estate
# I'm using data.table in general. Using tribble below for cleaner data definition.
real_estate_data <- tibble::tribble(
~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
"30876343", 0.534722222222222, 1, 3,
"31914489", 0.476119402985075, 1, 1,
"30972289", 0.507352941176471, 1, 2,
"31739730", 0.472972972972973, 1, 3,
"31783137", 0.49875, 2, 3,
"31809435", 0.439705882352941, 2, 2,
"31943408", 0.469117647058824, 2, 3,
"31944348", 0.56231884057971, 2, 1,
"31961146", 0.472972972972973, 3, 3,
"24314388", 0.649550561797753, 3, 2,
"29840270", 0.719178082191781, 3, 3,
"29840429", 0.719178082191781, 3, 3,
"30873484", 0.822857142857143, 4, 3,
"30969673", 0.533802816901408, 4, 3,
"31333120", 0.741511627906977, 4, 3,
"31788730", 0.527142857142857, 4, 2,
"31948441", 0.734848484848485, 5, 2,
"31962350", 0.8, 5, 3,
"31962779", 0.670454545454545, 5, 3,
"31979128", 0.689054054054054, 5, 1
)
real_estate_data <- as.data.table(real_estate_data) %>% .[, district := factor(district)]
# train/test split --------------------------------------------------------
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test <- testing(re_split)
# workflow (w/ recipe) ----------------------------------------------------
re_rec <- recipe(re_train,
formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id, new_role = "ID") %>%
step_center(all_numeric(), - district) %>%
step_scale(all_predictors(), all_numeric(), - district) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 re_id nominal ID original
#> 2 district nominal predictor original
#> 3 num_room numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
# model training and prediction -------------------------------------------
re_fit <-
re_wflow %>%
fit(data = re_train)
re_pred <- predict(re_fit, re_test)
#> Error: Can't subset columns that don't exist.
#> x Column `price_per_sqm_huf_mil` doesn't exist.
Created on 2021-01-25 by the reprex package (v0.3.0)
Many thanks!
Upvotes: 1
Views: 1393
Reputation: 11663
The issue here is that you used step_center()
to transform the outcome (price_per_sqm_huf_mil
) and at prediction time, there is no outcome available. You can instead specify that you want to center all_predictors() & all_numeric()
like this:
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.2 ──
#> ✓ broom 0.7.3 ✓ recipes 0.1.15
#> ✓ dials 0.0.9 ✓ rsample 0.0.8
#> ✓ dplyr 1.0.3 ✓ tibble 3.0.5
#> ✓ ggplot2 3.3.3 ✓ tidyr 1.1.2
#> ✓ infer 0.5.4 ✓ tune 0.1.2
#> ✓ modeldata 0.1.0 ✓ workflows 0.2.1
#> ✓ parsnip 0.1.5 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(dplyr)
real_estate_data <- tibble::tribble(
~re_id, ~price_per_sqm_huf_mil, ~district, ~num_room,
"30876343", 0.534722222222222, 1, 3,
"31914489", 0.476119402985075, 1, 1,
"30972289", 0.507352941176471, 1, 2,
"31739730", 0.472972972972973, 1, 3,
"31783137", 0.49875, 2, 3,
"31809435", 0.439705882352941, 2, 2,
"31943408", 0.469117647058824, 2, 3,
"31944348", 0.56231884057971, 2, 1,
"31961146", 0.472972972972973, 3, 3,
"24314388", 0.649550561797753, 3, 2,
"29840270", 0.719178082191781, 3, 3,
"29840429", 0.719178082191781, 3, 3,
"30873484", 0.822857142857143, 4, 3,
"30969673", 0.533802816901408, 4, 3,
"31333120", 0.741511627906977, 4, 3,
"31788730", 0.527142857142857, 4, 2,
"31948441", 0.734848484848485, 5, 2,
"31962350", 0.8, 5, 3,
"31962779", 0.670454545454545, 5, 3,
"31979128", 0.689054054054054, 5, 1
) %>%
mutate(district = factor(district))
set.seed(123)
re_split <- initial_split(real_estate_data)
re_train <- training(re_split)
re_test <- testing(re_split)
re_rec <- recipe(re_train,
formula = price_per_sqm_huf_mil ~ .) %>%
update_role(re_id, new_role = "ID") %>%
step_center(all_predictors() & all_numeric()) %>%
step_scale(all_predictors() & all_numeric()) %>%
step_dummy(district) %>%
step_zv(all_predictors())
summary(re_rec)
#> # A tibble: 4 x 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 re_id nominal ID original
#> 2 district nominal predictor original
#> 3 num_room numeric predictor original
#> 4 price_per_sqm_huf_mil numeric outcome original
lr_model <-
linear_reg() %>%
set_engine("lm")
re_wflow <-
workflow() %>%
add_model(lr_model) %>%
add_recipe(re_rec)
re_fit <-
re_wflow %>%
fit(data = re_train)
predict(re_fit, new_data = re_test)
#> # A tibble: 5 x 1
#> .pred
#> <dbl>
#> 1 0.486
#> 2 0.611
#> 3 0.688
#> 4 0.688
#> 5 0.768
Created on 2021-01-25 by the reprex package (v0.3.0)
This has tripped up more folks than you so we are working on adding a new set of selectors that will be merged in soon. The other option to think about, if you really do want to try transforming an outcome, is to look into using skip = TRUE
.
Upvotes: 4