1 Preparations

1.1 Libraries

library(tidymodels)
library(tidyverse)
library(magrittr)
library(skimr)
library(knitr)
theme_set(theme_bw())

1.2 Data load

file_path <- "../competition/House Prices - Advanced Regression Techniques/input/house-prices-advanced-regression-techniques/"
files <- list.files(file_path)
files
## [1] "data_description.txt"  "sample_submission.csv" "test.csv"             
## [4] "train.csv"
train <- read_csv(file.path(file_path, "train.csv")) %>% 
  janitor::clean_names()
test <- read_csv(file.path(file_path, "test.csv")) %>% 
  janitor::clean_names()

2 Data overview

2.1 Basic info.

dim(train)
## [1] 1460   81
dim(test)
## [1] 1459   80

2.2 Distribution of sale_price

If we check out the distribution of the house price, it is little bit skewed to the right.

train %>% 
  ggplot(aes(x = sale_price)) +
  geom_histogram()

train %>% 
  ggplot(aes(x = log(sale_price))) +
  geom_histogram()

3 Preprecessing with recipe

3.1 all_data combine and name cleaning with janitor

all_data <- bind_rows(train, test)
names(all_data)[1:10]
##  [1] "id"           "ms_sub_class" "ms_zoning"    "lot_frontage" "lot_area"    
##  [6] "street"       "alley"        "lot_shape"    "land_contour" "utilities"

3.2 Make recipe

housing_recipe <- all_data %>% 
  recipe(sale_price ~ .) %>%
  step_rm(id) %>% 
  step_log(sale_price) %>%
  step_modeimpute(all_nominal()) %>% 
  step_dummy(all_nominal()) %>% 
  step_meanimpute(all_predictors()) %>%
  step_normalize(all_predictors()) %>% 
  prep(training = all_data)

print(housing_recipe)
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         80
## 
## Training data contained 2919 data points and 2919 incomplete rows. 
## 
## Operations:
## 
## Variables removed id [trained]
## Log transformation on sale_price [trained]
## Mode Imputation for ms_zoning, street, alley, ... [trained]
## Dummy variables from ms_zoning, street, alley, lot_shape, ... [trained]
## Mean Imputation for ms_sub_class, lot_frontage, ... [trained]
## Centering and scaling for ms_sub_class, lot_frontage, ... [trained]

3.3 juice the all_data2 and split

all_data2 <- juice(housing_recipe)

We are done for preprocessing. Let’s split the data set.

train_index <- seq_len(nrow(train))
train2 <- all_data2[train_index,]
test2 <- all_data2[-train_index,]

4 Split the train into validation and train

set.seed(2021)

validation_split <- validation_split(train2, prop = 0.7)

# actual split id stored in the following
# validation_split$splits[[1]]$in_id
# the whole point is that it's there and trust tidymodels :)
head(validation_split$splits[[1]]$in_id)
## [1]  903  166 1454  442  743  908

5 Set the tuning spec

tune_spec <- linear_reg(penalty = tune(),
                        mixture = 0) %>%
  set_engine("glmnet")

lambda_grid <- grid_regular(penalty(), 
                            levels = 100)

6 Set workflow()

workflow <- workflow() %>%
  add_model(tune_spec) %>% 
  add_formula(sale_price ~ .)

7 Tuning the lambda

doParallel::registerDoParallel()

tune_result <- workflow %>% 
  tune_grid(validation_split,
            grid = lambda_grid,
            metrics = metric_set(rmse))
tune_result %>% 
  collect_metrics()
## # A tibble: 100 x 7
##     penalty .metric .estimator  mean     n std_err .config               
##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                 
##  1 1.00e-10 rmse    standard   0.137     1      NA Preprocessor1_Model001
##  2 1.26e-10 rmse    standard   0.137     1      NA Preprocessor1_Model002
##  3 1.59e-10 rmse    standard   0.137     1      NA Preprocessor1_Model003
##  4 2.01e-10 rmse    standard   0.137     1      NA Preprocessor1_Model004
##  5 2.54e-10 rmse    standard   0.137     1      NA Preprocessor1_Model005
##  6 3.20e-10 rmse    standard   0.137     1      NA Preprocessor1_Model006
##  7 4.04e-10 rmse    standard   0.137     1      NA Preprocessor1_Model007
##  8 5.09e-10 rmse    standard   0.137     1      NA Preprocessor1_Model008
##  9 6.43e-10 rmse    standard   0.137     1      NA Preprocessor1_Model009
## 10 8.11e-10 rmse    standard   0.137     1      NA Preprocessor1_Model010
## # ... with 90 more rows

8 Visualization of the tunning result

tune_result %>%
  collect_metrics() %>%
  ggplot(aes(penalty, mean, color = .metric)) +
  geom_line(size = 1.5) +
  scale_x_log10() +
  theme(legend.position = "none") +
  labs(title = "RMSE")

tune_result %>% show_best()
## # A tibble: 5 x 7
##    penalty .metric .estimator  mean     n std_err .config               
##      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                 
## 1 4.86e- 2 rmse    standard   0.136     1      NA Preprocessor1_Model087
## 2 6.14e- 2 rmse    standard   0.136     1      NA Preprocessor1_Model088
## 3 3.85e- 2 rmse    standard   0.136     1      NA Preprocessor1_Model086
## 4 1.00e-10 rmse    standard   0.137     1      NA Preprocessor1_Model001
## 5 1.26e-10 rmse    standard   0.137     1      NA Preprocessor1_Model002
tune_best <- tune_result %>% select_best(metric = "rmse")
tune_best$penalty
## [1] 0.04862602

9 Set Lasso regression model and fitting

Set mixture is equal to zero refering the Lasso regression in glmnet since the

lasso_model <- 
    linear_reg(penalty = tune_best$penalty, # tuned penalty
               mixture = 1) %>% # lasso: 1, ridge: 0
    set_engine("glmnet")

lasso_fit <- 
    lasso_model %>% 
    fit(sale_price ~ ., data = train2)

options(max.print = 10)
lasso_fit %>% 
    tidy() %>% 
    filter(estimate > 0.001)
## # A tibble: 13 x 3
##    term           estimate penalty
##    <chr>             <dbl>   <dbl>
##  1 (Intercept)    12.0      0.0486
##  2 overall_qual    0.143    0.0486
##  3 year_built      0.0198   0.0486
##  4 year_remod_add  0.0237   0.0486
##  5 bsmt_fin_sf1    0.00450  0.0486
##  6 total_bsmt_sf   0.0264   0.0486
##  7 x1st_flr_sf     0.00399  0.0486
##  8 gr_liv_area     0.0840   0.0486
##  9 fireplaces      0.0125   0.0486
## 10 garage_cars     0.0509   0.0486
## 11 garage_area     0.00203  0.0486
## 12 ms_zoning_RL    0.00159  0.0486
## 13 central_air_Y   0.00499  0.0486

10 Prediction and submit

result <- predict(lasso_fit, test2)
result %>% head()
## # A tibble: 6 x 1
##   .pred
##   <dbl>
## 1  11.7
## 2  11.9
## 3  12.0
## 4  12.1
## 5  12.2
## 6  12.1
submission <- read_csv(file.path(file_path, "sample_submission.csv"))
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Id = col_double(),
##   SalePrice = col_double()
## )
submission$SalePrice <- exp(result$.pred)
write.csv(submission, row.names = FALSE,
          "lecture5.csv")