library(tidymodels)
library(tidyverse)
library(magrittr)
library(skimr)
library(knitr)
theme_set(theme_bw())file_path <- "../competition/House Prices - Advanced Regression Techniques/input/house-prices-advanced-regression-techniques/"
files <- list.files(file_path)
files## [1] "data_description.txt" "sample_submission.csv" "test.csv"
## [4] "train.csv"
train <- read_csv(file.path(file_path, "train.csv")) %>%
janitor::clean_names()
test <- read_csv(file.path(file_path, "test.csv")) %>%
janitor::clean_names()dim(train)## [1] 1460 81
dim(test)## [1] 1459 80
sale_priceIf we check out the distribution of the house price, it is little bit skewed to the right.
train %>%
ggplot(aes(x = sale_price)) +
geom_histogram()train %>%
ggplot(aes(x = log(sale_price))) +
geom_histogram()recipeall_data combine and name cleaning with janitorall_data <- bind_rows(train, test)
names(all_data)[1:10]## [1] "id" "ms_sub_class" "ms_zoning" "lot_frontage" "lot_area"
## [6] "street" "alley" "lot_shape" "land_contour" "utilities"
housing_recipe <- all_data %>%
recipe(sale_price ~ .) %>%
step_rm(id) %>%
step_log(sale_price) %>%
step_modeimpute(all_nominal()) %>%
step_dummy(all_nominal()) %>%
step_meanimpute(all_predictors()) %>%
step_normalize(all_predictors()) %>%
prep(training = all_data)
print(housing_recipe)## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 80
##
## Training data contained 2919 data points and 2919 incomplete rows.
##
## Operations:
##
## Variables removed id [trained]
## Log transformation on sale_price [trained]
## Mode Imputation for ms_zoning, street, alley, ... [trained]
## Dummy variables from ms_zoning, street, alley, lot_shape, ... [trained]
## Mean Imputation for ms_sub_class, lot_frontage, ... [trained]
## Centering and scaling for ms_sub_class, lot_frontage, ... [trained]
juice the all_data2 and splitall_data2 <- juice(housing_recipe)We are done for preprocessing. Let’s split the data set.
train_index <- seq_len(nrow(train))
train2 <- all_data2[train_index,]
test2 <- all_data2[-train_index,]set.seed(2021)
validation_split <- validation_split(train2, prop = 0.7)
# actual split id stored in the following
# validation_split$splits[[1]]$in_id
# the whole point is that it's there and trust tidymodels :)
head(validation_split$splits[[1]]$in_id)## [1] 903 166 1454 442 743 908
tune_spec <- linear_reg(penalty = tune(),
mixture = 0) %>%
set_engine("glmnet")
lambda_grid <- grid_regular(penalty(),
levels = 100)workflow <- workflow() %>%
add_model(tune_spec) %>%
add_formula(sale_price ~ .)doParallel::registerDoParallel()
tune_result <- workflow %>%
tune_grid(validation_split,
grid = lambda_grid,
metrics = metric_set(rmse))tune_result %>%
collect_metrics()## # A tibble: 100 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1.00e-10 rmse standard 0.137 1 NA Preprocessor1_Model001
## 2 1.26e-10 rmse standard 0.137 1 NA Preprocessor1_Model002
## 3 1.59e-10 rmse standard 0.137 1 NA Preprocessor1_Model003
## 4 2.01e-10 rmse standard 0.137 1 NA Preprocessor1_Model004
## 5 2.54e-10 rmse standard 0.137 1 NA Preprocessor1_Model005
## 6 3.20e-10 rmse standard 0.137 1 NA Preprocessor1_Model006
## 7 4.04e-10 rmse standard 0.137 1 NA Preprocessor1_Model007
## 8 5.09e-10 rmse standard 0.137 1 NA Preprocessor1_Model008
## 9 6.43e-10 rmse standard 0.137 1 NA Preprocessor1_Model009
## 10 8.11e-10 rmse standard 0.137 1 NA Preprocessor1_Model010
## # ... with 90 more rows
tune_result %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_line(size = 1.5) +
scale_x_log10() +
theme(legend.position = "none") +
labs(title = "RMSE")tune_result %>% show_best()## # A tibble: 5 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 4.86e- 2 rmse standard 0.136 1 NA Preprocessor1_Model087
## 2 6.14e- 2 rmse standard 0.136 1 NA Preprocessor1_Model088
## 3 3.85e- 2 rmse standard 0.136 1 NA Preprocessor1_Model086
## 4 1.00e-10 rmse standard 0.137 1 NA Preprocessor1_Model001
## 5 1.26e-10 rmse standard 0.137 1 NA Preprocessor1_Model002
tune_best <- tune_result %>% select_best(metric = "rmse")
tune_best$penalty## [1] 0.04862602
Set mixture is equal to zero refering the Lasso regression in glmnet since the
lasso_model <-
linear_reg(penalty = tune_best$penalty, # tuned penalty
mixture = 1) %>% # lasso: 1, ridge: 0
set_engine("glmnet")
lasso_fit <-
lasso_model %>%
fit(sale_price ~ ., data = train2)
options(max.print = 10)
lasso_fit %>%
tidy() %>%
filter(estimate > 0.001)## # A tibble: 13 x 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 12.0 0.0486
## 2 overall_qual 0.143 0.0486
## 3 year_built 0.0198 0.0486
## 4 year_remod_add 0.0237 0.0486
## 5 bsmt_fin_sf1 0.00450 0.0486
## 6 total_bsmt_sf 0.0264 0.0486
## 7 x1st_flr_sf 0.00399 0.0486
## 8 gr_liv_area 0.0840 0.0486
## 9 fireplaces 0.0125 0.0486
## 10 garage_cars 0.0509 0.0486
## 11 garage_area 0.00203 0.0486
## 12 ms_zoning_RL 0.00159 0.0486
## 13 central_air_Y 0.00499 0.0486
result <- predict(lasso_fit, test2)
result %>% head()## # A tibble: 6 x 1
## .pred
## <dbl>
## 1 11.7
## 2 11.9
## 3 12.0
## 4 12.1
## 5 12.2
## 6 12.1
submission <- read_csv(file.path(file_path, "sample_submission.csv"))##
## -- Column specification --------------------------------------------------------
## cols(
## Id = col_double(),
## SalePrice = col_double()
## )
submission$SalePrice <- exp(result$.pred)
write.csv(submission, row.names = FALSE,
"lecture5.csv")