library(tidymodels)
library(tidyverse)
library(magrittr)
library(skimr)
library(knitr)
theme_set(theme_bw())
<- "../competition/House Prices - Advanced Regression Techniques/input/house-prices-advanced-regression-techniques/"
file_path <- list.files(file_path)
files files
## [1] "data_description.txt" "sample_submission.csv" "test.csv"
## [4] "train.csv"
<- read_csv(file.path(file_path, "train.csv")) %>%
train ::clean_names()
janitor<- read_csv(file.path(file_path, "test.csv")) %>%
test ::clean_names() janitor
dim(train)
## [1] 1460 81
dim(test)
## [1] 1459 80
sale_price
If we check out the distribution of the house price, it is little bit skewed to the right.
%>%
train ggplot(aes(x = sale_price)) +
geom_histogram()
%>%
train ggplot(aes(x = log(sale_price))) +
geom_histogram()
recipe
all_data
combine and name cleaning with janitor
<- bind_rows(train, test)
all_data names(all_data)[1:10]
## [1] "id" "ms_sub_class" "ms_zoning" "lot_frontage" "lot_area"
## [6] "street" "alley" "lot_shape" "land_contour" "utilities"
<- all_data %>%
housing_recipe recipe(sale_price ~ .) %>%
step_rm(id) %>%
step_log(sale_price) %>%
step_modeimpute(all_nominal()) %>%
step_dummy(all_nominal()) %>%
step_meanimpute(all_predictors()) %>%
step_normalize(all_predictors()) %>%
prep(training = all_data)
print(housing_recipe)
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 80
##
## Training data contained 2919 data points and 2919 incomplete rows.
##
## Operations:
##
## Variables removed id [trained]
## Log transformation on sale_price [trained]
## Mode Imputation for ms_zoning, street, alley, ... [trained]
## Dummy variables from ms_zoning, street, alley, lot_shape, ... [trained]
## Mean Imputation for ms_sub_class, lot_frontage, ... [trained]
## Centering and scaling for ms_sub_class, lot_frontage, ... [trained]
juice
the all_data2 and split<- juice(housing_recipe) all_data2
We are done for preprocessing. Let’s split the data set.
<- seq_len(nrow(train))
train_index <- all_data2[train_index,]
train2 <- all_data2[-train_index,] test2
set.seed(2021)
<- validation_split(train2, prop = 0.7)
validation_split
# actual split id stored in the following
# validation_split$splits[[1]]$in_id
# the whole point is that it's there and trust tidymodels :)
head(validation_split$splits[[1]]$in_id)
## [1] 903 166 1454 442 743 908
<- linear_reg(penalty = tune(),
tune_spec mixture = 0) %>%
set_engine("glmnet")
<- grid_regular(penalty(),
lambda_grid levels = 100)
<- workflow() %>%
workflow add_model(tune_spec) %>%
add_formula(sale_price ~ .)
::registerDoParallel()
doParallel
<- workflow %>%
tune_result tune_grid(validation_split,
grid = lambda_grid,
metrics = metric_set(rmse))
%>%
tune_result collect_metrics()
## # A tibble: 100 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1.00e-10 rmse standard 0.137 1 NA Preprocessor1_Model001
## 2 1.26e-10 rmse standard 0.137 1 NA Preprocessor1_Model002
## 3 1.59e-10 rmse standard 0.137 1 NA Preprocessor1_Model003
## 4 2.01e-10 rmse standard 0.137 1 NA Preprocessor1_Model004
## 5 2.54e-10 rmse standard 0.137 1 NA Preprocessor1_Model005
## 6 3.20e-10 rmse standard 0.137 1 NA Preprocessor1_Model006
## 7 4.04e-10 rmse standard 0.137 1 NA Preprocessor1_Model007
## 8 5.09e-10 rmse standard 0.137 1 NA Preprocessor1_Model008
## 9 6.43e-10 rmse standard 0.137 1 NA Preprocessor1_Model009
## 10 8.11e-10 rmse standard 0.137 1 NA Preprocessor1_Model010
## # ... with 90 more rows
%>%
tune_result collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_line(size = 1.5) +
scale_x_log10() +
theme(legend.position = "none") +
labs(title = "RMSE")
%>% show_best() tune_result
## # A tibble: 5 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 4.86e- 2 rmse standard 0.136 1 NA Preprocessor1_Model087
## 2 6.14e- 2 rmse standard 0.136 1 NA Preprocessor1_Model088
## 3 3.85e- 2 rmse standard 0.136 1 NA Preprocessor1_Model086
## 4 1.00e-10 rmse standard 0.137 1 NA Preprocessor1_Model001
## 5 1.26e-10 rmse standard 0.137 1 NA Preprocessor1_Model002
<- tune_result %>% select_best(metric = "rmse")
tune_best $penalty tune_best
## [1] 0.04862602
Set mixture
is equal to zero refering the Lasso regression in glmnet
since the
<-
lasso_model linear_reg(penalty = tune_best$penalty, # tuned penalty
mixture = 1) %>% # lasso: 1, ridge: 0
set_engine("glmnet")
<-
lasso_fit %>%
lasso_model fit(sale_price ~ ., data = train2)
options(max.print = 10)
%>%
lasso_fit tidy() %>%
filter(estimate > 0.001)
## # A tibble: 13 x 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 12.0 0.0486
## 2 overall_qual 0.143 0.0486
## 3 year_built 0.0198 0.0486
## 4 year_remod_add 0.0237 0.0486
## 5 bsmt_fin_sf1 0.00450 0.0486
## 6 total_bsmt_sf 0.0264 0.0486
## 7 x1st_flr_sf 0.00399 0.0486
## 8 gr_liv_area 0.0840 0.0486
## 9 fireplaces 0.0125 0.0486
## 10 garage_cars 0.0509 0.0486
## 11 garage_area 0.00203 0.0486
## 12 ms_zoning_RL 0.00159 0.0486
## 13 central_air_Y 0.00499 0.0486
<- predict(lasso_fit, test2)
result %>% head() result
## # A tibble: 6 x 1
## .pred
## <dbl>
## 1 11.7
## 2 11.9
## 3 12.0
## 4 12.1
## 5 12.2
## 6 12.1
<- read_csv(file.path(file_path, "sample_submission.csv")) submission
##
## -- Column specification --------------------------------------------------------
## cols(
## Id = col_double(),
## SalePrice = col_double()
## )
$SalePrice <- exp(result$.pred)
submissionwrite.csv(submission, row.names = FALSE,
"lecture5.csv")