The notebook aims to:
library(skimr) # summary stats
library(DataExplorer) # variable profiling, EDA
library(tidyverse) # data manipulation
library(tidymodels) # modelling
library(highcharter) # interactive visualization
library(janitor) # clean column names, convert to snake case
library(countrycode) # feature engineering - from country to continent
dat <- read_csv(params$dpath) %>%
rename(target = params$target_name) %>%
clean_names()
dim(dat)
## [1] 506 14
set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss)
dat_test <- testing(sss)
dim(dat_train)
## [1] 380 14
dim(dat_test)
## [1] 126 14
Create a recipe of steps to be taken. Apply the recipe to train and test separately.
# Example recipe
rec <- dat_train %>%
recipe(target ~., data = .) %>%
step_discretize(age, num_breaks=3) %>%
#step_discretize(b, num_breaks=2, min_unique=1) %>%
step_discretize(crim, num_breaks=3) %>%
step_num2factor(chas) %>%
step_log(lstat) %>%
step_log(indus) %>%
step_log(b) %>%
#step_normalize(all_numeric(), -target) %>% # no difference
#step_log(crim) %>% # worse than step_discretize
#step_mutate() %>%
#step_num2factor() %>%
#update_role() %>%
#step_date() %>% # new feature - convert date data into one or more factor or numeric variables
#step_naomit() %>% # remove NA obs
#step_unknown() %>% # replace NA with a factor level "unknown"
#step_medianimpute(all_numeric()) %>% # impute missing values with median of train
step_nzv(all_predictors()) %>% # zero variance filter
check_missing(all_predictors()) # check missing values
prepped <- prep(rec)
train <- prepped %>%
juice()
test <- prepped %>%
bake(new_data = dat_test)
fit <- linear_reg(mode = 'regression') %>%
parsnip::set_engine(engine = 'lm') %>%
fit(formula=formula(prepped), data = train)
summary(fit$fit)
##
## Call:
## stats::lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6308 -2.6785 -0.2934 1.9844 25.0845
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.694900 6.270750 8.563 0.000000000000000315 ***
## crimbin2 1.704331 0.654945 2.602 0.009640 **
## crimbin3 1.725139 1.178744 1.464 0.144182
## zn 0.023256 0.016129 1.442 0.150207
## indus -1.064224 0.594312 -1.791 0.074175 .
## chas1 3.180118 0.920442 3.455 0.000615 ***
## nox -15.506436 4.296957 -3.609 0.000351 ***
## rm 2.314867 0.472419 4.900 0.000001444960425653 ***
## agebin2 0.551932 0.753163 0.733 0.464140
## agebin3 1.107391 0.961285 1.152 0.250081
## dis -1.306967 0.212133 -6.161 0.000000001915085965 ***
## rad 0.217731 0.069174 3.148 0.001782 **
## tax -0.010466 0.003615 -2.895 0.004015 **
## ptratio -0.786107 0.137934 -5.699 0.000000024894379571 ***
## b 1.136481 0.380741 2.985 0.003028 **
## lstat -9.177931 0.687780 -13.344 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.409 on 364 degrees of freedom
## Multiple R-squared: 0.7781, Adjusted R-squared: 0.769
## F-statistic: 85.1 on 15 and 364 DF, p-value: < 0.00000000000000022
We calculate evaluation metrics on both train and test sets.
NB: If the model performs much better on train than on test, it is likely to be overfitting.
GnT::one_metrics(vec_actual = select(dat_test, target), vec_pred = predict(fit, new_data = test))
## # A tibble: 3 x 2
## .metric .estimate
## <chr> <dbl>
## 1 rmse 3.88
## 2 rsq 0.826
## 3 mae 2.85
# train
results_train = dat_train %>%
bind_cols(pred = predict(fit, new_data = train))
# test
results_test = dat_test %>%
bind_cols(pred = predict(fit, new_data = test))
# combine
scores = list(results_train, results_test) %>%
map_dfc(~ yardstick::metrics(truth=target, estimate=.pred, data=.x)) %>%
select(-contains('.estimator')) %>%
select(-.metric1) %>%
set_names('Metric', 'Train Score', 'Test Score')
kable(scores)
Metric | Train Score | Test Score |
---|---|---|
rmse | 4.3149490 | 3.8807636 |
rsq | 0.7781203 | 0.8263787 |
mae | 3.1019493 | 2.8462026 |
GnT::plot_actual_vs_pred(data = results_test, xvar = 'target', yvar = '.pred')
GnT::save_mod(fit, scores, params$outpath, params$name)