Overview

The notebook aims to:

  1. Train model with various features and recipes
  2. Compare model performances

Load libraries

library(skimr)          # summary stats
library(DataExplorer)   # variable profiling, EDA
library(tidyverse)      # data manipulation
library(tidymodels)     # modelling
library(highcharter)    # interactive visualization
library(janitor)        # clean column names, convert to snake case
library(countrycode)    # feature engineering - from country to continent

Import data

dat <- read_csv(params$dpath) %>%
  rename(target = params$target_name) %>%
  clean_names()
dim(dat)
## [1] 506  14

Preprocessing

Train and test split

set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss)
dat_test <- testing(sss)

dim(dat_train)
## [1] 380  14
dim(dat_test)
## [1] 126  14

Recipe

Create a recipe of steps to be taken. Apply the recipe to train and test separately.

# Example recipe
rec <- dat_train %>%
    recipe(target ~., data = .) %>%
    step_discretize(age, num_breaks=3) %>%
    #step_discretize(b, num_breaks=2, min_unique=1) %>%
    step_discretize(crim, num_breaks=3) %>%
    step_num2factor(chas) %>%
    step_log(lstat) %>%
    step_log(indus) %>%
    step_log(b) %>%
    #step_normalize(all_numeric(), -target) %>% # no difference
    #step_log(crim) %>% # worse than step_discretize
    #step_mutate() %>%
    #step_num2factor() %>%
    #update_role() %>%
    #step_date() %>%                      # new feature - convert date data into one or more factor or numeric variables
    #step_naomit() %>%                    # remove NA obs
    #step_unknown() %>%                   # replace NA with a factor level "unknown"
    #step_medianimpute(all_numeric()) %>% # impute missing values with median of train
    step_nzv(all_predictors()) %>%       # zero variance filter             
    check_missing(all_predictors())      # check missing values

prepped <- prep(rec)

train <- prepped %>%
  juice()

test <- prepped %>%
  bake(new_data = dat_test)

Linear regression

fit <- linear_reg(mode = 'regression') %>%
  parsnip::set_engine(engine = 'lm') %>%
  fit(formula=formula(prepped), data = train)

summary(fit$fit)
## 
## Call:
## stats::lm(formula = formula, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.6308  -2.6785  -0.2934   1.9844  25.0845 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  53.694900   6.270750   8.563 0.000000000000000315 ***
## crimbin2      1.704331   0.654945   2.602             0.009640 ** 
## crimbin3      1.725139   1.178744   1.464             0.144182    
## zn            0.023256   0.016129   1.442             0.150207    
## indus        -1.064224   0.594312  -1.791             0.074175 .  
## chas1         3.180118   0.920442   3.455             0.000615 ***
## nox         -15.506436   4.296957  -3.609             0.000351 ***
## rm            2.314867   0.472419   4.900 0.000001444960425653 ***
## agebin2       0.551932   0.753163   0.733             0.464140    
## agebin3       1.107391   0.961285   1.152             0.250081    
## dis          -1.306967   0.212133  -6.161 0.000000001915085965 ***
## rad           0.217731   0.069174   3.148             0.001782 ** 
## tax          -0.010466   0.003615  -2.895             0.004015 ** 
## ptratio      -0.786107   0.137934  -5.699 0.000000024894379571 ***
## b             1.136481   0.380741   2.985             0.003028 ** 
## lstat        -9.177931   0.687780 -13.344 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.409 on 364 degrees of freedom
## Multiple R-squared:  0.7781, Adjusted R-squared:  0.769 
## F-statistic:  85.1 on 15 and 364 DF,  p-value: < 0.00000000000000022

Evaluation

We calculate evaluation metrics on both train and test sets.

NB: If the model performs much better on train than on test, it is likely to be overfitting.

Metrics

GnT::one_metrics(vec_actual = select(dat_test, target), vec_pred = predict(fit, new_data = test))
## # A tibble: 3 x 2
##   .metric .estimate
##   <chr>       <dbl>
## 1 rmse        3.88 
## 2 rsq         0.826
## 3 mae         2.85
# train
results_train = dat_train %>%
  bind_cols(pred = predict(fit, new_data = train))
# test
results_test = dat_test %>%
  bind_cols(pred = predict(fit, new_data = test))
# combine
scores = list(results_train, results_test) %>%
  map_dfc(~ yardstick::metrics(truth=target, estimate=.pred, data=.x)) %>%
  select(-contains('.estimator')) %>%
  select(-.metric1) %>%
  set_names('Metric', 'Train Score', 'Test Score')
kable(scores)
Metric Train Score Test Score
rmse 4.3149490 3.8807636
rsq 0.7781203 0.8263787
mae 3.1019493 2.8462026

Comparison of actuals VS predictions

GnT::plot_actual_vs_pred(data = results_test, xvar = 'target', yvar = '.pred')

Save model object

GnT::save_mod(fit, scores, params$outpath, params$name)