Overview

The notebook aims to:

  1. Train a baseline model
  2. Evaluate model performance
  3. Save the model object as asset

Load libraries

library(skimr)          # summary stats
library(DataExplorer)   # variable profiling, EDA
library(tidyverse)      # data manipulation
library(tidymodels)     # modelling
library(highcharter)    # interactive visualization
library(janitor)        # clean column names, convert to snake case
library(countrycode)    # feature engineering - from country to continent

Import data

dat <- read_csv(params$dpath) %>%
  rename(target = params$target_name) %>%
  clean_names()
dim(dat)
## [1] 506  14

Preprocessing

Train and test split

set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss)
dat_test <- testing(sss)

dim(dat_train)
## [1] 380  14
dim(dat_test)
## [1] 126  14

Recipe

Create a recipe of steps to be taken. Apply the recipe to train and test separately.

# Example recipe
rec <- dat_train %>%
    recipe(target ~., data = .) %>%                         
    #update_role() %>%
    #step_date() %>%                      # new feature - convert date data into one or more factor or numeric variables
    #step_naomit() %>%                    # remove NA obs
    #step_unknown() %>%                   # replace NA with a factor level "unknown"
    #step_medianimpute(all_numeric()) %>% # impute missing values with median of train
    #step_nzv(all_predictors()) %>%       # zero variance filter             
    check_missing(all_predictors())      # check missing values

prepped <- prep(rec)

train <- prepped %>%
  juice()

test <- prepped %>%
  bake(new_data = dat_test)

Linear regression

fit <- linear_reg(mode = 'regression') %>%
  parsnip::set_engine(engine = 'lm') %>%
  fit(formula=formula(prepped), data = train)

summary(fit$fit)
## 
## Call:
## stats::lm(formula = formula, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.0902  -2.8351  -0.5495   1.8994  25.1352 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)  36.4595341   5.8819560   6.199   0.0000000015372045 ***
## crim         -0.1195332   0.0370736  -3.224             0.001377 ** 
## zn            0.0503486   0.0159338   3.160             0.001710 ** 
## indus         0.0250089   0.0713607   0.350             0.726197    
## chas          3.2822586   1.0027446   3.273             0.001164 ** 
## nox         -18.0882809   4.4724074  -4.044   0.0000639962635882 ***
## rm            3.8163814   0.4840351   7.885   0.0000000000000367 ***
## age           0.0006868   0.0156074   0.044             0.964926    
## dis          -1.4945325   0.2323994  -6.431   0.0000000003967085 ***
## rad           0.3330457   0.0747465   4.456   0.0000111299357289 ***
## tax          -0.0139224   0.0041778  -3.332             0.000949 ***
## ptratio      -0.9239854   0.1497387  -6.171   0.0000000018039734 ***
## b             0.0087842   0.0033096   2.654             0.008297 ** 
## lstat        -0.5085857   0.0579310  -8.779 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.82 on 366 degrees of freedom
## Multiple R-squared:  0.7334, Adjusted R-squared:  0.7239 
## F-statistic: 77.43 on 13 and 366 DF,  p-value: < 0.00000000000000022

Evaluation

We calculate evaluation metrics on both train and test sets.

NB: If the model performs much better on train than on test, it is likely to be overfitting.

Metrics

GnT::one_metrics(vec_actual = select(dat_test, target), vec_pred = predict(fit, new_data = test))
## # A tibble: 3 x 2
##   .metric .estimate
##   <chr>       <dbl>
## 1 rmse        4.54 
## 2 rsq         0.760
## 3 mae         3.10
# train
results_train = dat_train %>%
  bind_cols(pred = predict(fit, new_data = train))
# test
results_test = dat_test %>%
  bind_cols(pred = predict(fit, new_data = test))
# combine
scores = list(results_train, results_test) %>%
  map_dfc(~ yardstick::metrics(truth=target, estimate=.pred, data=.x)) %>%
  select(-contains('.estimator')) %>%
  select(-.metric1) %>%
  set_names('Metric', 'Train Score', 'Test Score')
kable(scores)
Metric Train Score Test Score
rmse 4.7302427 4.5425316
rsq 0.7333552 0.7601197
mae 3.3471832 3.1049412

Comparison of actuals VS predictions

GnT::plot_actual_vs_pred(data = results_test, xvar = 'target', yvar = '.pred')

Save model object

GnT::save_mod(fit, scores, params$outpath, params$name)