Baseline Model

Overview

The notebook aims to:

Train a baseline model
Evaluate model performance
Save the model object as asset

Load libraries

library(skimr)          # summary stats
library(DataExplorer)   # variable profiling, EDA
library(tidyverse)      # data manipulation
library(tidymodels)     # modelling
library(highcharter)    # interactive visualization
library(janitor)        # clean column names, convert to snake case
library(countrycode)    # feature engineering - from country to continent

Import data

dat <- read_csv(params$dpath) %>%
  rename(target = params$target_name) %>%
  clean_names()
dim(dat)

## [1] 506  14

Preprocessing

Convert variable class eg. make sure date is in correct format
Variable validation
Derive new features (make sure don’t introduce leakage between train and test)

Train and test split

set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss)
dat_test <- testing(sss)

dim(dat_train)

## [1] 380  14

dim(dat_test)

## [1] 126  14

Recipe

Create a recipe of steps to be taken. Apply the recipe to train and test separately.

# Example recipe
rec <- dat_train %>%
    recipe(target ~., data = .) %>%                         
    #update_role() %>%
    #step_date() %>%                      # new feature - convert date data into one or more factor or numeric variables
    #step_naomit() %>%                    # remove NA obs
    #step_unknown() %>%                   # replace NA with a factor level "unknown"
    #step_medianimpute(all_numeric()) %>% # impute missing values with median of train
    #step_nzv(all_predictors()) %>%       # zero variance filter             
    check_missing(all_predictors())      # check missing values

prepped <- prep(rec)

train <- prepped %>%
  juice()

test <- prepped %>%
  bake(new_data = dat_test)

Linear regression

fit <- linear_reg(mode = 'regression') %>%
  parsnip::set_engine(engine = 'lm') %>%
  fit(formula=formula(prepped), data = train)

summary(fit$fit)

## 
## Call:
## stats::lm(formula = formula, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.0902  -2.8351  -0.5495   1.8994  25.1352 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)  36.4595341   5.8819560   6.199   0.0000000015372045 ***
## crim         -0.1195332   0.0370736  -3.224             0.001377 ** 
## zn            0.0503486   0.0159338   3.160             0.001710 ** 
## indus         0.0250089   0.0713607   0.350             0.726197    
## chas          3.2822586   1.0027446   3.273             0.001164 ** 
## nox         -18.0882809   4.4724074  -4.044   0.0000639962635882 ***
## rm            3.8163814   0.4840351   7.885   0.0000000000000367 ***
## age           0.0006868   0.0156074   0.044             0.964926    
## dis          -1.4945325   0.2323994  -6.431   0.0000000003967085 ***
## rad           0.3330457   0.0747465   4.456   0.0000111299357289 ***
## tax          -0.0139224   0.0041778  -3.332             0.000949 ***
## ptratio      -0.9239854   0.1497387  -6.171   0.0000000018039734 ***
## b             0.0087842   0.0033096   2.654             0.008297 ** 
## lstat        -0.5085857   0.0579310  -8.779 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.82 on 366 degrees of freedom
## Multiple R-squared:  0.7334, Adjusted R-squared:  0.7239 
## F-statistic: 77.43 on 13 and 366 DF,  p-value: < 0.00000000000000022

Evaluation

We calculate evaluation metrics on both train and test sets.

NB: If the model performs much better on train than on test, it is likely to be overfitting.

Metrics

GnT::one_metrics(vec_actual = select(dat_test, target), vec_pred = predict(fit, new_data = test))

## # A tibble: 3 x 2
##   .metric .estimate
##   <chr>       <dbl>
## 1 rmse        4.54 
## 2 rsq         0.760
## 3 mae         3.10

# train
results_train = dat_train %>%
  bind_cols(pred = predict(fit, new_data = train))
# test
results_test = dat_test %>%
  bind_cols(pred = predict(fit, new_data = test))
# combine
scores = list(results_train, results_test) %>%
  map_dfc(~ yardstick::metrics(truth=target, estimate=.pred, data=.x)) %>%
  select(-contains('.estimator')) %>%
  select(-.metric1) %>%
  set_names('Metric', 'Train Score', 'Test Score')
kable(scores)

Metric	Train Score	Test Score
rmse	4.7302427	4.5425316
rsq	0.7333552	0.7601197
mae	3.3471832	3.1049412

Comparison of actuals VS predictions

GnT::plot_actual_vs_pred(data = results_test, xvar = 'target', yvar = '.pred')

Save model object

GnT::save_mod(fit, scores, params$outpath, params$name)