Overview

The notebook aims to:

  1. Train a random forest model
  2. Evaluate model performance
  3. Save the model object as asset

Load Libraries

library(skimr)          # summary stats
library(tidyverse)      # data manipulation
library(tidymodels)     # modelling
library(highcharter)    # interactive visualization
library(janitor)        # clean column names, convert to snake case

Import Data

dat <- read_csv(params$dpath) %>%
  rename(target = params$target_name) %>%
  clean_names()
dim(dat)
## [1] 506  14

## Preprocessing

Train and Test Split

set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss)
dat_test <- testing(sss)

dim(dat_train)
## [1] 380  14
dim(dat_test)
## [1] 126  14

Recipe

Create a recipe of steps to be taken. Apply the recipe to train and test separately.

rec <- dat_train %>%
    recipe(target ~., data = .) %>%
    step_log(lstat) %>%
    step_nzv(all_predictors()) %>%       # zero variance filter             
    check_missing(all_predictors())      # check missing values

prepped <- prep(rec)

train <- prepped %>%
  juice()

test <- prepped %>%
  bake(new_data = dat_test)

Random Forest

fit <- rand_forest(mode = 'regression') %>%
  parsnip::set_engine(engine = 'randomForest') %>% # ranger
  fit(formula=formula(prepped), data = train)

fit$fit
## 
## Call:
##  randomForest(x = as.data.frame(x), y = y) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 11.33013
##                     % Var explained: 86.5

Evaluation

We calculate evaluation metrics on both train and test sets.

NB: If the model performs much better on train than on test, it is likely to be overfitting.

Metrics

# train
results_train = dat_train %>%
  bind_cols(predict(fit, new_data = train))
# test
results_test = dat_test %>%
  bind_cols(predict(fit, new_data = test))
# combine
scores = list(results_train, results_test) %>%
  map_dfc(~ yardstick::metrics(truth=target, estimate=.pred, data=.x)) %>%
  select(-contains('.estimator')) %>%
  select(-.metric1) %>%
  set_names('Metric', 'Train Score', 'Test Score')
kable(scores)
Metric Train Score Test Score
rmse 1.4917752 3.0489169
rsq 0.9780070 0.8950499
mae 0.9559566 2.2214201

Comparison of actuals VS predictions

GnT::plot_actual_vs_pred(data = results_test, xvar = "target", yvar = ".pred")

Variable Importance

Save Model Object

GnT::save_mod(fit, scores, params$outpath, params$name)