Random Forest Model

Overview

The notebook aims to:

Train a random forest model
Evaluate model performance
Save the model object as asset

Load Libraries

library(skimr)          # summary stats
library(tidyverse)      # data manipulation
library(tidymodels)     # modelling
library(highcharter)    # interactive visualization
library(janitor)        # clean column names, convert to snake case

Import Data

dat <- read_csv(params$dpath) %>%
  rename(target = params$target_name) %>%
  clean_names()
dim(dat)

## [1] 506  14

## Preprocessing

Convert variable class eg. make sure date is in correct format
Variable validation
Derive new features (make sure don’t introduce leakage between train and test)

Train and Test Split

set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss)
dat_test <- testing(sss)

dim(dat_train)

## [1] 380  14

dim(dat_test)

## [1] 126  14

Recipe

Create a recipe of steps to be taken. Apply the recipe to train and test separately.

rec <- dat_train %>%
    recipe(target ~., data = .) %>%
    step_log(lstat) %>%
    step_nzv(all_predictors()) %>%       # zero variance filter             
    check_missing(all_predictors())      # check missing values

prepped <- prep(rec)

train <- prepped %>%
  juice()

test <- prepped %>%
  bake(new_data = dat_test)

Random Forest

fit <- rand_forest(mode = 'regression') %>%
  parsnip::set_engine(engine = 'randomForest') %>% # ranger
  fit(formula=formula(prepped), data = train)

fit$fit

## 
## Call:
##  randomForest(x = as.data.frame(x), y = y) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 11.33013
##                     % Var explained: 86.5

Evaluation

We calculate evaluation metrics on both train and test sets.

NB: If the model performs much better on train than on test, it is likely to be overfitting.

Metrics

# train
results_train = dat_train %>%
  bind_cols(predict(fit, new_data = train))
# test
results_test = dat_test %>%
  bind_cols(predict(fit, new_data = test))
# combine
scores = list(results_train, results_test) %>%
  map_dfc(~ yardstick::metrics(truth=target, estimate=.pred, data=.x)) %>%
  select(-contains('.estimator')) %>%
  select(-.metric1) %>%
  set_names('Metric', 'Train Score', 'Test Score')
kable(scores)

Metric	Train Score	Test Score
rmse	1.4917752	3.0489169
rsq	0.9780070	0.8950499
mae	0.9559566	2.2214201

Comparison of actuals VS predictions

GnT::plot_actual_vs_pred(data = results_test, xvar = "target", yvar = ".pred")

Variable Importance

Save Model Object

GnT::save_mod(fit, scores, params$outpath, params$name)