Feature Engineering

Overview

The notebook aims to:

Train model with various features and recipes
Compare model performances

Load libraries

library(skimr)          # summary stats
library(DataExplorer)   # variable profiling, EDA
library(tidyverse)      # data manipulation
library(tidymodels)     # modelling
library(highcharter)    # interactive visualization
library(janitor)        # clean column names, convert to snake case
library(countrycode)    # feature engineering - from country to continent

Import data

dat <- read_csv(params$dpath) %>%
  rename(target = params$target_name) %>%
  clean_names()
dim(dat)

## [1] 506  14

Preprocessing

Convert variable class eg. make sure date is in correct format
Variable validation
Derive new features (make sure don’t introduce leakage between train and test)

Train and test split

set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss)
dat_test <- testing(sss)

dim(dat_train)

## [1] 380  14

dim(dat_test)

## [1] 126  14

Recipe

Create a recipe of steps to be taken. Apply the recipe to train and test separately.

# Example recipe
rec <- dat_train %>%
    recipe(target ~., data = .) %>%
    step_discretize(age, num_breaks=3) %>%
    #step_discretize(b, num_breaks=2, min_unique=1) %>%
    step_discretize(crim, num_breaks=3) %>%
    step_num2factor(chas) %>%
    step_log(lstat) %>%
    step_log(indus) %>%
    step_log(b) %>%
    #step_normalize(all_numeric(), -target) %>% # no difference
    #step_log(crim) %>% # worse than step_discretize
    #step_mutate() %>%
    #step_num2factor() %>%
    #update_role() %>%
    #step_date() %>%                      # new feature - convert date data into one or more factor or numeric variables
    #step_naomit() %>%                    # remove NA obs
    #step_unknown() %>%                   # replace NA with a factor level "unknown"
    #step_medianimpute(all_numeric()) %>% # impute missing values with median of train
    step_nzv(all_predictors()) %>%       # zero variance filter             
    check_missing(all_predictors())      # check missing values

prepped <- prep(rec)

train <- prepped %>%
  juice()

test <- prepped %>%
  bake(new_data = dat_test)

Linear regression

fit <- linear_reg(mode = 'regression') %>%
  parsnip::set_engine(engine = 'lm') %>%
  fit(formula=formula(prepped), data = train)

summary(fit$fit)

## 
## Call:
## stats::lm(formula = formula, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.6308  -2.6785  -0.2934   1.9844  25.0845 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  53.694900   6.270750   8.563 0.000000000000000315 ***
## crimbin2      1.704331   0.654945   2.602             0.009640 ** 
## crimbin3      1.725139   1.178744   1.464             0.144182    
## zn            0.023256   0.016129   1.442             0.150207    
## indus        -1.064224   0.594312  -1.791             0.074175 .  
## chas1         3.180118   0.920442   3.455             0.000615 ***
## nox         -15.506436   4.296957  -3.609             0.000351 ***
## rm            2.314867   0.472419   4.900 0.000001444960425653 ***
## agebin2       0.551932   0.753163   0.733             0.464140    
## agebin3       1.107391   0.961285   1.152             0.250081    
## dis          -1.306967   0.212133  -6.161 0.000000001915085965 ***
## rad           0.217731   0.069174   3.148             0.001782 ** 
## tax          -0.010466   0.003615  -2.895             0.004015 ** 
## ptratio      -0.786107   0.137934  -5.699 0.000000024894379571 ***
## b             1.136481   0.380741   2.985             0.003028 ** 
## lstat        -9.177931   0.687780 -13.344 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.409 on 364 degrees of freedom
## Multiple R-squared:  0.7781, Adjusted R-squared:  0.769 
## F-statistic:  85.1 on 15 and 364 DF,  p-value: < 0.00000000000000022

Evaluation

We calculate evaluation metrics on both train and test sets.

NB: If the model performs much better on train than on test, it is likely to be overfitting.

Metrics

GnT::one_metrics(vec_actual = select(dat_test, target), vec_pred = predict(fit, new_data = test))

## # A tibble: 3 x 2
##   .metric .estimate
##   <chr>       <dbl>
## 1 rmse        3.88 
## 2 rsq         0.826
## 3 mae         2.85

# train
results_train = dat_train %>%
  bind_cols(pred = predict(fit, new_data = train))
# test
results_test = dat_test %>%
  bind_cols(pred = predict(fit, new_data = test))
# combine
scores = list(results_train, results_test) %>%
  map_dfc(~ yardstick::metrics(truth=target, estimate=.pred, data=.x)) %>%
  select(-contains('.estimator')) %>%
  select(-.metric1) %>%
  set_names('Metric', 'Train Score', 'Test Score')
kable(scores)

Metric	Train Score	Test Score
rmse	4.3149490	3.8807636
rsq	0.7781203	0.8263787
mae	3.1019493	2.8462026

Comparison of actuals VS predictions

GnT::plot_actual_vs_pred(data = results_test, xvar = 'target', yvar = '.pred')

Save model object

GnT::save_mod(fit, scores, params$outpath, params$name)