The notebook aims to:
library(skimr) # summary stats
library(tidyverse) # data manipulation
library(tidymodels) # modelling
library(highcharter) # interactive visualization
library(janitor) # clean column names, convert to snake case
dat <- read_csv(params$dpath) %>%
rename(target = params$target_name) %>%
clean_names()
dim(dat)
## [1] 506 14
## Preprocessing
set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss)
dat_test <- testing(sss)
dim(dat_train)
## [1] 380 14
dim(dat_test)
## [1] 126 14
Create a recipe of steps to be taken. Apply the recipe to train and test separately.
rec <- dat_train %>%
recipe(target ~., data = .) %>%
step_log(lstat) %>%
step_nzv(all_predictors()) %>% # zero variance filter
check_missing(all_predictors()) # check missing values
prepped <- prep(rec)
train <- prepped %>%
juice()
test <- prepped %>%
bake(new_data = dat_test)
fit <- rand_forest(mode = 'regression') %>%
parsnip::set_engine(engine = 'randomForest') %>% # ranger
fit(formula=formula(prepped), data = train)
fit$fit
##
## Call:
## randomForest(x = as.data.frame(x), y = y)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 11.33013
## % Var explained: 86.5
We calculate evaluation metrics on both train and test sets.
NB: If the model performs much better on train than on test, it is likely to be overfitting.
# train
results_train = dat_train %>%
bind_cols(predict(fit, new_data = train))
# test
results_test = dat_test %>%
bind_cols(predict(fit, new_data = test))
# combine
scores = list(results_train, results_test) %>%
map_dfc(~ yardstick::metrics(truth=target, estimate=.pred, data=.x)) %>%
select(-contains('.estimator')) %>%
select(-.metric1) %>%
set_names('Metric', 'Train Score', 'Test Score')
kable(scores)
Metric | Train Score | Test Score |
---|---|---|
rmse | 1.4917752 | 3.0489169 |
rsq | 0.9780070 | 0.8950499 |
mae | 0.9559566 | 2.2214201 |
GnT::plot_actual_vs_pred(data = results_test, xvar = "target", yvar = ".pred")
GnT::save_mod(fit, scores, params$outpath, params$name)