The notebook aims to:
library(skimr) # summary stats
library(tidyverse) # data manipulation
library(tidymodels) # modelling
library(highcharter) # interactive visualization
library(janitor) # clean column names, convert to snake case
library(h2o) # for automl
Start from specifying cluster, with a clean slate - just in case the cluster was already running.
h2o.init(nthreads=-1, max_mem_size="2G")
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## C:\Users\ayang\AppData\Local\Temp\RtmpYhgqFN/h2o_ayang_started_from_r.out
## C:\Users\ayang\AppData\Local\Temp\RtmpYhgqFN/h2o_ayang_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 seconds 896 milliseconds
## H2O cluster timezone: Europe/London
## H2O data parsing timezone: UTC
## H2O cluster version: 3.26.0.2
## H2O cluster version age: 2 months and 27 days
## H2O cluster name: H2O_started_from_R_ayang_geo020
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.78 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.3 (2017-11-30)
h2o.removeAll()
dat <- read_csv(params$dpath) %>%
rename(target = params$target_name) %>%
clean_names()
dim(dat)
## [1] 506 14
## Preprocessing
# set.seed(42)
# dat_ls = h2o.splitFrame(dat, ratios=.75, destination_frames = c("train","test"))
#
# dim(dat_ls[[1]])
# dim(dat_ls[[2]])
set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss) %>% as.h2o()
##
|
| | 0%
|
|=================================================================| 100%
dat_test <- testing(sss) %>% as.h2o()
##
|
| | 0%
|
|=================================================================| 100%
dim(dat_train)
## [1] 380 14
dim(dat_test)
## [1] 126 14
# Run AutoML for 10 base models (limited to 1 hour max runtime by default)
fit <- h2o.automl(y = 'target',
training_frame = dat_train,
max_models = 10,
seed = 1)
##
|
| | 0%
|
|===== | 8%
|
|======== | 12%
|
|============ | 19%
|
|============= | 20%
|
|================ | 24%
|
|===================== | 32%
|
|======================= | 36%
|
|========================== | 40%
|
|========================================================= | 88%
|
|============================================================= | 94%
|
|=================================================================| 100%
fit@leader
## Model Details:
## ==============
##
## H2ORegressionModel: gbm
## Model ID: GBM_1_AutoML_20191024_082514
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 54 54 32536 6
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 6 6.00000 26 56 43.25926
##
##
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.3378408
## RMSE: 0.5812408
## MAE: 0.4687235
## RMSLE: 0.03117013
## Mean Residual Deviance : 0.3378408
## R^2 : 0.995974
##
##
##
## H2ORegressionMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 10.92923
## RMSE: 3.305938
## MAE: 2.058922
## RMSLE: 0.141614
## Mean Residual Deviance : 10.92923
## R^2 : 0.8697566
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid
## mae 2.0589216 0.15716508 1.9332296 1.8641936
## mean_residual_deviance 10.929227 2.6161783 9.102078 6.848985
## mse 10.929227 2.6161783 9.102078 6.848985
## r2 0.8695487 0.028616937 0.8733862 0.9234232
## residual_deviance 10.929227 2.6161783 9.102078 6.848985
## rmse 3.2589953 0.39254087 3.0169652 2.6170566
## rmsle 0.14046426 0.012733889 0.123100765 0.116575375
## cv_3_valid cv_4_valid cv_5_valid
## mae 1.8574331 2.4107032 2.229049
## mean_residual_deviance 7.950678 15.60169 15.142703
## mse 7.950678 15.60169 15.142703
## r2 0.9029419 0.8258562 0.822136
## residual_deviance 7.950678 15.60169 15.142703
## rmse 2.8196945 3.9498975 3.8913627
## rmsle 0.14330356 0.16099463 0.15834698
We calculate evaluation metrics on both train and test sets.
NB: If the model performs much better on train than on test, it is likely to be overfitting.
# train
results_train = bind_cols(
as.data.frame(dat_train),
as.data.frame(h2o.predict(fit, dat_train))
)
##
|
| | 0%
|
|=================================================================| 100%
# test
results_test = bind_cols(
as.data.frame(dat_test),
as.data.frame(h2o.predict(fit, dat_test))
)
##
|
| | 0%
|
|=================================================================| 100%
# combine
scores = list(results_train, results_test) %>%
map_dfc(~ yardstick::metrics(truth=target, estimate=predict, data=.x)) %>%
select(-contains('.estimator')) %>%
select(-.metric1) %>%
set_names('Metric', 'Train Score', 'Test Score')
kable(scores)
Metric | Train Score | Test Score |
---|---|---|
rmse | 0.5812407 | 3.4536875 |
rsq | 0.9962012 | 0.8625395 |
mae | 0.4687236 | 2.3537990 |
GnT::plot_actual_vs_pred(data = results_test, 'target', yvar = 'predict')
GnT::save_mod(fit, scores, params$outpath, params$name)