The notebook aims to:
library(skimr) # summary stats
library(DataExplorer) # variable profiling, EDA
library(tidyverse) # data manipulation
library(tidymodels) # modelling
library(highcharter) # interactive visualization
library(janitor) # clean column names, convert to snake case
library(countrycode) # feature engineering - from country to continent
dat <- read_csv(params$dpath) %>%
rename(target = params$target_name) %>%
clean_names()
dim(dat)
## [1] 506 14
set.seed(42)
sss <- initial_split(dat, prop = 3/4)
dat_train <- training(sss)
dat_test <- testing(sss)
dim(dat_train)
## [1] 380 14
dim(dat_test)
## [1] 126 14
Create a recipe of steps to be taken. Apply the recipe to train and test separately.
# Example recipe
rec <- dat_train %>%
recipe(target ~., data = .) %>%
#update_role() %>%
#step_date() %>% # new feature - convert date data into one or more factor or numeric variables
#step_naomit() %>% # remove NA obs
#step_unknown() %>% # replace NA with a factor level "unknown"
#step_medianimpute(all_numeric()) %>% # impute missing values with median of train
#step_nzv(all_predictors()) %>% # zero variance filter
check_missing(all_predictors()) # check missing values
prepped <- prep(rec)
train <- prepped %>%
juice()
test <- prepped %>%
bake(new_data = dat_test)
fit <- linear_reg(mode = 'regression') %>%
parsnip::set_engine(engine = 'lm') %>%
fit(formula=formula(prepped), data = train)
summary(fit$fit)
##
## Call:
## stats::lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.0902 -2.8351 -0.5495 1.8994 25.1352
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.4595341 5.8819560 6.199 0.0000000015372045 ***
## crim -0.1195332 0.0370736 -3.224 0.001377 **
## zn 0.0503486 0.0159338 3.160 0.001710 **
## indus 0.0250089 0.0713607 0.350 0.726197
## chas 3.2822586 1.0027446 3.273 0.001164 **
## nox -18.0882809 4.4724074 -4.044 0.0000639962635882 ***
## rm 3.8163814 0.4840351 7.885 0.0000000000000367 ***
## age 0.0006868 0.0156074 0.044 0.964926
## dis -1.4945325 0.2323994 -6.431 0.0000000003967085 ***
## rad 0.3330457 0.0747465 4.456 0.0000111299357289 ***
## tax -0.0139224 0.0041778 -3.332 0.000949 ***
## ptratio -0.9239854 0.1497387 -6.171 0.0000000018039734 ***
## b 0.0087842 0.0033096 2.654 0.008297 **
## lstat -0.5085857 0.0579310 -8.779 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.82 on 366 degrees of freedom
## Multiple R-squared: 0.7334, Adjusted R-squared: 0.7239
## F-statistic: 77.43 on 13 and 366 DF, p-value: < 0.00000000000000022
We calculate evaluation metrics on both train and test sets.
NB: If the model performs much better on train than on test, it is likely to be overfitting.
GnT::one_metrics(vec_actual = select(dat_test, target), vec_pred = predict(fit, new_data = test))
## # A tibble: 3 x 2
## .metric .estimate
## <chr> <dbl>
## 1 rmse 4.54
## 2 rsq 0.760
## 3 mae 3.10
# train
results_train = dat_train %>%
bind_cols(pred = predict(fit, new_data = train))
# test
results_test = dat_test %>%
bind_cols(pred = predict(fit, new_data = test))
# combine
scores = list(results_train, results_test) %>%
map_dfc(~ yardstick::metrics(truth=target, estimate=.pred, data=.x)) %>%
select(-contains('.estimator')) %>%
select(-.metric1) %>%
set_names('Metric', 'Train Score', 'Test Score')
kable(scores)
Metric | Train Score | Test Score |
---|---|---|
rmse | 4.7302427 | 4.5425316 |
rsq | 0.7333552 | 0.7601197 |
mae | 3.3471832 | 3.1049412 |
GnT::plot_actual_vs_pred(data = results_test, xvar = 'target', yvar = '.pred')
GnT::save_mod(fit, scores, params$outpath, params$name)