Getting Started

First we will import the needed packages. We will use the tidyverse set of packages for code readability and simplicity, along with glmnet for LASSO and Elastic Net.

library(tidyverse)
library('tidymodels')
Registered S3 method overwritten by 'tune':
  method                   from   
  required_pkgs.model_spec parsnip
-- Attaching packages --------------------------------------------------------------------------------------------------------------------- tidymodels 0.1.3 --
v broom        0.7.8      v rsample      0.1.0 
v dials        0.0.9      v tune         0.1.6 
v infer        1.0.0      v workflows    0.2.3 
v modeldata    0.1.1      v workflowsets 0.1.0 
v parsnip      0.1.7      v yardstick    0.0.8 
v recipes      0.1.16     
Warning: package ‘infer’ was built under R version 4.1.1
-- Conflicts ------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
x scales::discard()        masks purrr::discard()
x dplyr::filter()          masks stats::filter()
x recipes::fixed()         masks stringr::fixed()
x dplyr::lag()             masks stats::lag()
x caret::lift()            masks purrr::lift()
x yardstick::precision()   masks caret::precision()
x yardstick::recall()      masks caret::recall()
x yardstick::sensitivity() masks caret::sensitivity()
x yardstick::spec()        masks readr::spec()
x yardstick::specificity() masks caret::specificity()
x recipes::step()          masks stats::step()
* Use tidymodels_prefer() to resolve common conflicts.

Next, we need to import the dataset for the sessions exercises. We will process it just like we did for Session 1

df = read_csv('../../Data/S1_data.csv.gz')

-- Column specification ---------------------------------------------------------------------------------------------------------------------------------------
cols(
  .default = col_double(),
  Filing = col_character(),
  `date filed_x` = col_character(),
  FYE_x = col_character(),
  restate_filing = col_character(),
  Form = col_character(),
  Date = col_character(),
  loc = col_character()
)
i Use `spec()` for the full column specifications.
train <- df %>% filter(year < 2004)
test  <- df %>% filter(year == 2004)
print(c(nrow(df), nrow(train), nrow(test)))
[1] 14301 11478  2823
BD_eq <- as.formula(paste("sdvol1 ~ ", paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))
BCE_eq <- as.formula(paste("Restate_Int_chr ~ logtotasset + rsst_acc + chg_recv + chg_inv +
  soft_assets + pct_chg_cashsales + chg_roa + issuance +
  oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
  midNaudit + cffin + exfin + restruct + bullets + headerlen +
  newlines + alltags + processedsize + sentlen_u + wordlen_s +
  paralen_s + repetitious_p + sentlen_s + typetoken +
  clindex + fog + active_p + passive_p + lm_negative_p +
  lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
  paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))

SVM

In computer science and many other discplines, a simple type of classifier used in training models is SVM. This is particularly common for algorithms that are being used as supervised methods for predicting some variable in a model.

SVM: Support Vector Machine for Classification

This is the most common approach for these algorithms. The algorithm will allow us to approach a classification problem, and it can either report the most likely class for each observation, or, like with logistic regression, it can report the probability of belonging to a given class.

We will use the implementation from kernlab, though we will call it from caret.

This implementation is rather slow, and as such we are going to only use 3-fold CV in the example code below. You can change the number parameter to increase the number of folds.

recipe_svm <-
  recipe(BCE_eq, data = train)  %>%
  step_zv(all_predictors()) %>% # remove any zero variance predictors
  step_center(all_predictors()) %>%  # Center all prediction variables
  step_scale(all_predictors()) %>%  # Scale all prediction variables
  step_intercept() %>% # Add an intercept to the model
  step_num2factor(all_outcomes(), ordered = T, levels=c("0","1"),
                  transform = function(x) x + 1)  # Convert DV to factor
model_svm <-
  svm_linear(cost = tune()) %>%
  set_mode("classification") %>%
  set_engine("LiblineaR")

workflow_svm <- workflow() %>%
  add_model(model_svm) %>%
  add_recipe(recipe_svm)

folds_svm <- vfold_cv(train, v=10)  # from rsample
metrics_svm = metric_set(roc_auc)  # from yardstick
grid_svm <- expand_grid(cost = exp(seq(-10,0, length.out=10)))

svm_fit_tuned <- tune_grid(workflow_svm,
                             grid = grid_svm,
                             resamples = folds_svm,
                             metrics=metrics_svm)
x Fold01: preprocessor 1/1, model 1/10 (predictions): Error: No prob prediction method available for this model.
* Value for `type` should be one of: 'class', 'raw'
x Fold01: preprocessor 1/1, model 2/10 (predictions): Error: No prob prediction method available for this model.
* Value for `type` should be one of: 'class', 'raw'
x Fold01: preprocessor 1/1, model 3/10 (predictions): Error: No prob prediction method available for this model.
* Value for `type` should be one of: 'class', 'raw'
x Fold01: preprocessor 1/1, model 4/10 (predictions): Error: No prob prediction method available for this model.
* Value for `type` should be one of: 'class', 'raw'
x Fold01: preprocessor 1/1, model 5/10 (predictions): Error: No prob prediction method available for this model.
* Value for `type` should be one of: 'class', 'raw'
x Fold01: preprocessor 1/1, model 6/10 (predictions): Error: No prob prediction method available for this model.
* Value for `type` should be one of: 'class', 'raw'
x Fold01: preprocessor 1/1, model 7/10 (predictions): Error: No prob prediction method available for this model.
* Value for `type` should be one of: 'class', 'raw'
x Fold01: preprocessor 1/1, model 8/10 (predictions): Error: No prob prediction method available for this model.
* Value for `type` should be one of: 'class', 'raw'
x Fold01: preprocessor 1/1, model 9/10 (predictions): Error: No prob prediction method available for this model.
* Value for `type` should be one of: 'class', 'raw'

To see which model was best, we can use the show_best() function.

svm_final <- workflow_svm %>%
  finalize_workflow(
  select_best(svm_fit_tuned, "roc_auc")
) %>%
  fit(train)
 Setting default kernel parameters  
Warning in .local(x, ...) : Variable(s) `' constant. Cannot scale data.
maximum number of iterations reached 0.001801685 0.001785087

Next we can output a prediction just like we did for linear regression, and then use that prediction to obtain our ROC AUC and ROC curve.

print(paste0('ROC AUC: ', auc@y.values[[1]]))
[1] "ROC AUC: 0.695682643896551"
print(paste0('ROC AUC: ', auc_out@y.values[[1]]))
[1] "ROC AUC: 0.554117586010075"

In this case, we see that out-of-sample performance is much lower.

We can also plot out an ROC curve by calculating the True Positive rate and False Positive Rate. We can use ROCR with ggplot to get a nice visualization of this.

Using XGBoost

Note: The general workflow using parsnip with xgboost is the same as with svm. The model name is boost_tree() in this case, however, and of course the parameter list is longer (reference).

For simplicity, the below workflow is only to create a single XGBoost model using the package itself, rather than the tidymodels interface. As such, we will use the juice() function to extract the data in the format needed for XGBoost.

rec <- recipe(BCE_eq, data = train) %>%
  step_zv(all_predictors()) %>%  # Drop any variables with zero variance
  step_center(all_predictors()) %>%  # Center all prediction variables
  step_scale(all_predictors()) %>%  # Scale all prediction variables
  step_intercept()  # Add an intercept to the model

# Juice our data
prepped <- rec %>% prep(training=train)
train_x <- juice(prepped, all_predictors(), composition = "dgCMatrix")
train_y <- juice(prepped, all_outcomes(), composition = "matrix")
test_prepped <- rec %>% prep(training=test)
test_x <- juice(test_prepped, all_predictors(), composition = "dgCMatrix")
test_y <- juice(test_prepped, all_outcomes(), composition = "matrix")

# Cross validation
set.seed(482342)  #for reproducibility
library(xgboost)

Attaching package: ‘xgboost’

The following object is masked from ‘package:dplyr’:

    slice
# model setup
params <- list(max_depth=10,
               eta=0.2,
               gamma=10,
               min_child_weight = 5,
               objective =
                 "binary:logistic")

# run the model
xgbCV <- xgb.cv(params=params,
                data=train_x,
                label=train_y,
                nrounds=100,
                eval_metric="auc",
                nfold=10,
                stratified=TRUE)
[1] train-auc:0.500000+0.000000 test-auc:0.500000+0.000000 
[2] train-auc:0.500000+0.000000 test-auc:0.500000+0.000000 
[3] train-auc:0.500000+0.000000 test-auc:0.500000+0.000000 
[4] train-auc:0.500000+0.000000 test-auc:0.500000+0.000000 
[5] train-auc:0.500000+0.000000 test-auc:0.500000+0.000000 
[6] train-auc:0.500000+0.000000 test-auc:0.500000+0.000000 
[7] train-auc:0.500000+0.000000 test-auc:0.500000+0.000000 
[8] train-auc:0.512896+0.038689 test-auc:0.496177+0.011467 
[9] train-auc:0.560039+0.060273 test-auc:0.516633+0.044405 
[10]    train-auc:0.596276+0.048602 test-auc:0.562167+0.076553 
[11]    train-auc:0.618009+0.011237 test-auc:0.601066+0.067260 
[12]    train-auc:0.643567+0.019062 test-auc:0.608147+0.060943 
[13]    train-auc:0.646723+0.021138 test-auc:0.613355+0.065557 
[14]    train-auc:0.669839+0.014928 test-auc:0.633844+0.070390 
[15]    train-auc:0.683102+0.018207 test-auc:0.637254+0.069544 
[16]    train-auc:0.708800+0.025621 test-auc:0.639027+0.082525 
[17]    train-auc:0.726156+0.018551 test-auc:0.644384+0.083992 
[18]    train-auc:0.749755+0.031893 test-auc:0.653366+0.090486 
[19]    train-auc:0.767492+0.033702 test-auc:0.661573+0.085642 
[20]    train-auc:0.774949+0.030019 test-auc:0.664315+0.088128 
[21]    train-auc:0.788037+0.021650 test-auc:0.663974+0.092046 
[22]    train-auc:0.801918+0.023933 test-auc:0.665885+0.098105 
[23]    train-auc:0.813597+0.022498 test-auc:0.673366+0.102972 
[24]    train-auc:0.826128+0.017047 test-auc:0.680494+0.105595 
[25]    train-auc:0.835805+0.018088 test-auc:0.674278+0.103718 
[26]    train-auc:0.843025+0.018110 test-auc:0.683315+0.073458 
[27]    train-auc:0.851160+0.016997 test-auc:0.681916+0.076628 
[28]    train-auc:0.855128+0.018839 test-auc:0.679028+0.064542 
[29]    train-auc:0.861913+0.021880 test-auc:0.678984+0.070448 
[30]    train-auc:0.866917+0.021857 test-auc:0.674863+0.077988 
[31]    train-auc:0.876251+0.019530 test-auc:0.679354+0.067319 
[32]    train-auc:0.881639+0.019588 test-auc:0.680695+0.065898 
[33]    train-auc:0.885488+0.018508 test-auc:0.683177+0.061797 
[34]    train-auc:0.888280+0.019101 test-auc:0.682871+0.059875 
[35]    train-auc:0.892049+0.020314 test-auc:0.680254+0.061654 
[36]    train-auc:0.894742+0.022124 test-auc:0.680570+0.061335 
[37]    train-auc:0.898266+0.021981 test-auc:0.682069+0.065854 
[38]    train-auc:0.898684+0.021750 test-auc:0.682828+0.065218 
[39]    train-auc:0.901598+0.021755 test-auc:0.680384+0.069263 
[40]    train-auc:0.902870+0.022647 test-auc:0.679353+0.067703 
[41]    train-auc:0.905980+0.024111 test-auc:0.680171+0.070536 
[42]    train-auc:0.908842+0.025752 test-auc:0.677667+0.074860 
[43]    train-auc:0.910068+0.026359 test-auc:0.681808+0.070568 
[44]    train-auc:0.913474+0.028733 test-auc:0.681250+0.066556 
[45]    train-auc:0.914379+0.029492 test-auc:0.678519+0.070892 
[46]    train-auc:0.914952+0.030008 test-auc:0.679641+0.069695 
[47]    train-auc:0.915599+0.030600 test-auc:0.680817+0.069873 
[48]    train-auc:0.915834+0.030842 test-auc:0.681183+0.069732 
[49]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[50]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[51]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[52]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[53]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[54]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[55]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[56]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[57]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[58]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[59]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[60]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[61]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[62]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[63]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[64]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[65]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[66]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[67]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[68]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[69]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[70]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[71]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[72]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[73]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[74]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[75]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[76]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[77]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[78]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[79]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[80]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[81]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[82]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[83]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[84]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[85]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[86]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[87]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[88]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[89]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[90]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[91]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[92]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[93]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[94]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[95]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[96]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[97]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[98]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[99]    train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 
[100]   train-auc:0.916752+0.031916 test-auc:0.682152+0.069441 

Next, we re-run the optimal model.

numTrees <- min(
 which(
  xgbCV$evaluation_log$test_auc_mean == 
  max(xgbCV$evaluation_log$test_auc_mean)
 )
)

fit4 <- xgboost(params=params,
                data = train_x,
                label = train_y,
                nrounds = numTrees,
                eval_metric="auc")
[1] train-auc:0.500000 
[2] train-auc:0.500000 
[3] train-auc:0.500000 
[4] train-auc:0.500000 
[5] train-auc:0.500000 
[6] train-auc:0.500000 
[7] train-auc:0.500000 
[8] train-auc:0.500000 
[9] train-auc:0.617282 
[10]    train-auc:0.617282 
[11]    train-auc:0.623066 
[12]    train-auc:0.657007 
[13]    train-auc:0.657007 
[14]    train-auc:0.658862 
[15]    train-auc:0.663206 
[16]    train-auc:0.731354 
[17]    train-auc:0.731541 
[18]    train-auc:0.739063 
[19]    train-auc:0.754038 
[20]    train-auc:0.760430 
[21]    train-auc:0.771918 
[22]    train-auc:0.807310 
[23]    train-auc:0.809272 
[24]    train-auc:0.829148 
[25]    train-auc:0.837454 
[26]    train-auc:0.855574 

We can output an importance plot easily.

xgb.train.data = xgb.DMatrix(train_x, label = train_y, missing = NA)
col_names = attr(xgb.train.data, ".Dimnames")[[2]]
imp = xgb.importance(col_names, fit4)
# Variable importance
xgb.plot.importance(imp)

We can also calculate out-of-sample AUC.

pred.xgb <- predict(fit4, test_x, type="response")
ROCpred.xgb <- prediction(as.numeric(pred.xgb), as.numeric(test_y))
ROCperf.xgb <- performance(ROCpred.xgb, 'tpr','fpr')
#plot(ROCperf.xgb)
df_ROC.xgb <- data.frame(FalsePositive=c(ROCperf.xgb@x.values[[1]]),
                 TruePositive=c(ROCperf.xgb@y.values[[1]]))

ggplot() +
  geom_line(data=df_ROC.xgb, aes(x=FalsePositive, y=TruePositive, color="Out of Sample")) + 
  geom_abline(slope=1)


auc.xgb <- performance(ROCpred.xgb, measure = "auc")
auc <- auc.xgb@y.values[[1]]
names(auc) <- c("XGBoost AUC")
auc
XGBoost AUC 
  0.6326631 
---
title: "Session 2: Classification"
author: "Dr. Richard M. Crowley"
date: ""
output:
  html_notebook
---

# Getting Started

First we will import the needed packages.  We will use the `tidyverse` set of packages for code readability and simplicity, along with `glmnet` for LASSO and Elastic Net.

```{r}
library(tidyverse)
library(tidymodels)
library(ROCR)
```

Next, we need to import the dataset for the sessions exercises.  We will process it just like we did for Session 1

```{r}
df = read_csv('../../Data/S1_data.csv.gz')
train <- df %>% filter(year < 2004)
test  <- df %>% filter(year == 2004)
print(c(nrow(df), nrow(train), nrow(test)))
BD_eq <- as.formula(paste("sdvol1 ~ ", paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))
BCE_eq <- as.formula(paste("Restate_Int ~ logtotasset + rsst_acc + chg_recv + chg_inv +
  soft_assets + pct_chg_cashsales + chg_roa + issuance +
  oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
  midNaudit + cffin + exfin + restruct + bullets + headerlen +
  newlines + alltags + processedsize + sentlen_u + wordlen_s +
  paralen_s + repetitious_p + sentlen_s + typetoken +
  clindex + fog + active_p + passive_p + lm_negative_p +
  lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
  paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))
```

## SVM

In computer science and many other discplines, a simple type of classifier used in training models is SVM.  This is particularly common for algorithms that are being used as supervised methods for predicting some variable in a model.

### SVM: Support Vector Machine for Classification

This is the most common approach for these algorithms.  The algorithm will allow us to approach a classification problem, and it can either report the most likely class for each observation, or, like with logistic regression, it can report the probability of belonging to a given class.

We will use the implementation from [kernlab](https://cran.r-project.org/web/packages/kernlab/index.html), though we will call it from [caret](https://topepo.github.io/caret/).

This implementation is rather slow, and as such we are going to only use 3-fold CV in the example code below.  You can change the `number` parameter to increase the number of folds.

```{r}
# For parallel processing
doParallel::registerDoParallel()

recipe_svm <-
  recipe(BCE_eq, data = train)  %>%
  step_zv(all_predictors()) %>% # remove any zero variance predictors
  step_center(all_predictors()) %>%  # Center all prediction variables
  step_scale(all_predictors()) %>%  # Scale all prediction variables
  step_intercept() %>% # Add an intercept to the model
  step_num2factor(all_outcomes(), ordered = T, levels=c("0","1"),
                  transform = function(x) x + 1, skip = TRUE)  # Convert DV to factor
model_svm <-
  svm_linear(cost = tune()) %>%
  set_mode("classification") %>%
  set_engine("kernlab")

workflow_svm <- workflow() %>%
  add_model(model_svm) %>%
  add_recipe(recipe_svm)

folds_svm <- vfold_cv(train, v=10)  # from rsample
metrics_svm = metric_set(roc_auc)  # from yardstick
grid_svm <- expand_grid(cost = exp(seq(-10,0, length.out=10)))

svm_fit_tuned <- tune_grid(workflow_svm,
                           grid = grid_svm,
                           resamples = folds_svm,
                           metrics=metrics_svm)
```

To see which model was best, we can use the `show_best()` function.

```{r}
show_best(svm_fit_tuned, metric = "roc_auc")
```

```{r}
svm_final <- workflow_svm %>%
  finalize_workflow(
  select_best(svm_fit_tuned, "roc_auc")
) %>%
  fit(train)
```




Next we can output a prediction just like we did for linear regression, and then use that prediction to obtain our ROC AUC and ROC curve.

```{r}
# Logit, in-sample
Y_hat_train <- predict(svm_final, train, type="prob")
ROCpred <- prediction(Y_hat_train$.pred_1, train$Restate_Int)
#ROCpred_out <- prediction(as.numeric(pred[df$Test==1 & !is.na(pred)]), as.numeric(df[df$Test==1 & !is.na(pred),]$AAER))

auc <- performance(ROCpred, measure = "auc")

print(paste0('ROC AUC: ', auc@y.values[[1]]))
```

```{r}
# Logit, out-of-sample
Y_hat_test <- predict(svm_final, test, type="prob")
ROCpred_out <- prediction(Y_hat_test$.pred_1, test$Restate_Int)

auc_out <- performance(ROCpred_out, measure = "auc")

print(paste0('ROC AUC: ', auc_out@y.values[[1]]))
```

In this case, we see that out-of-sample performance is much lower.

We can also plot out an ROC curve by calculating the True Positive rate and False Positive Rate.  We can use `ROCR` with `ggplot` to get a nice visualization of this.

```{r}
ROCperf <- performance(ROCpred, 'tpr','fpr')
ROCperf_out <- performance(ROCpred_out, 'tpr','fpr')

df_roc_in <- data.frame(FalsePositive=c(ROCperf@x.values[[1]]),
                 TruePositive=c(ROCperf@y.values[[1]]))
df_roc_out <- data.frame(FalsePositive=c(ROCperf_out@x.values[[1]]),
                 TruePositive=c(ROCperf_out@y.values[[1]]))

ggplot() +
  geom_line(data=df_roc_in, aes(x=FalsePositive, y=TruePositive, color="In Sample")) +
  geom_line(data=df_roc_out, aes(x=FalsePositive, y=TruePositive, color="Out of Sample")) + 
  geom_abline(slope=1)
```

# Using XGBoost

Note: The general workflow using parsnip with xgboost is the same as with svm.  The model name is `boost_tree()` in this case, however, and of course the parameter list is longer ([reference](https://parsnip.tidymodels.org/reference/boost_tree.html)).

For simplicity, the below workflow is only to create a single XGBoost model using the package itself, rather than the `tidymodels` interface.  As such, we will use the `juice()` function to extract the data in the format needed for XGBoost.

```{r}
rec <- recipe(BCE_eq, data = train) %>%
  step_zv(all_predictors()) %>%  # Drop any variables with zero variance
  step_center(all_predictors()) %>%  # Center all prediction variables
  step_scale(all_predictors()) %>%  # Scale all prediction variables
  step_intercept()  # Add an intercept to the model

# Juice our data
prepped <- rec %>% prep(training=train)
train_x <- juice(prepped, all_predictors(), composition = "dgCMatrix")
train_y <- juice(prepped, all_outcomes(), composition = "matrix")
test_prepped <- rec %>% prep(training=test)
test_x <- juice(test_prepped, all_predictors(), composition = "dgCMatrix")
test_y <- juice(test_prepped, all_outcomes(), composition = "matrix")

# Cross validation
set.seed(482342)  #for reproducibility
library(xgboost)

# model setup
params <- list(max_depth=10,
               eta=0.2,
               gamma=10,
               min_child_weight = 5,
               objective =
                 "binary:logistic")

# run the model
xgbCV <- xgb.cv(params=params,
                data=train_x,
                label=train_y,
                nrounds=100,
                eval_metric="auc",
                nfold=10,
                stratified=TRUE)
```

Next, we re-run the optimal model.

```{r}
numTrees <- min(
 which(
  xgbCV$evaluation_log$test_auc_mean == 
  max(xgbCV$evaluation_log$test_auc_mean)
 )
)

fit4 <- xgboost(params=params,
                data = train_x,
                label = train_y,
                nrounds = numTrees,
                eval_metric="auc")
```

We can output an importance plot easily.

```{r}
xgb.train.data = xgb.DMatrix(train_x, label = train_y, missing = NA)
col_names = attr(xgb.train.data, ".Dimnames")[[2]]
imp = xgb.importance(col_names, fit4)
# Variable importance
xgb.plot.importance(imp)
```

We can also calculate out-of-sample AUC.

```{r}
pred.xgb <- predict(fit4, test_x, type="response")
ROCpred.xgb <- prediction(as.numeric(pred.xgb), as.numeric(test_y))
ROCperf.xgb <- performance(ROCpred.xgb, 'tpr','fpr')
#plot(ROCperf.xgb)
df_ROC.xgb <- data.frame(FalsePositive=c(ROCperf.xgb@x.values[[1]]),
                 TruePositive=c(ROCperf.xgb@y.values[[1]]))

ggplot() +
  geom_line(data=df_ROC.xgb, aes(x=FalsePositive, y=TruePositive, color="Out of Sample")) + 
  geom_abline(slope=1)

auc.xgb <- performance(ROCpred.xgb, measure = "auc")
auc <- auc.xgb@y.values[[1]]
names(auc) <- c("XGBoost AUC")
auc
```








