Code for Session 6

Author

Dr. Richard M. Crowley

Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.

library(knitr)
library(kableExtra)

#' Pretty formatted HTML tables
#' 
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
  if(!length(names)) {
    names=colnames(df)
  }
  kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
    kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
    {if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
    {if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
    {if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
    {if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}

library(tidyverse)

library(tidyverse)
library(coefplot)
library(DT)

df <- read.csv("../../Data/Session_6.csv") %>%
  mutate(AAER = factor(AAER, levels=c(0,1)))

ex <- data.frame(year=c(1999,2001,2003), year_found=c(2001,2003,2006), aaer=c(1,1,1), aaer_as_of_2004=c(1,1,0))
html_df(ex)

year	year_found	aaer	aaer_as_of_2004
1999	2001	1	1
2001	2003	1	1
2003	2006	1	0

df %>%
  group_by(year) %>%
  mutate(total_AAERS = sum(AAER==1), total_observations=n()) %>%
  slice(1) %>%
  ungroup() %>%
  select(year, total_AAERS, total_observations) %>%
  html_df

year	total_AAERS	total_observations
1999	46	2195
2000	50	2041
2001	43	2021
2002	50	2391
2003	57	2936
2004	49	2843

fit_1990s <- glm(AAER ~ ebit + ni_revt + ni_at + log_lt + ltl_at + lt_seq +
                   lt_at + act_lct + aq_lct + wcap_at + invt_revt + invt_at +
                   ni_ppent + rect_revt + revt_at + d_revt + b_rect + b_rect +
                   r_gp + b_gp + gp_at + revt_m_gp + ch_at + log_at +
                   ppent_at + wcap,
                 data=df[df$Test==0,],
                 family=binomial)
summary(fit_1990s)


Call:
glm(formula = AAER ~ ebit + ni_revt + ni_at + log_lt + ltl_at + 
    lt_seq + lt_at + act_lct + aq_lct + wcap_at + invt_revt + 
    invt_at + ni_ppent + rect_revt + revt_at + d_revt + b_rect + 
    b_rect + r_gp + b_gp + gp_at + revt_m_gp + ch_at + log_at + 
    ppent_at + wcap, family = binomial, data = df[df$Test == 
    0, ])

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -4.660e+00  8.336e-01  -5.591 2.26e-08 ***
ebit        -3.564e-04  1.094e-04  -3.257  0.00112 ** 
ni_revt      3.664e-02  3.058e-02   1.198  0.23084    
ni_at       -3.196e-01  2.325e-01  -1.374  0.16932    
log_lt       1.494e-01  3.409e-01   0.438  0.66118    
ltl_at      -2.306e-01  7.072e-01  -0.326  0.74438    
lt_seq      -2.826e-05  4.567e-04  -0.062  0.95067    
lt_at       -8.559e-01  9.270e-01  -0.923  0.35586    
act_lct      1.401e-01  7.005e-02   2.000  0.04546 *  
aq_lct      -1.751e-01  9.156e-02  -1.912  0.05588 .  
wcap_at     -8.488e-01  6.487e-01  -1.308  0.19075    
invt_revt   -9.165e-01  9.679e-01  -0.947  0.34371    
invt_at      2.350e+00  1.033e+00   2.276  0.02286 *  
ni_ppent     2.703e-03  1.203e-02   0.225  0.82223    
rect_revt    6.587e-02  7.427e-02   0.887  0.37511    
revt_at     -4.790e-01  1.734e-01  -2.763  0.00572 ** 
d_revt      -7.848e-04  1.586e-03  -0.495  0.62079    
b_rect       8.212e-02  1.569e-01   0.524  0.60060    
r_gp        -1.055e-04  6.623e-04  -0.159  0.87345    
b_gp         1.266e-01  1.556e-01   0.813  0.41605    
gp_at        1.031e-01  3.512e-01   0.293  0.76921    
revt_m_gp    2.621e-05  1.094e-05   2.396  0.01655 *  
ch_at       -5.356e-01  7.909e-01  -0.677  0.49829    
log_at       2.687e-01  3.483e-01   0.771  0.44045    
ppent_at    -2.956e+00  4.798e-01  -6.161 7.23e-10 ***
wcap         7.009e-05  5.187e-05   1.351  0.17658    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2118.6  on 10268  degrees of freedom
Residual deviance: 1928.8  on 10243  degrees of freedom
  (1315 observations deleted due to missingness)
AIC: 1980.8

Number of Fisher Scoring iterations: 12

library(yardstick)

df$pred_1990s <- predict(fit_1990s, df, type="response")

auc_in <- df %>% filter(Test==0) %>% roc_auc(AAER, pred_1990s, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(AAER, pred_1990s, event_level='second')
curve_in_1990s <- df %>% filter(Test==0) %>% roc_curve(AAER, pred_1990s, event_level='second')
curve_out_1990s <- df %>% filter(Test==1) %>% roc_curve(AAER, pred_1990s, event_level='second')

ggplot() +
  geom_line(data=curve_in_1990s, aes(y=sensitivity, x=1-specificity, color="1990s, In Sample")) + 
  geom_line(data=curve_out_1990s, aes(y=sensitivity, x=1-specificity, color="1990s, Out of Sample")) +
  geom_abline(slope=1)

aucs_1990s <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_1990s) <- c("In sample AUC", "Out of sample AUC")
aucs_1990s

    In sample AUC Out of sample AUC 
        0.7483132         0.7292981

fit_2011 <- glm(AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
                  soft_assets + pct_chg_cashsales + chg_roa + issuance +
                  oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
                  midNaudit + cffin + exfin + restruct,
                 data=df[df$Test==0,],
                 family=binomial)
summary(fit_2011)


Call:
glm(formula = AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv + 
    soft_assets + pct_chg_cashsales + chg_roa + issuance + oplease_dum + 
    book_mkt + lag_sdvol + merger + bigNaudit + midNaudit + cffin + 
    exfin + restruct, family = binomial, data = df[df$Test == 
    0, ])

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -7.1474558  0.5337491 -13.391  < 2e-16 ***
logtotasset        0.3214322  0.0355467   9.043  < 2e-16 ***
rsst_acc          -0.2190095  0.3009287  -0.728   0.4667    
chg_recv           1.1020740  1.0590837   1.041   0.2981    
chg_inv            0.0389504  1.2507142   0.031   0.9752    
soft_assets        2.3094551  0.3325731   6.944 3.81e-12 ***
pct_chg_cashsales -0.0006912  0.0108771  -0.064   0.9493    
chg_roa           -0.2697984  0.2554262  -1.056   0.2908    
issuance           0.1443841  0.3187606   0.453   0.6506    
oplease_dum       -0.2029022  0.1970597  -1.030   0.3032    
book_mkt           0.0150113  0.0110652   1.357   0.1749    
lag_sdvol          0.0517368  0.0555338   0.932   0.3515    
merger             0.3529102  0.1511729   2.334   0.0196 *  
bigNaudit         -0.1998454  0.3598283  -0.555   0.5786    
midNaudit         -0.4894029  0.5118388  -0.956   0.3390    
cffin              0.4557560  0.3435253   1.327   0.1846    
exfin             -0.0053608  0.0393354  -0.136   0.8916    
restruct           0.3827721  0.1471729   2.601   0.0093 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2381.9  on 11583  degrees of freedom
Residual deviance: 2189.7  on 11566  degrees of freedom
AIC: 2225.7

Number of Fisher Scoring iterations: 7

df$pred_2011 <- predict(fit_2011, df, type="response")

auc_in <- df %>% filter(Test==0) %>% roc_auc(AAER, pred_2011, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(AAER, pred_2011, event_level='second')
curve_in_2011 <- df %>% filter(Test==0) %>% roc_curve(AAER, pred_2011, event_level='second')
curve_out_2011 <- df %>% filter(Test==1) %>% roc_curve(AAER, pred_2011, event_level='second')

ggplot() +
  geom_line(data=curve_in_2011, aes(y=sensitivity, x=1-specificity, color="2011, In Sample")) + 
  geom_line(data=curve_out_2011, aes(y=sensitivity, x=1-specificity, color="2011, Out of Sample")) +
  geom_abline(slope=1)

aucs_2011 <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_2011) <- c("In sample AUC", "Out of sample AUC")
aucs_2011

    In sample AUC Out of sample AUC 
        0.7445378         0.6849225

df_save <- data.frame(Test=df$Test, AAER=df$AAER, pred_F=df$pred_2011)

fit_2000s <- glm(AAER ~ bullets + headerlen + newlines + alltags +
                   processedsize + sentlen_u + wordlen_s + paralen_s +
                   repetitious_p + sentlen_s + typetoken + clindex + fog +
                   active_p + passive_p + lm_negative_p + lm_positive_p +
                   allcaps + exclamationpoints + questionmarks,
                 data=df[df$Test==0,],
                 family=binomial)
summary(fit_2000s)


Call:
glm(formula = AAER ~ bullets + headerlen + newlines + alltags + 
    processedsize + sentlen_u + wordlen_s + paralen_s + repetitious_p + 
    sentlen_s + typetoken + clindex + fog + active_p + passive_p + 
    lm_negative_p + lm_positive_p + allcaps + exclamationpoints + 
    questionmarks, family = binomial, data = df[df$Test == 0, 
    ])

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -5.662e+00  3.143e+00  -1.801  0.07165 .  
bullets           -2.635e-05  2.625e-05  -1.004  0.31558    
headerlen         -2.943e-04  3.477e-04  -0.846  0.39733    
newlines          -4.821e-05  1.220e-04  -0.395  0.69271    
alltags            5.060e-08  2.567e-07   0.197  0.84376    
processedsize      5.709e-06  1.287e-06   4.435 9.19e-06 ***
sentlen_u         -3.790e-02  6.897e-02  -0.550  0.58259    
wordlen_s          1.278e-01  1.199e+00   0.107  0.91510    
paralen_s         -4.808e-02  3.052e-02  -1.576  0.11514    
repetitious_p     -1.673e+00  1.665e+00  -1.005  0.31508    
sentlen_s         -2.098e-02  2.222e-02  -0.944  0.34518    
typetoken         -3.609e-01  1.729e+00  -0.209  0.83469    
clindex            2.744e-01  1.519e-01   1.806  0.07085 .  
fog               -1.839e-02  1.322e-01  -0.139  0.88935    
active_p          -4.321e-01  1.459e+00  -0.296  0.76711    
passive_p          7.321e-02  3.204e+00   0.023  0.98177    
lm_negative_p      1.356e+01  1.304e+01   1.039  0.29875    
lm_positive_p     -9.911e+01  3.842e+01  -2.580  0.00989 ** 
allcaps           -4.646e-04  1.835e-04  -2.532  0.01136 *  
exclamationpoints -3.520e-01  1.944e-01  -1.811  0.07010 .  
questionmarks      3.371e-03  2.816e-02   0.120  0.90471    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2381.9  on 11583  degrees of freedom
Residual deviance: 2329.2  on 11563  degrees of freedom
AIC: 2371.2

Number of Fisher Scoring iterations: 10

df$pred_2000s <- predict(fit_2000s, df, type="response")

auc_in <- df %>% filter(Test==0) %>% roc_auc(AAER, pred_2000s, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(AAER, pred_2000s, event_level='second')
curve_in_2000s <- df %>% filter(Test==0) %>% roc_curve(AAER, pred_2000s, event_level='second')
curve_out_2000s <- df %>% filter(Test==1) %>% roc_curve(AAER, pred_2000s, event_level='second')

ggplot() +
  geom_line(data=curve_in_2000s, aes(y=sensitivity, x=1-specificity, color="2000s, In Sample")) + 
  geom_line(data=curve_out_2000s, aes(y=sensitivity, x=1-specificity, color="2000s, Out of Sample")) +
  geom_abline(slope=1)

aucs_2000s <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_2000s) <- c("In sample AUC", "Out of sample AUC")
aucs_2000s

    In sample AUC Out of sample AUC 
        0.6377783         0.6295414

df_save$pred_S <- df$pred_2000s

fit_2000f <- glm(AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
                   soft_assets + pct_chg_cashsales + chg_roa + issuance +
                   oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
                   midNaudit + cffin + exfin + restruct + bullets + headerlen +
                   newlines + alltags + processedsize + sentlen_u + wordlen_s +
                   paralen_s + repetitious_p + sentlen_s + typetoken +
                   clindex + fog + active_p + passive_p + lm_negative_p +
                   lm_positive_p + allcaps + exclamationpoints + questionmarks,
                 data=df[df$Test==0,],
                 family=binomial)
summary(fit_2000f)


Call:
glm(formula = AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv + 
    soft_assets + pct_chg_cashsales + chg_roa + issuance + oplease_dum + 
    book_mkt + lag_sdvol + merger + bigNaudit + midNaudit + cffin + 
    exfin + restruct + bullets + headerlen + newlines + alltags + 
    processedsize + sentlen_u + wordlen_s + paralen_s + repetitious_p + 
    sentlen_s + typetoken + clindex + fog + active_p + passive_p + 
    lm_negative_p + lm_positive_p + allcaps + exclamationpoints + 
    questionmarks, family = binomial, data = df[df$Test == 0, 
    ])

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -1.634e+00  3.415e+00  -0.479  0.63223    
logtotasset        3.437e-01  3.921e-02   8.766  < 2e-16 ***
rsst_acc          -2.123e-01  2.995e-01  -0.709  0.47844    
chg_recv           1.022e+00  1.055e+00   0.969  0.33274    
chg_inv           -9.464e-02  1.226e+00  -0.077  0.93845    
soft_assets        2.573e+00  3.387e-01   7.598 3.01e-14 ***
pct_chg_cashsales -1.134e-03  1.103e-02  -0.103  0.91811    
chg_roa           -2.696e-01  2.470e-01  -1.092  0.27504    
issuance           1.471e-01  3.220e-01   0.457  0.64777    
oplease_dum       -2.859e-01  2.020e-01  -1.416  0.15691    
book_mkt           1.531e-02  1.115e-02   1.373  0.16967    
lag_sdvol          6.147e-02  5.418e-02   1.135  0.25655    
merger             3.699e-01  1.536e-01   2.408  0.01604 *  
bigNaudit         -1.992e-01  3.638e-01  -0.548  0.58392    
midNaudit         -4.839e-01  5.139e-01  -0.942  0.34635    
cffin              4.608e-01  3.599e-01   1.280  0.20044    
exfin             -1.235e-03  5.907e-02  -0.021  0.98332    
restruct           3.035e-01  1.555e-01   1.952  0.05099 .  
bullets           -1.958e-05  2.519e-05  -0.777  0.43692    
headerlen         -5.792e-04  4.038e-04  -1.434  0.15151    
newlines          -7.423e-05  1.294e-04  -0.574  0.56615    
alltags           -1.338e-07  2.701e-07  -0.495  0.62036    
processedsize      4.111e-06  1.446e-06   2.844  0.00446 ** 
sentlen_u         -5.086e-02  6.950e-02  -0.732  0.46433    
wordlen_s         -1.618e+00  1.303e+00  -1.241  0.21450    
paralen_s         -3.934e-02  2.923e-02  -1.346  0.17834    
repetitious_p     -1.433e+00  1.658e+00  -0.864  0.38740    
sentlen_s         -1.690e-02  2.212e-02  -0.764  0.44502    
typetoken         -1.216e+00  1.788e+00  -0.680  0.49643    
clindex            9.620e-02  1.521e-01   0.632  0.52718    
fog                1.063e-02  1.339e-01   0.079  0.93675    
active_p           6.259e-01  1.435e+00   0.436  0.66271    
passive_p         -5.848e+00  3.796e+00  -1.541  0.12337    
lm_negative_p      2.012e+01  1.220e+01   1.649  0.09906 .  
lm_positive_p     -3.700e+01  3.864e+01  -0.958  0.33825    
allcaps           -4.721e-04  1.912e-04  -2.469  0.01354 *  
exclamationpoints -4.510e-01  2.024e-01  -2.228  0.02586 *  
questionmarks      4.096e-03  2.853e-02   0.144  0.88586    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2381.9  on 11583  degrees of freedom
Residual deviance: 2143.0  on 11546  degrees of freedom
AIC: 2219

Number of Fisher Scoring iterations: 10

df$pred_2000f <- predict(fit_2000f, df, type="response")

auc_in <- df %>% filter(Test==0) %>% roc_auc(AAER, pred_2000f, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(AAER, pred_2000f, event_level='second')
curve_in_2000f <- df %>% filter(Test==0) %>% roc_curve(AAER, pred_2000f, event_level='second')
curve_out_2000f <- df %>% filter(Test==1) %>% roc_curve(AAER, pred_2000f, event_level='second')

ggplot() +
  geom_line(data=curve_in_2000f, aes(y=sensitivity, x=1-specificity, color="2000s + 2011, In Sample")) + 
  geom_line(data=curve_out_2000f, aes(y=sensitivity, x=1-specificity, color="2000s + 2011, Out of Sample")) +
  geom_abline(slope=1)

aucs_2000f <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_2000f) <- c("In sample AUC", "Out of sample AUC")
aucs_2000f

    In sample AUC Out of sample AUC 
        0.7664115         0.7147021

df_save$pred_FS <- df$pred_2000f

BCE_eq = as.formula(paste("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
  soft_assets + pct_chg_cashsales + chg_roa + issuance +
  oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
  midNaudit + cffin + exfin + restruct + bullets + headerlen +
  newlines + alltags + processedsize + sentlen_u + wordlen_s +
  paralen_s + repetitious_p + sentlen_s + typetoken +
  clindex + fog + active_p + passive_p + lm_negative_p +
  lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
  paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))
fit_BCE <- glm(BCE_eq,
               data=df[df$Test==0,],
               family=binomial)
summary(fit_BCE)


Call:
glm(formula = BCE_eq, family = binomial, data = df[df$Test == 
    0, ])

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -8.032e+00  3.872e+00  -2.074  0.03806 *  
logtotasset        3.879e-01  4.554e-02   8.519  < 2e-16 ***
rsst_acc          -1.938e-01  3.055e-01  -0.634  0.52593    
chg_recv           8.581e-01  1.071e+00   0.801  0.42296    
chg_inv           -2.607e-01  1.223e+00  -0.213  0.83119    
soft_assets        2.555e+00  3.796e-01   6.730  1.7e-11 ***
pct_chg_cashsales -1.976e-03  6.997e-03  -0.282  0.77767    
chg_roa           -2.532e-01  2.786e-01  -0.909  0.36354    
issuance           9.692e-02  3.269e-01   0.296  0.76687    
oplease_dum       -3.451e-01  2.097e-01  -1.645  0.09989 .  
book_mkt           1.361e-02  1.151e-02   1.183  0.23692    
lag_sdvol          4.546e-02  5.709e-02   0.796  0.42589    
merger             3.224e-01  1.572e-01   2.051  0.04027 *  
bigNaudit         -2.010e-01  3.711e-01  -0.542  0.58804    
midNaudit         -4.641e-01  5.195e-01  -0.893  0.37169    
cffin              6.024e-01  3.769e-01   1.598  0.10998    
exfin             -6.738e-03  4.063e-02  -0.166  0.86831    
restruct           1.915e-01  1.593e-01   1.202  0.22920    
bullets           -1.612e-05  2.526e-05  -0.638  0.52334    
headerlen         -3.956e-04  3.722e-04  -1.063  0.28776    
newlines          -1.038e-04  1.333e-04  -0.779  0.43623    
alltags           -1.338e-07  2.772e-07  -0.483  0.62934    
processedsize      4.178e-06  1.477e-06   2.828  0.00468 ** 
sentlen_u         -7.131e-02  7.881e-02  -0.905  0.36553    
wordlen_s          4.413e-01  1.430e+00   0.309  0.75761    
paralen_s         -4.584e-02  2.976e-02  -1.540  0.12356    
repetitious_p     -1.525e+00  1.780e+00  -0.857  0.39152    
sentlen_s         -7.700e-04  2.238e-02  -0.034  0.97255    
typetoken          5.313e-02  1.934e+00   0.027  0.97809    
clindex            6.406e-02  1.669e-01   0.384  0.70116    
fog                7.432e-02  1.590e-01   0.467  0.64018    
active_p          -3.159e-01  1.628e+00  -0.194  0.84620    
passive_p         -6.714e+00  4.192e+00  -1.602  0.10926    
lm_negative_p      3.558e+00  1.422e+01   0.250  0.80240    
lm_positive_p     -8.906e+01  4.641e+01  -1.919  0.05497 .  
allcaps           -4.189e-04  1.916e-04  -2.186  0.02878 *  
exclamationpoints -4.685e-01  2.143e-01  -2.187  0.02878 *  
questionmarks      4.424e-03  2.876e-02   0.154  0.87774    
Topic_1_n_oI       3.157e+01  1.137e+02   0.278  0.78136    
Topic_2_n_oI       6.382e+01  6.784e+01   0.941  0.34684    
Topic_3_n_oI      -1.853e+02  1.457e+02  -1.272  0.20335    
Topic_4_n_oI      -4.247e+01  7.460e+01  -0.569  0.56919    
Topic_5_n_oI      -8.240e+00  8.061e+01  -0.102  0.91858    
Topic_6_n_oI      -5.072e+02  1.793e+02  -2.829  0.00468 ** 
Topic_7_n_oI       1.994e+01  8.192e+01   0.243  0.80772    
Topic_8_n_oI      -9.620e+01  8.419e+01  -1.143  0.25321    
Topic_9_n_oI       5.963e+01  7.064e+01   0.844  0.39863    
Topic_10_n_oI     -4.641e+01  6.705e+01  -0.692  0.48885    
Topic_11_n_oI     -1.415e+02  7.384e+01  -1.916  0.05536 .  
Topic_12_n_oI      2.147e-01  8.441e+01   0.003  0.99797    
Topic_13_n_oI     -1.420e+02  2.612e+02  -0.543  0.58686    
Topic_14_n_oI      2.086e+01  7.840e+01   0.266  0.79019    
Topic_15_n_oI     -3.519e+01  6.332e+01  -0.556  0.57840    
Topic_16_n_oI      1.629e+01  1.156e+02   0.141  0.88793    
Topic_17_n_oI     -5.188e+01  8.901e+01  -0.583  0.56000    
Topic_18_n_oI      2.239e+01  9.340e+01   0.240  0.81058    
Topic_19_n_oI     -1.071e+02  8.063e+01  -1.328  0.18422    
Topic_20_n_oI     -4.885e+01  9.701e+01  -0.504  0.61455    
Topic_21_n_oI     -1.515e+02  9.116e+01  -1.662  0.09654 .  
Topic_22_n_oI     -6.818e+00  6.472e+01  -0.105  0.91610    
Topic_23_n_oI     -1.226e+01  6.965e+01  -0.176  0.86031    
Topic_24_n_oI     -6.545e+01  9.523e+01  -0.687  0.49189    
Topic_25_n_oI     -6.863e+01  7.094e+01  -0.967  0.33336    
Topic_26_n_oI     -2.182e+00  6.580e+01  -0.033  0.97354    
Topic_27_n_oI      9.374e+00  1.418e+02   0.066  0.94729    
Topic_28_n_oI      5.882e+00  7.937e+01   0.074  0.94092    
Topic_29_n_oI     -4.209e+01  6.670e+01  -0.631  0.52798    
Topic_30_n_oI     -4.605e+02  2.672e+02  -1.724  0.08477 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2381.9  on 11583  degrees of freedom
Residual deviance: 2069.5  on 11516  degrees of freedom
AIC: 2205.5

Number of Fisher Scoring iterations: 11

df$pred_BCE <- predict(fit_BCE, df, type="response")

auc_in <- df %>% filter(Test==0) %>% roc_auc(AAER, pred_BCE, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(AAER, pred_BCE, event_level='second')
curve_in_BCE <- df %>% filter(Test==0) %>% roc_curve(AAER, pred_BCE, event_level='second')
curve_out_BCE <- df %>% filter(Test==1) %>% roc_curve(AAER, pred_BCE, event_level='second')

ggplot() +
  geom_line(data=curve_in_BCE, aes(y=sensitivity, x=1-specificity, color="BCE, In Sample")) + 
  geom_line(data=curve_out_BCE, aes(y=sensitivity, x=1-specificity, color="BCE, Out of Sample")) +
  geom_abline(slope=1)

aucs_BCE <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_BCE) <- c("In sample AUC", "Out of sample AUC")
aucs_BCE

    In sample AUC Out of sample AUC 
        0.7941841         0.7599594

df_save$pred_BCE <- df$pred_BCE

ggplot() +
  geom_line(data=curve_out_BCE, aes(y=sensitivity, x=1-specificity, color="BCE")) +
  geom_line(data=curve_out_2000f, aes(y=sensitivity, x=1-specificity, color="2000s + 2011")) +
  geom_line(data=curve_out_2000s, aes(y=sensitivity, x=1-specificity, color="2000s")) +
  geom_line(data=curve_out_2011, aes(y=sensitivity, x=1-specificity, color="2011")) +
  geom_line(data=curve_out_1990s, aes(y=sensitivity, x=1-specificity, color="1990s")) +
  geom_abline(slope=1) + 
  ggtitle("Out of Sample ROC Curves")

oos_aucs <- c(aucs_1990s[2], aucs_2011[2], aucs_2000s[2], aucs_2000f[2], aucs_BCE[2])
names(oos_aucs) <- c("1990s", "2011", "2000s", "2000s + 2011", "BCE")
oos_aucs

       1990s         2011        2000s 2000s + 2011          BCE 
   0.7292981    0.6849225    0.6295414    0.7147021    0.7599594

library(glmnet)
x <- model.matrix(BCE_eq, data=df[df$Test==0,])[,-1]  # [,-1] to remove intercept
y <- model.frame(BCE_eq, data=df[df$Test==0,])[,"AAER"]
fit_LASSO <- glmnet(x=x, y=y,
                    family = "binomial",
                    alpha = 1  # Specifies LASSO.  alpha = 0 is ridge
                    )

plot(fit_LASSO)

print(fit_LASSO)


Call:  glmnet(x = x, y = y, family = "binomial", alpha = 1) 

   Df  %Dev    Lambda
1   0  0.00 0.0143300
2   1  0.81 0.0130500
3   1  1.46 0.0118900
4   1  2.00 0.0108400
5   2  2.47 0.0098740
6   2  3.22 0.0089970
7   2  3.85 0.0081970
8   2  4.37 0.0074690
9   2  4.81 0.0068060
10  3  5.22 0.0062010
11  3  5.59 0.0056500
12  4  5.91 0.0051480
13  4  6.25 0.0046910
14  5  6.57 0.0042740
15  7  6.89 0.0038940
16  8  7.22 0.0035480
17 10  7.52 0.0032330
18 12  7.83 0.0029460
19 15  8.16 0.0026840
20 15  8.49 0.0024460
21 15  8.78 0.0022290
22 15  9.03 0.0020310
23 18  9.26 0.0018500
24 20  9.48 0.0016860
25 22  9.69 0.0015360
26 25  9.89 0.0014000
27 28 10.10 0.0012750
28 30 10.33 0.0011620
29 31 10.54 0.0010590
30 33 10.74 0.0009647
31 36 10.93 0.0008790
32 37 11.11 0.0008009
33 39 11.26 0.0007297
34 42 11.42 0.0006649
35 43 11.57 0.0006058
36 44 11.71 0.0005520
37 47 11.85 0.0005030
38 48 11.97 0.0004583
39 48 12.08 0.0004176
40 48 12.18 0.0003805
41 51 12.27 0.0003467
42 52 12.35 0.0003159
43 54 12.43 0.0002878
44 54 12.49 0.0002623
45 54 12.55 0.0002390
46 55 12.60 0.0002177
47 56 12.65 0.0001984
48 58 12.70 0.0001808
49 60 12.74 0.0001647
50 61 12.79 0.0001501
51 62 12.82 0.0001367
52 63 12.86 0.0001246
53 65 12.89 0.0001135
54 66 12.92 0.0001034
55 66 12.94 0.0000942
56 66 12.97 0.0000859
57 66 12.98 0.0000782
58 66 13.00 0.0000713
59 66 13.02 0.0000650
60 66 13.03 0.0000592
61 67 13.04 0.0000539
62 67 13.05 0.0000491
63 67 13.06 0.0000448
64 67 13.06 0.0000408
65 67 13.07 0.0000372
66 67 13.08 0.0000339
67 67 13.08 0.0000309
68 67 13.09 0.0000281
69 67 13.09 0.0000256
70 67 13.09 0.0000233
71 67 13.10 0.0000213
72 67 13.10 0.0000194
73 67 13.10 0.0000177
74 67 13.10 0.0000161
75 67 13.10 0.0000147

#coef(fit_LASSO, s=0.002031)
coefplot(fit_LASSO, lambda=0.002031, sort='magnitude')

# na.pass has model.matrix retain NA values (so the # of rows is constant)
xp <- model.matrix(BCE_eq, data=df, na.action='na.pass')[,-1]
# s= specifies the version of the model to use
df$pred_L1 <- c(predict(fit_LASSO, xp, type="response", s = 0.002031))

auc_in <- df %>% filter(Test==0) %>% roc_auc(AAER, pred_L1, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(AAER, pred_L1, event_level='second')
curve_in_L1 <- df %>% filter(Test==0) %>% roc_curve(AAER, pred_L1, event_level='second')
curve_out_L1 <- df %>% filter(Test==1) %>% roc_curve(AAER, pred_L1, event_level='second')

ggplot() +
  geom_line(data=curve_in_L1, aes(y=sensitivity, x=1-specificity, color="BCE, LASSO, In Sample")) + 
  geom_line(data=curve_out_L1, aes(y=sensitivity, x=1-specificity, color="BCE, LASSO, Out of Sample")) +
  geom_abline(slope=1)

aucs_L1 <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_L1) <- c("In sample AUC", "Out of sample AUC")
aucs_L1

    In sample AUC Out of sample AUC 
        0.7593828         0.7239785

# Cross validation
set.seed(697435)  #for reproducibility
cvfit = cv.glmnet(x=x, y=y,family = "binomial", alpha = 1, type.measure="auc")

plot(cvfit)

cvfit$lambda.min

[1] 0.001685798

cvfit$lambda.1se

[1] 0.002684268

#coef(cvfit, s = "lambda.min")
coefplot(cvfit, lambda='lambda.min', sort='magnitude') + theme(axis.text.y = element_text(size=15))

#coef(cvfit, s = "lambda.1se")
coefplot(cvfit, lambda='lambda.1se', sort='magnitude') + theme(axis.text.y = element_text(size=15))

# s= specifies the version of the model to use
df$pred_L1.min <- c(predict(cvfit, xp, type="response", s = "lambda.min"))
df$pred_L1.1se <- c(predict(cvfit, xp, type="response", s = "lambda.1se"))

df_save$pred_lmin <- df$pred_L1.min
df_save$pred_l1se <- df$pred_L1.1se

auc_in <- df %>% filter(Test==0) %>% roc_auc(AAER, pred_L1.min, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(AAER, pred_L1.min, event_level='second')
curve_in_CV.min <- df %>% filter(Test==0) %>% roc_curve(AAER, pred_L1.min, event_level='second')
curve_out_CV.min <- df %>% filter(Test==1) %>% roc_curve(AAER, pred_L1.min, event_level='second')

aucs_CV.min <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_CV.min) <- c("In sample, lambda.min", "Out of sample, lambda.min")

auc_in <- df %>% filter(Test==0) %>% roc_auc(AAER, pred_L1.1se, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(AAER, pred_L1.1se, event_level='second')
curve_in_CV.1se <- df %>% filter(Test==0) %>% roc_curve(AAER, pred_L1.1se, event_level='second')
curve_out_CV.1se <- df %>% filter(Test==1) %>% roc_curve(AAER, pred_L1.1se, event_level='second')

aucs_CV.1se <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_CV.1se) <- c("In sample, lambda.1se", "Out of sample, lambda.1se")

ggplot() +
  geom_line(data=curve_in_CV.min, aes(y=sensitivity, x=1-specificity, color="In Sample, lambda.min")) + 
  geom_line(data=curve_out_CV.min, aes(y=sensitivity, x=1-specificity, color="Out of Sample, lambda.min")) +
  geom_line(data=curve_in_CV.1se, aes(y=sensitivity, x=1-specificity, color="In Sample, lambda.1se")) + 
  geom_line(data=curve_out_CV.1se, aes(y=sensitivity, x=1-specificity, color="Out of Sample, lambda.1se")) +
  geom_abline(slope=1)

c(aucs_CV.min, aucs_CV.1se)

    In sample, lambda.min Out of sample, lambda.min     In sample, lambda.1se 
                0.7631710                 0.7290185                 0.7509946 
Out of sample, lambda.1se 
                0.7124231

BCE_eq <- as.formula(paste("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
  soft_assets + pct_chg_cashsales + chg_roa + issuance +
  oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
  midNaudit + cffin + exfin + restruct + bullets + headerlen +
  newlines + alltags + processedsize + sentlen_u + wordlen_s +
  paralen_s + repetitious_p + sentlen_s + typetoken +
  clindex + fog + active_p + passive_p + lm_negative_p +
  lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
  paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))

library(recipes)
library(parsnip)

df <- read_csv("../../Data/Session_6.csv")
BCEformula <- BCE_eq

train <- df %>% filter(Test == 0)
test <- df %>% filter(Test == 1)

rec <- recipe(BCEformula, data = train) %>%
  step_zv(all_predictors()) %>%  # Drop any variables with zero variance
  step_center(all_predictors()) %>%  # Center all prediction variables
  step_scale(all_predictors()) %>%  # Scale all prediction variables
  step_intercept() %>%  # Add an intercept to the model
  step_num2factor(all_outcomes(), ordered = T, levels=c("0","1"),
                  transform = function(x) x + 1)  # Convert DV to factor

prepped <- rec %>% prep(training=train)

# "bake" your recipe to get data ready
train_baked  <- bake(prepped, new_data = train)
test_baked  <- bake(prepped, new_data = test)

# Run the model with parsnip
train_model <- logistic_reg(mixture=1, penalty=1) %>%  # mixture = 1 sets LASSO
  set_engine('glmnet') %>%
  fit(BCEformula, data = train_baked)

# train_model$fit is the same as fit_LASSO earlier in the slides
coefplot(train_model$fit, lambda=0.002031, sort='magnitude')

rec <- recipe(BCEformula, data = train) %>%
  step_zv(all_predictors()) %>%  # Drop any variables with zero variance
  step_center(all_predictors()) %>%  # Center all prediction variables
  step_scale(all_predictors()) %>%  # Scale all prediction variables
  step_intercept()  # Add an intercept to the model

prepped <- rec %>% prep(training=train)
test_prepped <- rec %>% prep(training=test)

# "Juice" your recipe to get data for other packages
train_x <- juice(prepped, all_predictors(), composition = "dgCMatrix")
train_y <- juice(prepped, all_outcomes(), composition = "matrix")
test_x <- juice(test_prepped, all_predictors(), composition = "dgCMatrix")
test_y <- juice(test_prepped, all_outcomes(), composition = "matrix")

# Cross validation
set.seed(75347)  #for reproducibility
cvfit = cv.glmnet(x=train_x, y=train_y, family = "binomial", alpha = 1,
                  type.measure="auc")

plot(cvfit)

cvfit$lambda.min

[1] 0.00139958

cvfit$lambda.1se

[1] 0.003548444

#coef(cvfit, s = "lambda.min")
coefplot(cvfit, lambda='lambda.min', sort='magnitude') + theme(axis.text.y = element_text(size=15))

#coef(cvfit, s = "lambda.1se")
coefplot(cvfit, lambda='lambda.1se', sort='magnitude') + theme(axis.text.y = element_text(size=15))

train.pred_L1.min2 <- c(predict(cvfit, train_x, type="response", s = "lambda.min"))
test.pred_L1.min2 <- c(predict(cvfit, test_x, type="response", s = "lambda.min"))
train.pred_L1.1se2 <- c(predict(cvfit, train_x, type="response", s = "lambda.1se"))
test.pred_L1.1se2 <- c(predict(cvfit, test_x, type="response", s = "lambda.1se"))

df_train <- data.frame(train.pred_L1.min2=train.pred_L1.min2, train.pred_L1.1se2=train.pred_L1.1se2, AAER=factor(train_y, levels=c(0,1)))
df_test <- data.frame(test.pred_L1.min2=test.pred_L1.min2, test.pred_L1.1se2=test.pred_L1.1se2, AAER=factor(test_y, levels=c(0,1)))

auc_in <- df_train %>% roc_auc(AAER, train.pred_L1.min2, event_level='second')
auc_out <- df_test %>% roc_auc(AAER, test.pred_L1.min2, event_level='second')
curve_in_CV.min2 <- df_train %>% roc_curve(AAER, train.pred_L1.min2, event_level='second')
curve_out_CV.min2 <- df_test %>% roc_curve(AAER, test.pred_L1.min2, event_level='second')

aucs_CV.min2 <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_CV.min2) <- c("In sample, lambda.min", "Out of sample, lambda.min")

auc_in <- df_train %>% roc_auc(AAER, train.pred_L1.1se2, event_level='second')
auc_out <- df_test %>% roc_auc(AAER, test.pred_L1.1se2, event_level='second')
curve_in_CV.1se2 <- df_train %>% roc_curve(AAER, train.pred_L1.1se2, event_level='second')
curve_out_CV.1se2 <- df_test %>% roc_curve(AAER, test.pred_L1.1se2, event_level='second')

aucs_CV.1se2 <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_CV.1se2) <- c("In sample, lambda.1se", "Out of sample, lambda.1se")

ggplot() +
  geom_line(data=curve_in_CV.min2, aes(y=sensitivity, x=1-specificity, color="In Sample, lambda.min")) + 
  geom_line(data=curve_out_CV.min2, aes(y=sensitivity, x=1-specificity, color="Out of Sample, lambda.min")) +
  geom_line(data=curve_in_CV.1se2, aes(y=sensitivity, x=1-specificity, color="In Sample, lambda.1se")) + 
  geom_line(data=curve_out_CV.1se2, aes(y=sensitivity, x=1-specificity, color="Out of Sample, lambda.1se")) +
  geom_abline(slope=1)

c(aucs_CV.min2, aucs_CV.1se2)

    In sample, lambda.min Out of sample, lambda.min     In sample, lambda.1se 
                0.7665463                 0.7364834                 0.7417082 
Out of sample, lambda.1se 
                0.7028034

BCE_eq <- as.formula(paste("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
  soft_assets + pct_chg_cashsales + chg_roa + issuance +
  oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
  midNaudit + cffin + exfin + restruct + bullets + headerlen +
  newlines + alltags + processedsize + sentlen_u + wordlen_s +
  paralen_s + repetitious_p + sentlen_s + typetoken +
  clindex + fog + active_p + passive_p + lm_negative_p +
  lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
  paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))

library(tidyr)
library(tidymodels)
library(tidyverse)

df <- read_csv("../../Data/Session_6.csv")
BCEformula <- BCE_eq

train <- df %>% filter(Test == 0)
test <- df %>% filter(Test == 1)

LASSO_rec <- recipe(BCEformula, data = train) %>%
  step_zv(all_predictors()) %>%  # Drop any variables with zero variance
  step_center(all_predictors()) %>%  # Center all prediction variables
  step_scale(all_predictors()) %>%  # Scale all prediction variables
  step_intercept() %>%  # Add an intercept to the model
  step_num2factor(all_outcomes(), ordered = T, levels=c("0","1"),
                  transform = function(x) x + 1)  # Convert DV to factor

LASSO_mod <- logistic_reg(penalty=tune(), mixture=1) %>%  # mixture = 1 sets LASSO
  set_engine('glmnet')

# Define a grid to tune over
grid <- expand_grid(penalty = exp(seq(-11,-4, length.out=100)))

LASSO_wfl <- workflow() %>%
  add_model(LASSO_mod) %>%
  add_recipe(LASSO_rec)

set.seed(354351)
folds <- vfold_cv(train, v=10)  # from rsample
metrics = metric_set(roc_auc)  # from yardstick

LASSO_fit_tuned <- tune_grid(LASSO_wfl,
                             grid = grid,
                             resamples = folds,
                             metrics=metrics)

LASSO_fit_tuned %>%
  collect_metrics()

# A tibble: 100 × 7
     penalty .metric .estimator  mean     n std_err .config               
       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                 
 1 0.0000167 roc_auc binary     0.727    10  0.0257 Preprocessor1_Model001
 2 0.0000179 roc_auc binary     0.727    10  0.0257 Preprocessor1_Model002
 3 0.0000192 roc_auc binary     0.727    10  0.0257 Preprocessor1_Model003
 4 0.0000206 roc_auc binary     0.727    10  0.0257 Preprocessor1_Model004
 5 0.0000222 roc_auc binary     0.727    10  0.0257 Preprocessor1_Model005
 6 0.0000238 roc_auc binary     0.727    10  0.0257 Preprocessor1_Model006
 7 0.0000255 roc_auc binary     0.727    10  0.0257 Preprocessor1_Model007
 8 0.0000274 roc_auc binary     0.727    10  0.0256 Preprocessor1_Model008
 9 0.0000294 roc_auc binary     0.727    10  0.0256 Preprocessor1_Model009
10 0.0000316 roc_auc binary     0.727    10  0.0256 Preprocessor1_Model010
# ℹ 90 more rows

lambda.min <- LASSO_fit_tuned %>%
  collect_metrics() %>%
  arrange(-mean) %>%
  slice(1) %>%
  pull(penalty) %>%
  log()

LASSO_fit_tuned %>%
  collect_metrics() %>%
  ggplot(aes(x=log(penalty), y=mean)) +
  geom_point() +
  xlab("Log(lambda)") + 
  geom_vline(xintercept = lambda.min)

BCE_eq <- as.formula(paste("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
  soft_assets + pct_chg_cashsales + chg_roa + issuance +
  oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
  midNaudit + cffin + exfin + restruct + bullets + headerlen +
  newlines + alltags + processedsize + sentlen_u + wordlen_s +
  paralen_s + repetitious_p + sentlen_s + typetoken +
  clindex + fog + active_p + passive_p + lm_negative_p +
  lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
  paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))

library(recipes)
library(parsnip)

df <- read_csv("../../Data/Session_6.csv")
BCEformula <- BCE_eq

train <- df %>% filter(Test == 0)
test <- df %>% filter(Test == 1)

rec <- recipe(BCEformula, data = train) %>%
  step_zv(all_predictors()) %>%  # Drop any variables with zero variance
  step_center(all_predictors()) %>%  # Center all prediction variables
  step_scale(all_predictors()) %>%  # Scale all prediction variables
  step_intercept()  # Add an intercept to the model

# Juice our data
prepped <- rec %>% prep(training=train)
train_x <- juice(prepped, all_predictors(), composition = "dgCMatrix")
train_y <- juice(prepped, all_outcomes(), composition = "matrix")
test_prepped <- rec %>% prep(training=test)
test_x <- juice(test_prepped, all_predictors(), composition = "dgCMatrix")
test_y <- juice(test_prepped, all_outcomes(), composition = "matrix")

# Cross validation
set.seed(482342)  #for reproducibility
library(xgboost)
# model setup
params <- list(max_depth=10, eta=0.2,
               gamma=10, min_child_weight=5,
               objective="binary:logistic")
# run the model
xgbCV <- xgb.cv(params=params, data=train_x,
                label=train_y, nrounds=100,
                eval_metric="auc", nfold=10,
                stratified=TRUE)

[1] train-auc:0.552507+0.080499 test-auc:0.538707+0.062529 
[2] train-auc:0.586947+0.087237 test-auc:0.563604+0.068172 
[3] train-auc:0.603035+0.084511 test-auc:0.583011+0.074621 
[4] train-auc:0.663903+0.057212 test-auc:0.631184+0.055907 
[5] train-auc:0.677173+0.064281 test-auc:0.639249+0.055184 
[6] train-auc:0.707156+0.026578 test-auc:0.663628+0.038438 
[7] train-auc:0.716727+0.025892 test-auc:0.666075+0.037700 
[8] train-auc:0.728506+0.026368 test-auc:0.671749+0.041744 
[9] train-auc:0.768085+0.025756 test-auc:0.682083+0.041544 
[10]    train-auc:0.783654+0.030705 test-auc:0.687617+0.046750 
[11]    train-auc:0.796643+0.027157 test-auc:0.701862+0.046887 
[12]    train-auc:0.814196+0.019522 test-auc:0.707956+0.051442 
[13]    train-auc:0.834534+0.023090 test-auc:0.718937+0.051517 
[14]    train-auc:0.855445+0.020539 test-auc:0.738984+0.046730 
[15]    train-auc:0.865581+0.014472 test-auc:0.746203+0.053149 
[16]    train-auc:0.879177+0.015412 test-auc:0.755713+0.047733 
[17]    train-auc:0.885384+0.010695 test-auc:0.756954+0.049152 
[18]    train-auc:0.893771+0.010416 test-auc:0.754607+0.049381 
[19]    train-auc:0.899295+0.011640 test-auc:0.755961+0.048730 
[20]    train-auc:0.904153+0.009617 test-auc:0.757726+0.049454 
[21]    train-auc:0.912452+0.011732 test-auc:0.767517+0.049317 
[22]    train-auc:0.916374+0.010769 test-auc:0.771695+0.049749 
[23]    train-auc:0.923077+0.008942 test-auc:0.776505+0.044740 
[24]    train-auc:0.931861+0.007251 test-auc:0.775831+0.047694 
[25]    train-auc:0.939725+0.007186 test-auc:0.780205+0.050416 
[26]    train-auc:0.946987+0.007851 test-auc:0.781855+0.049097 
[27]    train-auc:0.953082+0.007199 test-auc:0.790830+0.049520 
[28]    train-auc:0.956905+0.006655 test-auc:0.790994+0.048041 
[29]    train-auc:0.959474+0.007135 test-auc:0.790498+0.049088 
[30]    train-auc:0.962160+0.007075 test-auc:0.789384+0.049979 
[31]    train-auc:0.964746+0.007114 test-auc:0.792300+0.051115 
[32]    train-auc:0.966856+0.007151 test-auc:0.792673+0.052020 
[33]    train-auc:0.968700+0.007220 test-auc:0.795034+0.054237 
[34]    train-auc:0.970152+0.007450 test-auc:0.794372+0.055125 
[35]    train-auc:0.971111+0.007797 test-auc:0.794512+0.056575 
[36]    train-auc:0.971236+0.007781 test-auc:0.794393+0.056170 
[37]    train-auc:0.971682+0.007156 test-auc:0.794569+0.055670 
[38]    train-auc:0.972597+0.005744 test-auc:0.795533+0.054641 
[39]    train-auc:0.972833+0.005444 test-auc:0.797224+0.055205 
[40]    train-auc:0.973173+0.004823 test-auc:0.797334+0.055032 
[41]    train-auc:0.973331+0.004548 test-auc:0.796973+0.055180 
[42]    train-auc:0.973664+0.004087 test-auc:0.797062+0.055142 
[43]    train-auc:0.973977+0.004077 test-auc:0.796690+0.055335 
[44]    train-auc:0.974349+0.003928 test-auc:0.796797+0.055291 
[45]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[46]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[47]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[48]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[49]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[50]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[51]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[52]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[53]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[54]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[55]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[56]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[57]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[58]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[59]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[60]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[61]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[62]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[63]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[64]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[65]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[66]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[67]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[68]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[69]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[70]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[71]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[72]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[73]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[74]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[75]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[76]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[77]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[78]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[79]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[80]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[81]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[82]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[83]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[84]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[85]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[86]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[87]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[88]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[89]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[90]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[91]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[92]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[93]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[94]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[95]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[96]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[97]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[98]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[99]    train-auc:0.974441+0.003939 test-auc:0.796609+0.055369 
[100]   train-auc:0.974441+0.003939 test-auc:0.796609+0.055369

numTrees <- min(
 which(
  xgbCV$evaluation_log$test_auc_mean == 
  max(xgbCV$evaluation_log$test_auc_mean)
 )
)

fit4 <- xgboost(params=params,
                data = train_x,
                label = train_y,
                nrounds = numTrees,
                eval_metric="auc")

[1] train-auc:0.500000 
[2] train-auc:0.663489 
[3] train-auc:0.663489 
[4] train-auc:0.703386 
[5] train-auc:0.703386 
[6] train-auc:0.704123 
[7] train-auc:0.727505 
[8] train-auc:0.727505 
[9] train-auc:0.727505 
[10]    train-auc:0.784639 
[11]    train-auc:0.818359 
[12]    train-auc:0.816647 
[13]    train-auc:0.851022 
[14]    train-auc:0.864434 
[15]    train-auc:0.877787 
[16]    train-auc:0.883615 
[17]    train-auc:0.885182 
[18]    train-auc:0.899875 
[19]    train-auc:0.902216 
[20]    train-auc:0.912799 
[21]    train-auc:0.917703 
[22]    train-auc:0.918794 
[23]    train-auc:0.920644 
[24]    train-auc:0.926631 
[25]    train-auc:0.933874 
[26]    train-auc:0.947723 
[27]    train-auc:0.958398 
[28]    train-auc:0.961457 
[29]    train-auc:0.963316 
[30]    train-auc:0.966204 
[31]    train-auc:0.968502 
[32]    train-auc:0.971049 
[33]    train-auc:0.972695 
[34]    train-auc:0.975750 
[35]    train-auc:0.977323 
[36]    train-auc:0.978427 
[37]    train-auc:0.979179 
[38]    train-auc:0.979179 
[39]    train-auc:0.979179 
[40]    train-auc:0.979179

xgb.train.data = xgb.DMatrix(train_x, label = train_y, missing = NA)
col_names = attr(xgb.train.data, ".Dimnames")[[2]]
imp = xgb.importance(col_names, fit4)
# Variable importance
xgb.plot.importance(imp)

pred.xgb <- c(predict(fit4, test_x, type="response"))
df_test <- data.frame(pred.xgb=pred.xgb, AAER=factor(test_y, levels=c(0,1)))

pred.xgb <- c(predict(fit4, train_x, type="response"))
df_train <- data.frame(pred.xgb=pred.xgb, AAER=factor(train_y, levels=c(0,1)))

auc_in <- df_train %>% roc_auc(AAER, pred.xgb, event_level='second')
auc_out <- df_test %>% roc_auc(AAER, pred.xgb, event_level='second')
curve_in_xgb <- df_train %>% roc_curve(AAER, pred.xgb, event_level='second')
curve_out_xgb <- df_test %>% roc_curve(AAER, pred.xgb, event_level='second')

aucs_xgb <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_xgb) <- c("In sample, XGBoost", "Out of sample, XGBoost")

ggplot() +
  geom_line(data=curve_out_BCE, aes(y=sensitivity, x=1-specificity, color="BCE")) + 
  geom_line(data=curve_out_2000f, aes(y=sensitivity, x=1-specificity, color="2000s + 2011")) + 
  geom_line(data=curve_out_2000s, aes(y=sensitivity, x=1-specificity, color="2000s")) + 
  geom_line(data=curve_out_2011, aes(y=sensitivity, x=1-specificity, color="2011")) + 
  geom_line(data=curve_out_1990s, aes(y=sensitivity, x=1-specificity, color="1990s")) +
  geom_line(data=curve_out_CV.min, aes(y=sensitivity, x=1-specificity, color="LASSO, lambda.min")) + 
  geom_line(data=curve_out_CV.1se, aes(y=sensitivity, x=1-specificity, color="LASSO, lambda.1se")) + 
  geom_line(data=curve_out_xgb, aes(y=sensitivity, x=1-specificity, color="XGBoost")) +
  geom_abline(slope=1)

oos_aucs <- c(aucs_1990s[2], aucs_2000s[2], aucs_2000f[2], aucs_2011[2], aucs_BCE[2], aucs_CV.1se[2], aucs_CV.min[2], auc_out$.estimate)
names(oos_aucs) <- c('1990s', '2000s', '2000s + 2011', '2011', 'BC', 'LASSO, lambda.1se', 'LASSO, lambda.min', 'XGBoost')
oos_aucs

            1990s             2000s      2000s + 2011              2011 
        0.7292981         0.6295414         0.7147021         0.6849225 
               BC LASSO, lambda.1se LASSO, lambda.min           XGBoost 
        0.7599594         0.7124231         0.7290185         0.8083503

df_save_0 <- df_save %>% filter(Test==0)
df_save_1 <- df_save %>% filter(Test==1)
df_save_0$pred_xgb <- predict(fit4, train_x, type="response")
df_save_1$pred_xgb <- predict(fit4, test_x, type="response")
df_save <- bind_rows(df_save_0, df_save_1)
saveRDS(df_save, '../../Data/Session_6_models.rds')