Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
} else {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
column_spec(1,bold=T)
}
}
```r
library(tidyverse)
<!-- rnb-source-end -->
<!-- rnb-output-begin eyJkYXRhIjoiUmVnaXN0ZXJlZCBTMyBtZXRob2Qgb3ZlcndyaXR0ZW4gYnkgJ2RwbHlyJzpcbiAgbWV0aG9kICAgICAgICAgICBmcm9tXG4gIHByaW50LnJvd3dpc2VfZGYgICAgIFxuLS0gQXR0YWNoaW5nIHBhY2thZ2VzIC0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLSB0aWR5dmVyc2UgMS4yLjEgLS1cbnYgZ2dwbG90MiAzLjIuMSAgICAgdiBwdXJyciAgIDAuMy4yXG52IHRpYmJsZSAgMi4xLjMgICAgIHYgZHBseXIgICAwLjguM1xudiB0aWR5ciAgIDEuMC4wICAgICB2IHN0cmluZ3IgMS40LjBcbnYgcmVhZHIgICAxLjMuMSAgICAgdiBmb3JjYXRzIDAuNC4wXG4tLSBDb25mbGljdHMgLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tIHRpZHl2ZXJzZV9jb25mbGljdHMoKSAtLVxueCBkcGx5cjo6ZmlsdGVyKCkgICAgIG1hc2tzIHN0YXRzOjpmaWx0ZXIoKVxueCBkcGx5cjo6Z3JvdXBfcm93cygpIG1hc2tzIGthYmxlRXh0cmE6Omdyb3VwX3Jvd3MoKVxueCBkcGx5cjo6bGFnKCkgICAgICAgIG1hc2tzIHN0YXRzOjpsYWcoKVxuIn0= -->
Registered S3 method overwritten by ‘dplyr’: method from print.rowwise_df
– Attaching packages ————————————— tidyverse 1.2.1 – v ggplot2 3.2.1 v purrr 0.3.2 v tibble 2.1.3 v dplyr 0.8.3 v tidyr 1.0.0 v stringr 1.4.0 v readr 1.3.1 v forcats 0.4.0 – Conflicts —————————————— tidyverse_conflicts() – x dplyr::filter() masks stats::filter() x dplyr::group_rows() masks kableExtra::group_rows() x dplyr::lag() masks stats::lag()
<!-- rnb-output-end -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuYGBgclxubGlicmFyeShjb2VmcGxvdClcbmBgYFxuYGBgIn0= -->
```r
```r
library(coefplot)
<!-- rnb-source-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuZGYgPC0gcmVhZC5jc3YoXCIuLi8uLi9EYXRhL1Nlc3Npb25fNi5jc3ZcIikgJT4lXG4gIG11dGF0ZShBQUVSID0gZmFjdG9yKEFBRVIsIGxldmVscz1jKDAsMSkpKVxuYGBgIn0= -->
```r
df <- read.csv("../../Data/Session_6.csv") %>%
mutate(AAER = factor(AAER, levels=c(0,1)))
ex <- data.frame(year=c(1999,2001,2003), year_found=c(2001,2003,2006), aaer=c(1,1,1), aaer_as_of_2004=c(1,1,0))
html_df(ex)
year | year_found | aaer | aaer_as_of_2004 |
---|---|---|---|
1999 | 2001 | 1 | 1 |
2001 | 2003 | 1 | 1 |
2003 | 2006 | 1 | 0 |
df %>%
group_by(year) %>%
mutate(total_AAERS = sum(AAER==1), total_observations=n()) %>%
slice(1) %>%
ungroup() %>%
select(year, total_AAERS, total_observations) %>%
html_df
year | total_AAERS | total_observations |
---|---|---|
1999 | 46 | 2195 |
2000 | 50 | 2041 |
2001 | 43 | 2021 |
2002 | 50 | 2391 |
2003 | 57 | 2936 |
2004 | 49 | 2843 |
fit_1990s <- glm(AAER ~ ebit + ni_revt + ni_at + log_lt + ltl_at + lt_seq +
lt_at + act_lct + aq_lct + wcap_at + invt_revt + invt_at +
ni_ppent + rect_revt + revt_at + d_revt + b_rect + b_rect +
r_gp + b_gp + gp_at + revt_m_gp + ch_at + log_at +
ppent_at + wcap,
data=df[df$Test==0,],
family=binomial)
summary(fit_1990s)
Call:
glm(formula = AAER ~ ebit + ni_revt + ni_at + log_lt + ltl_at +
lt_seq + lt_at + act_lct + aq_lct + wcap_at + invt_revt +
invt_at + ni_ppent + rect_revt + revt_at + d_revt + b_rect +
b_rect + r_gp + b_gp + gp_at + revt_m_gp + ch_at + log_at +
ppent_at + wcap, family = binomial, data = df[df$Test ==
0, ])
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1391 -0.2275 -0.1661 -0.1190 3.6236
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.660e+00 8.336e-01 -5.591 2.26e-08 ***
ebit -3.564e-04 1.094e-04 -3.257 0.00112 **
ni_revt 3.664e-02 3.058e-02 1.198 0.23084
ni_at -3.196e-01 2.325e-01 -1.374 0.16932
log_lt 1.494e-01 3.409e-01 0.438 0.66118
ltl_at -2.306e-01 7.072e-01 -0.326 0.74438
lt_seq -2.826e-05 4.567e-04 -0.062 0.95067
lt_at -8.559e-01 9.270e-01 -0.923 0.35586
act_lct 1.401e-01 7.005e-02 2.000 0.04546 *
aq_lct -1.751e-01 9.156e-02 -1.912 0.05588 .
wcap_at -8.488e-01 6.487e-01 -1.308 0.19075
invt_revt -9.165e-01 9.679e-01 -0.947 0.34371
invt_at 2.350e+00 1.033e+00 2.276 0.02286 *
ni_ppent 2.703e-03 1.203e-02 0.225 0.82223
rect_revt 6.587e-02 7.427e-02 0.887 0.37511
revt_at -4.790e-01 1.734e-01 -2.763 0.00572 **
d_revt -7.848e-04 1.586e-03 -0.495 0.62079
b_rect 8.212e-02 1.569e-01 0.524 0.60060
r_gp -1.055e-04 6.623e-04 -0.159 0.87345
b_gp 1.266e-01 1.556e-01 0.813 0.41605
gp_at 1.031e-01 3.512e-01 0.293 0.76921
revt_m_gp 2.621e-05 1.094e-05 2.396 0.01655 *
ch_at -5.356e-01 7.909e-01 -0.677 0.49829
log_at 2.687e-01 3.483e-01 0.771 0.44045
ppent_at -2.956e+00 4.798e-01 -6.161 7.23e-10 ***
wcap 7.009e-05 5.187e-05 1.351 0.17658
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2118.6 on 10268 degrees of freedom
Residual deviance: 1928.8 on 10243 degrees of freedom
(1315 observations deleted due to missingness)
AIC: 1980.8
Number of Fisher Scoring iterations: 12
library(yardstick)
For binary classification, the first factor level is assumed to be the event.
Use the argument `event_level = "second"` to alter this as needed.
Attaching package: ‘yardstick’
The following object is masked from ‘package:readr’:
spec
df$pred_1990s <- predict(fit_1990s, df, type="response")
auc_in <- df %>% filter(Test==0) %>% roc_auc(truth=AAER, estimate=pred_1990s, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(truth=AAER, estimate=pred_1990s, event_level='second')
curve_in_1990s <- df %>% filter(Test==0) %>% roc_curve(truth=AAER, estimate=pred_1990s, event_level='second')
curve_out_1990s <- df %>% filter(Test==1) %>% roc_curve(truth=AAER, estimate=pred_1990s, event_level='second')
ggplot() +
geom_line(data=curve_in_1990s, aes(y=sensitivity, x=1-specificity, color="1990s, In Sample")) +
geom_line(data=curve_out_1990s, aes(y=sensitivity, x=1-specificity, color="1990s, Out of Sample")) +
geom_abline(slope=1)
aucs_1990s <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_1990s) <- c("In sample AUC", "Out of sample AUC")
aucs_1990s
In sample AUC Out of sample AUC
0.7483132 0.7292981
fit_2011 <- glm(AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct,
data=df[df$Test==0,],
family=binomial)
summary(fit_2011)
Call:
glm(formula = AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance + oplease_dum +
book_mkt + lag_sdvol + merger + bigNaudit + midNaudit + cffin +
exfin + restruct, family = binomial, data = df[df$Test ==
0, ])
Deviance Residuals:
Min 1Q Median 3Q Max
-0.8434 -0.2291 -0.1658 -0.1196 3.2614
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.1474558 0.5337491 -13.391 < 2e-16 ***
logtotasset 0.3214322 0.0355467 9.043 < 2e-16 ***
rsst_acc -0.2190095 0.3009287 -0.728 0.4667
chg_recv 1.1020740 1.0590837 1.041 0.2981
chg_inv 0.0389504 1.2507142 0.031 0.9752
soft_assets 2.3094551 0.3325731 6.944 3.81e-12 ***
pct_chg_cashsales -0.0006912 0.0108771 -0.064 0.9493
chg_roa -0.2697984 0.2554262 -1.056 0.2908
issuance 0.1443841 0.3187606 0.453 0.6506
oplease_dum -0.2029022 0.1970597 -1.030 0.3032
book_mkt 0.0150113 0.0110652 1.357 0.1749
lag_sdvol 0.0517368 0.0555338 0.932 0.3515
merger 0.3529102 0.1511729 2.334 0.0196 *
bigNaudit -0.1998454 0.3598283 -0.555 0.5786
midNaudit -0.4894029 0.5118388 -0.956 0.3390
cffin 0.4557560 0.3435253 1.327 0.1846
exfin -0.0053608 0.0393354 -0.136 0.8916
restruct 0.3827721 0.1471729 2.601 0.0093 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2381.9 on 11583 degrees of freedom
Residual deviance: 2189.7 on 11566 degrees of freedom
AIC: 2225.7
Number of Fisher Scoring iterations: 7
df$pred_2011 <- predict(fit_2011, df, type="response")
auc_in <- df %>% filter(Test==0) %>% roc_auc(truth=AAER, estimate=pred_2011, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(truth=AAER, estimate=pred_2011, event_level='second')
curve_in_2011 <- df %>% filter(Test==0) %>% roc_curve(truth=AAER, estimate=pred_2011, event_level='second')
curve_out_2011 <- df %>% filter(Test==1) %>% roc_curve(truth=AAER, estimate=pred_2011, event_level='second')
ggplot() +
geom_line(data=curve_in_2011, aes(y=sensitivity, x=1-specificity, color="2011, In Sample")) +
geom_line(data=curve_out_2011, aes(y=sensitivity, x=1-specificity, color="2011, Out of Sample")) +
geom_abline(slope=1)
aucs_2011 <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_2011) <- c("In sample AUC", "Out of sample AUC")
aucs_2011
In sample AUC Out of sample AUC
0.7445378 0.6849225
df_save <- data.frame(Test=df$Test, AAER=df$AAER, pred_F=df$pred_2011)
fit_2000s <- glm(AAER ~ bullets + headerlen + newlines + alltags +
processedsize + sentlen_u + wordlen_s + paralen_s +
repetitious_p + sentlen_s + typetoken + clindex + fog +
active_p + passive_p + lm_negative_p + lm_positive_p +
allcaps + exclamationpoints + questionmarks,
data=df[df$Test==0,],
family=binomial)
summary(fit_2000s)
Call:
glm(formula = AAER ~ bullets + headerlen + newlines + alltags +
processedsize + sentlen_u + wordlen_s + paralen_s + repetitious_p +
sentlen_s + typetoken + clindex + fog + active_p + passive_p +
lm_negative_p + lm_positive_p + allcaps + exclamationpoints +
questionmarks, family = binomial, data = df[df$Test == 0,
])
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9604 -0.2244 -0.1984 -0.1749 3.2318
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.662e+00 3.143e+00 -1.801 0.07165 .
bullets -2.635e-05 2.625e-05 -1.004 0.31558
headerlen -2.943e-04 3.477e-04 -0.846 0.39733
newlines -4.821e-05 1.220e-04 -0.395 0.69271
alltags 5.060e-08 2.567e-07 0.197 0.84376
processedsize 5.709e-06 1.287e-06 4.435 9.19e-06 ***
sentlen_u -3.790e-02 6.897e-02 -0.550 0.58259
wordlen_s 1.278e-01 1.199e+00 0.107 0.91510
paralen_s -4.808e-02 3.052e-02 -1.576 0.11514
repetitious_p -1.673e+00 1.665e+00 -1.005 0.31508
sentlen_s -2.098e-02 2.222e-02 -0.944 0.34518
typetoken -3.609e-01 1.729e+00 -0.209 0.83469
clindex 2.744e-01 1.519e-01 1.806 0.07085 .
fog -1.839e-02 1.322e-01 -0.139 0.88935
active_p -4.321e-01 1.459e+00 -0.296 0.76711
passive_p 7.321e-02 3.204e+00 0.023 0.98177
lm_negative_p 1.356e+01 1.304e+01 1.039 0.29875
lm_positive_p -9.911e+01 3.842e+01 -2.580 0.00989 **
allcaps -4.646e-04 1.835e-04 -2.532 0.01136 *
exclamationpoints -3.520e-01 1.944e-01 -1.811 0.07010 .
questionmarks 3.371e-03 2.816e-02 0.120 0.90471
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2381.9 on 11583 degrees of freedom
Residual deviance: 2329.2 on 11563 degrees of freedom
AIC: 2371.2
Number of Fisher Scoring iterations: 10
df$pred_2000s <- predict(fit_2000s, df, type="response")
auc_in <- df %>% filter(Test==0) %>% roc_auc(truth=AAER, estimate=pred_2000s, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(truth=AAER, estimate=pred_2000s, event_level='second')
curve_in_2000s <- df %>% filter(Test==0) %>% roc_curve(truth=AAER, estimate=pred_2000s, event_level='second')
curve_out_2000s <- df %>% filter(Test==1) %>% roc_curve(truth=AAER, estimate=pred_2000s, event_level='second')
ggplot() +
geom_line(data=curve_in_2000s, aes(y=sensitivity, x=1-specificity, color="2000s, In Sample")) +
geom_line(data=curve_out_2000s, aes(y=sensitivity, x=1-specificity, color="2000s, Out of Sample")) +
geom_abline(slope=1)
aucs_2000s <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_2000s) <- c("In sample AUC", "Out of sample AUC")
aucs_2000s
In sample AUC Out of sample AUC
0.6377783 0.6295414
df_save$pred_S <- df$pred_2000s
fit_2000f <- glm(AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks,
data=df[df$Test==0,],
family=binomial)
summary(fit_2000f)
Call:
glm(formula = AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance + oplease_dum +
book_mkt + lag_sdvol + merger + bigNaudit + midNaudit + cffin +
exfin + restruct + bullets + headerlen + newlines + alltags +
processedsize + sentlen_u + wordlen_s + paralen_s + repetitious_p +
sentlen_s + typetoken + clindex + fog + active_p + passive_p +
lm_negative_p + lm_positive_p + allcaps + exclamationpoints +
questionmarks, family = binomial, data = df[df$Test == 0,
])
Deviance Residuals:
Min 1Q Median 3Q Max
-0.9514 -0.2237 -0.1596 -0.1110 3.3882
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.634e+00 3.415e+00 -0.479 0.63223
logtotasset 3.437e-01 3.921e-02 8.766 < 2e-16 ***
rsst_acc -2.123e-01 2.995e-01 -0.709 0.47844
chg_recv 1.022e+00 1.055e+00 0.969 0.33274
chg_inv -9.464e-02 1.226e+00 -0.077 0.93845
soft_assets 2.573e+00 3.387e-01 7.598 3.01e-14 ***
pct_chg_cashsales -1.134e-03 1.103e-02 -0.103 0.91811
chg_roa -2.696e-01 2.470e-01 -1.092 0.27504
issuance 1.471e-01 3.220e-01 0.457 0.64777
oplease_dum -2.859e-01 2.020e-01 -1.416 0.15691
book_mkt 1.531e-02 1.115e-02 1.373 0.16967
lag_sdvol 6.147e-02 5.418e-02 1.135 0.25655
merger 3.699e-01 1.536e-01 2.408 0.01604 *
bigNaudit -1.992e-01 3.638e-01 -0.548 0.58392
midNaudit -4.839e-01 5.139e-01 -0.942 0.34635
cffin 4.608e-01 3.599e-01 1.280 0.20044
exfin -1.235e-03 5.907e-02 -0.021 0.98332
restruct 3.035e-01 1.555e-01 1.952 0.05099 .
bullets -1.958e-05 2.519e-05 -0.777 0.43692
headerlen -5.792e-04 4.038e-04 -1.434 0.15151
newlines -7.423e-05 1.294e-04 -0.574 0.56615
alltags -1.338e-07 2.701e-07 -0.495 0.62036
processedsize 4.111e-06 1.446e-06 2.844 0.00446 **
sentlen_u -5.086e-02 6.950e-02 -0.732 0.46433
wordlen_s -1.618e+00 1.303e+00 -1.241 0.21450
paralen_s -3.934e-02 2.923e-02 -1.346 0.17834
repetitious_p -1.433e+00 1.658e+00 -0.864 0.38740
sentlen_s -1.690e-02 2.212e-02 -0.764 0.44502
typetoken -1.216e+00 1.788e+00 -0.680 0.49643
clindex 9.620e-02 1.521e-01 0.632 0.52718
fog 1.063e-02 1.339e-01 0.079 0.93675
active_p 6.259e-01 1.435e+00 0.436 0.66271
passive_p -5.848e+00 3.796e+00 -1.541 0.12337
lm_negative_p 2.012e+01 1.220e+01 1.649 0.09906 .
lm_positive_p -3.700e+01 3.864e+01 -0.958 0.33825
allcaps -4.721e-04 1.912e-04 -2.469 0.01354 *
exclamationpoints -4.510e-01 2.024e-01 -2.228 0.02586 *
questionmarks 4.096e-03 2.853e-02 0.144 0.88586
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2381.9 on 11583 degrees of freedom
Residual deviance: 2143.0 on 11546 degrees of freedom
AIC: 2219
Number of Fisher Scoring iterations: 10
df$pred_2000f <- predict(fit_2000f, df, type="response")
auc_in <- df %>% filter(Test==0) %>% roc_auc(truth=AAER, estimate=pred_2000f, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(truth=AAER, estimate=pred_2000f, event_level='second')
curve_in_2000f <- df %>% filter(Test==0) %>% roc_curve(truth=AAER, estimate=pred_2000f, event_level='second')
curve_out_2000f <- df %>% filter(Test==1) %>% roc_curve(truth=AAER, estimate=pred_2000f, event_level='second')
ggplot() +
geom_line(data=curve_in_2000f, aes(y=sensitivity, x=1-specificity, color="2000s + 2011, In Sample")) +
geom_line(data=curve_out_2000f, aes(y=sensitivity, x=1-specificity, color="2000s + 2011, Out of Sample")) +
geom_abline(slope=1)
aucs_2000f <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_2000f) <- c("In sample AUC", "Out of sample AUC")
aucs_2000f
In sample AUC Out of sample AUC
0.7664115 0.7147021
df_save$pred_FS <- df$pred_2000f
BCE_eq = as.formula(paste("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))
fit_BCE <- glm(BCE_eq,
data=df[df$Test==0,],
family=binomial)
summary(fit_BCE)
Call:
glm(formula = BCE_eq, family = binomial, data = df[df$Test ==
0, ])
Deviance Residuals:
Min 1Q Median 3Q Max
-1.0887 -0.2212 -0.1478 -0.0940 3.5401
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.032e+00 3.872e+00 -2.074 0.03806 *
logtotasset 3.879e-01 4.554e-02 8.519 < 2e-16 ***
rsst_acc -1.938e-01 3.055e-01 -0.634 0.52593
chg_recv 8.581e-01 1.071e+00 0.801 0.42296
chg_inv -2.607e-01 1.223e+00 -0.213 0.83119
soft_assets 2.555e+00 3.796e-01 6.730 1.7e-11 ***
pct_chg_cashsales -1.976e-03 6.997e-03 -0.282 0.77767
chg_roa -2.532e-01 2.786e-01 -0.909 0.36354
issuance 9.692e-02 3.269e-01 0.296 0.76687
oplease_dum -3.451e-01 2.097e-01 -1.645 0.09989 .
book_mkt 1.361e-02 1.151e-02 1.183 0.23692
lag_sdvol 4.546e-02 5.709e-02 0.796 0.42589
merger 3.224e-01 1.572e-01 2.051 0.04027 *
bigNaudit -2.010e-01 3.711e-01 -0.542 0.58804
midNaudit -4.641e-01 5.195e-01 -0.893 0.37169
cffin 6.024e-01 3.769e-01 1.598 0.10998
exfin -6.738e-03 4.063e-02 -0.166 0.86831
restruct 1.915e-01 1.593e-01 1.202 0.22920
bullets -1.612e-05 2.526e-05 -0.638 0.52334
headerlen -3.956e-04 3.722e-04 -1.063 0.28776
newlines -1.038e-04 1.333e-04 -0.779 0.43623
alltags -1.338e-07 2.772e-07 -0.483 0.62934
processedsize 4.178e-06 1.477e-06 2.828 0.00468 **
sentlen_u -7.131e-02 7.881e-02 -0.905 0.36553
wordlen_s 4.413e-01 1.430e+00 0.309 0.75761
paralen_s -4.584e-02 2.976e-02 -1.540 0.12356
repetitious_p -1.525e+00 1.780e+00 -0.857 0.39152
sentlen_s -7.700e-04 2.238e-02 -0.034 0.97255
typetoken 5.313e-02 1.934e+00 0.027 0.97809
clindex 6.406e-02 1.669e-01 0.384 0.70116
fog 7.432e-02 1.590e-01 0.467 0.64018
active_p -3.159e-01 1.628e+00 -0.194 0.84620
passive_p -6.714e+00 4.192e+00 -1.602 0.10926
lm_negative_p 3.558e+00 1.422e+01 0.250 0.80240
lm_positive_p -8.906e+01 4.641e+01 -1.919 0.05497 .
allcaps -4.189e-04 1.916e-04 -2.186 0.02878 *
exclamationpoints -4.685e-01 2.143e-01 -2.187 0.02878 *
questionmarks 4.424e-03 2.876e-02 0.154 0.87774
Topic_1_n_oI 3.157e+01 1.137e+02 0.278 0.78136
Topic_2_n_oI 6.382e+01 6.784e+01 0.941 0.34684
Topic_3_n_oI -1.853e+02 1.457e+02 -1.272 0.20335
Topic_4_n_oI -4.247e+01 7.460e+01 -0.569 0.56919
Topic_5_n_oI -8.240e+00 8.061e+01 -0.102 0.91858
Topic_6_n_oI -5.072e+02 1.793e+02 -2.829 0.00468 **
Topic_7_n_oI 1.994e+01 8.192e+01 0.243 0.80772
Topic_8_n_oI -9.620e+01 8.419e+01 -1.143 0.25321
Topic_9_n_oI 5.963e+01 7.064e+01 0.844 0.39863
Topic_10_n_oI -4.641e+01 6.705e+01 -0.692 0.48885
Topic_11_n_oI -1.415e+02 7.384e+01 -1.916 0.05536 .
Topic_12_n_oI 2.147e-01 8.441e+01 0.003 0.99797
Topic_13_n_oI -1.420e+02 2.612e+02 -0.543 0.58686
Topic_14_n_oI 2.086e+01 7.840e+01 0.266 0.79019
Topic_15_n_oI -3.519e+01 6.332e+01 -0.556 0.57840
Topic_16_n_oI 1.629e+01 1.156e+02 0.141 0.88793
Topic_17_n_oI -5.188e+01 8.901e+01 -0.583 0.56000
Topic_18_n_oI 2.239e+01 9.340e+01 0.240 0.81058
Topic_19_n_oI -1.071e+02 8.063e+01 -1.328 0.18422
Topic_20_n_oI -4.885e+01 9.701e+01 -0.504 0.61455
Topic_21_n_oI -1.515e+02 9.116e+01 -1.662 0.09654 .
Topic_22_n_oI -6.818e+00 6.472e+01 -0.105 0.91610
Topic_23_n_oI -1.226e+01 6.965e+01 -0.176 0.86031
Topic_24_n_oI -6.545e+01 9.523e+01 -0.687 0.49189
Topic_25_n_oI -6.863e+01 7.094e+01 -0.967 0.33336
Topic_26_n_oI -2.182e+00 6.580e+01 -0.033 0.97354
Topic_27_n_oI 9.374e+00 1.418e+02 0.066 0.94729
Topic_28_n_oI 5.882e+00 7.937e+01 0.074 0.94092
Topic_29_n_oI -4.209e+01 6.670e+01 -0.631 0.52798
Topic_30_n_oI -4.605e+02 2.672e+02 -1.724 0.08477 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2381.9 on 11583 degrees of freedom
Residual deviance: 2069.5 on 11516 degrees of freedom
AIC: 2205.5
Number of Fisher Scoring iterations: 11
df$pred_BCE <- predict(fit_BCE, df, type="response")
auc_in <- df %>% filter(Test==0) %>% roc_auc(truth=AAER, estimate=pred_BCE, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(truth=AAER, estimate=pred_BCE, event_level='second')
curve_in_BCE <- df %>% filter(Test==0) %>% roc_curve(truth=AAER, estimate=pred_BCE, event_level='second')
curve_out_BCE <- df %>% filter(Test==1) %>% roc_curve(truth=AAER, estimate=pred_BCE, event_level='second')
ggplot() +
geom_line(data=curve_in_BCE, aes(y=sensitivity, x=1-specificity, color="BCE, In Sample")) +
geom_line(data=curve_out_BCE, aes(y=sensitivity, x=1-specificity, color="BCE, Out of Sample")) +
geom_abline(slope=1)
aucs_BCE <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_BCE) <- c("In sample AUC", "Out of sample AUC")
aucs_BCE
In sample AUC Out of sample AUC
0.7941841 0.7599594
df_save$pred_BCE <- df$pred_aucs_BCE
ggplot() +
geom_line(data=curve_out_BCE, aes(y=sensitivity, x=1-specificity, color="BCE")) +
geom_line(data=curve_out_2000f, aes(y=sensitivity, x=1-specificity, color="2000s + 2011")) +
geom_line(data=curve_out_2000s, aes(y=sensitivity, x=1-specificity, color="2000s")) +
geom_line(data=curve_out_2011, aes(y=sensitivity, x=1-specificity, color="2011")) +
geom_line(data=curve_out_1990s, aes(y=sensitivity, x=1-specificity, color="1990s")) +
geom_abline(slope=1) +
ggtitle("Out of Sample ROC Curves")
oos_aucs <- c(aucs_1990s[2], aucs_2011[2], aucs_2000s[2], aucs_2000f[2], aucs_BCE[2])
names(oos_aucs) <- c("1990s", "2011", "2000s", "2000s + 2011", "BCE")
oos_aucs
1990s 2011 2000s 2000s + 2011 BCE
0.7292981 0.6849225 0.6295414 0.7147021 0.7599594
library(glmnet)
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
Loaded glmnet 4.1-2
x <- model.matrix(BCE_eq, data=df[df$Test==0,])[,-1] # [,-1] to remove intercept
y <- model.frame(BCE_eq, data=df[df$Test==0,])[,"AAER"]
fit_LASSO <- glmnet(x=x, y=y,
family = "binomial",
alpha = 1 # Specifies LASSO. alpha = 0 is ridge
)
plot(fit_LASSO)
print(fit_LASSO)
Call: glmnet(x = x, y = y, family = "binomial", alpha = 1)
Df %Dev Lambda
1 0 0.00 0.0143300
2 1 0.81 0.0130500
3 1 1.46 0.0118900
4 1 2.00 0.0108400
5 2 2.47 0.0098740
6 2 3.22 0.0089970
7 2 3.85 0.0081970
8 2 4.37 0.0074690
9 2 4.81 0.0068060
10 3 5.22 0.0062010
11 3 5.59 0.0056500
12 4 5.91 0.0051480
13 4 6.25 0.0046910
14 5 6.57 0.0042740
15 7 6.89 0.0038940
16 8 7.22 0.0035480
17 10 7.52 0.0032330
18 12 7.83 0.0029460
19 15 8.16 0.0026840
20 15 8.49 0.0024460
21 15 8.78 0.0022290
22 15 9.03 0.0020310
23 18 9.26 0.0018500
24 20 9.48 0.0016860
25 22 9.69 0.0015360
26 25 9.89 0.0014000
27 28 10.10 0.0012750
28 30 10.33 0.0011620
29 31 10.54 0.0010590
30 33 10.74 0.0009647
31 36 10.93 0.0008790
32 37 11.11 0.0008009
33 39 11.26 0.0007297
34 42 11.42 0.0006649
35 43 11.57 0.0006058
36 44 11.71 0.0005520
37 47 11.85 0.0005030
38 48 11.97 0.0004583
39 48 12.08 0.0004176
40 48 12.18 0.0003805
41 51 12.27 0.0003467
42 52 12.35 0.0003159
43 54 12.43 0.0002878
44 54 12.49 0.0002623
45 54 12.55 0.0002390
46 55 12.60 0.0002177
47 56 12.65 0.0001984
48 58 12.70 0.0001808
49 60 12.74 0.0001647
50 61 12.79 0.0001501
51 62 12.82 0.0001367
52 63 12.86 0.0001246
53 65 12.89 0.0001135
54 66 12.92 0.0001034
55 66 12.94 0.0000942
56 66 12.97 0.0000859
57 66 12.98 0.0000782
58 66 13.00 0.0000713
59 66 13.02 0.0000650
60 66 13.03 0.0000592
61 67 13.04 0.0000539
62 67 13.05 0.0000491
63 67 13.06 0.0000448
64 67 13.06 0.0000408
65 67 13.07 0.0000372
66 67 13.08 0.0000339
67 67 13.08 0.0000309
68 67 13.09 0.0000281
69 67 13.09 0.0000256
70 67 13.09 0.0000233
71 67 13.10 0.0000213
72 67 13.10 0.0000194
73 67 13.10 0.0000177
74 67 13.10 0.0000161
75 67 13.10 0.0000147
#coef(fit_LASSO, s=0.002031)
coefplot(fit_LASSO, lambda=0.002031, sort='magnitude')
# na.pass has model.matrix retain NA values (so the # of rows is constant)
xp <- model.matrix(BCE_eq, data=df, na.action='na.pass')[,-1]
# s= specifies the version of the model to use
df$pred_L1 <- c(predict(fit_LASSO, xp, type="response", s = 0.002031))
auc_in <- df %>% filter(Test==0) %>% roc_auc(truth=AAER, estimate=pred_L1, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(truth=AAER, estimate=pred_L1, event_level='second')
curve_in_L1 <- df %>% filter(Test==0) %>% roc_curve(truth=AAER, estimate=pred_L1, event_level='second')
curve_out_L1 <- df %>% filter(Test==1) %>% roc_curve(truth=AAER, estimate=pred_L1, event_level='second')
ggplot() +
geom_line(data=curve_in_L1, aes(y=sensitivity, x=1-specificity, color="BCE, LASSO, In Sample")) +
geom_line(data=curve_out_L1, aes(y=sensitivity, x=1-specificity, color="BCE, LASSO, Out of Sample")) +
geom_abline(slope=1)
aucs_L1 <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_L1) <- c("In sample AUC", "Out of sample AUC")
aucs_L1
In sample AUC Out of sample AUC
0.7593828 0.7239785
# Cross validation
set.seed(697435) #for reproducibility
cvfit = cv.glmnet(x=x, y=y,family = "binomial", alpha = 1, type.measure="auc")
plot(cvfit)
cvfit$lambda.min
[1] 0.001685798
cvfit$lambda.1se
[1] 0.002684268
#coef(cvfit, s = "lambda.min")
coefplot(cvfit, lambda='lambda.min', sort='magnitude') + theme(axis.text.y = element_text(size=15))
#coef(cvfit, s = "lambda.1se")
coefplot(cvfit, lambda='lambda.1se', sort='magnitude') + theme(axis.text.y = element_text(size=15))
# s= specifies the version of the model to use
df$pred_L1.min <- c(predict(cvfit, xp, type="response", s = "lambda.min"))
df$pred_L1.1se <- c(predict(cvfit, xp, type="response", s = "lambda.1se"))
df_save$pred_lmin <- df$pred_L1.min
df_save$pred_l1se <- df$pred_L1.1se
auc_in <- df %>% filter(Test==0) %>% roc_auc(truth=AAER, estimate=pred_L1.min, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(truth=AAER, estimate=pred_L1.min, event_level='second')
curve_in_CV.min <- df %>% filter(Test==0) %>% roc_curve(truth=AAER, estimate=pred_L1.min, event_level='second')
curve_out_CV.min <- df %>% filter(Test==1) %>% roc_curve(truth=AAER, estimate=pred_L1.min, event_level='second')
aucs_CV.min <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_CV.min) <- c("In sample, lambda.min", "Out of sample, lambda.min")
auc_in <- df %>% filter(Test==0) %>% roc_auc(truth=AAER, estimate=pred_L1.1se, event_level='second')
auc_out <- df %>% filter(Test==1) %>% roc_auc(truth=AAER, estimate=pred_L1.1se, event_level='second')
curve_in_CV.1se <- df %>% filter(Test==0) %>% roc_curve(truth=AAER, estimate=pred_L1.1se, event_level='second')
curve_out_CV.1se <- df %>% filter(Test==1) %>% roc_curve(truth=AAER, estimate=pred_L1.1se, event_level='second')
aucs_CV.1se <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_CV.1se) <- c("In sample, lambda.1se", "Out of sample, lambda.1se")
ggplot() +
geom_line(data=curve_in_CV.min, aes(y=sensitivity, x=1-specificity, color="In Sample, lambda.min")) +
geom_line(data=curve_out_CV.min, aes(y=sensitivity, x=1-specificity, color="Out of Sample, lambda.min")) +
geom_line(data=curve_in_CV.1se, aes(y=sensitivity, x=1-specificity, color="In Sample, lambda.1se")) +
geom_line(data=curve_out_CV.1se, aes(y=sensitivity, x=1-specificity, color="Out of Sample, lambda.1se")) +
geom_abline(slope=1)
c(aucs_CV.min, aucs_CV.1se)
In sample, lambda.min Out of sample, lambda.min In sample, lambda.1se Out of sample, lambda.1se
0.7631710 0.7290185 0.7509946 0.7124231
BCE_eq <- as.formula(paste("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))
library(recipes)
Attaching package: ‘recipes’
The following object is masked from ‘package:Matrix’:
update
The following object is masked from ‘package:stringr’:
fixed
The following object is masked from ‘package:stats’:
step
library(parsnip)
df <- read_csv("../../Data/Session_6.csv")
-- Column specification ---------------------------------------------------------------------------------------------------------------------------------------
cols(
.default = col_double(),
Filing = col_character(),
date.filed_x = col_character(),
FYE_x = col_character(),
restate_filing = col_character(),
Form = col_character(),
Date = col_character(),
loc = col_character(),
date.filed = col_character(),
FYE = col_character()
)
i Use `spec()` for the full column specifications.
BCEformula <- BCE_eq
train <- df %>% filter(Test == 0)
test <- df %>% filter(Test == 1)
rec <- recipe(BCEformula, data = train) %>%
step_zv(all_predictors()) %>% # Drop any variables with zero variance
step_center(all_predictors()) %>% # Center all prediction variables
step_scale(all_predictors()) %>% # Scale all prediction variables
step_intercept() %>% # Add an intercept to the model
step_num2factor(all_outcomes(), ordered = T, levels=c("0","1"),
transform = function(x) x + 1) # Convert DV to factor
prepped <- rec %>% prep(training=train)
# "bake" your recipe to get data ready
train_baked <- bake(prepped, new_data = train)
test_baked <- bake(prepped, new_data = test)
# Run the model with parsnip
train_model <- logistic_reg(mixture=1, penalty=1) %>% # mixture = 1 sets LASSO
set_engine('glmnet') %>%
fit(BCEformula, data = train_baked)
# train_model$fit is the same as fit_LASSO earlier in the slides
coefplot(train_model$fit, lambda=0.002031, sort='magnitude')
```r
# \bake\ your recipe to get data ready
train_baked <- bake(prepped, new_data = train)
test_baked <- bake(prepped, new_data = test)
# Run the model with parsnip
train_model <- logistic_reg(mixture=1) %>% # mixture = 1 sets LASSO
set_engine('glmnet') %>%
fit(BCEformula, data = train_baked)
<!-- rnb-source-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuIyBDcm9zcyB2YWxpZGF0aW9uXG5zZXQuc2VlZCg3NTM0NykgICNmb3IgcmVwcm9kdWNpYmlsaXR5XG5jdmZpdCA9IGN2LmdsbW5ldCh4PXRyYWluX3gsIHk9dHJhaW5feSwgZmFtaWx5ID0gXCJiaW5vbWlhbFwiLCBhbHBoYSA9IDEsXG4gICAgICAgICAgICAgICAgICB0eXBlLm1lYXN1cmU9XCJhdWNcIilcbmBgYCJ9 -->
```r
# Cross validation
set.seed(75347) #for reproducibility
cvfit = cv.glmnet(x=train_x, y=train_y, family = "binomial", alpha = 1,
type.measure="auc")
plot(cvfit)
cvfit$lambda.min
[1] 0.00139958
cvfit$lambda.1se
[1] 0.003548444
#coef(cvfit, s = "lambda.min")
coefplot(cvfit, lambda='lambda.min', sort='magnitude') + theme(axis.text.y = element_text(size=15))
#coef(cvfit, s = "lambda.1se")
coefplot(cvfit, lambda='lambda.1se', sort='magnitude') + theme(axis.text.y = element_text(size=15))
train.pred_L1.min2 <- c(predict(cvfit, train_x, type="response", s = "lambda.min"))
test.pred_L1.min2 <- c(predict(cvfit, test_x, type="response", s = "lambda.min"))
train.pred_L1.1se2 <- c(predict(cvfit, train_x, type="response", s = "lambda.1se"))
test.pred_L1.1se2 <- c(predict(cvfit, test_x, type="response", s = "lambda.1se"))
df_train <- data.frame(train.pred_L1.min2=train.pred_L1.min2, train.pred_L1.1se2=train.pred_L1.1se2, AAER=factor(train_y, levels=c(0,1)))
df_test <- data.frame(test.pred_L1.min2=test.pred_L1.min2, test.pred_L1.1se2=test.pred_L1.1se2, AAER=factor(test_y, levels=c(0,1)))
auc_in <- df_train %>% roc_auc(truth=AAER, estimate=train.pred_L1.min2, event_level='second')
auc_out <- df_test %>% roc_auc(truth=AAER, estimate=test.pred_L1.min2, event_level='second')
curve_in_CV.min2 <- df_train %>% roc_curve(truth=AAER, estimate=train.pred_L1.min2, event_level='second')
curve_out_CV.min2 <- df_test %>% roc_curve(truth=AAER, estimate=test.pred_L1.min2, event_level='second')
aucs_CV.min2 <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_CV.min2) <- c("In sample, lambda.min", "Out of sample, lambda.min")
auc_in <- df_train %>% roc_auc(truth=AAER, estimate=train.pred_L1.1se2, event_level='second')
auc_out <- df_test %>% roc_auc(truth=AAER, estimate=test.pred_L1.1se2, event_level='second')
curve_in_CV.1se2 <- df_train %>% roc_curve(truth=AAER, estimate=train.pred_L1.1se2, event_level='second')
curve_out_CV.1se2 <- df_test %>% roc_curve(truth=AAER, estimate=test.pred_L1.1se2, event_level='second')
aucs_CV.1se2 <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_CV.1se2) <- c("In sample, lambda.1se", "Out of sample, lambda.1se")
ggplot() +
geom_line(data=curve_in_CV.min2, aes(y=sensitivity, x=1-specificity, color="In Sample, lambda.min")) +
geom_line(data=curve_out_CV.min2, aes(y=sensitivity, x=1-specificity, color="Out of Sample, lambda.min")) +
geom_line(data=curve_in_CV.1se2, aes(y=sensitivity, x=1-specificity, color="In Sample, lambda.1se")) +
geom_line(data=curve_out_CV.1se2, aes(y=sensitivity, x=1-specificity, color="Out of Sample, lambda.1se")) +
geom_abline(slope=1)
c(aucs_CV.min2, aucs_CV.1se2)
In sample, lambda.min Out of sample, lambda.min In sample, lambda.1se Out of sample, lambda.1se
0.7665463 0.7364834 0.7417082 0.7028034
BCE_eq <- as.formula(paste("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))
library(tidyr)
library(tidymodels)
Registered S3 method overwritten by 'tune':
method from
required_pkgs.model_spec parsnip
-- Attaching packages --------------------------------------------------------------------------------------------------------------------- tidymodels 0.1.3 --
v broom 0.7.8 v rsample 0.1.0
v dials 0.0.9 v tune 0.1.6
v infer 1.0.0 v workflows 0.2.3
v modeldata 0.1.1 v workflowsets 0.1.0
Warning: package ‘infer’ was built under R version 4.1.1
-- Conflicts ------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
x scales::discard() masks purrr::discard()
x Matrix::expand() masks tidyr::expand()
x dplyr::filter() masks stats::filter()
x recipes::fixed() masks stringr::fixed()
x dplyr::group_rows() masks kableExtra::group_rows()
x dplyr::lag() masks stats::lag()
x Matrix::pack() masks tidyr::pack()
x yardstick::spec() masks readr::spec()
x recipes::step() masks stats::step()
x Matrix::unpack() masks tidyr::unpack()
x recipes::update() masks Matrix::update(), stats::update()
* Use tidymodels_prefer() to resolve common conflicts.
library(tidyverse)
df <- read_csv("../../Data/Session_6.csv")
-- Column specification ---------------------------------------------------------------------------------------------------------------------------------------
cols(
.default = col_double(),
Filing = col_character(),
date.filed_x = col_character(),
FYE_x = col_character(),
restate_filing = col_character(),
Form = col_character(),
Date = col_character(),
loc = col_character(),
date.filed = col_character(),
FYE = col_character()
)
i Use `spec()` for the full column specifications.
BCEformula <- BCE_eq
train <- df %>% filter(Test == 0)
test <- df %>% filter(Test == 1)
LASSO_rec <- recipe(BCEformula, data = train) %>%
step_zv(all_predictors()) %>% # Drop any variables with zero variance
step_center(all_predictors()) %>% # Center all prediction variables
step_scale(all_predictors()) %>% # Scale all prediction variables
step_intercept() %>% # Add an intercept to the model
step_num2factor(all_outcomes(), ordered = T, levels=c("0","1"),
transform = function(x) x + 1) # Convert DV to factor
LASSO_mod <- logistic_reg(penalty=tune(), mixture=1) %>% # mixture = 1 sets LASSO
set_engine('glmnet')
# Define a grid to tune over
grid <- expand_grid(penalty = exp(seq(-11,-4, length.out=100)))
LASSO_wfl <- workflow() %>%
add_model(LASSO_mod) %>%
add_recipe(LASSO_rec)
set.seed(354351)
folds <- vfold_cv(train, v=10) # from rsample
metrics = metric_set(roc_auc) # from yardstick
LASSO_fit_tuned <- tune_grid(LASSO_wfl,
grid = grid,
resamples = folds,
metrics=metrics)
LASSO_fit_tuned %>%
collect_metrics()
lambda.min <- LASSO_fit_tuned %>%
collect_metrics() %>%
arrange(-mean) %>%
slice(1) %>%
pull(penalty) %>%
log()
LASSO_fit_tuned %>%
collect_metrics() %>%
ggplot(aes(x=log(penalty), y=mean)) +
geom_point() +
xlab("Log(lambda)") +
geom_vline(xintercept = lambda.min)
BCE_eq <- as.formula(paste("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks + ",
paste(paste0("Topic_",1:30,"_n_oI"), collapse=" + "), collapse=""))
library(recipes)
library(parsnip)
df <- read_csv("../../Data/Session_6.csv")
-- Column specification ---------------------------------------------------------------------------------------------------------------------------------------
cols(
.default = col_double(),
Filing = col_character(),
date.filed_x = col_character(),
FYE_x = col_character(),
restate_filing = col_character(),
Form = col_character(),
Date = col_character(),
loc = col_character(),
date.filed = col_character(),
FYE = col_character()
)
i Use `spec()` for the full column specifications.
BCEformula <- BCE_eq
train <- df %>% filter(Test == 0)
test <- df %>% filter(Test == 1)
rec <- recipe(BCEformula, data = train) %>%
step_zv(all_predictors()) %>% # Drop any variables with zero variance
step_center(all_predictors()) %>% # Center all prediction variables
step_scale(all_predictors()) %>% # Scale all prediction variables
step_intercept() # Add an intercept to the model
# Juice our data
prepped <- rec %>% prep(training=train)
train_x <- juice(prepped, all_predictors(), composition = "dgCMatrix")
train_y <- juice(prepped, all_outcomes(), composition = "matrix")
test_prepped <- rec %>% prep(training=test)
test_x <- juice(test_prepped, all_predictors(), composition = "dgCMatrix")
test_y <- juice(test_prepped, all_outcomes(), composition = "matrix")
# Cross validation
set.seed(482342) #for reproducibility
library(xgboost)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: ‘xgboost’
The following object is masked from ‘package:dplyr’:
slice
# model setup
params <- list(max_depth=10,
eta=0.2,
gamma=10,
min_child_weight = 5,
objective =
"binary:logistic")
# run the model
xgbCV <- xgb.cv(params=params,
data=train_x,
label=train_y,
nrounds=100,
eval_metric="auc",
nfold=10,
stratified=TRUE)
[1] train-auc:0.552507+0.080499 test-auc:0.538707+0.062529
[2] train-auc:0.586947+0.087237 test-auc:0.563604+0.068172
[3] train-auc:0.603035+0.084511 test-auc:0.583011+0.074621
[4] train-auc:0.663903+0.057212 test-auc:0.631184+0.055907
[5] train-auc:0.677173+0.064281 test-auc:0.639249+0.055183
[6] train-auc:0.707156+0.026578 test-auc:0.663628+0.038438
[7] train-auc:0.716727+0.025892 test-auc:0.666075+0.037700
[8] train-auc:0.728506+0.026368 test-auc:0.671749+0.041745
[9] train-auc:0.768085+0.025756 test-auc:0.682083+0.041544
[10] train-auc:0.783654+0.030705 test-auc:0.687617+0.046750
[11] train-auc:0.796643+0.027157 test-auc:0.701862+0.046887
[12] train-auc:0.814196+0.019522 test-auc:0.707957+0.051442
[13] train-auc:0.834534+0.023090 test-auc:0.718937+0.051517
[14] train-auc:0.855445+0.020539 test-auc:0.738984+0.046730
[15] train-auc:0.865581+0.014472 test-auc:0.746202+0.053148
[16] train-auc:0.879178+0.015412 test-auc:0.755713+0.047733
[17] train-auc:0.885384+0.010695 test-auc:0.756954+0.049152
[18] train-auc:0.893771+0.010416 test-auc:0.754607+0.049381
[19] train-auc:0.899295+0.011640 test-auc:0.755961+0.048730
[20] train-auc:0.904153+0.009617 test-auc:0.757726+0.049454
[21] train-auc:0.912452+0.011732 test-auc:0.767517+0.049317
[22] train-auc:0.916374+0.010769 test-auc:0.771695+0.049749
[23] train-auc:0.923077+0.008942 test-auc:0.776505+0.044740
[24] train-auc:0.931861+0.007251 test-auc:0.775831+0.047694
[25] train-auc:0.939725+0.007186 test-auc:0.780205+0.050416
[26] train-auc:0.946987+0.007851 test-auc:0.781856+0.049097
[27] train-auc:0.953082+0.007199 test-auc:0.790830+0.049520
[28] train-auc:0.956905+0.006655 test-auc:0.790994+0.048041
[29] train-auc:0.959474+0.007135 test-auc:0.790498+0.049087
[30] train-auc:0.962160+0.007075 test-auc:0.789384+0.049979
[31] train-auc:0.964746+0.007114 test-auc:0.792300+0.051115
[32] train-auc:0.966856+0.007152 test-auc:0.792673+0.052020
[33] train-auc:0.968699+0.007220 test-auc:0.795034+0.054238
[34] train-auc:0.970151+0.007450 test-auc:0.794372+0.055125
[35] train-auc:0.971111+0.007797 test-auc:0.794512+0.056575
[36] train-auc:0.971236+0.007781 test-auc:0.794393+0.056170
[37] train-auc:0.971681+0.007156 test-auc:0.794569+0.055670
[38] train-auc:0.972597+0.005744 test-auc:0.795534+0.054641
[39] train-auc:0.972832+0.005444 test-auc:0.797224+0.055205
[40] train-auc:0.973173+0.004823 test-auc:0.797334+0.055032
[41] train-auc:0.973331+0.004548 test-auc:0.796974+0.055180
[42] train-auc:0.973664+0.004087 test-auc:0.797062+0.055142
[43] train-auc:0.973977+0.004077 test-auc:0.796690+0.055335
[44] train-auc:0.974349+0.003928 test-auc:0.796797+0.055291
[45] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[46] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[47] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[48] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[49] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[50] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[51] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[52] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[53] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[54] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[55] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[56] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[57] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[58] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[59] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[60] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[61] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[62] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[63] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[64] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[65] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[66] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[67] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[68] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[69] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[70] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[71] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[72] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[73] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[74] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[75] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[76] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[77] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[78] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[79] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[80] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[81] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[82] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[83] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[84] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[85] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[86] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[87] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[88] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[89] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[90] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[91] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[92] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[93] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[94] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[95] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[96] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[97] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[98] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[99] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
[100] train-auc:0.974441+0.003939 test-auc:0.796609+0.055369
numTrees <- min(
which(
xgbCV$evaluation_log$test_auc_mean ==
max(xgbCV$evaluation_log$test_auc_mean)
)
)
fit4 <- xgboost(params=params,
data = train_x,
label = train_y,
nrounds = numTrees,
eval_metric="auc")
[1] train-auc:0.500000
[2] train-auc:0.663489
[3] train-auc:0.663489
[4] train-auc:0.703386
[5] train-auc:0.703386
[6] train-auc:0.704123
[7] train-auc:0.727506
[8] train-auc:0.727506
[9] train-auc:0.727506
[10] train-auc:0.784639
[11] train-auc:0.818359
[12] train-auc:0.816647
[13] train-auc:0.851022
[14] train-auc:0.864434
[15] train-auc:0.877787
[16] train-auc:0.883615
[17] train-auc:0.885182
[18] train-auc:0.899875
[19] train-auc:0.902216
[20] train-auc:0.912799
[21] train-auc:0.917703
[22] train-auc:0.918794
[23] train-auc:0.920644
[24] train-auc:0.926631
[25] train-auc:0.933874
[26] train-auc:0.947723
[27] train-auc:0.958398
[28] train-auc:0.961457
[29] train-auc:0.963316
[30] train-auc:0.966204
[31] train-auc:0.968502
[32] train-auc:0.971049
[33] train-auc:0.972695
[34] train-auc:0.975750
[35] train-auc:0.977323
[36] train-auc:0.978427
[37] train-auc:0.979179
[38] train-auc:0.979179
[39] train-auc:0.979179
[40] train-auc:0.979179
xgb.train.data = xgb.DMatrix(train_x, label = train_y, missing = NA)
col_names = attr(xgb.train.data, ".Dimnames")[[2]]
imp = xgb.importance(col_names, fit4)
# Variable importance
xgb.plot.importance(imp)
pred.xgb <- c(predict(fit4, test_x, type="response"))
df_test <- data.frame(pred.xgb=pred.xgb, AAER=factor(test_y, levels=c(0,1)))
pred.xgb <- c(predict(fit4, train_x, type="response"))
df_train <- data.frame(pred.xgb=pred.xgb, AAER=factor(train_y, levels=c(0,1)))
auc_in <- df_train %>% roc_auc(truth=AAER, estimate=pred.xgb, event_level='second')
auc_out <- df_test %>% roc_auc(truth=AAER, estimate=pred.xgb, event_level='second')
curve_in_xgb <- df_train %>% roc_curve(truth=AAER, estimate=pred.xgb, event_level='second')
curve_out_xgb <- df_test %>% roc_curve(truth=AAER, estimate=pred.xgb, event_level='second')
aucs_xgb <- c(auc_in$.estimate, auc_out$.estimate)
names(aucs_xgb) <- c("In sample, XGBoost", "Out of sample, XGBoost")
ggplot() +
geom_line(data=curve_out_BCE, aes(y=sensitivity, x=1-specificity, color="BCE")) +
geom_line(data=curve_out_2000f, aes(y=sensitivity, x=1-specificity, color="2000s + 2011")) +
geom_line(data=curve_out_2000s, aes(y=sensitivity, x=1-specificity, color="2000s")) +
geom_line(data=curve_out_2011, aes(y=sensitivity, x=1-specificity, color="2011")) +
geom_line(data=curve_out_1990s, aes(y=sensitivity, x=1-specificity, color="1990s")) +
geom_line(data=curve_out_CV.min, aes(y=sensitivity, x=1-specificity, color="LASSO, lambda.min")) +
geom_line(data=curve_out_CV.1se, aes(y=sensitivity, x=1-specificity, color="LASSO, lambda.1se")) +
geom_line(data=curve_out_xgb, aes(y=sensitivity, x=1-specificity, color="XGBoost")) +
geom_abline(slope=1)
oos_aucs <- c(aucs_1990s[2], aucs_2000s[2], aucs_2000f[2], aucs_2011[2], aucs_BCE[2], aucs_CV.1se[2], aucs_CV.min[2], auc_out$.estimate)
names(oos_aucs) <- c('1990s', '2000s', '2000s + 2011', '2011', 'BC', 'LASSO, lambda.1se', 'LASSO, lambda.min', 'XGBoost')
oos_aucs
1990s 2000s 2000s + 2011 2011 BC LASSO, lambda.1se LASSO, lambda.min XGBoost
0.7292981 0.6295414 0.7147021 0.6849225 0.7599594 0.7124231 0.7290185 0.8083503
df_save_0 <- df_save %>% filter(Test==0)
df_save_1 <- df_save %>% filter(Test==1)
df_save_0$pred_xgb <- predict(fit4, train_x, type="response")
df_save_1$pred_xgb <- predict(fit4, test_x, type="response")
df_save <- bind_rows(df_save_0, df_save_1)
saveRDS(df_save, '../../Data/Session_6_models.rds')