---
title: "Code for Session 6"
author: "Dr. Richard M. Crowley"
format:
html:
code-tools: true
code-link: true
embed-resources: true
---
Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
```{r helpers, warning=FALSE, message=F}
library (knitr)
library (kableExtra)
#' Pretty formatted HTML tables
#'
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function (df, names= NULL , highlight_cols= c (1 ), highlight_rows= c (), color_rows= c (), highlight_end_rows= c (), full= F, fixed_header= T) {
if (! length (names)) {
names= colnames (df)
}
kbl (df,"html" , col.names = names, align = c ("l" ,rep ('c' ,length (cols)- 1 ))) %>%
kable_paper (c ("striped" ,"hover" ), full_width= full, fixed_thead= F, html_font = " \" Source Sans Pro \" , Helvetica, sans-serif" ) %>%
{if (length (highlight_cols)) column_spec (., highlight_cols,bold= T, background = "#ffffff99" ) else .} %>%
{if (length (highlight_rows)) row_spec (., highlight_rows,bold= T, background = "#ffffff99" ) else .} %>%
{if (length (color_rows)) row_spec (., color_rows,bold= T, background = "#aaaaff99" ) else .} %>%
{if (length (highlight_end_rows)) row_spec (., nrow (df) + 1 - highlight_end_rows,bold= T, background = "#ffffff99" ) else .}
}
library (tidyverse)
```
```{r}
#| include: false
options (downlit.attached = c ("knitr" , "kableExtra" , "tidyverse" ))
```
```{r, warning=FALSE}
library (tidyverse)
library (coefplot)
library (DT)
```
```{r}
df <- read.csv ("../../Data/Session_6.csv" ) %>%
mutate (AAER = factor (AAER, levels= c (0 ,1 )))
```
```{r}
ex <- data.frame (year= c (1999 ,2001 ,2003 ), year_found= c (2001 ,2003 ,2006 ), aaer= c (1 ,1 ,1 ), aaer_as_of_2004= c (1 ,1 ,0 ))
html_df (ex)
```
```{r}
df %>%
group_by (year) %>%
mutate (total_AAERS = sum (AAER== 1 ), total_observations= n ()) %>%
slice (1 ) %>%
ungroup () %>%
select (year, total_AAERS, total_observations) %>%
html_df
```
```{r, warning=F, }
fit_1990s <- glm (AAER ~ ebit + ni_revt + ni_at + log_lt + ltl_at + lt_seq +
lt_at + act_lct + aq_lct + wcap_at + invt_revt + invt_at +
ni_ppent + rect_revt + revt_at + d_revt + b_rect + b_rect +
r_gp + b_gp + gp_at + revt_m_gp + ch_at + log_at +
ppent_at + wcap,
data= df[df$ Test== 0 ,],
family= binomial)
summary (fit_1990s)
```
```{r, warning=F, message=F, fig.height=5}
library (yardstick)
df$ pred_1990s <- predict (fit_1990s, df, type= "response" )
auc_in <- df %>% filter (Test== 0 ) %>% roc_auc (AAER, pred_1990s, event_level= 'second' )
auc_out <- df %>% filter (Test== 1 ) %>% roc_auc (AAER, pred_1990s, event_level= 'second' )
curve_in_1990s <- df %>% filter (Test== 0 ) %>% roc_curve (AAER, pred_1990s, event_level= 'second' )
curve_out_1990s <- df %>% filter (Test== 1 ) %>% roc_curve (AAER, pred_1990s, event_level= 'second' )
ggplot () +
geom_line (data= curve_in_1990s, aes (y= sensitivity, x= 1 - specificity, color= "1990s, In Sample" )) +
geom_line (data= curve_out_1990s, aes (y= sensitivity, x= 1 - specificity, color= "1990s, Out of Sample" )) +
geom_abline (slope= 1 )
aucs_1990s <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_1990s) <- c ("In sample AUC" , "Out of sample AUC" )
aucs_1990s
```
```{r, warning=F}
fit_2011 <- glm (AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct,
data= df[df$ Test== 0 ,],
family= binomial)
summary (fit_2011)
```
```{r, warning=F, message=F, fig.height=5}
df$ pred_2011 <- predict (fit_2011, df, type= "response" )
auc_in <- df %>% filter (Test== 0 ) %>% roc_auc (AAER, pred_2011, event_level= 'second' )
auc_out <- df %>% filter (Test== 1 ) %>% roc_auc (AAER, pred_2011, event_level= 'second' )
curve_in_2011 <- df %>% filter (Test== 0 ) %>% roc_curve (AAER, pred_2011, event_level= 'second' )
curve_out_2011 <- df %>% filter (Test== 1 ) %>% roc_curve (AAER, pred_2011, event_level= 'second' )
ggplot () +
geom_line (data= curve_in_2011, aes (y= sensitivity, x= 1 - specificity, color= "2011, In Sample" )) +
geom_line (data= curve_out_2011, aes (y= sensitivity, x= 1 - specificity, color= "2011, Out of Sample" )) +
geom_abline (slope= 1 )
aucs_2011 <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_2011) <- c ("In sample AUC" , "Out of sample AUC" )
aucs_2011
```
```{r, warning=F, message=F}
df_save <- data.frame (Test= df$ Test, AAER= df$ AAER, pred_F= df$ pred_2011)
```
```{r, warning=F}
fit_2000s <- glm (AAER ~ bullets + headerlen + newlines + alltags +
processedsize + sentlen_u + wordlen_s + paralen_s +
repetitious_p + sentlen_s + typetoken + clindex + fog +
active_p + passive_p + lm_negative_p + lm_positive_p +
allcaps + exclamationpoints + questionmarks,
data= df[df$ Test== 0 ,],
family= binomial)
summary (fit_2000s)
```
```{r, warning=F, message=F, fig.height=5}
df$ pred_2000s <- predict (fit_2000s, df, type= "response" )
auc_in <- df %>% filter (Test== 0 ) %>% roc_auc (AAER, pred_2000s, event_level= 'second' )
auc_out <- df %>% filter (Test== 1 ) %>% roc_auc (AAER, pred_2000s, event_level= 'second' )
curve_in_2000s <- df %>% filter (Test== 0 ) %>% roc_curve (AAER, pred_2000s, event_level= 'second' )
curve_out_2000s <- df %>% filter (Test== 1 ) %>% roc_curve (AAER, pred_2000s, event_level= 'second' )
ggplot () +
geom_line (data= curve_in_2000s, aes (y= sensitivity, x= 1 - specificity, color= "2000s, In Sample" )) +
geom_line (data= curve_out_2000s, aes (y= sensitivity, x= 1 - specificity, color= "2000s, Out of Sample" )) +
geom_abline (slope= 1 )
aucs_2000s <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_2000s) <- c ("In sample AUC" , "Out of sample AUC" )
aucs_2000s
```
```{r, warning=F, message=F}
df_save$ pred_S <- df$ pred_2000s
```
```{r, warning=F}
fit_2000f <- glm (AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks,
data= df[df$ Test== 0 ,],
family= binomial)
summary (fit_2000f)
```
```{r, warning=F, message=F, fig.height=5}
df$ pred_2000f <- predict (fit_2000f, df, type= "response" )
auc_in <- df %>% filter (Test== 0 ) %>% roc_auc (AAER, pred_2000f, event_level= 'second' )
auc_out <- df %>% filter (Test== 1 ) %>% roc_auc (AAER, pred_2000f, event_level= 'second' )
curve_in_2000f <- df %>% filter (Test== 0 ) %>% roc_curve (AAER, pred_2000f, event_level= 'second' )
curve_out_2000f <- df %>% filter (Test== 1 ) %>% roc_curve (AAER, pred_2000f, event_level= 'second' )
ggplot () +
geom_line (data= curve_in_2000f, aes (y= sensitivity, x= 1 - specificity, color= "2000s + 2011, In Sample" )) +
geom_line (data= curve_out_2000f, aes (y= sensitivity, x= 1 - specificity, color= "2000s + 2011, Out of Sample" )) +
geom_abline (slope= 1 )
aucs_2000f <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_2000f) <- c ("In sample AUC" , "Out of sample AUC" )
aucs_2000f
```
```{r, warning=F, message=F}
df_save$ pred_FS <- df$ pred_2000f
```
```{r, warning=F}
BCE_eq = as.formula (paste ("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks + " ,
paste (paste0 ("Topic_" ,1 : 30 ,"_n_oI" ), collapse= " + " ), collapse= "" ))
fit_BCE <- glm (BCE_eq,
data= df[df$ Test== 0 ,],
family= binomial)
summary (fit_BCE)
```
```{r, warning=F, message=F, fig.height=5}
df$ pred_BCE <- predict (fit_BCE, df, type= "response" )
auc_in <- df %>% filter (Test== 0 ) %>% roc_auc (AAER, pred_BCE, event_level= 'second' )
auc_out <- df %>% filter (Test== 1 ) %>% roc_auc (AAER, pred_BCE, event_level= 'second' )
curve_in_BCE <- df %>% filter (Test== 0 ) %>% roc_curve (AAER, pred_BCE, event_level= 'second' )
curve_out_BCE <- df %>% filter (Test== 1 ) %>% roc_curve (AAER, pred_BCE, event_level= 'second' )
ggplot () +
geom_line (data= curve_in_BCE, aes (y= sensitivity, x= 1 - specificity, color= "BCE, In Sample" )) +
geom_line (data= curve_out_BCE, aes (y= sensitivity, x= 1 - specificity, color= "BCE, Out of Sample" )) +
geom_abline (slope= 1 )
aucs_BCE <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_BCE) <- c ("In sample AUC" , "Out of sample AUC" )
aucs_BCE
```
```{r, warning=F, message=F}
df_save$ pred_BCE <- df$ pred_BCE
```
```{r, warning=F, message=F, fig.height=5}
ggplot () +
geom_line (data= curve_out_BCE, aes (y= sensitivity, x= 1 - specificity, color= "BCE" )) +
geom_line (data= curve_out_2000f, aes (y= sensitivity, x= 1 - specificity, color= "2000s + 2011" )) +
geom_line (data= curve_out_2000s, aes (y= sensitivity, x= 1 - specificity, color= "2000s" )) +
geom_line (data= curve_out_2011, aes (y= sensitivity, x= 1 - specificity, color= "2011" )) +
geom_line (data= curve_out_1990s, aes (y= sensitivity, x= 1 - specificity, color= "1990s" )) +
geom_abline (slope= 1 ) +
ggtitle ("Out of Sample ROC Curves" )
oos_aucs <- c (aucs_1990s[2 ], aucs_2011[2 ], aucs_2000s[2 ], aucs_2000f[2 ], aucs_BCE[2 ])
names (oos_aucs) <- c ("1990s" , "2011" , "2000s" , "2000s + 2011" , "BCE" )
oos_aucs
```
```{r, message=F}
library (glmnet)
x <- model.matrix (BCE_eq, data= df[df$ Test== 0 ,])[,- 1 ] # [,-1] to remove intercept
y <- model.frame (BCE_eq, data= df[df$ Test== 0 ,])[,"AAER" ]
fit_LASSO <- glmnet (x= x, y= y,
family = "binomial" ,
alpha = 1 # Specifies LASSO. alpha = 0 is ridge
)
```
```{r, fig.height=4}
plot (fit_LASSO)
```
```{r}
print (fit_LASSO)
```
```{r, warning=F, fig.height=5}
#coef(fit_LASSO, s=0.002031)
coefplot (fit_LASSO, lambda= 0.002031 , sort= 'magnitude' )
```
```{r}
# na.pass has model.matrix retain NA values (so the # of rows is constant)
xp <- model.matrix (BCE_eq, data= df, na.action= 'na.pass' )[,- 1 ]
# s= specifies the version of the model to use
df$ pred_L1 <- c (predict (fit_LASSO, xp, type= "response" , s = 0.002031 ))
```
```{r, fig.height=4}
auc_in <- df %>% filter (Test== 0 ) %>% roc_auc (AAER, pred_L1, event_level= 'second' )
auc_out <- df %>% filter (Test== 1 ) %>% roc_auc (AAER, pred_L1, event_level= 'second' )
curve_in_L1 <- df %>% filter (Test== 0 ) %>% roc_curve (AAER, pred_L1, event_level= 'second' )
curve_out_L1 <- df %>% filter (Test== 1 ) %>% roc_curve (AAER, pred_L1, event_level= 'second' )
ggplot () +
geom_line (data= curve_in_L1, aes (y= sensitivity, x= 1 - specificity, color= "BCE, LASSO, In Sample" )) +
geom_line (data= curve_out_L1, aes (y= sensitivity, x= 1 - specificity, color= "BCE, LASSO, Out of Sample" )) +
geom_abline (slope= 1 )
aucs_L1 <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_L1) <- c ("In sample AUC" , "Out of sample AUC" )
aucs_L1
```
```{r}
# Cross validation
set.seed (697435 ) #for reproducibility
cvfit = cv.glmnet (x= x, y= y,family = "binomial" , alpha = 1 , type.measure= "auc" )
```
```{r}
plot (cvfit)
```
```{r}
cvfit$ lambda.min
cvfit$ lambda.1 se
```
```{r, warning=F, fig.height=5.2, fig.width=4}
#coef(cvfit, s = "lambda.min")
coefplot (cvfit, lambda= 'lambda.min' , sort= 'magnitude' ) + theme (axis.text.y = element_text (size= 15 ))
```
```{r, warning=F, fig.height=5.2, fig.width=4}
#coef(cvfit, s = "lambda.1se")
coefplot (cvfit, lambda= 'lambda.1se' , sort= 'magnitude' ) + theme (axis.text.y = element_text (size= 15 ))
```
```{r}
# s= specifies the version of the model to use
df$ pred_L1.min <- c (predict (cvfit, xp, type= "response" , s = "lambda.min" ))
df$ pred_L1.1 se <- c (predict (cvfit, xp, type= "response" , s = "lambda.1se" ))
```
```{r, warning=F, message=F}
df_save$ pred_lmin <- df$ pred_L1.min
df_save$ pred_l1se <- df$ pred_L1.1 se
```
```{r, fig.height=3.5}
auc_in <- df %>% filter (Test== 0 ) %>% roc_auc (AAER, pred_L1.min, event_level= 'second' )
auc_out <- df %>% filter (Test== 1 ) %>% roc_auc (AAER, pred_L1.min, event_level= 'second' )
curve_in_CV.min <- df %>% filter (Test== 0 ) %>% roc_curve (AAER, pred_L1.min, event_level= 'second' )
curve_out_CV.min <- df %>% filter (Test== 1 ) %>% roc_curve (AAER, pred_L1.min, event_level= 'second' )
aucs_CV.min <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_CV.min) <- c ("In sample, lambda.min" , "Out of sample, lambda.min" )
auc_in <- df %>% filter (Test== 0 ) %>% roc_auc (AAER, pred_L1.1 se, event_level= 'second' )
auc_out <- df %>% filter (Test== 1 ) %>% roc_auc (AAER, pred_L1.1 se, event_level= 'second' )
curve_in_CV.1 se <- df %>% filter (Test== 0 ) %>% roc_curve (AAER, pred_L1.1 se, event_level= 'second' )
curve_out_CV.1 se <- df %>% filter (Test== 1 ) %>% roc_curve (AAER, pred_L1.1 se, event_level= 'second' )
aucs_CV.1 se <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_CV.1 se) <- c ("In sample, lambda.1se" , "Out of sample, lambda.1se" )
ggplot () +
geom_line (data= curve_in_CV.min, aes (y= sensitivity, x= 1 - specificity, color= "In Sample, lambda.min" )) +
geom_line (data= curve_out_CV.min, aes (y= sensitivity, x= 1 - specificity, color= "Out of Sample, lambda.min" )) +
geom_line (data= curve_in_CV.1 se, aes (y= sensitivity, x= 1 - specificity, color= "In Sample, lambda.1se" )) +
geom_line (data= curve_out_CV.1 se, aes (y= sensitivity, x= 1 - specificity, color= "Out of Sample, lambda.1se" )) +
geom_abline (slope= 1 )
c (aucs_CV.min, aucs_CV.1 se)
```
```{r}
BCE_eq <- as.formula (paste ("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks + " ,
paste (paste0 ("Topic_" ,1 : 30 ,"_n_oI" ), collapse= " + " ), collapse= "" ))
```
```{r, message=F}
library (recipes)
library (parsnip)
df <- read_csv ("../../Data/Session_6.csv" )
BCEformula <- BCE_eq
train <- df %>% filter (Test == 0 )
test <- df %>% filter (Test == 1 )
rec <- recipe (BCEformula, data = train) %>%
step_zv (all_predictors ()) %>% # Drop any variables with zero variance
step_center (all_predictors ()) %>% # Center all prediction variables
step_scale (all_predictors ()) %>% # Scale all prediction variables
step_intercept () %>% # Add an intercept to the model
step_num2factor (all_outcomes (), ordered = T, levels= c ("0" ,"1" ),
transform = function (x) x + 1 ) # Convert DV to factor
prepped <- rec %>% prep (training= train)
```
```{r}
# "bake" your recipe to get data ready
train_baked <- bake (prepped, new_data = train)
test_baked <- bake (prepped, new_data = test)
# Run the model with parsnip
train_model <- logistic_reg (mixture= 1 , penalty= 1 ) %>% # mixture = 1 sets LASSO
set_engine ('glmnet' ) %>%
fit (BCEformula, data = train_baked)
```
```{r, warning=F, message=F, fig.height=5}
# train_model$fit is the same as fit_LASSO earlier in the slides
coefplot (train_model$ fit, lambda= 0.002031 , sort= 'magnitude' )
```
```{r}
rec <- recipe (BCEformula, data = train) %>%
step_zv (all_predictors ()) %>% # Drop any variables with zero variance
step_center (all_predictors ()) %>% # Center all prediction variables
step_scale (all_predictors ()) %>% # Scale all prediction variables
step_intercept () # Add an intercept to the model
prepped <- rec %>% prep (training= train)
test_prepped <- rec %>% prep (training= test)
# "Juice" your recipe to get data for other packages
train_x <- juice (prepped, all_predictors (), composition = "dgCMatrix" )
train_y <- juice (prepped, all_outcomes (), composition = "matrix" )
test_x <- juice (test_prepped, all_predictors (), composition = "dgCMatrix" )
test_y <- juice (test_prepped, all_outcomes (), composition = "matrix" )
```
```{r, message=F}
# Cross validation
set.seed (75347 ) #for reproducibility
cvfit = cv.glmnet (x= train_x, y= train_y, family = "binomial" , alpha = 1 ,
type.measure= "auc" )
```
```{r}
plot (cvfit)
```
```{r}
cvfit$ lambda.min
cvfit$ lambda.1 se
```
```{r, warning=F, fig.height=5.2, fig.width=4}
#coef(cvfit, s = "lambda.min")
coefplot (cvfit, lambda= 'lambda.min' , sort= 'magnitude' ) + theme (axis.text.y = element_text (size= 15 ))
```
```{r, warning=F, fig.height=5.2, fig.width=4}
#coef(cvfit, s = "lambda.1se")
coefplot (cvfit, lambda= 'lambda.1se' , sort= 'magnitude' ) + theme (axis.text.y = element_text (size= 15 ))
```
```{r, fig.height=3.5}
train.pred_L1.min2 <- c (predict (cvfit, train_x, type= "response" , s = "lambda.min" ))
test.pred_L1.min2 <- c (predict (cvfit, test_x, type= "response" , s = "lambda.min" ))
train.pred_L1.1 se2 <- c (predict (cvfit, train_x, type= "response" , s = "lambda.1se" ))
test.pred_L1.1 se2 <- c (predict (cvfit, test_x, type= "response" , s = "lambda.1se" ))
df_train <- data.frame (train.pred_L1.min2= train.pred_L1.min2, train.pred_L1.1se2= train.pred_L1.1 se2, AAER= factor (train_y, levels= c (0 ,1 )))
df_test <- data.frame (test.pred_L1.min2= test.pred_L1.min2, test.pred_L1.1se2= test.pred_L1.1 se2, AAER= factor (test_y, levels= c (0 ,1 )))
auc_in <- df_train %>% roc_auc (AAER, train.pred_L1.min2, event_level= 'second' )
auc_out <- df_test %>% roc_auc (AAER, test.pred_L1.min2, event_level= 'second' )
curve_in_CV.min2 <- df_train %>% roc_curve (AAER, train.pred_L1.min2, event_level= 'second' )
curve_out_CV.min2 <- df_test %>% roc_curve (AAER, test.pred_L1.min2, event_level= 'second' )
aucs_CV.min2 <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_CV.min2) <- c ("In sample, lambda.min" , "Out of sample, lambda.min" )
auc_in <- df_train %>% roc_auc (AAER, train.pred_L1.1 se2, event_level= 'second' )
auc_out <- df_test %>% roc_auc (AAER, test.pred_L1.1 se2, event_level= 'second' )
curve_in_CV.1 se2 <- df_train %>% roc_curve (AAER, train.pred_L1.1 se2, event_level= 'second' )
curve_out_CV.1 se2 <- df_test %>% roc_curve (AAER, test.pred_L1.1 se2, event_level= 'second' )
aucs_CV.1 se2 <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_CV.1 se2) <- c ("In sample, lambda.1se" , "Out of sample, lambda.1se" )
ggplot () +
geom_line (data= curve_in_CV.min2, aes (y= sensitivity, x= 1 - specificity, color= "In Sample, lambda.min" )) +
geom_line (data= curve_out_CV.min2, aes (y= sensitivity, x= 1 - specificity, color= "Out of Sample, lambda.min" )) +
geom_line (data= curve_in_CV.1 se2, aes (y= sensitivity, x= 1 - specificity, color= "In Sample, lambda.1se" )) +
geom_line (data= curve_out_CV.1 se2, aes (y= sensitivity, x= 1 - specificity, color= "Out of Sample, lambda.1se" )) +
geom_abline (slope= 1 )
c (aucs_CV.min2, aucs_CV.1 se2)
```
```{r}
BCE_eq <- as.formula (paste ("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks + " ,
paste (paste0 ("Topic_" ,1 : 30 ,"_n_oI" ), collapse= " + " ), collapse= "" ))
```
```{r, message=F}
library (tidyr)
library (tidymodels)
library (tidyverse)
df <- read_csv ("../../Data/Session_6.csv" )
BCEformula <- BCE_eq
train <- df %>% filter (Test == 0 )
test <- df %>% filter (Test == 1 )
LASSO_rec <- recipe (BCEformula, data = train) %>%
step_zv (all_predictors ()) %>% # Drop any variables with zero variance
step_center (all_predictors ()) %>% # Center all prediction variables
step_scale (all_predictors ()) %>% # Scale all prediction variables
step_intercept () %>% # Add an intercept to the model
step_num2factor (all_outcomes (), ordered = T, levels= c ("0" ,"1" ),
transform = function (x) x + 1 ) # Convert DV to factor
```
```{r}
LASSO_mod <- logistic_reg (penalty= tune (), mixture= 1 ) %>% # mixture = 1 sets LASSO
set_engine ('glmnet' )
# Define a grid to tune over
grid <- expand_grid (penalty = exp (seq (- 11 ,- 4 , length.out= 100 )))
```
```{r}
LASSO_wfl <- workflow () %>%
add_model (LASSO_mod) %>%
add_recipe (LASSO_rec)
```
```{r}
set.seed (354351 )
folds <- vfold_cv (train, v= 10 ) # from rsample
metrics = metric_set (roc_auc) # from yardstick
LASSO_fit_tuned <- tune_grid (LASSO_wfl,
grid = grid,
resamples = folds,
metrics= metrics)
```
```{r}
LASSO_fit_tuned %>%
collect_metrics ()
```
```{r, fig.height=3.0}
lambda.min <- LASSO_fit_tuned %>%
collect_metrics () %>%
arrange (- mean) %>%
slice (1 ) %>%
pull (penalty) %>%
log ()
LASSO_fit_tuned %>%
collect_metrics () %>%
ggplot (aes (x= log (penalty), y= mean)) +
geom_point () +
xlab ("Log(lambda)" ) +
geom_vline (xintercept = lambda.min)
```
```{r}
BCE_eq <- as.formula (paste ("AAER ~ logtotasset + rsst_acc + chg_recv + chg_inv +
soft_assets + pct_chg_cashsales + chg_roa + issuance +
oplease_dum + book_mkt + lag_sdvol + merger + bigNaudit +
midNaudit + cffin + exfin + restruct + bullets + headerlen +
newlines + alltags + processedsize + sentlen_u + wordlen_s +
paralen_s + repetitious_p + sentlen_s + typetoken +
clindex + fog + active_p + passive_p + lm_negative_p +
lm_positive_p + allcaps + exclamationpoints + questionmarks + " ,
paste (paste0 ("Topic_" ,1 : 30 ,"_n_oI" ), collapse= " + " ), collapse= "" ))
```
```{r, message=F}
library (recipes)
library (parsnip)
df <- read_csv ("../../Data/Session_6.csv" )
BCEformula <- BCE_eq
train <- df %>% filter (Test == 0 )
test <- df %>% filter (Test == 1 )
rec <- recipe (BCEformula, data = train) %>%
step_zv (all_predictors ()) %>% # Drop any variables with zero variance
step_center (all_predictors ()) %>% # Center all prediction variables
step_scale (all_predictors ()) %>% # Scale all prediction variables
step_intercept () # Add an intercept to the model
```
```{r, message=F}
# Juice our data
prepped <- rec %>% prep (training= train)
train_x <- juice (prepped, all_predictors (), composition = "dgCMatrix" )
train_y <- juice (prepped, all_outcomes (), composition = "matrix" )
test_prepped <- rec %>% prep (training= test)
test_x <- juice (test_prepped, all_predictors (), composition = "dgCMatrix" )
test_y <- juice (test_prepped, all_outcomes (), composition = "matrix" )
```
```{r, message=F, warning=F, error=F}
# Cross validation
set.seed (482342 ) #for reproducibility
library (xgboost)
# model setup
params <- list (max_depth= 10 , eta= 0.2 ,
gamma= 10 , min_child_weight= 5 ,
objective= "binary:logistic" )
# run the model
xgbCV <- xgb.cv (params= params, data= train_x,
label= train_y, nrounds= 100 ,
eval_metric= "auc" , nfold= 10 ,
stratified= TRUE )
```
```{r}
numTrees <- min (
which (
xgbCV$ evaluation_log$ test_auc_mean ==
max (xgbCV$ evaluation_log$ test_auc_mean)
)
)
fit4 <- xgboost (params= params,
data = train_x,
label = train_y,
nrounds = numTrees,
eval_metric= "auc" )
```
```{r, fig.height=4.5}
xgb.train.data = xgb.DMatrix (train_x, label = train_y, missing = NA )
col_names = attr (xgb.train.data, ".Dimnames" )[[2 ]]
imp = xgb.importance (col_names, fit4)
# Variable importance
xgb.plot.importance (imp)
```
```{r, fig.height=3.5}
pred.xgb <- c (predict (fit4, test_x, type= "response" ))
df_test <- data.frame (pred.xgb= pred.xgb, AAER= factor (test_y, levels= c (0 ,1 )))
pred.xgb <- c (predict (fit4, train_x, type= "response" ))
df_train <- data.frame (pred.xgb= pred.xgb, AAER= factor (train_y, levels= c (0 ,1 )))
auc_in <- df_train %>% roc_auc (AAER, pred.xgb, event_level= 'second' )
auc_out <- df_test %>% roc_auc (AAER, pred.xgb, event_level= 'second' )
curve_in_xgb <- df_train %>% roc_curve (AAER, pred.xgb, event_level= 'second' )
curve_out_xgb <- df_test %>% roc_curve (AAER, pred.xgb, event_level= 'second' )
aucs_xgb <- c (auc_in$ .estimate, auc_out$ .estimate)
names (aucs_xgb) <- c ("In sample, XGBoost" , "Out of sample, XGBoost" )
ggplot () +
geom_line (data= curve_out_BCE, aes (y= sensitivity, x= 1 - specificity, color= "BCE" )) +
geom_line (data= curve_out_2000f, aes (y= sensitivity, x= 1 - specificity, color= "2000s + 2011" )) +
geom_line (data= curve_out_2000s, aes (y= sensitivity, x= 1 - specificity, color= "2000s" )) +
geom_line (data= curve_out_2011, aes (y= sensitivity, x= 1 - specificity, color= "2011" )) +
geom_line (data= curve_out_1990s, aes (y= sensitivity, x= 1 - specificity, color= "1990s" )) +
geom_line (data= curve_out_CV.min, aes (y= sensitivity, x= 1 - specificity, color= "LASSO, lambda.min" )) +
geom_line (data= curve_out_CV.1 se, aes (y= sensitivity, x= 1 - specificity, color= "LASSO, lambda.1se" )) +
geom_line (data= curve_out_xgb, aes (y= sensitivity, x= 1 - specificity, color= "XGBoost" )) +
geom_abline (slope= 1 )
oos_aucs <- c (aucs_1990s[2 ], aucs_2000s[2 ], aucs_2000f[2 ], aucs_2011[2 ], aucs_BCE[2 ], aucs_CV.1 se[2 ], aucs_CV.min[2 ], auc_out$ .estimate)
names (oos_aucs) <- c ('1990s' , '2000s' , '2000s + 2011' , '2011' , 'BC' , 'LASSO, lambda.1se' , 'LASSO, lambda.min' , 'XGBoost' )
oos_aucs
```
```{r, warning=F, message=F}
df_save_0 <- df_save %>% filter (Test== 0 )
df_save_1 <- df_save %>% filter (Test== 1 )
df_save_0$ pred_xgb <- predict (fit4, train_x, type= "response" )
df_save_1$ pred_xgb <- predict (fit4, test_x, type= "response" )
df_save <- bind_rows (df_save_0, df_save_1)
saveRDS (df_save, '../../Data/Session_6_models.rds' )
```