Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
library(knitr)library(kableExtra)#' Pretty formatted HTML tables#' #' @param df The dataframe to format as a table#' @param names Names to use for columns if not using the df column names. Vector#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.#' @param full Use to span the full width of a slide. FALSE by default.#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.html_df<-function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T){if(!length(names)){names=colnames(df)}kbl(df,"html", col.names =names, align =c("l",rep('c',length(cols)-1)))%>%kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font ="\"Source Sans Pro\", Helvetica, sans-serif")%>%{if(length(highlight_cols))column_spec(., highlight_cols,bold=T, background ="#ffffff99")else.}%>%{if(length(highlight_rows))row_spec(., highlight_rows,bold=T, background ="#ffffff99")else.}%>%{if(length(color_rows))row_spec(., color_rows,bold=T, background ="#aaaaff99")else.}%>%{if(length(highlight_end_rows))row_spec(., nrow(df)+1-highlight_end_rows,bold=T, background ="#ffffff99")else.}}library(tidyverse)
# initial cleaning; 100338 is an outlier in the bonds distributiondf<-df%>%filter(at>=1, revt>=1, gvkey!=100338)## Merge in stock valuedf$date<-ymd(df$datadate)df_mve<-df_mve%>%mutate(date =ymd(datadate), mve =csho*prcc_f)%>%rename(gvkey=GVKEY)df<-left_join(df, df_mve[,c("gvkey","date","mve")])
Call:
glm(formula = bankrupt_lead ~ Z, family = binomial, data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.87769 0.11741 -50.060 < 2e-16 ***
Z -0.05494 0.01235 -4.449 8.61e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1101.0 on 33372 degrees of freedom
Residual deviance: 1088.8 on 33371 degrees of freedom
(14245 observations deleted due to missingness)
AIC: 1092.8
Number of Fisher Scoring iterations: 9
# df_stock is an already prepped csv from CRSP datadf_stock$date<-as.Date(df_stock$date)df<-left_join(df, df_stock[,c("gvkey", "date", "ret", "ret.sd")])
Joining with `by = join_by(gvkey, date)`
Warning in left_join(df, df_stock[, c("gvkey", "date", "ret", "ret.sd")]): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 13537 of `x` matches multiple rows in `y`.
ℹ Row 20513 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
# Calculating a 253 day rolling standard deviation in Rcrsp<-crsp%>%group_by(gvkey)%>%mutate(ret.sd =rollapply(data=ret, width=253, FUN=sd, align="right", fill=NA, na.rm=T))%>%ungroup()
Source Code
---title: "Code for Session 5"author: "Dr. Richard M. Crowley"format: html: code-tools: true code-link: true embed-resources: true---Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.```{r helpers, warning=FALSE, message=F}library(knitr)library(kableExtra)#' Pretty formatted HTML tables#' #' @param df The dataframe to format as a table#' @param names Names to use for columns if not using the df column names. Vector#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.#' @param full Use to span the full width of a slide. FALSE by default.#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.html_df <-function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {if(!length(names)) { names=colnames(df) }kbl(df,"html", col.names = names, align =c("l",rep('c',length(cols)-1))) %>%kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font ="\"Source Sans Pro\", Helvetica, sans-serif") %>% {if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background ="#ffffff99") else .} %>% {if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background ="#ffffff99") else .} %>% {if(length(color_rows)) row_spec(., color_rows,bold=T, background ="#aaaaff99") else .} %>% {if(length(highlight_end_rows)) row_spec(., nrow(df) +1- highlight_end_rows,bold=T, background ="#ffffff99") else .}}library(tidyverse)``````{r}library(tidyverse)library(plotly)library(lubridate)df <-read.csv("../../Data/Session_5-1.csv", stringsAsFactors=FALSE)df_ratings <-read.csv("../../Data/Session_5-2.csv", stringsAsFactors=FALSE)df_mve <-read.csv("../../Data/Session_5-3.csv", stringsAsFactors=FALSE)df_rf <-read.csv("../../Data/Session_5-4.csv", stringsAsFactors=FALSE)df_stock <-read.csv("../../Data/Session_5-5.csv", stringsAsFactors=FALSE)``````{r}#| include: falseoptions(downlit.attached =c("knitr", "kableExtra", "tidyverse"))``````{r}# initial cleaning; 100338 is an outlier in the bonds distributiondf <- df %>%filter(at >=1, revt >=1, gvkey !=100338)## Merge in stock valuedf$date <-ymd(df$datadate)df_mve <- df_mve %>%mutate(date =ymd(datadate),mve = csho * prcc_f) %>%rename(gvkey=GVKEY)df <-left_join(df, df_mve[,c("gvkey","date","mve")])``````{r}# Compute the bankruptcy indicatordf <- df %>%group_by(gvkey) %>%arrange(datadate) %>%mutate(bankrupt =ifelse(row_number() ==n() & dlrsn ==2&!is.na(dlrsn), 1, 0),bankrupt_lead =lead(bankrupt)) %>%ungroup() %>%filter(!is.na(bankrupt_lead)) %>%mutate(bankrupt_lead =factor(bankrupt_lead, levels=c(0,1)))``````{r}# Calculate the measures neededdf <- df %>%mutate(wcap_at = wcap / at, # x1re_at = re / at, # x2ebit_at = ebit / at, # x3mve_lt = mve / lt, # x4revt_at = revt / at) # x5# cleanupdf <- df %>%mutate(across(where(is.numeric), ~replace(., !is.finite(.), NA)))# Calculate the scoredf <- df %>%mutate(Z =1.2* wcap_at +1.4* re_at +3.3* ebit_at +0.6* mve_lt +0.999* revt_at)# Calculate date info for mergingdf$date <-ymd(df$datadate)df$year <-year(df$date)df$month <-month(df$date)``````{r}# df_ratings has ratings data in it# Ratings, in order from worst to bestratings <-c("D", "C", "CC", "CCC-", "CCC","CCC+", "B-", "B", "B+", "BB-","BB", "BB+", "BBB-", "BBB", "BBB+", "A-", "A", "A+", "AA-", "AA","AA+", "AAA-", "AAA", "AAA+")# Convert string ratings (splticrm) to numeric ratingsdf_ratings$rating <-factor(df_ratings$splticrm, levels=ratings, ordered=T)df_ratings$date <-as.Date(df_ratings$datadate)df_ratings$year <-year(df_ratings$date)df_ratings$month <-month(df_ratings$date)# Merge together datadf <-left_join(df, df_ratings[,c("gvkey", "year", "month", "rating")])``````{r, fig.height=5, fig.width=6}plot <- df %>%filter(!is.na(Z), !is.na(rating)) %>%group_by(rating) %>%mutate(mean_Z=mean(Z,na.rm=T)) %>%slice(1) %>%ungroup() %>%select(rating, mean_Z) %>%ggplot(aes(y=mean_Z, x=rating)) +geom_col() +ylab('Mean Altman Z') +xlab('Credit rating') +theme(axis.text.x =element_text(angle =90))ggplotly(plot)``````{r}df %>%filter(!is.na(Z),!is.na(bankrupt)) %>%group_by(bankrupt_lead) %>%mutate(mean_Z=mean(Z,na.rm=T)) %>%slice(1) %>%ungroup() %>%select(bankrupt_lead, mean_Z) %>%html_df()``````{r, fig.height=5, fig.width=6}plot <- df %>%filter(!is.na(Z), !is.na(rating), year >=2000) %>%group_by(rating) %>%mutate(mean_Z=mean(Z,na.rm=T)) %>%slice(1) %>%ungroup() %>%select(rating, mean_Z) %>%ggplot(aes(y=mean_Z, x=rating)) +geom_col() +ylab('Mean Altman Z') +xlab('Credit rating') +theme(axis.text.x =element_text(angle =90))ggplotly(plot)``````{r}df %>%filter(!is.na(Z),!is.na(bankrupt_lead), year >=2000) %>%group_by(bankrupt_lead) %>%mutate(mean_Z=mean(Z,na.rm=T)) %>%slice(1) %>%ungroup() %>%select(bankrupt_lead, mean_Z) %>%html_df()``````{r, warning=F}fit_Z <-glm(bankrupt_lead ~ Z, data=df, family=binomial)summary(fit_Z)``````{r}lz <- df %>%filter(!is.na(bankrupt_lead), !is.na(Z)) %>%filter(Z <1) %>%pull(bankrupt_lead) %>%table()hz <- df %>%filter(!is.na(bankrupt_lead), !is.na(Z)) %>%filter(Z >=1) %>%pull(bankrupt_lead) %>%table()x <-matrix(c(lz, hz), nrow=2)rownames(x) <-c('No bankruptcy', 'Bankruptcy')colnames(x) <-c('Z < 1', 'Z >= 1')x``````{r, message=F, error=F, warning=F, fig.height=4.8, fig.width=4.8}library(yardstick)df_Z <- df %>%filter(!is.na(Z), !is.na(bankrupt_lead))df_Z$pred <-predict(fit_Z, df_Z, type="response")df_Z %>%roc_curve(bankrupt_lead, pred, event_level='second') %>%autoplot()``````{r, fig.width=4.8, fig.height=4.8}df_Z %>%roc_curve(bankrupt_lead, pred,event_level='second') %>%autoplot()``````{r}auc_Z <- df_Z %>%roc_auc(bankrupt_lead, pred, event_level='second')auc_Z``````{r}score =1m =0std =1funcShaded <-function(x, lower_bound) { y =dnorm(x, mean = m, sd = std) y[x < lower_bound] <-NAreturn(y)}ggplot(data.frame(x =c(-3, 3)), aes(x = x)) +stat_function(fun = dnorm, args =list(mean = m, sd = std)) +stat_function(fun = funcShaded, args =list(lower_bound = score), geom ="area", fill ='black', alpha = .2) +scale_x_continuous(name ="Score", breaks =seq(-3, 3, std)) +geom_text(data =data.frame(x=c(1.5), y=c(0.05)), aes(x=x, y=y, label="Prob(default)", size=30)) +geom_line(data =data.frame(x=c(1,1), y=c(0,0.4)), aes(x=x,y=y)) +geom_text(data =data.frame(x=c(1.3), y=c(0.4)), aes(x=x, y=y, label="DD", size=30)) +theme(legend.position="none")``````{r}# df_stock is an already prepped csv from CRSP datadf_stock$date <-as.Date(df_stock$date)df <-left_join(df, df_stock[,c("gvkey", "date", "ret", "ret.sd")])``````{r}df_rf$date <-as.Date(df_rf$dateff)df_rf$year <-year(df_rf$date)df_rf$month <-month(df_rf$date)df <-left_join(df, df_rf[,c("year", "month", "rf")])df <- df %>%mutate(DD = (log(mve / lt) + (rf - (ret.sd*sqrt(253))^2/2)) / (ret.sd*sqrt(253)))# Clean the measuredf <- df %>%mutate(across(where(is.numeric), ~replace(., !is.finite(.), NA)))``````{r, fig.height=5, fig.width=6}plot <- df %>%filter(!is.na(DD),!is.na(rating)) %>%group_by(rating) %>%mutate(mean_DD=mean(DD,na.rm=T),prob_default =pnorm(-1* mean_DD)) %>%slice(1) %>%ungroup() %>%select(rating, prob_default) %>%ggplot(aes(y=prob_default, x=rating)) +geom_col() +ylab('Probability of default') +xlab('Credit rating') +theme(axis.text.x =element_text(angle =90))ggplotly(plot)``````{r}df %>%filter(!is.na(DD),!is.na(bankrupt_lead)) %>%group_by(bankrupt_lead) %>%mutate(mean_DD=mean(DD, na.rm=T),prob_default =pnorm(-1* mean_DD)) %>%slice(1) %>%ungroup() %>%select(bankrupt_lead, mean_DD, prob_default) %>%html_df()``````{r, fig.height=5, fig.width=6}plot <- df %>%filter(!is.na(DD),!is.na(rating), year >=2000) %>%group_by(rating) %>%mutate(mean_DD=mean(DD,na.rm=T),prob_default =pnorm(-1* mean_DD)) %>%slice(1) %>%ungroup() %>%select(rating, prob_default) %>%ggplot(aes(y=prob_default, x=rating)) +geom_col() +ylab('Probability of default') +xlab('Credit rating') +theme(axis.text.x =element_text(angle =90))ggplotly(plot)``````{r}df %>%filter(!is.na(DD),!is.na(bankrupt_lead), year >=2000) %>%group_by(bankrupt_lead) %>%mutate(mean_DD=mean(DD, na.rm=T),prob_default =pnorm(-1* mean_DD)) %>%slice(1) %>%ungroup() %>%select(bankrupt_lead, mean_DD, prob_default) %>%html_df()``````{r, warning=FALSE}fit_DD <-glm(bankrupt_lead ~ DD, data=df, family=binomial)summary(fit_DD)``````{r, fig.height=3.5, fig.width=3.5}df_DD <- df %>%filter(!is.na(Z), !is.na(bankrupt_lead))df_DD$pred <-predict(fit_DD, df_DD, type="response")df_DD %>%roc_curve(bankrupt_lead, pred, event_level='second') %>%autoplot()df_DD %>%roc_auc(bankrupt_lead, pred, event_level='second')``````{r}#AUCauc_DD <- df_DD %>%roc_auc(bankrupt_lead, pred, event_level='second')AUCs <-c(auc_Z$.estimate, auc_DD$.estimate)names(AUCs) <-c("Z", "DD")AUCs``````{r, warning=FALSE}# calculate downgradedf <- df %>%group_by(gvkey) %>%arrange(date) %>%mutate(downgrade =factor(ifelse(lead(rating) < rating,1,0), levels=c(0,1)),diff_Z = Z -lag(Z),diff_DD = DD -lag(DD)) %>%ungroup()# training sampletrain <- df %>%filter(year <2014, !is.na(diff_Z), !is.na(diff_DD), !is.na(downgrade), year >1985)test <- df %>%filter(year >=2014, !is.na(diff_Z), !is.na(diff_DD), !is.na(downgrade))# glmsfit_Z2 <-glm(downgrade ~ diff_Z, data=train, family=binomial)fit_DD2 <-glm(downgrade ~ diff_DD, data=train, family=binomial)``````{r}summary(fit_Z2)``````{r}summary(fit_DD2)``````{r, fig.height=6, fig.width=8}df_Z2 <- train %>%filter(!is.na(diff_Z), !is.na(downgrade))df_Z2$pred <-predict(fit_Z2, df_Z2, type="response")curve1 <- df_Z2 %>%roc_curve(downgrade, pred, event_level='second')auc_Z2 <- df_Z2 %>%roc_auc(downgrade, pred, event_level='second')df_DD2 <- train %>%filter(!is.na(diff_DD), !is.na(downgrade))df_DD2$pred <-predict(fit_DD2, df_DD2, type="response")curve2 <- df_DD2 %>%roc_curve(downgrade, pred, event_level='second')auc_DD2 <- df_DD2 %>%roc_auc(downgrade, pred, event_level='second')curve1 <- curve1 %>%group_by(sensitivity) %>%slice(c(1, n())) %>%ungroup()curve2 <- curve2 %>%group_by(sensitivity) %>%slice(c(1, n())) %>%ungroup()ggplot() +geom_line(data=curve1, aes(y=sensitivity, x=1-specificity, color="Altman Z")) +geom_line(data=curve2, aes(y=sensitivity, x=1-specificity, color="DD")) +geom_abline(slope=1)``````{r}AUCs <-c(auc_Z2$.estimate, auc_DD2$.estimate)names(AUCs) <-c("Z", "DD")AUCs``````{r, fig.height=5}df_Z2 <- test %>%filter(!is.na(diff_Z), !is.na(downgrade))df_Z2$pred <-predict(fit_Z2, df_Z2, type="response")curve1 <- df_Z2 %>%roc_curve(downgrade, pred, event_level='second')auc_Z2 <- df_Z2 %>%roc_auc(downgrade, pred, event_level='second')df_DD2 <- test %>%filter(!is.na(diff_DD), !is.na(downgrade))df_DD2$pred <-predict(fit_DD2, df_DD2, type="response")curve2 <- df_DD2 %>%roc_curve(downgrade, pred, event_level='second')auc_DD2 <- df_DD2 %>%roc_auc(downgrade, pred, event_level='second')curve1 <- curve1 %>%group_by(sensitivity) %>%slice(c(1, n())) %>%ungroup()curve2 <- curve2 %>%group_by(sensitivity) %>%slice(c(1, n())) %>%ungroup()ggplot() +geom_line(data=curve1, aes(y=sensitivity, x=1-specificity, color="Altman Z")) +geom_line(data=curve2, aes(y=sensitivity, x=1-specificity, color="DD")) +geom_abline(slope=1)``````{r}AUCs <-c(auc_Z2$.estimate, auc_DD2$.estimate)names(AUCs) <-c("Z", "DD")AUCs``````{r, warning=F}fit_comb <-glm(downgrade ~ diff_Z + diff_DD, data=train, family=binomial)summary(fit_comb)``````{r}library(margins)fit_comb %>%margins() %>%summary()``````{r, fig.height=5}df_comb <- test %>%filter(!is.na(diff_DD), !is.na(diff_Z), !is.na(downgrade))df_comb$pred <-predict(fit_comb, df_comb, type="response")curve3 <- df_comb %>%roc_curve(downgrade, pred, event_level='second')auc_comb <- df_comb %>%roc_auc(downgrade, pred, event_level='second')curve3 <- curve3 %>%group_by(sensitivity) %>%slice(c(1, n())) %>%ungroup()ggplot() +geom_line(data=curve1, aes(y=sensitivity, x=1-specificity, color="Altman Z")) +geom_line(data=curve2, aes(y=sensitivity, x=1-specificity, color="DD")) +geom_line(data=curve3, aes(y=sensitivity, x=1-specificity, color="Combined")) +geom_abline(slope=1)AUCs <-c(auc_Z2$.estimate, auc_DD2$.estimate, auc_comb$.estimate)names(AUCs) <-c("Z", "DD", "Combined")AUCs``````{r}#| eval: false#| echo: true# Calculating a 253 day rolling standard deviation in Rcrsp <- crsp %>%group_by(gvkey) %>%mutate(ret.sd =rollapply(data=ret, width=253, FUN=sd, align="right", fill=NA, na.rm=T)) %>%ungroup()```