Code for Session 5

Author

Dr. Richard M. Crowley

Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.

library(knitr)
library(kableExtra)

#' Pretty formatted HTML tables
#' 
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
  if(!length(names)) {
    names=colnames(df)
  }
  kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
    kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
    {if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
    {if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
    {if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
    {if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}

library(tidyverse)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(lubridate)
df <- read.csv("../../Data/Session_5-1.csv", stringsAsFactors=FALSE)
df_ratings <- read.csv("../../Data/Session_5-2.csv", stringsAsFactors=FALSE)
df_mve <- read.csv("../../Data/Session_5-3.csv", stringsAsFactors=FALSE)
df_rf <- read.csv("../../Data/Session_5-4.csv", stringsAsFactors=FALSE)
df_stock <- read.csv("../../Data/Session_5-5.csv", stringsAsFactors=FALSE)
# initial cleaning; 100338 is an outlier in the bonds distribution
df <- df %>% filter(at >= 1, revt >= 1, gvkey != 100338)

## Merge in stock value
df$date <- ymd(df$datadate)
df_mve <- df_mve %>%
  mutate(date = ymd(datadate),
         mve = csho * prcc_f) %>%
  rename(gvkey=GVKEY)

df <- left_join(df, df_mve[,c("gvkey","date","mve")])
Joining with `by = join_by(gvkey, date)`
# Compute the bankruptcy indicator
df <- df %>%
  group_by(gvkey) %>%
  arrange(datadate) %>%
  mutate(bankrupt = ifelse(row_number() == n() & dlrsn == 2 &
                           !is.na(dlrsn), 1, 0),
         bankrupt_lead = lead(bankrupt)) %>%
  ungroup() %>%
  filter(!is.na(bankrupt_lead)) %>%
  mutate(bankrupt_lead = factor(bankrupt_lead, levels=c(0,1)))
# Calculate the measures needed
df <- df %>%
  mutate(wcap_at = wcap / at,  # x1
         re_at = re / at,  # x2
         ebit_at = ebit / at,  # x3
         mve_lt = mve / lt,  # x4
         revt_at = revt / at)  # x5
# cleanup
df <- df %>% mutate(across(where(is.numeric), ~replace(., !is.finite(.), NA)))

# Calculate the score
df <- df %>% mutate(Z = 1.2 * wcap_at + 1.4 * re_at + 3.3 * ebit_at + 0.6 * mve_lt + 0.999 * revt_at)

# Calculate date info for merging
df$date <- ymd(df$datadate)
df$year <- year(df$date)
df$month <- month(df$date)
# df_ratings has ratings data in it

# Ratings, in order from worst to best
ratings <- c("D", "C", "CC", "CCC-", "CCC","CCC+", "B-", "B", "B+", "BB-",
             "BB", "BB+", "BBB-", "BBB", "BBB+", "A-", "A", "A+", "AA-", "AA",
             "AA+", "AAA-", "AAA", "AAA+")
# Convert string ratings (splticrm) to numeric ratings
df_ratings$rating <- factor(df_ratings$splticrm, levels=ratings, ordered=T)

df_ratings$date <- as.Date(df_ratings$datadate)
df_ratings$year <- year(df_ratings$date)
df_ratings$month <- month(df_ratings$date)

# Merge together data
df <- left_join(df, df_ratings[,c("gvkey", "year", "month", "rating")])
Joining with `by = join_by(gvkey, year, month)`
plot <- df %>%
  filter(!is.na(Z), !is.na(rating)) %>%
  group_by(rating) %>%
  mutate(mean_Z=mean(Z,na.rm=T)) %>%
  slice(1) %>%
  ungroup() %>%
  select(rating, mean_Z) %>%
  ggplot(aes(y=mean_Z, x=rating)) + 
  geom_col() + 
  ylab('Mean Altman Z') + xlab('Credit rating') + 
  theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
  filter(!is.na(Z),
         !is.na(bankrupt)) %>%
  group_by(bankrupt_lead) %>%
  mutate(mean_Z=mean(Z,na.rm=T)) %>%
  slice(1) %>%
  ungroup() %>%
  select(bankrupt_lead, mean_Z) %>%
  html_df()
bankrupt_lead mean_Z
0 3.993796
1 1.739039
plot <- df %>%
  filter(!is.na(Z), !is.na(rating), year >= 2000) %>%
  group_by(rating) %>%
  mutate(mean_Z=mean(Z,na.rm=T)) %>%
  slice(1) %>%
  ungroup() %>%
  select(rating, mean_Z) %>%
  ggplot(aes(y=mean_Z, x=rating)) + 
  geom_col() + 
  ylab('Mean Altman Z') + xlab('Credit rating') + 
  theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
  filter(!is.na(Z),
         !is.na(bankrupt_lead),
         year >= 2000) %>%
  group_by(bankrupt_lead) %>%
  mutate(mean_Z=mean(Z,na.rm=T)) %>%
  slice(1) %>%
  ungroup() %>%
  select(bankrupt_lead, mean_Z) %>%
  html_df()
bankrupt_lead mean_Z
0 3.897392
1 1.670656
fit_Z <- glm(bankrupt_lead ~ Z, data=df, family=binomial)
summary(fit_Z)

Call:
glm(formula = bankrupt_lead ~ Z, family = binomial, data = df)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -5.87769    0.11741 -50.060  < 2e-16 ***
Z           -0.05494    0.01235  -4.449 8.61e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1101.0  on 33372  degrees of freedom
Residual deviance: 1088.8  on 33371  degrees of freedom
  (14245 observations deleted due to missingness)
AIC: 1092.8

Number of Fisher Scoring iterations: 9
lz <- df %>% filter(!is.na(bankrupt_lead), !is.na(Z)) %>% filter(Z < 1) %>% pull(bankrupt_lead) %>% table()
hz <- df %>% filter(!is.na(bankrupt_lead), !is.na(Z)) %>% filter(Z >= 1) %>% pull(bankrupt_lead) %>% table()
x <- matrix(c(lz, hz), nrow=2)
rownames(x) <- c('No bankruptcy', 'Bankruptcy')
colnames(x) <- c('Z < 1', 'Z >= 1')
x
              Z < 1 Z >= 1
No bankruptcy  2654  30641
Bankruptcy       29     49
library(yardstick)
df_Z <- df %>% filter(!is.na(Z), !is.na(bankrupt_lead))
df_Z$pred <- predict(fit_Z, df_Z, type="response")
df_Z %>% roc_curve(bankrupt_lead, pred, event_level='second') %>% autoplot()

df_Z %>% roc_curve(bankrupt_lead,
                  pred,
                  event_level='second') %>%
  autoplot()

auc_Z <- df_Z %>% roc_auc(bankrupt_lead, pred, event_level='second')
auc_Z
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.750
score = 1
m = 0
std = 1

funcShaded <- function(x, lower_bound) {
    y = dnorm(x, mean = m, sd = std)
    y[x < lower_bound] <- NA
    return(y)
}

ggplot(data.frame(x = c(-3, 3)), aes(x = x)) + 
  stat_function(fun = dnorm, args = list(mean = m, sd = std)) + 
  stat_function(fun = funcShaded, args = list(lower_bound = score), 
                geom = "area", fill = 'black', alpha = .2) +
  scale_x_continuous(name = "Score", breaks = seq(-3, 3, std)) + 
  geom_text(data = data.frame(x=c(1.5), y=c(0.05)), aes(x=x, y=y, label="Prob(default)", size=30)) + 
  geom_line(data = data.frame(x=c(1,1), y=c(0,0.4)), aes(x=x,y=y)) + 
  geom_text(data = data.frame(x=c(1.3), y=c(0.4)), aes(x=x, y=y, label="DD", size=30)) +
  theme(legend.position="none")

# df_stock is an already prepped csv from CRSP data
df_stock$date <- as.Date(df_stock$date)
df <- left_join(df, df_stock[,c("gvkey", "date", "ret", "ret.sd")])
Joining with `by = join_by(gvkey, date)`
Warning in left_join(df, df_stock[, c("gvkey", "date", "ret", "ret.sd")]): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 13537 of `x` matches multiple rows in `y`.
ℹ Row 20513 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
df_rf$date <- as.Date(df_rf$dateff)
df_rf$year <- year(df_rf$date)
df_rf$month <- month(df_rf$date)

df <- left_join(df, df_rf[,c("year", "month", "rf")])
Joining with `by = join_by(year, month)`
df <- df %>%
  mutate(DD = (log(mve / lt) + (rf - (ret.sd*sqrt(253))^2 / 2)) /
              (ret.sd*sqrt(253)))
# Clean the measure
df <- df %>%
  mutate(across(where(is.numeric), ~replace(., !is.finite(.), NA)))
plot <- df %>%
  filter(!is.na(DD),
         !is.na(rating)) %>%
  group_by(rating) %>%
  mutate(mean_DD=mean(DD,na.rm=T),
         prob_default = pnorm(-1 * mean_DD)) %>%
  slice(1) %>%
  ungroup() %>%
  select(rating, prob_default) %>%
  ggplot(aes(y=prob_default, x=rating)) + 
  geom_col() + 
  ylab('Probability of default') + xlab('Credit rating') + 
  theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
  filter(!is.na(DD),
         !is.na(bankrupt_lead)) %>%
  group_by(bankrupt_lead) %>%
  mutate(mean_DD=mean(DD, na.rm=T),
         prob_default =
           pnorm(-1 * mean_DD)) %>%
  slice(1) %>%
  ungroup() %>%
  select(bankrupt_lead, mean_DD,
         prob_default) %>%
  html_df()
bankrupt_lead mean_DD prob_default
0 0.6427281 0.2602003
1 -3.1423863 0.9991621
plot <- df %>%
  filter(!is.na(DD),
         !is.na(rating),
         year >= 2000) %>%
  group_by(rating) %>%
  mutate(mean_DD=mean(DD,na.rm=T),
         prob_default = pnorm(-1 * mean_DD)) %>%
  slice(1) %>%
  ungroup() %>%
  select(rating, prob_default) %>%
  ggplot(aes(y=prob_default, x=rating)) + 
  geom_col() + 
  ylab('Probability of default') + xlab('Credit rating') + 
  theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
  filter(!is.na(DD),
         !is.na(bankrupt_lead),
         year >= 2000) %>%
  group_by(bankrupt_lead) %>%
  mutate(mean_DD=mean(DD, na.rm=T),
         prob_default =
           pnorm(-1 * mean_DD)) %>%
  slice(1) %>%
  ungroup() %>%
  select(bankrupt_lead, mean_DD,
         prob_default) %>%
  html_df()
bankrupt_lead mean_DD prob_default
0 0.8878013 0.1873238
1 -4.4289487 0.9999953
fit_DD <- glm(bankrupt_lead ~ DD, data=df, family=binomial)
summary(fit_DD)

Call:
glm(formula = bankrupt_lead ~ DD, family = binomial, data = df)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -6.27408    0.16653 -37.676  < 2e-16 ***
DD          -0.29783    0.03877  -7.682 1.57e-14 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 665.03  on 20455  degrees of freedom
Residual deviance: 608.65  on 20454  degrees of freedom
  (31380 observations deleted due to missingness)
AIC: 612.65

Number of Fisher Scoring iterations: 9
df_DD <- df %>% filter(!is.na(Z), !is.na(bankrupt_lead))
df_DD$pred <- predict(fit_DD, df_DD, type="response")
df_DD %>% roc_curve(bankrupt_lead, pred, event_level='second') %>%
  autoplot()

df_DD %>% roc_auc(bankrupt_lead, pred, event_level='second')
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.843
#AUC
auc_DD <- df_DD %>% roc_auc(bankrupt_lead, pred, event_level='second')
AUCs <- c(auc_Z$.estimate, auc_DD$.estimate)
names(AUCs) <- c("Z", "DD")
AUCs
        Z        DD 
0.7498970 0.8434707 
# calculate downgrade
df <- df %>%
  group_by(gvkey) %>%
  arrange(date) %>%
  mutate(downgrade = factor(ifelse(lead(rating) < rating,1,0), levels=c(0,1)),
         diff_Z = Z - lag(Z),
         diff_DD = DD - lag(DD)) %>%
  ungroup()


# training sample
train <- df %>% filter(year < 2014, !is.na(diff_Z), !is.na(diff_DD), !is.na(downgrade),
                       year > 1985)
test <- df %>% filter(year >= 2014, !is.na(diff_Z), !is.na(diff_DD), !is.na(downgrade))

# glms
fit_Z2 <- glm(downgrade ~ diff_Z, data=train, family=binomial)
fit_DD2 <- glm(downgrade ~ diff_DD, data=train, family=binomial)
summary(fit_Z2)

Call:
glm(formula = downgrade ~ diff_Z, family = binomial, data = train)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.32925    0.06246 -37.294   <2e-16 ***
diff_Z      -0.09426    0.04860  -1.939   0.0525 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1913.6  on 3177  degrees of freedom
Residual deviance: 1908.7  on 3176  degrees of freedom
AIC: 1912.7

Number of Fisher Scoring iterations: 5
summary(fit_DD2)

Call:
glm(formula = downgrade ~ diff_DD, family = binomial, data = train)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.36904    0.06452 -36.719  < 2e-16 ***
diff_DD     -0.25536    0.03883  -6.576 4.82e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1913.6  on 3177  degrees of freedom
Residual deviance: 1871.4  on 3176  degrees of freedom
AIC: 1875.4

Number of Fisher Scoring iterations: 5
df_Z2 <- train %>% filter(!is.na(diff_Z), !is.na(downgrade))
df_Z2$pred <- predict(fit_Z2, df_Z2, type="response")
curve1 <- df_Z2 %>% roc_curve(downgrade, pred, event_level='second')
auc_Z2 <- df_Z2 %>% roc_auc(downgrade, pred, event_level='second')

df_DD2 <- train %>% filter(!is.na(diff_DD), !is.na(downgrade))
df_DD2$pred <- predict(fit_DD2, df_DD2, type="response")
curve2 <- df_DD2 %>% roc_curve(downgrade, pred, event_level='second')
auc_DD2 <- df_DD2 %>% roc_auc(downgrade, pred, event_level='second')

curve1 <- curve1 %>% group_by(sensitivity) %>% slice(c(1, n())) %>% ungroup()
curve2 <- curve2 %>% group_by(sensitivity) %>% slice(c(1, n())) %>% ungroup()

ggplot() +
  geom_line(data=curve1, aes(y=sensitivity, x=1-specificity, color="Altman Z")) + 
  geom_line(data=curve2, aes(y=sensitivity, x=1-specificity, color="DD")) + 
  geom_abline(slope=1)

AUCs <- c(auc_Z2$.estimate, auc_DD2$.estimate)
names(AUCs) <- c("Z", "DD")
AUCs
        Z        DD 
0.6672852 0.6440596 
df_Z2 <- test %>% filter(!is.na(diff_Z), !is.na(downgrade))
df_Z2$pred <- predict(fit_Z2, df_Z2, type="response")
curve1 <- df_Z2 %>% roc_curve(downgrade, pred, event_level='second')
auc_Z2 <- df_Z2 %>% roc_auc(downgrade, pred, event_level='second')

df_DD2 <- test %>% filter(!is.na(diff_DD), !is.na(downgrade))
df_DD2$pred <- predict(fit_DD2, df_DD2, type="response")
curve2 <- df_DD2 %>% roc_curve(downgrade, pred, event_level='second')
auc_DD2 <- df_DD2 %>% roc_auc(downgrade, pred, event_level='second')

curve1 <- curve1 %>% group_by(sensitivity) %>% slice(c(1, n())) %>% ungroup()
curve2 <- curve2 %>% group_by(sensitivity) %>% slice(c(1, n())) %>% ungroup()

ggplot() +
  geom_line(data=curve1, aes(y=sensitivity, x=1-specificity, color="Altman Z")) + 
  geom_line(data=curve2, aes(y=sensitivity, x=1-specificity, color="DD")) + 
  geom_abline(slope=1)

AUCs <- c(auc_Z2$.estimate, auc_DD2$.estimate)
names(AUCs) <- c("Z", "DD")
AUCs
     Z     DD 
0.6464 0.5904 
fit_comb <- glm(downgrade ~ diff_Z + diff_DD, data=train, family=binomial)
summary(fit_comb)

Call:
glm(formula = downgrade ~ diff_Z + diff_DD, family = binomial, 
    data = train)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.36899    0.06457 -36.689  < 2e-16 ***
diff_Z       0.02886    0.04289   0.673    0.501    
diff_DD     -0.26787    0.04306  -6.220 4.97e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1913.6  on 3177  degrees of freedom
Residual deviance: 1871.0  on 3175  degrees of freedom
AIC: 1877

Number of Fisher Scoring iterations: 5
  factor     AME     SE       z      p   lower   upper
 diff_DD -0.0214 0.0035 -6.1316 0.0000 -0.0283 -0.0146
  diff_Z  0.0023 0.0034  0.6726 0.5012 -0.0044  0.0090
df_comb <- test %>% filter(!is.na(diff_DD), !is.na(diff_Z), !is.na(downgrade))
df_comb$pred <- predict(fit_comb, df_comb, type="response")
curve3 <- df_comb %>% roc_curve(downgrade, pred, event_level='second')
auc_comb <- df_comb %>% roc_auc(downgrade, pred, event_level='second')

curve3 <- curve3 %>% group_by(sensitivity) %>% slice(c(1, n())) %>% ungroup()

ggplot() +
  geom_line(data=curve1, aes(y=sensitivity, x=1-specificity, color="Altman Z")) + 
  geom_line(data=curve2, aes(y=sensitivity, x=1-specificity, color="DD")) +
  geom_line(data=curve3, aes(y=sensitivity, x=1-specificity, color="Combined")) + 
  geom_abline(slope=1)

AUCs <- c(auc_Z2$.estimate, auc_DD2$.estimate, auc_comb$.estimate)
names(AUCs) <- c("Z", "DD", "Combined")
AUCs
        Z        DD  Combined 
0.6464000 0.5904000 0.5959385 
# Calculating a 253 day rolling standard deviation in R
crsp <- crsp %>%
  group_by(gvkey) %>%
  mutate(ret.sd = rollapply(data=ret, width=253, FUN=sd, align="right", fill=NA, na.rm=T)) %>%
  ungroup()