Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.

library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
  if(!length(cols)) {
    cols=colnames(text)
  }
  if(!col1) {
    kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
      kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
  } else {
    kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
      kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
      column_spec(1,bold=T)
  }
}
library(tidyverse)
library(plotly)
library(lubridate)
df <- read.csv("../../Data/Session_5-1.csv", stringsAsFactors=FALSE)
df_ratings <- read.csv("../../Data/Session_5-2.csv", stringsAsFactors=FALSE)
df_mve <- read.csv("../../Data/Session_5-3.csv", stringsAsFactors=FALSE)
df_rf <- read.csv("../../Data/Session_5-4.csv", stringsAsFactors=FALSE)
df_stock <- read.csv("../../Data/Session_5-5.csv", stringsAsFactors=FALSE)
# initial cleaning
# 100338 is an outlier in the bonds distribution
df <- df %>% filter(at >= 1, revt >= 1, gvkey != 100338)

## Merge in stock value
df$date <- as.Date(df$datadate)
df_mve <- df_mve %>%
  mutate(date = as.Date(datadate),
         mve = csho * prcc_f) %>%
  rename(gvkey=GVKEY)

df <- left_join(df, df_mve[,c("gvkey","date","mve")])
Joining, by = c("gvkey", "date")
df <- df %>%
  group_by(gvkey) %>%
  mutate(bankrupt = ifelse(row_number() == n() & dlrsn == 2 &
                           !is.na(dlrsn), 1, 0)) %>%
  ungroup()

# Calculate the measures needed
df <- df %>%
  mutate(wcap_at = wcap / at,  # x1
         re_at = re / at,  # x2
         ebit_at = ebit / at,  # x3
         mve_lt = mve / lt,  # x4
         revt_at = revt / at)  # x5
# cleanup
df <- df %>%
  mutate_if(is.numeric, list(~replace(., !is.finite(.), NA)))

# Calculate the score
df <- df %>%
  mutate(Z = 1.2 * wcap_at + 1.4 * re_at + 3.3 * ebit_at + 0.6 * mve_lt + 
           0.999 * revt_at)

# Calculate date info for merging
df$date <- as.Date(df$datadate)
df$year <- year(df$date)
df$month <- month(df$date)
# df_ratings has ratings data in it

# Ratings, in order from worst to best
ratings <- c("D", "C", "CC", "CCC-", "CCC","CCC+", "B-", "B", "B+", "BB-",
             "BB", "BB+", "BBB-", "BBB", "BBB+", "A-", "A", "A+", "AA-", "AA",
             "AA+", "AAA-", "AAA", "AAA+")
# Convert string ratings (splticrm) to numeric ratings
df_ratings$rating <- factor(df_ratings$splticrm, levels=ratings, ordered=T)

df_ratings$date <- as.Date(df_ratings$datadate)
df_ratings$year <- year(df_ratings$date)
df_ratings$month <- month(df_ratings$date)

# Merge together data
df <- left_join(df, df_ratings[,c("gvkey", "year", "month", "rating")])
Joining, by = c("gvkey", "year", "month")
plot <- df %>%
  filter(!is.na(Z), !is.na(rating)) %>%
  group_by(rating) %>%
  mutate(mean_Z=mean(Z,na.rm=T)) %>%
  slice(1) %>%
  ungroup() %>%
  select(rating, mean_Z) %>%
  ggplot(aes(y=mean_Z, x=rating)) + 
  geom_col() + 
  ylab('Mean Altman Z') + xlab('Credit rating') + 
  theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
  filter(!is.na(Z),
         !is.na(bankrupt)) %>%
  group_by(bankrupt) %>%
  mutate(mean_Z=mean(Z,na.rm=T)) %>%
  slice(1) %>%
  ungroup() %>%
  select(bankrupt, mean_Z) %>%
  html_df()
bankrupt mean_Z
0 3.939223
1 0.927843
plot <- df %>%
  filter(!is.na(Z), !is.na(rating), year >= 2000) %>%
  group_by(rating) %>%
  mutate(mean_Z=mean(Z,na.rm=T)) %>%
  slice(1) %>%
  ungroup() %>%
  select(rating, mean_Z) %>%
  ggplot(aes(y=mean_Z, x=rating)) + 
  geom_col() + 
  ylab('Mean Altman Z') + xlab('Credit rating') + 
  theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
  filter(!is.na(Z),
         !is.na(bankrupt),
         year >= 2000) %>%
  group_by(bankrupt) %>%
  mutate(mean_Z=mean(Z,na.rm=T)) %>%
  slice(1) %>%
  ungroup() %>%
  select(bankrupt, mean_Z) %>%
  html_df()
bankrupt mean_Z
0 3.822281
1 1.417683
fit_Z <- glm(bankrupt ~ Z, data=df, family=binomial)
summary(fit_Z)

Call:
glm(formula = bankrupt ~ Z, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8297  -0.0676  -0.0654  -0.0624   3.7794  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -5.94354    0.11829 -50.245  < 2e-16 ***
Z           -0.06383    0.01239  -5.151 2.59e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1085.2  on 35296  degrees of freedom
Residual deviance: 1066.5  on 35295  degrees of freedom
  (15577 observations deleted due to missingness)
AIC: 1070.5

Number of Fisher Scoring iterations: 9
library(ROCR)
dfZ <- df %>% filter(!is.na(Z), !is.na(bankrupt))
pred_Z <- predict(fit_Z, dfZ, type="response")
ROCpred_Z <- prediction(as.numeric(pred_Z), as.numeric(dfZ$bankrupt))
ROCperf_Z <- performance(ROCpred_Z, 'tpr','fpr')
df_ROC_Z <- data.frame(
  FP=c(ROCperf_Z@x.values[[1]]),
  TP=c(ROCperf_Z@y.values[[1]]))
ggplot(data=df_ROC_Z,
  aes(x=FP, y=TP)) + geom_line() +
  geom_abline(slope=1)

plot(ROCperf_Z)

ggplot(data=df_ROC_Z, aes(x=FP, y=TP)) +
  geom_line() +
  geom_abline(slope=1) +
  ylab("True positive rate (Sensitivity)") + 
  xlab("False positive rate (1 - Specificity)") +
  ggtitle("ROC Curve")

auc_Z <- performance(ROCpred_Z, measure = "auc")
auc_Z@y.values[[1]]
[1] 0.8280943
score = 1
m = 0
std = 1

funcShaded <- function(x, lower_bound) {
    y = dnorm(x, mean = m, sd = std)
    y[x < lower_bound] <- NA
    return(y)
}

ggplot(data.frame(x = c(-3, 3)), aes(x = x)) + 
  stat_function(fun = dnorm, args = list(mean = m, sd = std)) + 
  stat_function(fun = funcShaded, args = list(lower_bound = score), 
                geom = "area", fill = 'black', alpha = .2) +
  scale_x_continuous(name = "Score", breaks = seq(-3, 3, std)) + 
  geom_text(data = data.frame(x=c(1.5), y=c(0.05)), aes(x=x, y=y, label="Prob(default)", size=30)) + 
  geom_line(data = data.frame(x=c(1,1), y=c(0,0.4)), aes(x=x,y=y)) + 
  geom_text(data = data.frame(x=c(1.3), y=c(0.4)), aes(x=x, y=y, label="DD", size=30)) +
  theme(legend.position="none")

# df_stock is an already prepped csv from CRSP data
df_stock$date <- as.Date(df_stock$date)
df <- left_join(df, df_stock[,c("gvkey", "date", "ret", "ret.sd")])
Joining, by = c("gvkey", "date")

df_rf$date <- as.Date(df_rf$dateff)
df_rf$year <- year(df_rf$date)
df_rf$month <- month(df_rf$date)

df <- left_join(df, df_rf[,c("year", "month", "rf")])
Joining, by = c("year", "month")
df <- df %>%
  mutate(DD = (log(mve / lt) + (rf - (ret.sd*sqrt(253))^2 / 2)) /
              (ret.sd*sqrt(253)))
# Clean the measure
df <- df %>%
  mutate_if(is.numeric, list(~replace(., !is.finite(.), NA)))
plot <- df %>%
  filter(!is.na(DD),
         !is.na(rating)) %>%
  group_by(rating) %>%
  mutate(mean_DD=mean(DD,na.rm=T),
         prob_default = pnorm(-1 * mean_DD)) %>%
  slice(1) %>%
  ungroup() %>%
  select(rating, prob_default) %>%
  ggplot(aes(y=prob_default, x=rating)) + 
  geom_col() + 
  ylab('Probability of default') + xlab('Credit rating') + 
  theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
  filter(!is.na(DD),
         !is.na(bankrupt)) %>%
  group_by(bankrupt) %>%
  mutate(mean_DD=mean(DD, na.rm=T),
         prob_default =
           pnorm(-1 * mean_DD)) %>%
  slice(1) %>%
  ungroup() %>%
  select(bankrupt, mean_DD,
         prob_default) %>%
  html_df()
bankrupt mean_DD prob_default
0 0.6096854 0.2710351
1 -2.4445081 0.9927475
plot <- df %>%
  filter(!is.na(DD),
         !is.na(rating),
         year >= 2000) %>%
  group_by(rating) %>%
  mutate(mean_DD=mean(DD,na.rm=T),
         prob_default = pnorm(-1 * mean_DD)) %>%
  slice(1) %>%
  ungroup() %>%
  select(rating, prob_default) %>%
  ggplot(aes(y=prob_default, x=rating)) + 
  geom_col() + 
  ylab('Probability of default') + xlab('Credit rating') + 
  theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
  filter(!is.na(DD),
         !is.na(bankrupt),
         year >= 2000) %>%
  group_by(bankrupt) %>%
  mutate(mean_DD=mean(DD, na.rm=T),
         prob_default =
           pnorm(-1 * mean_DD)) %>%
  slice(1) %>%
  ungroup() %>%
  select(bankrupt, mean_DD,
         prob_default) %>%
  html_df()
bankrupt mean_DD prob_default
0 0.8379932 0.2010172
1 -4.3001844 0.9999915
fit_DD <- glm(bankrupt ~ DD, data=df, family=binomial)
summary(fit_DD)

Call:
glm(formula = bankrupt ~ DD, family = binomial, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9929  -0.0750  -0.0634  -0.0506   3.6503  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -6.16394    0.15322 -40.230  < 2e-16 ***
DD          -0.24459    0.03781  -6.469 9.89e-11 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 718.67  on 21563  degrees of freedom
Residual deviance: 677.27  on 21562  degrees of freedom
  (33618 observations deleted due to missingness)
AIC: 681.27

Number of Fisher Scoring iterations: 9
dfDD <- df %>% filter(!is.na(DD), !is.na(bankrupt))
pred_DD <- predict(fit_DD, dfDD, type="response")
ROCpred_DD <- prediction(as.numeric(pred_DD), as.numeric(dfDD$bankrupt))
ROCperf_DD <- performance(ROCpred_DD, 'tpr','fpr')
df_ROC_DD <- data.frame(FalsePositive=c(ROCperf_DD@x.values[[1]]),
                 TruePositive=c(ROCperf_DD@y.values[[1]]))
ggplot() +
  geom_line(data=df_ROC_DD, aes(x=FalsePositive, y=TruePositive, color="DD")) + 
  geom_line(data=df_ROC_Z, aes(x=FP, y=TP, color="Z")) + 
  geom_abline(slope=1)

#AUC
auc_DD <- performance(ROCpred_DD, measure = "auc")
AUCs <- c(auc_Z@y.values[[1]], auc_DD@y.values[[1]])
names(AUCs) <- c("Z", "DD")
AUCs
        Z        DD 
0.8280943 0.8098304 
# calculate downgrade
df <- df %>%
  group_by(gvkey) %>%
  arrange(date) %>%
  mutate(downgrade = ifelse(rating < lag(rating),1,0),
         diff_Z = Z - lag(Z),
         diff_DD = DD - lag(DD)) %>%
  ungroup()


# training sample
train <- df %>% filter(year < 2014, !is.na(diff_Z), !is.na(diff_DD), !is.na(downgrade),
                       year > 1985)
test <- df %>% filter(year >= 2014, !is.na(diff_Z), !is.na(diff_DD), !is.na(downgrade))

# glms
fit_Z2 <- glm(downgrade ~ diff_Z, data=train, family=binomial)
fit_DD2 <- glm(downgrade ~ diff_DD, data=train, family=binomial)
summary(fit_Z2)

Call:
glm(formula = downgrade ~ diff_Z, family = binomial, data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.4115  -0.4428  -0.4428  -0.3928   2.7437  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.27310    0.06139 -37.029   <2e-16 ***
diff_Z      -0.77150    0.09245  -8.345   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2145.3  on 3277  degrees of freedom
Residual deviance: 2065.8  on 3276  degrees of freedom
AIC: 2069.8

Number of Fisher Scoring iterations: 5
summary(fit_DD2)

Call:
glm(formula = downgrade ~ diff_DD, family = binomial, data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5726  -0.4565  -0.4558  -0.4095   2.6804  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.21199    0.05926 -37.325  < 2e-16 ***
diff_DD     -0.21378    0.03723  -5.742 9.37e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2145.3  on 3277  degrees of freedom
Residual deviance: 2113.2  on 3276  degrees of freedom
AIC: 2117.2

Number of Fisher Scoring iterations: 5
pred_Z2 <- predict(fit_Z2, train, type="response")
pred_Z2 <- ifelse(!is.finite(pred_Z2),NA,pred_Z2)

ROCpred_Z2 <- prediction(as.numeric(pred_Z2[!is.na(train$downgrade) & !is.na(pred_Z2)]), as.numeric(train[!is.na(train$downgrade) & !is.na(pred_Z2),]$downgrade))
ROCperf_Z2 <- performance(ROCpred_Z2, 'tpr','fpr')
df_ROC_Z2 <- data.frame(FalsePositive=c(ROCperf_Z2@x.values[[1]]),
                 TruePositive=c(ROCperf_Z2@y.values[[1]]))
auc_Z2 <- performance(ROCpred_Z2, measure = "auc")


pred_DD2 <- predict(fit_DD2, train, type="response")
#[!is.na(pred)]
ROCpred_DD2 <- prediction(as.numeric(pred_DD2[!is.na(train$downgrade) & !is.na(pred_DD2)]), as.numeric(train[!is.na(train$downgrade) & !is.na(pred_DD2),]$downgrade))
ROCperf_DD2 <- performance(ROCpred_DD2, 'tpr','fpr')
df_ROC_DD2 <- data.frame(FalsePositive=c(ROCperf_DD2@x.values[[1]]),
                 TruePositive=c(ROCperf_DD2@y.values[[1]]))
ggplot() + geom_line(data=df_ROC_DD2, aes(x=FalsePositive, y=TruePositive, color='DD')) + geom_line(data=df_ROC_Z2, aes(x=FalsePositive, y=TruePositive, color='Z')) + geom_abline(slope=1)

auc_DD2 <- performance(ROCpred_DD2, measure = "auc")
AUCs <- c(auc_Z2@y.values[[1]], auc_DD2@y.values[[1]])
names(AUCs) <- c("Z", "DD")
AUCs
        Z        DD 
0.6465042 0.5847885 
pred_Z2 <- predict(fit_Z2, test, type="response")
ROCpred_Z2 <- prediction(as.numeric(pred_Z2[!is.na(test$downgrade) & !is.na(pred_Z2)]), as.numeric(test[!is.na(test$downgrade) & !is.na(pred_Z2),]$downgrade))
ROCperf_Z2 <- performance(ROCpred_Z2, 'tpr','fpr')
df_ROC_Z2 <- data.frame(FalsePositive=c(ROCperf_Z2@x.values[[1]]),
                 TruePositive=c(ROCperf_Z2@y.values[[1]]))
auc_Z2 <- performance(ROCpred_Z2, measure = "auc")

pred_DD2 <- predict(fit_DD2, test, type="response")
ROCpred_DD2 <- prediction(as.numeric(pred_DD2[!is.na(test$downgrade) & !is.na(pred_DD2)]), as.numeric(test[!is.na(test$downgrade) & !is.na(pred_DD2),]$downgrade))
ROCperf_DD2 <- performance(ROCpred_DD2, 'tpr','fpr')
df_ROC_DD2 <- data.frame(FalsePositive=c(ROCperf_DD2@x.values[[1]]),
                 TruePositive=c(ROCperf_DD2@y.values[[1]]))
ggplot() + geom_line(data=df_ROC_DD2, aes(x=FalsePositive, y=TruePositive, color='DD')) + geom_line(data=df_ROC_Z2, aes(x=FalsePositive, y=TruePositive, color='Z')) + geom_abline(slope=1)

auc_DD2 <- performance(ROCpred_DD2, measure = "auc")
AUCs <- c(auc_Z2@y.values[[1]], auc_DD2@y.values[[1]])
names(AUCs) <- c("Z", "DD")
AUCs
        Z        DD 
0.8134671 0.7420213 
fit_comb <- glm(downgrade ~ diff_Z + diff_DD, data=train, family=binomial)
summary(fit_comb)

Call:
glm(formula = downgrade ~ diff_Z + diff_DD, family = binomial, 
    data = train)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.3263  -0.4431  -0.4430  -0.3892   2.7504  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.27217    0.06144 -36.980  < 2e-16 ***
diff_Z      -0.71374    0.10709  -6.665 2.65e-11 ***
diff_DD     -0.04884    0.04638  -1.053    0.292    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2145.3  on 3277  degrees of freedom
Residual deviance: 2064.7  on 3275  degrees of freedom
AIC: 2070.7

Number of Fisher Scoring iterations: 5
fit_comb %>%
  margins::margins() %>%
  summary()
pred_comb <- predict(fit_comb, test, type="response")
pred_comb <- ifelse(!is.finite(pred_comb),NA,pred_comb)

ROCpred_comb <- prediction(as.numeric(pred_comb[!is.na(test$downgrade) & !is.na(pred_comb)]), as.numeric(test[!is.na(test$downgrade) & !is.na(pred_comb),]$downgrade))
ROCperf_comb <- performance(ROCpred_comb, 'tpr','fpr')
df_ROC_comb <- data.frame(FalsePositive=c(ROCperf_comb@x.values[[1]]),
                   TruePositive=c(ROCperf_comb@y.values[[1]]))
auc_comb <- performance(ROCpred_comb, measure = "auc")

ggplot() + 
  geom_line(data=df_ROC_comb, aes(x=FalsePositive, y=TruePositive, color='Combined')) +
  geom_line(data=df_ROC_Z2, aes(x=FalsePositive, y=TruePositive, color='Z')) +
  geom_abline(slope=1) + 
  geom_line(data=df_ROC_DD2, aes(x=FalsePositive, y=TruePositive, color='DD'))


AUCs <- c(auc_comb@y.values[[1]], auc_Z2@y.values[[1]], auc_DD2@y.values[[1]])
names(AUCs) <- c("Combined", "Z", "DD")
AUCs
 Combined         Z        DD 
0.8151596 0.8134671 0.7420213 
