Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
} else {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
column_spec(1,bold=T)
}
}
library(tidyverse)
library(plotly)
library(lubridate)
df <- read.csv("../../Data/Session_5-1.csv", stringsAsFactors=FALSE)
df_ratings <- read.csv("../../Data/Session_5-2.csv", stringsAsFactors=FALSE)
df_mve <- read.csv("../../Data/Session_5-3.csv", stringsAsFactors=FALSE)
df_rf <- read.csv("../../Data/Session_5-4.csv", stringsAsFactors=FALSE)
df_stock <- read.csv("../../Data/Session_5-5.csv", stringsAsFactors=FALSE)
# initial cleaning
# 100338 is an outlier in the bonds distribution
df <- df %>% filter(at >= 1, revt >= 1, gvkey != 100338)
## Merge in stock value
df$date <- as.Date(df$datadate)
df_mve <- df_mve %>%
mutate(date = as.Date(datadate),
mve = csho * prcc_f) %>%
rename(gvkey=GVKEY)
df <- left_join(df, df_mve[,c("gvkey","date","mve")])
Joining, by = c("gvkey", "date")
df <- df %>%
group_by(gvkey) %>%
mutate(bankrupt = ifelse(row_number() == n() & dlrsn == 2 &
!is.na(dlrsn), 1, 0)) %>%
ungroup()
# Calculate the measures needed
df <- df %>%
mutate(wcap_at = wcap / at, # x1
re_at = re / at, # x2
ebit_at = ebit / at, # x3
mve_lt = mve / lt, # x4
revt_at = revt / at) # x5
# cleanup
df <- df %>%
mutate_if(is.numeric, list(~replace(., !is.finite(.), NA)))
# Calculate the score
df <- df %>%
mutate(Z = 1.2 * wcap_at + 1.4 * re_at + 3.3 * ebit_at + 0.6 * mve_lt +
0.999 * revt_at)
# Calculate date info for merging
df$date <- as.Date(df$datadate)
df$year <- year(df$date)
df$month <- month(df$date)
# df_ratings has ratings data in it
# Ratings, in order from worst to best
ratings <- c("D", "C", "CC", "CCC-", "CCC","CCC+", "B-", "B", "B+", "BB-",
"BB", "BB+", "BBB-", "BBB", "BBB+", "A-", "A", "A+", "AA-", "AA",
"AA+", "AAA-", "AAA", "AAA+")
# Convert string ratings (splticrm) to numeric ratings
df_ratings$rating <- factor(df_ratings$splticrm, levels=ratings, ordered=T)
df_ratings$date <- as.Date(df_ratings$datadate)
df_ratings$year <- year(df_ratings$date)
df_ratings$month <- month(df_ratings$date)
# Merge together data
df <- left_join(df, df_ratings[,c("gvkey", "year", "month", "rating")])
Joining, by = c("gvkey", "year", "month")
plot <- df %>%
filter(!is.na(Z), !is.na(rating)) %>%
group_by(rating) %>%
mutate(mean_Z=mean(Z,na.rm=T)) %>%
slice(1) %>%
ungroup() %>%
select(rating, mean_Z) %>%
ggplot(aes(y=mean_Z, x=rating)) +
geom_col() +
ylab('Mean Altman Z') + xlab('Credit rating') +
theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
filter(!is.na(Z),
!is.na(bankrupt)) %>%
group_by(bankrupt) %>%
mutate(mean_Z=mean(Z,na.rm=T)) %>%
slice(1) %>%
ungroup() %>%
select(bankrupt, mean_Z) %>%
html_df()
bankrupt |
mean_Z |
0 |
3.939223 |
1 |
0.927843 |
plot <- df %>%
filter(!is.na(Z), !is.na(rating), year >= 2000) %>%
group_by(rating) %>%
mutate(mean_Z=mean(Z,na.rm=T)) %>%
slice(1) %>%
ungroup() %>%
select(rating, mean_Z) %>%
ggplot(aes(y=mean_Z, x=rating)) +
geom_col() +
ylab('Mean Altman Z') + xlab('Credit rating') +
theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
filter(!is.na(Z),
!is.na(bankrupt),
year >= 2000) %>%
group_by(bankrupt) %>%
mutate(mean_Z=mean(Z,na.rm=T)) %>%
slice(1) %>%
ungroup() %>%
select(bankrupt, mean_Z) %>%
html_df()
bankrupt |
mean_Z |
0 |
3.822281 |
1 |
1.417683 |
fit_Z <- glm(bankrupt ~ Z, data=df, family=binomial)
summary(fit_Z)
Call:
glm(formula = bankrupt ~ Z, family = binomial, data = df)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8297 -0.0676 -0.0654 -0.0624 3.7794
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.94354 0.11829 -50.245 < 2e-16 ***
Z -0.06383 0.01239 -5.151 2.59e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1085.2 on 35296 degrees of freedom
Residual deviance: 1066.5 on 35295 degrees of freedom
(15577 observations deleted due to missingness)
AIC: 1070.5
Number of Fisher Scoring iterations: 9
library(ROCR)
dfZ <- df %>% filter(!is.na(Z), !is.na(bankrupt))
pred_Z <- predict(fit_Z, dfZ, type="response")
ROCpred_Z <- prediction(as.numeric(pred_Z), as.numeric(dfZ$bankrupt))
ROCperf_Z <- performance(ROCpred_Z, 'tpr','fpr')
df_ROC_Z <- data.frame(
FP=c(ROCperf_Z@x.values[[1]]),
TP=c(ROCperf_Z@y.values[[1]]))
ggplot(data=df_ROC_Z,
aes(x=FP, y=TP)) + geom_line() +
geom_abline(slope=1)

plot(ROCperf_Z)

ggplot(data=df_ROC_Z, aes(x=FP, y=TP)) +
geom_line() +
geom_abline(slope=1) +
ylab("True positive rate (Sensitivity)") +
xlab("False positive rate (1 - Specificity)") +
ggtitle("ROC Curve")

auc_Z <- performance(ROCpred_Z, measure = "auc")
auc_Z@y.values[[1]]
[1] 0.8280943
score = 1
m = 0
std = 1
funcShaded <- function(x, lower_bound) {
y = dnorm(x, mean = m, sd = std)
y[x < lower_bound] <- NA
return(y)
}
ggplot(data.frame(x = c(-3, 3)), aes(x = x)) +
stat_function(fun = dnorm, args = list(mean = m, sd = std)) +
stat_function(fun = funcShaded, args = list(lower_bound = score),
geom = "area", fill = 'black', alpha = .2) +
scale_x_continuous(name = "Score", breaks = seq(-3, 3, std)) +
geom_text(data = data.frame(x=c(1.5), y=c(0.05)), aes(x=x, y=y, label="Prob(default)", size=30)) +
geom_line(data = data.frame(x=c(1,1), y=c(0,0.4)), aes(x=x,y=y)) +
geom_text(data = data.frame(x=c(1.3), y=c(0.4)), aes(x=x, y=y, label="DD", size=30)) +
theme(legend.position="none")

# df_stock is an already prepped csv from CRSP data
df_stock$date <- as.Date(df_stock$date)
df <- left_join(df, df_stock[,c("gvkey", "date", "ret", "ret.sd")])
Joining, by = c("gvkey", "date")
df_rf$date <- as.Date(df_rf$dateff)
df_rf$year <- year(df_rf$date)
df_rf$month <- month(df_rf$date)
df <- left_join(df, df_rf[,c("year", "month", "rf")])
Joining, by = c("year", "month")
df <- df %>%
mutate(DD = (log(mve / lt) + (rf - (ret.sd*sqrt(253))^2 / 2)) /
(ret.sd*sqrt(253)))
# Clean the measure
df <- df %>%
mutate_if(is.numeric, list(~replace(., !is.finite(.), NA)))
plot <- df %>%
filter(!is.na(DD),
!is.na(rating)) %>%
group_by(rating) %>%
mutate(mean_DD=mean(DD,na.rm=T),
prob_default = pnorm(-1 * mean_DD)) %>%
slice(1) %>%
ungroup() %>%
select(rating, prob_default) %>%
ggplot(aes(y=prob_default, x=rating)) +
geom_col() +
ylab('Probability of default') + xlab('Credit rating') +
theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
filter(!is.na(DD),
!is.na(bankrupt)) %>%
group_by(bankrupt) %>%
mutate(mean_DD=mean(DD, na.rm=T),
prob_default =
pnorm(-1 * mean_DD)) %>%
slice(1) %>%
ungroup() %>%
select(bankrupt, mean_DD,
prob_default) %>%
html_df()
bankrupt |
mean_DD |
prob_default |
0 |
0.6096854 |
0.2710351 |
1 |
-2.4445081 |
0.9927475 |
plot <- df %>%
filter(!is.na(DD),
!is.na(rating),
year >= 2000) %>%
group_by(rating) %>%
mutate(mean_DD=mean(DD,na.rm=T),
prob_default = pnorm(-1 * mean_DD)) %>%
slice(1) %>%
ungroup() %>%
select(rating, prob_default) %>%
ggplot(aes(y=prob_default, x=rating)) +
geom_col() +
ylab('Probability of default') + xlab('Credit rating') +
theme(axis.text.x = element_text(angle = 90))
ggplotly(plot)
df %>%
filter(!is.na(DD),
!is.na(bankrupt),
year >= 2000) %>%
group_by(bankrupt) %>%
mutate(mean_DD=mean(DD, na.rm=T),
prob_default =
pnorm(-1 * mean_DD)) %>%
slice(1) %>%
ungroup() %>%
select(bankrupt, mean_DD,
prob_default) %>%
html_df()
bankrupt |
mean_DD |
prob_default |
0 |
0.8379932 |
0.2010172 |
1 |
-4.3001844 |
0.9999915 |
fit_DD <- glm(bankrupt ~ DD, data=df, family=binomial)
summary(fit_DD)
Call:
glm(formula = bankrupt ~ DD, family = binomial, data = df)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.9929 -0.0750 -0.0634 -0.0506 3.6503
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.16394 0.15322 -40.230 < 2e-16 ***
DD -0.24459 0.03781 -6.469 9.89e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 718.67 on 21563 degrees of freedom
Residual deviance: 677.27 on 21562 degrees of freedom
(33618 observations deleted due to missingness)
AIC: 681.27
Number of Fisher Scoring iterations: 9
dfDD <- df %>% filter(!is.na(DD), !is.na(bankrupt))
pred_DD <- predict(fit_DD, dfDD, type="response")
ROCpred_DD <- prediction(as.numeric(pred_DD), as.numeric(dfDD$bankrupt))
ROCperf_DD <- performance(ROCpred_DD, 'tpr','fpr')
df_ROC_DD <- data.frame(FalsePositive=c(ROCperf_DD@x.values[[1]]),
TruePositive=c(ROCperf_DD@y.values[[1]]))
ggplot() +
geom_line(data=df_ROC_DD, aes(x=FalsePositive, y=TruePositive, color="DD")) +
geom_line(data=df_ROC_Z, aes(x=FP, y=TP, color="Z")) +
geom_abline(slope=1)

#AUC
auc_DD <- performance(ROCpred_DD, measure = "auc")
AUCs <- c(auc_Z@y.values[[1]], auc_DD@y.values[[1]])
names(AUCs) <- c("Z", "DD")
AUCs
Z DD
0.8280943 0.8098304
# calculate downgrade
df <- df %>%
group_by(gvkey) %>%
arrange(date) %>%
mutate(downgrade = ifelse(rating < lag(rating),1,0),
diff_Z = Z - lag(Z),
diff_DD = DD - lag(DD)) %>%
ungroup()
# training sample
train <- df %>% filter(year < 2014, !is.na(diff_Z), !is.na(diff_DD), !is.na(downgrade),
year > 1985)
test <- df %>% filter(year >= 2014, !is.na(diff_Z), !is.na(diff_DD), !is.na(downgrade))
# glms
fit_Z2 <- glm(downgrade ~ diff_Z, data=train, family=binomial)
fit_DD2 <- glm(downgrade ~ diff_DD, data=train, family=binomial)
summary(fit_Z2)
Call:
glm(formula = downgrade ~ diff_Z, family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4115 -0.4428 -0.4428 -0.3928 2.7437
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.27310 0.06139 -37.029 <2e-16 ***
diff_Z -0.77150 0.09245 -8.345 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2145.3 on 3277 degrees of freedom
Residual deviance: 2065.8 on 3276 degrees of freedom
AIC: 2069.8
Number of Fisher Scoring iterations: 5
summary(fit_DD2)
Call:
glm(formula = downgrade ~ diff_DD, family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.5726 -0.4565 -0.4558 -0.4095 2.6804
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.21199 0.05926 -37.325 < 2e-16 ***
diff_DD -0.21378 0.03723 -5.742 9.37e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2145.3 on 3277 degrees of freedom
Residual deviance: 2113.2 on 3276 degrees of freedom
AIC: 2117.2
Number of Fisher Scoring iterations: 5
pred_Z2 <- predict(fit_Z2, train, type="response")
pred_Z2 <- ifelse(!is.finite(pred_Z2),NA,pred_Z2)
ROCpred_Z2 <- prediction(as.numeric(pred_Z2[!is.na(train$downgrade) & !is.na(pred_Z2)]), as.numeric(train[!is.na(train$downgrade) & !is.na(pred_Z2),]$downgrade))
ROCperf_Z2 <- performance(ROCpred_Z2, 'tpr','fpr')
df_ROC_Z2 <- data.frame(FalsePositive=c(ROCperf_Z2@x.values[[1]]),
TruePositive=c(ROCperf_Z2@y.values[[1]]))
auc_Z2 <- performance(ROCpred_Z2, measure = "auc")
pred_DD2 <- predict(fit_DD2, train, type="response")
#[!is.na(pred)]
ROCpred_DD2 <- prediction(as.numeric(pred_DD2[!is.na(train$downgrade) & !is.na(pred_DD2)]), as.numeric(train[!is.na(train$downgrade) & !is.na(pred_DD2),]$downgrade))
ROCperf_DD2 <- performance(ROCpred_DD2, 'tpr','fpr')
df_ROC_DD2 <- data.frame(FalsePositive=c(ROCperf_DD2@x.values[[1]]),
TruePositive=c(ROCperf_DD2@y.values[[1]]))
ggplot() + geom_line(data=df_ROC_DD2, aes(x=FalsePositive, y=TruePositive, color='DD')) + geom_line(data=df_ROC_Z2, aes(x=FalsePositive, y=TruePositive, color='Z')) + geom_abline(slope=1)

auc_DD2 <- performance(ROCpred_DD2, measure = "auc")
AUCs <- c(auc_Z2@y.values[[1]], auc_DD2@y.values[[1]])
names(AUCs) <- c("Z", "DD")
AUCs
Z DD
0.6465042 0.5847885
pred_Z2 <- predict(fit_Z2, test, type="response")
ROCpred_Z2 <- prediction(as.numeric(pred_Z2[!is.na(test$downgrade) & !is.na(pred_Z2)]), as.numeric(test[!is.na(test$downgrade) & !is.na(pred_Z2),]$downgrade))
ROCperf_Z2 <- performance(ROCpred_Z2, 'tpr','fpr')
df_ROC_Z2 <- data.frame(FalsePositive=c(ROCperf_Z2@x.values[[1]]),
TruePositive=c(ROCperf_Z2@y.values[[1]]))
auc_Z2 <- performance(ROCpred_Z2, measure = "auc")
pred_DD2 <- predict(fit_DD2, test, type="response")
ROCpred_DD2 <- prediction(as.numeric(pred_DD2[!is.na(test$downgrade) & !is.na(pred_DD2)]), as.numeric(test[!is.na(test$downgrade) & !is.na(pred_DD2),]$downgrade))
ROCperf_DD2 <- performance(ROCpred_DD2, 'tpr','fpr')
df_ROC_DD2 <- data.frame(FalsePositive=c(ROCperf_DD2@x.values[[1]]),
TruePositive=c(ROCperf_DD2@y.values[[1]]))
ggplot() + geom_line(data=df_ROC_DD2, aes(x=FalsePositive, y=TruePositive, color='DD')) + geom_line(data=df_ROC_Z2, aes(x=FalsePositive, y=TruePositive, color='Z')) + geom_abline(slope=1)

auc_DD2 <- performance(ROCpred_DD2, measure = "auc")
AUCs <- c(auc_Z2@y.values[[1]], auc_DD2@y.values[[1]])
names(AUCs) <- c("Z", "DD")
AUCs
Z DD
0.8134671 0.7420213
fit_comb <- glm(downgrade ~ diff_Z + diff_DD, data=train, family=binomial)
summary(fit_comb)
Call:
glm(formula = downgrade ~ diff_Z + diff_DD, family = binomial,
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.3263 -0.4431 -0.4430 -0.3892 2.7504
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.27217 0.06144 -36.980 < 2e-16 ***
diff_Z -0.71374 0.10709 -6.665 2.65e-11 ***
diff_DD -0.04884 0.04638 -1.053 0.292
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2145.3 on 3277 degrees of freedom
Residual deviance: 2064.7 on 3275 degrees of freedom
AIC: 2070.7
Number of Fisher Scoring iterations: 5
fit_comb %>%
margins::margins() %>%
summary()
pred_comb <- predict(fit_comb, test, type="response")
pred_comb <- ifelse(!is.finite(pred_comb),NA,pred_comb)
ROCpred_comb <- prediction(as.numeric(pred_comb[!is.na(test$downgrade) & !is.na(pred_comb)]), as.numeric(test[!is.na(test$downgrade) & !is.na(pred_comb),]$downgrade))
ROCperf_comb <- performance(ROCpred_comb, 'tpr','fpr')
df_ROC_comb <- data.frame(FalsePositive=c(ROCperf_comb@x.values[[1]]),
TruePositive=c(ROCperf_comb@y.values[[1]]))
auc_comb <- performance(ROCpred_comb, measure = "auc")
ggplot() +
geom_line(data=df_ROC_comb, aes(x=FalsePositive, y=TruePositive, color='Combined')) +
geom_line(data=df_ROC_Z2, aes(x=FalsePositive, y=TruePositive, color='Z')) +
geom_abline(slope=1) +
geom_line(data=df_ROC_DD2, aes(x=FalsePositive, y=TruePositive, color='DD'))

AUCs <- c(auc_comb@y.values[[1]], auc_Z2@y.values[[1]], auc_DD2@y.values[[1]])
names(AUCs) <- c("Combined", "Z", "DD")
AUCs
Combined Z DD
0.8151596 0.8134671 0.7420213
