Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
library(tidyverse)
df <- read.csv("../../Data/Session_2-1.csv", stringsAsFactors=FALSE)
df_full <- df
uol <- filter(df, isin == "SG1S83002349")
#clean_df <- subset(df,fyear==2017 & !is.na(revt) & !is.na(ni) & revt > 1)
# revt: Revenue, at: Assets
summary(uol[,c("revt", "at")])
revt at
Min. : 94.78 Min. : 1218
1st Qu.: 193.41 1st Qu.: 3044
Median : 427.44 Median : 3478
Mean : 666.38 Mean : 5534
3rd Qu.:1058.61 3rd Qu.: 7939
Max. :2103.15 Max. :19623
mod1 <- lm(revt ~ at, data = uol)
summary(mod1)
Call:
lm(formula = revt ~ at, data = uol)
Residuals:
Min 1Q Median 3Q Max
-295.01 -101.29 -41.09 47.17 926.29
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -13.831399 67.491305 -0.205 0.839
at 0.122914 0.009678 12.701 6.7e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 221.2 on 27 degrees of freedom
Multiple R-squared: 0.8566, Adjusted R-squared: 0.8513
F-statistic: 161.3 on 1 and 27 DF, p-value: 6.699e-13
# Graph showing squared error
uolg <- uol[,c("at","revt")]
uolg$resid <- mod1$residuals
uolg$xleft <- ifelse(uolg$resid < 0,uolg$at,uolg$at - uolg$resid)
uolg$xright <- ifelse(uolg$resid < 0,uolg$at - uolg$resid, uol$at)
uolg$ytop <- ifelse(uolg$resid < 0,uolg$revt - uolg$resid,uol$revt)
uolg$ybottom <- ifelse(uolg$resid < 0,uolg$revt, uolg$revt - uolg$resid)
uolg$point <- TRUE
uolg2 <- uolg
uolg2$point <- FALSE
uolg2$at <- ifelse(uolg$resid < 0,uolg2$xright,uolg2$xleft)
uolg2$revt <- ifelse(uolg$resid < 0,uolg2$ytop,uolg2$ybottom)
uolg <- rbind(uolg, uolg2)
uolg %>% ggplot(aes(y=revt, x=at, group=point)) +
geom_point(aes(shape=point)) +
scale_shape_manual(values=c(NA,18)) +
geom_smooth(method="lm", se=FALSE) +
geom_errorbarh(aes(xmax=xright, xmin = xleft)) +
geom_errorbar(aes(ymax=ytop, ymin = ybottom)) +
theme(legend.position="none")
# tidyverse
uol <- uol %>%
mutate(revt_growth1 = revt / lag(revt) - 1)
# R way
uol$revt_growth2 = uol$revt / c(NA, uol$revt[-length(uol$revt)]) - 1
identical(uol$revt_growth1, uol$revt_growth2)
[1] TRUE
# Make the other needed change
uol <- uol %>%
mutate(at_growth = at / lag(at) - 1) %>% # Calculate asset growth
rename(revt_growth = revt_growth1) # Rename for readability
# Run the OLS model
mod2 <- lm(revt_growth ~ at_growth, data = uol)
summary(mod2)
Call:
lm(formula = revt_growth ~ at_growth, data = uol)
Residuals:
Min 1Q Median 3Q Max
-0.57736 -0.10534 -0.00953 0.15132 0.42284
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.09024 0.05620 1.606 0.1204
at_growth 0.53821 0.27717 1.942 0.0631 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2444 on 26 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.1267, Adjusted R-squared: 0.09307
F-statistic: 3.771 on 1 and 26 DF, p-value: 0.06307
# lct: short term liabilities, che: cash and equivalents, ebit: EBIT
uol <- uol %>%
mutate_at(vars(lct, che, ebit), list(growth = ~. / lag(.) - 1)) # Calculate 3 growths
mod3 <- lm(revt_growth ~ lct_growth + che_growth + ebit_growth, data=uol)
summary(mod3)
Call:
lm(formula = revt_growth ~ lct_growth + che_growth + ebit_growth,
data = uol)
Residuals:
Min 1Q Median 3Q Max
-0.46531 -0.15097 0.00205 0.17601 0.31997
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.07498 0.04915 1.526 0.14018
lct_growth 0.23482 0.07319 3.209 0.00376 **
che_growth -0.11561 0.09227 -1.253 0.22230
ebit_growth 0.03808 0.02208 1.724 0.09751 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2228 on 24 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.33, Adjusted R-squared: 0.2462
F-statistic: 3.94 on 3 and 24 DF, p-value: 0.02033
detector <- function() {
dice <- sample(1:6, size=2, replace=TRUE)
if (sum(dice) == 12) {
"exploded"
} else {
"still there"
}
}
experiment <- replicate(1000,detector())
# p value
p <- sum(experiment == "still there") / 1000
if (p < 0.05) {
paste("p-value: ", p, "-- Fail to reject H_A, sun appears to have exploded")
} else {
paste("p-value: ", p, "-- Reject H_A that sun exploded")
}
[1] "p-value: 0.963 -- Reject H_A that sun exploded"
library(tidyverse)
read_csv('../../Data/Session_3-1.csv') %>%
ggplot(aes(y=revtq, x=atq)) +
geom_point() +
geom_smooth(method="lm") +
xlab("Assets") +
ylab("Revenue")
Parsed with column specification:
cols(
.default = col_double(),
gvkey = [31mcol_character()[39m,
datadate = [34mcol_date(format = "")[39m,
indfmt = [31mcol_character()[39m,
consol = [31mcol_character()[39m,
popsrc = [31mcol_character()[39m,
datafmt = [31mcol_character()[39m,
tic = [31mcol_character()[39m,
conm = [31mcol_character()[39m,
curcdq = [31mcol_character()[39m,
datacqtr = [31mcol_character()[39m,
datafqtr = [31mcol_character()[39m,
costat = [31mcol_character()[39m
)
See spec(...) for full column specifications.
plot_norm <- function(bound, outer) {
x <- seq(-outer,outer,length=100)
hx <- dnorm(x)
plot(x, hx, type="n", xlab="z values", ylab="Normal PDF", main="Normal Distribution", axes=FALSE)
lines(x, hx)
i <- x < -bound
polygon(c(-outer,x[i],-bound,-bound), c(0,hx[i],max(hx[i]),0), col="red")
i <- x > bound
polygon(c(bound, bound, x[i],outer), c(0,max(hx[i]), hx[i],0), col="red")
axis(1, at=-outer:outer, pos=0)
}
plot_norm(1.96, 4)
anova(mod2, mod3, test="Chisq")
Analysis of Variance Table
Model 1: revt_growth ~ at_growth
Model 2: revt_growth ~ lct_growth + che_growth + ebit_growth
Res.Df RSS Df Sum of Sq Pr(>Chi)
1 26 1.5534
2 24 1.1918 2 0.36168 0.0262 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Ensure firms have at least $1M (local currency), and have revenue
# df contains all real estate companies excluding North America
df_clean <- filter(df, df$at>1, df$revt>0)
# We cleaned out 578 observations!
print(c(nrow(df), nrow(df_clean)))
[1] 5161 4583
# Another useful cleaning funtion:
# Replaces NaN, Inf, and -Inf with NA for all numeric variables in the data!
df_clean <- df_clean %>%
mutate_if(is.numeric, list(~replace(., !is.finite(.), NA)))
uol <- uol %>% mutate(revt_lead = lead(revt)) # From dplyr
forecast1 <- lm(revt_lead ~ lct + che + ebit, data=uol)
library(broom) # Lets us view bigger regression outputs in a tidy fashion
tidy(forecast1) # Present regression output
glance(forecast1) # Present regression statistics
forecast2 <-
lm(revt_lead ~ revt + act + che + lct + dp + ebit , data=uol)
tidy(forecast2)
glance(forecast2)
anova(forecast1, forecast2, test="Chisq")
Analysis of Variance Table
Model 1: revt_lead ~ lct + che + ebit
Model 2: revt_lead ~ revt + act + che + lct + dp + ebit
Res.Df RSS Df Sum of Sq Pr(>Chi)
1 24 3059182
2 21 863005 3 2196177 1.477e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Note the group_by -- without it, lead() will pull from the subsequent firm!
# ungroup() tells R that we finished grouping
df_clean <- df_clean %>%
group_by(isin) %>%
mutate(revt_lead = lead(revt)) %>%
ungroup()
forecast3 <-
lm(revt_lead ~ revt + act + che + lct + dp + ebit , data=df_clean[df_clean$fic=="SGP",])
tidy(forecast3)
glance(forecast3)
forecast4 <-
lm(revt_lead ~ revt + act + che + lct + dp + ebit , data=df_clean)
tidy(forecast4)
glance(forecast4)
forecast3.1 <-
lm(revt_lead ~ revt + act + che + lct + dp + ebit + factor(isin),
data=df_clean[df_clean$fic=="SGP",])
# n=7 to prevent outputting every fixed effect
print(tidy(forecast3.1), n=15)
glance(forecast3.1)
anova(forecast3, forecast3.1, test="Chisq")
Analysis of Variance Table
Model 1: revt_lead ~ revt + act + che + lct + dp + ebit
Model 2: revt_lead ~ revt + act + che + lct + dp + ebit + factor(isin)
Res.Df RSS Df Sum of Sq Pr(>Chi)
1 324 14331633
2 304 13215145 20 1116488 0.1765
library(lfe)
forecast3.2 <-
felm(revt_lead ~ revt + act + che + lct + dp + ebit | factor(isin),
data=df_clean[df_clean$fic=="SGP",])
summary(forecast3.2)
Call:
felm(formula = revt_lead ~ revt + act + che + lct + dp + ebit | factor(isin), data = df_clean[df_clean$fic == "SGP", ])
Residuals:
Min 1Q Median 3Q Max
-1181.88 -23.25 -1.87 18.03 1968.86
Coefficients:
Estimate Std. Error t value Pr(>|t|)
revt 0.39200 0.09767 4.013 7.54e-05 ***
act -0.05382 0.06017 -0.894 0.37181
che 0.30370 0.17682 1.718 0.08690 .
lct 0.39209 0.09210 4.257 2.76e-05 ***
dp 4.71275 1.73168 2.721 0.00687 **
ebit -0.85080 0.32704 -2.602 0.00974 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 208.5 on 304 degrees of freedom
(29 observations deleted due to missingness)
Multiple R-squared(full model): 0.8558 Adjusted R-squared: 0.8435
Multiple R-squared(proj model): 0.7806 Adjusted R-squared: 0.7618
F-statistic(full model):69.41 on 26 and 304 DF, p-value: < 2.2e-16
F-statistic(proj model): 180.3 on 6 and 304 DF, p-value: < 2.2e-16
df_clean %>%
filter(fic=="SGP") %>%
group_by(isin) %>%
mutate(mean_revt_lead=mean(revt_lead, na.rm=T)) %>%
slice(1) %>%
ungroup() %>%
ggplot(aes(x=mean_revt_lead)) + geom_histogram(aes(y = ..density..)) + geom_density(alpha=.4, fill="#FF6666")
# Exports for the following week
save(df_clean, forecast2, uol, forecast4, file = "../../Data/Session_2_export.RData")