library(knitr)
library(kableExtra)
#' Pretty formatted HTML tables
#'
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
if(!length(names)) {
names=colnames(df)
}
kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
{if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
{if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
{if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
{if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}
library(tidyverse)
Code for Session 2
Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
# The data set is Compustat global, gind==602010 [note: Compustat changed this retroactively];
library(tidyverse)
df <- read.csv("../../Data/Session_2-1.csv", stringsAsFactors=FALSE)
df_fullyear <- df
df_full <- filter(df_fullyear, datadate < 20221231)
uol_fullyear <- filter(df, isin == "SG1S83002349")
uol <- filter(uol_fullyear, datadate < 20221231)
uol_answer = filter(df, isin == "SG1S83002349", datadate == 20221231)
#clean_df <- subset(df,fyear==2017 & !is.na(revt) & !is.na(ni) & revt > 1)
revt at
Min. : 155.1 Min. : 2366
1st Qu.: 347.1 1st Qu.: 3277
Median : 804.1 Median : 6138
Mean : 999.2 Mean : 8440
3rd Qu.:1380.7 3rd Qu.:11515
Max. :2606.8 Max. :21275
Call:
lm(formula = revt ~ at, data = uol)
Residuals:
Min 1Q Median 3Q Max
-362.48 -141.73 -33.20 61.29 951.62
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 51.069230 75.749121 0.674 0.506
at 0.112330 0.007174 15.657 9.41e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 240.8 on 26 degrees of freedom
Multiple R-squared: 0.9041, Adjusted R-squared: 0.9004
F-statistic: 245.1 on 1 and 26 DF, p-value: 9.408e-15
library(tidyverse)
# uolg is defined in the class session code file
uolg %>% ggplot(aes(y=revt, x=at)) +
geom_point(aes(shape=point, group=point)) +
scale_shape_manual(values=c(NA,18)) +
geom_smooth(method="lm", se=FALSE) +
geom_errorbarh(aes(xmax=xright, xmin = xleft)) +
geom_errorbar(aes(ymax=ytop, ymin = ybottom)) +
theme(legend.position="none") + xlim(0, 20000) + ylim(0, 2500)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 8 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 32 rows containing missing values (`geom_point()`).
Warning: Removed 8 rows containing missing values (`geom_errorbarh()`).
# tidyverse
uol <- uol %>%
mutate(revt_growth1 = revt / lag(revt) - 1)
# R way
uol$revt_growth2 = uol$revt / c(NA, uol$revt[-length(uol$revt)]) - 1
# Check that both ways are equivalent
identical(uol$revt_growth1, uol$revt_growth2)
[1] TRUE
# Make the other needed change
uol <- uol %>%
mutate(at_growth = at / lag(at) - 1) %>% # Calculate asset growth
rename(revt_growth = revt_growth1) # Rename for readability
# Run the OLS model
mod2 <- lm(revt_growth ~ at_growth, data = uol)
summary(mod2)
Call:
lm(formula = revt_growth ~ at_growth, data = uol)
Residuals:
Min 1Q Median 3Q Max
-0.57261 -0.13261 -0.00151 0.15371 0.42832
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.08725 0.05569 1.567 0.1298
at_growth 0.57277 0.29580 1.936 0.0642 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2496 on 25 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.1304, Adjusted R-squared: 0.09564
F-statistic: 3.749 on 1 and 25 DF, p-value: 0.06421
# lct: short term liabilities, che: cash and equivalents, ebit: EBIT
uol <- uol %>%
mutate(across(c(act, che, lct),
.fns = ~. / lag(.) - 1,
.names = paste0('{col}','_growth'))) # Calculate 3 growths
mod3 <- lm(revt_growth ~ act_growth + che_growth + lct_growth, data=uol)
summary(mod3)
Call:
lm(formula = revt_growth ~ act_growth + che_growth + lct_growth,
data = uol)
Residuals:
Min 1Q Median 3Q Max
-0.48371 -0.13474 -0.00062 0.19134 0.33432
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.09595 0.04937 1.944 0.0643 .
act_growth 0.26711 0.16472 1.622 0.1185
che_growth -0.25907 0.12340 -2.099 0.0470 *
lct_growth 0.18242 0.09680 1.884 0.0722 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2317 on 23 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.311, Adjusted R-squared: 0.2212
F-statistic: 3.461 on 3 and 23 DF, p-value: 0.03287
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 1 rows containing missing values (`geom_point()`).
anova(mod2, mod3, test="Chisq")
Analysis of Variance Table
Model 1: revt_growth ~ at_growth
Model 2: revt_growth ~ act_growth + che_growth + lct_growth
Res.Df RSS Df Sum of Sq Pr(>Chi)
1 25 1.5580
2 23 1.2344 2 0.32359 0.04906 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Ensure firms have at least $1M (local currency), and have revenue
# df_full contains all real estate companies excluding North America
df_clean <- df_full %>%
filter(at>1, revt>0)
# We cleaned out 596 observations!
print(c(nrow(df_full), nrow(df_clean)))
[1] 6152 5556
uol <- uol %>% mutate(revt_lead = lead(revt)) # From dplyr
forecast1 <- lm(revt_lead ~ act + che + lct, data=uol)
library(broom) # Lets us view bigger regression outputs in a tidy fashion
tidy(forecast1) # Present regression output
# A tibble: 4 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 235. 139. 1.69 0.104
2 act 0.548 0.145 3.77 0.000999
3 che -0.181 0.322 -0.561 0.580
4 lct -0.0700 0.242 -0.289 0.775
glance(forecast1) # Present regression statistics
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.826 0.803 337. 36.4 0.00000000675 3 -193. 397. 403.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# A tibble: 7 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 148. 119. 1.24 0.228
2 revt 1.63 0.311 5.22 0.0000414
3 act 0.317 0.165 1.92 0.0687
4 che 0.124 0.322 0.384 0.705
5 lct -0.189 0.193 -0.981 0.338
6 dp -3.66 3.39 -1.08 0.293
7 ebit -3.63 0.995 -3.65 0.00159
glance(forecast2)
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.929 0.907 231. 43.4 1.97e-10 6 -181. 379. 389.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
anova(forecast1, forecast2, test="Chisq")
Analysis of Variance Table
Model 1: revt_lead ~ act + che + lct
Model 2: revt_lead ~ revt + act + che + lct + dp + ebit
Res.Df RSS Df Sum of Sq Pr(>Chi)
1 23 2616067
2 20 1071637 3 1544429 2.439e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
forecast3 <-
lm(revt_lead ~ revt + act + che + lct + dp + ebit,
data=df_clean[df_clean$fic=="SGP",])
tidy(forecast3)
# A tibble: 7 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.0134 7.86 0.00170 9.99e- 1
2 revt 0.652 0.0555 11.7 2.00e-27
3 act 0.154 0.0306 5.03 7.48e- 7
4 che 0.234 0.0807 2.90 3.98e- 3
5 lct 0.0768 0.0575 1.34 1.82e- 1
6 dp 1.63 0.748 2.17 3.04e- 2
7 ebit -0.802 0.206 -3.90 1.15e- 4
glance(forecast3)
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.884 0.883 123. 488. 5.63e-176 6 -2427. 4870. 4902.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# A tibble: 7 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 357. 666. 0.536 5.92e- 1
2 revt 1.03 0.00599 171. 0
3 act -0.0307 0.00602 -5.11 3.33e- 7
4 che 0.0274 0.0116 2.35 1.86e- 2
5 lct 0.0701 0.00919 7.63 2.78e-14
6 dp 0.237 0.166 1.42 1.55e- 1
7 ebit 0.0319 0.0490 0.651 5.15e- 1
glance(forecast4)
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.948 0.948 46353. 15089. 0 6 -60617. 121249. 121302.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
library(tidyverse)
# First column, first 10 rows...
read_csv("../../Data/Session_2-Macro.csv") %>%
.[1:10, 1] %>%
DT::datatable()
# Skip the header
# Next 10 rows and columns
read_csv("../../Data/Session_2-Macro.csv", skip=10) %>%
.[1:10, 1:10] %>%
DT::datatable()
expectations <- read_csv("../../Data/Session_2-Macro.csv",
skip=10, na="na") %>% # Needed to load file
filter(row_number() < 21) %>% # Drop the footer
rename(industry=`Data Series`) %>% # Rename column
pivot_longer(!industry, names_to='yearQ',
values_to='fin_sentiment') %>% # Cast wide to long
mutate(year = as.numeric(substr(yearQ, 1, 4))) %>% # split out year
mutate(quarter = as.numeric(substr(yearQ, 6, 6))) %>% # split out quarter
select(-yearQ) # Remove measure
# extract out Q1, finance only
expectations_re <- expectations %>%
filter(quarter == 1, # Keep only the Q1
industry == "Real Estate") # Keep only real estate
expectations %>%
arrange(industry, year, quarter) %>% # sort the data
filter(year == 2022) %>%
DT::datatable(rownames=FALSE) # display using DT
# subset out our Singaporean data, since our macro data is Singapore-specific
df_SG <- df_clean %>% filter(fic == "SGP")
# Create year in df_SG (date is given by datadate as YYYYMMDD)
df_SG$year = round(df_SG$datadate / 10000, digits=0)
# Combine datasets
# Notice how it automatically figures out to join by "year"
df_SG_macro <- left_join(df_SG, expectations_re[,c("year","fin_sentiment")])
Joining with `by = join_by(year)`
macro1 <- lm(revt_lead ~ revt + act + che + lct + dp + ebit + fin_sentiment,
data=df_SG_macro)
library(broom)
tidy(macro1)
# A tibble: 8 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.119 8.00 0.0149 9.88e- 1
2 revt 0.652 0.0563 11.6 1.01e-26
3 act 0.155 0.0316 4.90 1.41e- 6
4 che 0.231 0.0823 2.81 5.23e- 3
5 lct 0.0755 0.0582 1.30 1.96e- 1
6 dp 1.63 0.761 2.15 3.25e- 2
7 ebit -0.804 0.208 -3.86 1.35e- 4
8 fin_sentiment 0.0174 0.177 0.0980 9.22e- 1
df_SG_macro %>%
ggplot(aes(y=revt_lead,
x=fin_sentiment)) +
geom_point()
Warning: Removed 27 rows containing missing values (`geom_point()`).
df_SG_macro %>%
ggplot(aes(y=revt_lead,
x=fin_sentiment * revt)) +
geom_point()
Warning: Removed 27 rows containing missing values (`geom_point()`).
macro3 <-
lm(revt_lead ~ revt + act + che + lct + dp + ebit + fin_sentiment:revt,
data=df_SG_macro)
tidy(macro3)
# A tibble: 8 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 1.83 7.91 0.231 8.18e- 1
2 revt 0.655 0.0556 11.8 1.63e-27
3 act 0.133 0.0316 4.21 3.21e- 5
4 che 0.267 0.0821 3.25 1.24e- 3
5 lct 0.0619 0.0577 1.07 2.84e- 1
6 dp 1.94 0.757 2.57 1.06e- 2
7 ebit -0.804 0.206 -3.90 1.12e- 4
8 revt:fin_sentiment -0.00175 0.000596 -2.94 3.51e- 3
glance(macro3)
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.887 0.885 123. 421. 1.28e-173 7 -2388. 4794. 4830.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# Ensure that we use the same data (fin_sentiment is missing in 1994)
baseline <-
lm(revt_lead ~ revt + act + che + lct + dp + ebit,
data=df_SG_macro[!is.na(df_SG_macro$fin_sentiment),])
glance(baseline)
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.884 0.882 124. 480. 3.97e-173 6 -2392. 4801. 4832.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(macro3)
# A tibble: 1 × 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.887 0.885 123. 421. 1.28e-173 7 -2388. 4794. 4830.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
anova(baseline, macro3, test="Chisq")
Analysis of Variance Table
Model 1: revt_lead ~ revt + act + che + lct + dp + ebit
Model 2: revt_lead ~ revt + act + che + lct + dp + ebit + fin_sentiment:revt
Res.Df RSS Df Sum of Sq Pr(>Chi)
1 377 5799708
2 376 5669613 1 130094 0.003311 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
revenue_mean = mean(df_SG_macro$revt, na.rm=T)
r_sd <- round(sd(df_SG_macro$fin_sentiment, na.rm=T),1)
r_min <- min(df_SG_macro$fin_sentiment, na.rm=T)
r_max <- max(df_SG_macro$fin_sentiment, na.rm=T)
rev_base <- macro3$coefficients[["revt"]]
rev <- macro3$coefficients[["revt:fin_sentiment"]]
r_rev = round(100 * r_sd * rev,2)
r_rev_base = round(100 * rev_base, 2)
rev_min <- round((r_min * rev)*100,2)
rev_max <- round((r_max * rev)*100,2)
p_uol <- predict(forecast2, uol[uol$fyear==2021,])
p_base <- predict(baseline,
df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear==2021,])
p_macro <- predict(macro3,
df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear==2021,])
p_world <- predict(forecast4,
df_clean[df_clean$isin=="SG1S83002349" & df_clean$fyear==2021,])
preds <- c(p_uol, p_base, p_macro, p_world)
names(preds) <- c("UOL 2022 UOL", "UOL 2022 Base", "UOL 2022 Macro",
"UOL 2022 World")
preds
UOL 2022 UOL UOL 2022 Base UOL 2022 Macro UOL 2022 World
3608.571 2834.237 2745.834 3136.901
Attaching package: 'plotly'
The following object is masked from 'package:downlit':
highlight
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
actual_series <- df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear < 2021,]$revt_lead
uol_series <- uol[uol$fyear < 2021,]$pred_uol
base_series <- df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear < 2021,]$pred_base
macro_series <- df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear < 2021,]$pred_macro
world_series <- df_clean[df_clean$isin=="SG1S83002349" & df_clean$fyear < 2021,]$pred_world
# series vectors calculated here -- See appendix
rmse <- function(v1, v2) {
sqrt(mean((v1 - v2)^2, na.rm=T))
}
rmse <- c(rmse(actual_series, uol_series), rmse(actual_series, base_series),
rmse(actual_series, macro_series), rmse(actual_series, world_series))
names(rmse) <- c("UOL 2018 UOL", "UOL 2018 Base", "UOL 2018 Macro", "UOL 2018 World")
rmse
UOL 2018 UOL UOL 2018 Base UOL 2018 Macro UOL 2018 World
199.2242 274.2474 266.2979 455.7594
preds
UOL 2022 UOL UOL 2022 Base UOL 2022 Macro UOL 2022 World
3608.571 2834.237 2745.834 3136.901
# Exports for the following week
save(df_clean, forecast2, forecast3, uol, forecast4, file = "../../Data/Session_2_export.RData")