Code for Session 2

Author

Dr. Richard M. Crowley

Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.

library(knitr)
library(kableExtra)

#' Pretty formatted HTML tables
#' 
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
  if(!length(names)) {
    names=colnames(df)
  }
  kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
    kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
    {if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
    {if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
    {if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
    {if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}

library(tidyverse)
# The data set is Compustat global, gind==602010 [note: Compustat changed this retroactively]; 
library(tidyverse)
df <- read.csv("../../Data/Session_2-1.csv", stringsAsFactors=FALSE)
df_fullyear <- df
df_full <- filter(df_fullyear, datadate < 20221231)
uol_fullyear <- filter(df, isin == "SG1S83002349")
uol <- filter(uol_fullyear, datadate < 20221231)
uol_answer = filter(df, isin == "SG1S83002349", datadate == 20221231)
#clean_df <- subset(df,fyear==2017 & !is.na(revt) & !is.na(ni) & revt > 1)
# revt: Revenue, at: Assets
summary(uol[,c("revt", "at")])
      revt              at       
 Min.   : 155.1   Min.   : 2366  
 1st Qu.: 347.1   1st Qu.: 3277  
 Median : 804.1   Median : 6138  
 Mean   : 999.2   Mean   : 8440  
 3rd Qu.:1380.7   3rd Qu.:11515  
 Max.   :2606.8   Max.   :21275  
mod1 <- lm(revt ~ at, data = uol)
summary(mod1)

Call:
lm(formula = revt ~ at, data = uol)

Residuals:
    Min      1Q  Median      3Q     Max 
-362.48 -141.73  -33.20   61.29  951.62 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 51.069230  75.749121   0.674    0.506    
at           0.112330   0.007174  15.657 9.41e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 240.8 on 26 degrees of freedom
Multiple R-squared:  0.9041,    Adjusted R-squared:  0.9004 
F-statistic: 245.1 on 1 and 26 DF,  p-value: 9.408e-15
library(tidyverse)
# uolg is defined in the class session code file
uolg %>% ggplot(aes(y=revt, x=at)) + 
         geom_point(aes(shape=point, group=point)) + 
         scale_shape_manual(values=c(NA,18)) + 
         geom_smooth(method="lm", se=FALSE) +
         geom_errorbarh(aes(xmax=xright, xmin = xleft)) + 
         geom_errorbar(aes(ymax=ytop, ymin = ybottom)) + 
         theme(legend.position="none") + xlim(0, 20000) + ylim(0, 2500)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 8 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 32 rows containing missing values (`geom_point()`).
Warning: Removed 8 rows containing missing values (`geom_errorbarh()`).

# tidyverse
uol <- uol %>%
  mutate(revt_growth1 = revt / lag(revt) - 1)

# R way
uol$revt_growth2 = uol$revt / c(NA, uol$revt[-length(uol$revt)]) - 1

# Check that both ways are equivalent
identical(uol$revt_growth1, uol$revt_growth2)
[1] TRUE
# Make the other needed change
uol <- uol %>%
  mutate(at_growth = at / lag(at) - 1) %>%  # Calculate asset growth
  rename(revt_growth = revt_growth1)        # Rename for readability

# Run the OLS model
mod2 <- lm(revt_growth ~ at_growth, data = uol)
summary(mod2)

Call:
lm(formula = revt_growth ~ at_growth, data = uol)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.57261 -0.13261 -0.00151  0.15371  0.42832 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept)  0.08725    0.05569   1.567   0.1298  
at_growth    0.57277    0.29580   1.936   0.0642 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2496 on 25 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.1304,    Adjusted R-squared:  0.09564 
F-statistic: 3.749 on 1 and 25 DF,  p-value: 0.06421
# lct: short term liabilities, che: cash and equivalents, ebit: EBIT
uol <- uol %>%
  mutate(across(c(act, che, lct),
                   .fns = ~. / lag(.) - 1,
                   .names = paste0('{col}','_growth')))  # Calculate 3 growths
mod3 <- lm(revt_growth ~ act_growth + che_growth + lct_growth, data=uol)
summary(mod3)

Call:
lm(formula = revt_growth ~ act_growth + che_growth + lct_growth, 
    data = uol)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.48371 -0.13474 -0.00062  0.19134  0.33432 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept)  0.09595    0.04937   1.944   0.0643 .
act_growth   0.26711    0.16472   1.622   0.1185  
che_growth  -0.25907    0.12340  -2.099   0.0470 *
lct_growth   0.18242    0.09680   1.884   0.0722 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2317 on 23 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.311, Adjusted R-squared:  0.2212 
F-statistic: 3.461 on 3 and 23 DF,  p-value: 0.03287
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 1 rows containing missing values (`geom_point()`).

anova(mod2, mod3, test="Chisq")
Analysis of Variance Table

Model 1: revt_growth ~ at_growth
Model 2: revt_growth ~ act_growth + che_growth + lct_growth
  Res.Df    RSS Df Sum of Sq Pr(>Chi)  
1     25 1.5580                        
2     23 1.2344  2   0.32359  0.04906 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Ensure firms have at least $1M (local currency), and have revenue
# df_full contains all real estate companies excluding North America
df_clean <- df_full %>%
  filter(at>1, revt>0)

# We cleaned out 596 observations!
print(c(nrow(df_full), nrow(df_clean)))
[1] 6152 5556
# Another useful cleaning function:
# Replaces NaN, Inf, and -Inf with NA for all numeric variables in the data!
df_clean <- df_clean %>%
  mutate(across(where(is.numeric), ~replace(., !is.finite(.), NA)))
uol <- uol %>% mutate(revt_lead = lead(revt))  # From dplyr
forecast1 <- lm(revt_lead ~ act + che + lct, data=uol)
library(broom)  # Lets us view bigger regression outputs in a tidy fashion
tidy(forecast1)  # Present regression output
# A tibble: 4 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept) 235.       139.        1.69  0.104   
2 act           0.548      0.145     3.77  0.000999
3 che          -0.181      0.322    -0.561 0.580   
4 lct          -0.0700     0.242    -0.289 0.775   
glance(forecast1)  # Present regression statistics
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic       p.value    df logLik   AIC   BIC
      <dbl>         <dbl> <dbl>     <dbl>         <dbl> <dbl>  <dbl> <dbl> <dbl>
1     0.826         0.803  337.      36.4 0.00000000675     3  -193.  397.  403.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
forecast2 <- 
  lm(revt_lead ~ revt + act + che + lct + dp + ebit , data=uol)
tidy(forecast2)
# A tibble: 7 × 5
  term        estimate std.error statistic   p.value
  <chr>          <dbl>     <dbl>     <dbl>     <dbl>
1 (Intercept)  148.      119.        1.24  0.228    
2 revt           1.63      0.311     5.22  0.0000414
3 act            0.317     0.165     1.92  0.0687   
4 che            0.124     0.322     0.384 0.705    
5 lct           -0.189     0.193    -0.981 0.338    
6 dp            -3.66      3.39     -1.08  0.293    
7 ebit          -3.63      0.995    -3.65  0.00159  
glance(forecast2)
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
      <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
1     0.929         0.907  231.      43.4 1.97e-10     6  -181.  379.  389.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
anova(forecast1, forecast2, test="Chisq")
Analysis of Variance Table

Model 1: revt_lead ~ act + che + lct
Model 2: revt_lead ~ revt + act + che + lct + dp + ebit
  Res.Df     RSS Df Sum of Sq  Pr(>Chi)    
1     23 2616067                           
2     20 1071637  3   1544429 2.439e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Note the group_by -- without it, lead() will pull from the subsequent firm!
# ungroup() tells R that we finished grouping
df_clean <- df_clean %>% 
  group_by(isin) %>% 
  mutate(revt_lead = lead(revt)) %>%
  ungroup()
forecast3 <-
  lm(revt_lead ~ revt + act + che + lct + dp + ebit,
     data=df_clean[df_clean$fic=="SGP",])
tidy(forecast3)
# A tibble: 7 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)   0.0134    7.86     0.00170 9.99e- 1
2 revt          0.652     0.0555  11.7     2.00e-27
3 act           0.154     0.0306   5.03    7.48e- 7
4 che           0.234     0.0807   2.90    3.98e- 3
5 lct           0.0768    0.0575   1.34    1.82e- 1
6 dp            1.63      0.748    2.17    3.04e- 2
7 ebit         -0.802     0.206   -3.90    1.15e- 4
glance(forecast3)
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC
      <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>
1     0.884         0.883  123.      488. 5.63e-176     6 -2427. 4870. 4902.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
forecast4 <-
  lm(revt_lead ~ revt + act + che + lct + dp + ebit , data=df_clean)
tidy(forecast4)
# A tibble: 7 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept) 357.     666.          0.536 5.92e- 1
2 revt          1.03     0.00599   171.    0       
3 act          -0.0307   0.00602    -5.11  3.33e- 7
4 che           0.0274   0.0116      2.35  1.86e- 2
5 lct           0.0701   0.00919     7.63  2.78e-14
6 dp            0.237    0.166       1.42  1.55e- 1
7 ebit          0.0319   0.0490      0.651 5.15e- 1
glance(forecast4)
# A tibble: 1 × 12
  r.squared adj.r.squared  sigma statistic p.value    df  logLik     AIC     BIC
      <dbl>         <dbl>  <dbl>     <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>
1     0.948         0.948 46353.    15089.       0     6 -60617. 121249. 121302.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
library(DT)
options(DT.options = list(scrollY="400px"))
library(tidyverse)
# First column, first 10 rows...
read_csv("../../Data/Session_2-Macro.csv") %>%
  .[1:10, 1] %>%
  DT::datatable()
library(DT)
options(DT.options = list(scrollY="400px"))
# Skip the header
# Next 10 rows and columns
read_csv("../../Data/Session_2-Macro.csv", skip=10) %>%
  .[1:10, 1:10] %>%
  DT::datatable()
expectations <- read_csv("../../Data/Session_2-Macro.csv",
                         skip=10, na="na") %>%          # Needed to load file
  filter(row_number() < 21) %>%                         # Drop the footer
  rename(industry=`Data Series`) %>%                    # Rename column
  pivot_longer(!industry, names_to='yearQ',
               values_to='fin_sentiment') %>%           # Cast wide to long
  mutate(year = as.numeric(substr(yearQ, 1, 4))) %>%    # split out year
  mutate(quarter = as.numeric(substr(yearQ, 6, 6))) %>% # split out quarter
  select(-yearQ)                                        # Remove measure

# extract out Q1, finance only
expectations_re <- expectations %>%
  filter(quarter == 1,                               # Keep only the Q1
         industry == "Real Estate")                  # Keep only real estate
expectations %>%
  arrange(industry, year, quarter) %>%  # sort the data
  filter(year == 2022) %>%
  DT::datatable(rownames=FALSE)  # display using DT
# subset out our Singaporean data, since our macro data is Singapore-specific
df_SG <- df_clean %>% filter(fic == "SGP")

# Create year in df_SG (date is given by datadate as YYYYMMDD)
df_SG$year = round(df_SG$datadate / 10000, digits=0)

# Combine datasets
# Notice how it automatically figures out to join by "year"
df_SG_macro <- left_join(df_SG, expectations_re[,c("year","fin_sentiment")])
Joining with `by = join_by(year)`
macro1 <- lm(revt_lead ~ revt + act + che + lct + dp + ebit + fin_sentiment,
             data=df_SG_macro)
library(broom)
tidy(macro1)
# A tibble: 8 × 5
  term          estimate std.error statistic  p.value
  <chr>            <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)     0.119     8.00      0.0149 9.88e- 1
2 revt            0.652     0.0563   11.6    1.01e-26
3 act             0.155     0.0316    4.90   1.41e- 6
4 che             0.231     0.0823    2.81   5.23e- 3
5 lct             0.0755    0.0582    1.30   1.96e- 1
6 dp              1.63      0.761     2.15   3.25e- 2
7 ebit           -0.804     0.208    -3.86   1.35e- 4
8 fin_sentiment   0.0174    0.177     0.0980 9.22e- 1
df_SG_macro %>%
  ggplot(aes(y=revt_lead,
             x=fin_sentiment)) +
  geom_point()
Warning: Removed 27 rows containing missing values (`geom_point()`).

df_SG_macro %>%
  ggplot(aes(y=revt_lead,
             x=fin_sentiment * revt)) +
  geom_point()
Warning: Removed 27 rows containing missing values (`geom_point()`).

macro3 <-
  lm(revt_lead ~ revt + act + che + lct + dp + ebit + fin_sentiment:revt,
     data=df_SG_macro)
tidy(macro3)
# A tibble: 8 × 5
  term               estimate std.error statistic  p.value
  <chr>                 <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)         1.83     7.91         0.231 8.18e- 1
2 revt                0.655    0.0556      11.8   1.63e-27
3 act                 0.133    0.0316       4.21  3.21e- 5
4 che                 0.267    0.0821       3.25  1.24e- 3
5 lct                 0.0619   0.0577       1.07  2.84e- 1
6 dp                  1.94     0.757        2.57  1.06e- 2
7 ebit               -0.804    0.206       -3.90  1.12e- 4
8 revt:fin_sentiment -0.00175  0.000596    -2.94  3.51e- 3
glance(macro3)
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC
      <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>
1     0.887         0.885  123.      421. 1.28e-173     7 -2388. 4794. 4830.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# Ensure that we use the same data (fin_sentiment is missing in 1994)
baseline <-
  lm(revt_lead ~ revt + act + che + lct + dp + ebit,
     data=df_SG_macro[!is.na(df_SG_macro$fin_sentiment),])
glance(baseline)
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC
      <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>
1     0.884         0.882  124.      480. 3.97e-173     6 -2392. 4801. 4832.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(macro3)
# A tibble: 1 × 12
  r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC
      <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>
1     0.887         0.885  123.      421. 1.28e-173     7 -2388. 4794. 4830.
# ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
anova(baseline, macro3, test="Chisq")
Analysis of Variance Table

Model 1: revt_lead ~ revt + act + che + lct + dp + ebit
Model 2: revt_lead ~ revt + act + che + lct + dp + ebit + fin_sentiment:revt
  Res.Df     RSS Df Sum of Sq Pr(>Chi)   
1    377 5799708                         
2    376 5669613  1    130094 0.003311 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
revenue_mean = mean(df_SG_macro$revt, na.rm=T)
r_sd <- round(sd(df_SG_macro$fin_sentiment, na.rm=T),1)
r_min <- min(df_SG_macro$fin_sentiment, na.rm=T)
r_max <- max(df_SG_macro$fin_sentiment, na.rm=T)
rev_base <- macro3$coefficients[["revt"]]
rev <- macro3$coefficients[["revt:fin_sentiment"]]
r_rev = round(100 * r_sd * rev,2)
r_rev_base = round(100 * rev_base, 2)
rev_min <- round((r_min * rev)*100,2)
rev_max <- round((r_max * rev)*100,2)
p_uol <- predict(forecast2, uol[uol$fyear==2021,])
p_base <- predict(baseline,
  df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear==2021,])
p_macro <- predict(macro3,
  df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear==2021,])
p_world <- predict(forecast4,
  df_clean[df_clean$isin=="SG1S83002349" & df_clean$fyear==2021,])
preds <- c(p_uol, p_base, p_macro, p_world)
names(preds) <- c("UOL 2022 UOL", "UOL 2022 Base", "UOL 2022 Macro",
                  "UOL 2022 World")
preds
  UOL 2022 UOL  UOL 2022 Base UOL 2022 Macro UOL 2022 World 
      3608.571       2834.237       2745.834       3136.901 

Attaching package: 'plotly'
The following object is masked from 'package:downlit':

    highlight
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
actual_series <- df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear < 2021,]$revt_lead
uol_series <- uol[uol$fyear < 2021,]$pred_uol
base_series <- df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear < 2021,]$pred_base
macro_series <- df_SG_macro[df_SG_macro$isin=="SG1S83002349" & df_SG_macro$fyear < 2021,]$pred_macro
world_series <- df_clean[df_clean$isin=="SG1S83002349" & df_clean$fyear < 2021,]$pred_world
# series vectors calculated here -- See appendix
rmse <- function(v1, v2) {
  sqrt(mean((v1 - v2)^2, na.rm=T))
}

rmse <- c(rmse(actual_series, uol_series), rmse(actual_series, base_series),
          rmse(actual_series, macro_series), rmse(actual_series, world_series))
names(rmse) <- c("UOL 2018 UOL", "UOL 2018 Base", "UOL 2018 Macro", "UOL 2018 World")
rmse
  UOL 2018 UOL  UOL 2018 Base UOL 2018 Macro UOL 2018 World 
      199.2242       274.2474       266.2979       455.7594 
preds
  UOL 2022 UOL  UOL 2022 Base UOL 2022 Macro UOL 2022 World 
      3608.571       2834.237       2745.834       3136.901 
# Exports for the following week
save(df_clean, forecast2, forecast3, uol, forecast4, file = "../../Data/Session_2_export.RData")