Dr. Richard M. Crowley
Quick survey: rmc.link/420hw1
A specific test is one of an infinite number of replications
Focus on distributions and beliefs
This is why we use more than 1 data point
detector <- function() {
dice <- sample(1:6, size=2, replace=TRUE)
if (sum(dice) == 12) {
"exploded"
} else {
"still there"
}
}
experiment <- replicate(1000,detector())
# p value
paste("p-value: ",
sum(experiment == "still there") / 1000,
"-- Reject H_A that sun exploded")
## [1] "p-value: 0.962 -- Reject H_A that sun exploded"
Frequentist: The sun didn’t explode
P(A|B) = \frac{P(B|A) P(A)}{P(B)}
P(A|B) = \frac{P(B|A) P(A)}{P(B)} = \frac{\frac{35}{36}\times\sim 0}{\frac{1}{36}} = 35\times \sim 0 \approx 0
Bayesian: The sun didn’t explode
We will use both to some extent – for our purposes, we will not debate the merits of either school of thought, but use tools derived from both
There are many ways to measure a model, each with their own merits. They don’t always agree, and it’s on us to pick a reasonable measure.
y = \alpha + \beta_1 x_1 + \beta_2 x_2 + \ldots + \varepsilon
\hat{y} = \alpha + \beta_1 \hat{x}_1 + \beta_2 \hat{x}_2 + \ldots + \hat{\varepsilon}
Models designed under a frequentist approach can only answer the question of “does this matter?”
Linear models are very flexible
Simple OLS measures a simple linear relationship between an input and an output
OLS measures simple linear relationships between a set of inputs and one output
IV/2SLS models linear relationships where the effect of some x_i on y may be confounded by outside factors.
SUR models systems with related error terms
3SLS models systems of equations with related outputs
SEM can model abstract and multi-level relationships
Pick what fits your problem!
Own knowledge
Or: Why do we like simpler models so much?
An overfitted model works really well on its own data, and quite poorly on new data
“The P value is defined as the probability under the assumption of no effect or no difference (null hypothesis), of obtaining a result equal to or more extreme than what was actually observed.”"
– Dahiru 2008
A \rightarrow B
A \rightarrow B or A \leftarrow B?
A_t \rightarrow B_{t+1}
Revenue_{t+1} = Revenue_t + \ldots
A_t \rightarrow B_{t+1}? \quad OR \quad C \rightarrow A_t and C \rightarrow B_{t+1}?
How can we predict quarterly revenue for retail companies, leveraging our knowledge of such companies
library(tidyverse) # As always
library(plotly) # interactive graphs
library(lubridate) # import some sensible date functions
# Generate quarter over quarter growth "revtq_gr"
df <- df %>% group_by(gvkey) %>% mutate(revtq_gr=revtq / lag(revtq) - 1) %>% ungroup()
# Generate year-over-year growth "revtq_yoy"
df <- df %>% group_by(gvkey) %>% mutate(revtq_yoy=revtq / lag(revtq, 4) - 1) %>% ungroup()
# Generate first difference "revtq_d"
df <- df %>% group_by(gvkey) %>% mutate(revtq_d=revtq - lag(revtq)) %>% ungroup()
# Generate a proper date
# Date was YYMMDDs10: YYYY/MM/DD, can be converted from text to date easily
df$date <- as.Date(df$datadate) # Built in to R
format=
argument
conm | date | revtq | revtq_gr | revtq_yoy | revtq_d |
---|---|---|---|---|---|
ALLIED STORES | 1962-04-30 | 156.5 | NA | NA | NA |
ALLIED STORES | 1962-07-31 | 161.9 | 0.0345048 | NA | 5.4 |
ALLIED STORES | 1962-10-31 | 176.9 | 0.0926498 | NA | 15.0 |
ALLIED STORES | 1963-01-31 | 275.5 | 0.5573770 | NA | 98.6 |
ALLIED STORES | 1963-04-30 | 171.1 | -0.3789474 | 0.0932907 | -104.4 |
ALLIED STORES | 1963-07-31 | 182.2 | 0.0648743 | 0.1253860 | 11.1 |
## # A tibble: 6 x 3
## conm date datadate
## <chr> <date> <chr>
## 1 ALLIED STORES 1962-04-30 1962/04/30
## 2 ALLIED STORES 1962-07-31 1962/07/31
## 3 ALLIED STORES 1962-10-31 1962/10/31
## 4 ALLIED STORES 1963-01-31 1963/01/31
## 5 ALLIED STORES 1963-04-30 1963/04/30
## 6 ALLIED STORES 1963-07-31 1963/07/31
# Custom Function to generate a series of lags
multi_lag <- function(df, lags, var, ext="") {
lag_names <- paste0(var,ext,lags)
lag_funs <- setNames(paste("dplyr::lag(.,",lags,")"), lag_names)
df %>% group_by(gvkey) %>% mutate_at(vars(var), funs_(lag_funs)) %>% ungroup()
}
# Generate lags "revtq_l#"
df <- multi_lag(df, 1:8, "revtq", "_l")
# Generate changes "revtq_gr#"
df <- multi_lag(df, 1:8, "revtq_gr")
# Generate year-over-year changes "revtq_yoy#"
df <- multi_lag(df, 1:8, "revtq_yoy")
# Generate first differences "revtq_d#"
df <- multi_lag(df, 1:8, "revtq_d")
# Equivalent brute force code for this is in the appendix
conm | date | revtq | revtq_l1 | revtq_l2 | revtq_l3 | revtq_l4 |
---|---|---|---|---|---|---|
ALLIED STORES | 1962-04-30 | 156.5 | NA | NA | NA | NA |
ALLIED STORES | 1962-07-31 | 161.9 | 156.5 | NA | NA | NA |
ALLIED STORES | 1962-10-31 | 176.9 | 161.9 | 156.5 | NA | NA |
ALLIED STORES | 1963-01-31 | 275.5 | 176.9 | 161.9 | 156.5 | NA |
ALLIED STORES | 1963-04-30 | 171.1 | 275.5 | 176.9 | 161.9 | 156.5 |
ALLIED STORES | 1963-07-31 | 182.2 | 171.1 | 275.5 | 176.9 | 161.9 |
# Clean the data: Replace NaN, Inf, and -Inf with NA
df <- df %>%
mutate_if(is.numeric, funs(replace(., !is.finite(.), NA)))
# Split into training and testing data
# Training data: We'll use data released before 2015
train <- filter(df, year(date) < 2015)
# Testing data: We'll use data released 2015 through 2018
test <- filter(df, year(date) >= 2015)
summary(df[,c("revtq","revtq_gr","revtq_yoy", "revtq_d","fqtr")])
## revtq revtq_gr revtq_yoy
## Min. : 0.00 Min. :-1.0000 Min. :-1.0000
## 1st Qu.: 64.46 1st Qu.:-0.1112 1st Qu.: 0.0077
## Median : 273.95 Median : 0.0505 Median : 0.0740
## Mean : 2439.38 Mean : 0.0650 Mean : 0.1273
## 3rd Qu.: 1254.21 3rd Qu.: 0.2054 3rd Qu.: 0.1534
## Max. :136267.00 Max. :14.3333 Max. :47.6600
## NA's :367 NA's :690 NA's :940
## revtq_d fqtr
## Min. :-24325.21 Min. :1.000
## 1st Qu.: -19.33 1st Qu.:1.000
## Median : 4.30 Median :2.000
## Mean : 22.66 Mean :2.478
## 3rd Qu.: 55.02 3rd Qu.:3.000
## Max. : 15495.00 Max. :4.000
## NA's :663
%>%
), but instead adds everything together (+
)library(ggplot2) # or tidyverse -- it's part of tidyverse
df %>%
ggplot(aes(y=var_for_y_axis, x=var_for_y_axis)) +
geom_point() # scatterplot
aes()
is for aesthetics – how the chart is set upgroup=
to set groups to list in the legend. Not needed if using the below thoughcolor=
to set color by some grouping variable. Put factor()
around the variable if you want discrete groups, otherwise it will do a color scale (light to dark)shape=
to set shapes for points – see here for a listlibrary(ggplot2) # or tidyverse -- it's part of tidyverse
df %>%
ggplot(aes(y=var_for_y_axis, x=var_for_y_axis)) +
geom_point() # scatterplot
geom
stands for geometry – any shapes, lines, etc. start with geom
geom_line()
: makes a line chartgeom_bar()
: makes a bar chart – y is the height, x is the categorygeom_smooth(method="lm")
: Adds a linear regression into the chartgeom_abline(slope=1)
: Adds a 45° linexlab("Label text here")
to change the x-axis labelylab("Label text here")
to change the y-axis labelggtitle("Title text here")
to add a titleRevenue
Year-over-year growth
log(revtq)
?Revenue
Year-over-year growth
Revenue
Year-over-year growth
cor(train[,c("revtq","revtq_l1","revtq_l2","revtq_l3", "revtq_l4")],
use="complete.obs")
## revtq revtq_l1 revtq_l2 revtq_l3 revtq_l4
## revtq 1.0000000 0.9916167 0.9938489 0.9905522 0.9972735
## revtq_l1 0.9916167 1.0000000 0.9914767 0.9936977 0.9898184
## revtq_l2 0.9938489 0.9914767 1.0000000 0.9913489 0.9930152
## revtq_l3 0.9905522 0.9936977 0.9913489 1.0000000 0.9906006
## revtq_l4 0.9972735 0.9898184 0.9930152 0.9906006 1.0000000
cor(train[,c("revtq_gr","revtq_gr1","revtq_gr2","revtq_gr3", "revtq_gr4")],
use="complete.obs")
## revtq_gr revtq_gr1 revtq_gr2 revtq_gr3 revtq_gr4
## revtq_gr 1.00000000 -0.32291329 0.06299605 -0.22769442 0.64570015
## revtq_gr1 -0.32291329 1.00000000 -0.31885020 0.06146805 -0.21923630
## revtq_gr2 0.06299605 -0.31885020 1.00000000 -0.32795121 0.06775742
## revtq_gr3 -0.22769442 0.06146805 -0.32795121 1.00000000 -0.31831023
## revtq_gr4 0.64570015 -0.21923630 0.06775742 -0.31831023 1.00000000
Retail revenue has really high autocorrelation! Concern for multicolinearity. Revenue growth is less autocorrelated and oscillates.
cor(train[,c("revtq_yoy","revtq_yoy1","revtq_yoy2","revtq_yoy3", "revtq_yoy4")],
use="complete.obs")
## revtq_yoy revtq_yoy1 revtq_yoy2 revtq_yoy3 revtq_yoy4
## revtq_yoy 1.0000000 0.6554179 0.4127263 0.4196003 0.1760055
## revtq_yoy1 0.6554179 1.0000000 0.5751128 0.3665961 0.3515105
## revtq_yoy2 0.4127263 0.5751128 1.0000000 0.5875643 0.3683539
## revtq_yoy3 0.4196003 0.3665961 0.5875643 1.0000000 0.5668211
## revtq_yoy4 0.1760055 0.3515105 0.3683539 0.5668211 1.0000000
cor(train[,c("revtq_d","revtq_d1","revtq_d2","revtq_d3", "revtq_d4")],
use="complete.obs")
## revtq_d revtq_d1 revtq_d2 revtq_d3 revtq_d4
## revtq_d 1.0000000 -0.6181516 0.3309349 -0.6046998 0.9119911
## revtq_d1 -0.6181516 1.0000000 -0.6155259 0.3343317 -0.5849841
## revtq_d2 0.3309349 -0.6155259 1.0000000 -0.6191366 0.3165450
## revtq_d3 -0.6046998 0.3343317 -0.6191366 1.0000000 -0.5864285
## revtq_d4 0.9119911 -0.5849841 0.3165450 -0.5864285 1.0000000
Year over year change fixes the multicollinearity issue. First difference oscillates like quarter over quarter growth.
mod1 <- lm(revtq ~ revtq_l1, data=train)
mod2 <- lm(revtq ~ revtq_l1 + revtq_l4, data=train)
mod3 <- lm(revtq ~ revtq_l1 + revtq_l2 + revtq_l3 + revtq_l4 +
revtq_l5 + revtq_l6 + revtq_l7 + revtq_l8, data=train)
mod4 <- lm(revtq ~ (revtq_l1 + revtq_l2 + revtq_l3 + revtq_l4 +
revtq_l5 + revtq_l6 + revtq_l7 + revtq_l8):factor(fqtr),
data=train)
summary(mod1)
##
## Call:
## lm(formula = revtq ~ revtq_l1, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24438.7 -34.0 -11.7 34.6 15200.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.639975 13.514877 1.157 0.247
## revtq_l1 1.003038 0.001556 644.462 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1152 on 7676 degrees of freedom
## (662 observations deleted due to missingness)
## Multiple R-squared: 0.9819, Adjusted R-squared: 0.9819
## F-statistic: 4.153e+05 on 1 and 7676 DF, p-value: < 2.2e-16
summary(mod2)
##
## Call:
## lm(formula = revtq ~ revtq_l1 + revtq_l4, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20245.7 -18.4 -4.4 19.1 9120.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.444986 7.145633 0.762 0.446
## revtq_l1 0.231759 0.005610 41.312 <2e-16 ***
## revtq_l4 0.815570 0.005858 139.227 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 592.1 on 7274 degrees of freedom
## (1063 observations deleted due to missingness)
## Multiple R-squared: 0.9954, Adjusted R-squared: 0.9954
## F-statistic: 7.94e+05 on 2 and 7274 DF, p-value: < 2.2e-16
summary(mod3)
##
## Call:
## lm(formula = revtq ~ revtq_l1 + revtq_l2 + revtq_l3 + revtq_l4 +
## revtq_l5 + revtq_l6 + revtq_l7 + revtq_l8, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5005.6 -12.9 -3.7 9.3 5876.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.02478 4.37003 0.921 0.3571
## revtq_l1 0.77379 0.01229 62.972 < 2e-16 ***
## revtq_l2 0.10497 0.01565 6.707 2.16e-11 ***
## revtq_l3 -0.03091 0.01538 -2.010 0.0445 *
## revtq_l4 0.91982 0.01213 75.800 < 2e-16 ***
## revtq_l5 -0.76459 0.01324 -57.749 < 2e-16 ***
## revtq_l6 -0.08080 0.01634 -4.945 7.80e-07 ***
## revtq_l7 0.01146 0.01594 0.719 0.4721
## revtq_l8 0.07924 0.01209 6.554 6.03e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 346 on 6666 degrees of freedom
## (1665 observations deleted due to missingness)
## Multiple R-squared: 0.9986, Adjusted R-squared: 0.9986
## F-statistic: 5.802e+05 on 8 and 6666 DF, p-value: < 2.2e-16
summary(mod4)
##
## Call:
## lm(formula = revtq ~ (revtq_l1 + revtq_l2 + revtq_l3 + revtq_l4 +
## revtq_l5 + revtq_l6 + revtq_l7 + revtq_l8):factor(fqtr),
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6066.6 -13.9 0.1 15.1 4941.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.201107 4.004046 -0.050 0.959944
## revtq_l1:factor(fqtr)1 0.488584 0.021734 22.480 < 2e-16 ***
## revtq_l1:factor(fqtr)2 1.130563 0.023017 49.120 < 2e-16 ***
## revtq_l1:factor(fqtr)3 0.774983 0.028727 26.977 < 2e-16 ***
## revtq_l1:factor(fqtr)4 0.977353 0.026888 36.349 < 2e-16 ***
## revtq_l2:factor(fqtr)1 0.258024 0.035136 7.344 2.33e-13 ***
## revtq_l2:factor(fqtr)2 -0.100284 0.024664 -4.066 4.84e-05 ***
## revtq_l2:factor(fqtr)3 0.212954 0.039698 5.364 8.40e-08 ***
## revtq_l2:factor(fqtr)4 0.266761 0.035226 7.573 4.14e-14 ***
## revtq_l3:factor(fqtr)1 0.124187 0.036695 3.384 0.000718 ***
## revtq_l3:factor(fqtr)2 -0.042214 0.035787 -1.180 0.238197
## revtq_l3:factor(fqtr)3 -0.005758 0.024367 -0.236 0.813194
## revtq_l3:factor(fqtr)4 -0.308661 0.038974 -7.920 2.77e-15 ***
## revtq_l4:factor(fqtr)1 0.459768 0.038266 12.015 < 2e-16 ***
## revtq_l4:factor(fqtr)2 0.684943 0.033366 20.528 < 2e-16 ***
## revtq_l4:factor(fqtr)3 0.252169 0.035708 7.062 1.81e-12 ***
## revtq_l4:factor(fqtr)4 0.817136 0.017927 45.582 < 2e-16 ***
## revtq_l5:factor(fqtr)1 -0.435406 0.023278 -18.704 < 2e-16 ***
## revtq_l5:factor(fqtr)2 -0.725000 0.035497 -20.424 < 2e-16 ***
## revtq_l5:factor(fqtr)3 -0.160408 0.036733 -4.367 1.28e-05 ***
## revtq_l5:factor(fqtr)4 -0.473030 0.033349 -14.184 < 2e-16 ***
## revtq_l6:factor(fqtr)1 0.059832 0.034672 1.726 0.084453 .
## revtq_l6:factor(fqtr)2 0.154990 0.025368 6.110 1.05e-09 ***
## revtq_l6:factor(fqtr)3 -0.156840 0.041147 -3.812 0.000139 ***
## revtq_l6:factor(fqtr)4 -0.106082 0.037368 -2.839 0.004541 **
## revtq_l7:factor(fqtr)1 0.060031 0.038599 1.555 0.119936
## revtq_l7:factor(fqtr)2 0.061381 0.034510 1.779 0.075344 .
## revtq_l7:factor(fqtr)3 0.028149 0.025385 1.109 0.267524
## revtq_l7:factor(fqtr)4 -0.277337 0.039380 -7.043 2.08e-12 ***
## revtq_l8:factor(fqtr)1 -0.016637 0.033568 -0.496 0.620177
## revtq_l8:factor(fqtr)2 -0.152379 0.028014 -5.439 5.54e-08 ***
## revtq_l8:factor(fqtr)3 0.052208 0.027334 1.910 0.056179 .
## revtq_l8:factor(fqtr)4 0.103495 0.015777 6.560 5.78e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 299.7 on 6642 degrees of freedom
## (1665 observations deleted due to missingness)
## Multiple R-squared: 0.9989, Adjusted R-squared: 0.9989
## F-statistic: 1.935e+05 on 32 and 6642 DF, p-value: < 2.2e-16
rmse <- function(v1, v2) {
sqrt(mean((v1 - v2)^2, na.rm=T))
}
mae <- function(v1, v2) {
mean(abs(v1-v2), na.rm=T)
}
Both are commonly used for evaluating OLS out of sample
adj_r_sq | rmse_in | mae_in | rmse_out | mae_out | |
---|---|---|---|---|---|
1 period | 0.9818514 | 1151.3535 | 322.73819 | 2947.3619 | 1252.5196 |
1 and 4 periods | 0.9954393 | 591.9500 | 156.20811 | 1400.3841 | 643.9823 |
8 periods | 0.9985643 | 345.8053 | 94.91083 | 677.6218 | 340.8236 |
8 periods w/ quarters | 0.9989231 | 298.9557 | 91.28056 | 645.5415 | 324.9395 |
1 quarter model
8 period model, by quarter
Backing out a revenue prediction, revt_t=(1+growth_t)\times revt_{t-1}
adj_r_sq | rmse_in | mae_in | rmse_out | mae_out | |
---|---|---|---|---|---|
1 period | 0.0910390 | 1106.3730 | 308.48331 | 3374.728 | 1397.6541 |
1 and 4 periods | 0.4398456 | 530.6444 | 154.15086 | 1447.035 | 679.3536 |
8 periods | 0.6761666 | 456.2551 | 123.34075 | 1254.201 | 584.9709 |
8 periods w/ quarters | 0.7758834 | 378.4082 | 98.45751 | 1015.971 | 436.1522 |
1 quarter model
8 period model, by quarter
Backing out a revenue prediction, revt_t=(1+yoy\_growth_t)\times revt_{t-4}
adj_r_sq | rmse_in | mae_in | rmse_out | mae_out | |
---|---|---|---|---|---|
1 period | 0.4370372 | 513.3264 | 129.2309 | 1867.4957 | 798.0327 |
1 and 4 periods | 0.5392281 | 487.6441 | 126.6012 | 1677.4003 | 731.2841 |
8 periods | 0.5398870 | 384.2923 | 101.0104 | 822.0065 | 403.5445 |
8 periods w/ quarters | 0.1563169 | 714.4285 | 195.3204 | 1231.8436 | 617.2989 |
1 quarter model
8 period model
Backing out a revenue prediction, revt_t= change_t+ revt_{t-1}
adj_r_sq | rmse_in | mae_in | rmse_out | mae_out | |
---|---|---|---|---|---|
1 period | 0.3532044 | 896.7969 | 287.77940 | 2252.7605 | 1022.0960 |
1 and 4 periods | 0.8425348 | 454.8651 | 115.52694 | 734.8120 | 377.5281 |
8 periods | 0.9220849 | 333.0054 | 95.95924 | 651.4967 | 320.0567 |
8 periods w/ quarters | 0.9397434 | 292.3102 | 86.95563 | 659.4412 | 319.7305 |
1 quarter model
8 period model, by quarter
Predicting quarter over quarter revenue growth itself
adj_r_sq | rmse_in | mae_in | rmse_out | mae_out | |
---|---|---|---|---|---|
1 period | 0.0910390 | 0.3509269 | 0.2105219 | 0.2257396 | 0.1750580 |
1 and 4 periods | 0.4398456 | 0.2681899 | 0.1132003 | 0.1597771 | 0.0998087 |
8 periods | 0.6761666 | 0.1761825 | 0.0867347 | 0.1545298 | 0.0845826 |
8 periods w/ quarters | 0.7758834 | 0.1462979 | 0.0765792 | 0.1459460 | 0.0703554 |
1 quarter model
8 period model, by quarter
adj_r_sq | rmse_in | mae_in | rmse_out | mae_out | |
---|---|---|---|---|---|
1 period | 0.4370372 | 0.3116645 | 0.1114610 | 0.1515638 | 0.0942544 |
1 and 4 periods | 0.5392281 | 0.2451749 | 0.1015699 | 0.1498755 | 0.0896079 |
8 periods | 0.5398870 | 0.1928940 | 0.0764447 | 0.1346238 | 0.0658011 |
8 periods w/ quarters | 0.1563169 | 0.3006075 | 0.1402156 | 0.1841025 | 0.0963205 |
1 quarter model
8 period model
Predicting first difference in revenue itself
adj_r_sq | rmse_in | mae_in | rmse_out | mae_out | |
---|---|---|---|---|---|
1 period | 0.3532044 | 896.7969 | 287.77940 | 2252.7605 | 1022.0960 |
1 and 4 periods | 0.8425348 | 454.8651 | 115.52694 | 734.8120 | 377.5281 |
8 periods | 0.9220849 | 333.0054 | 95.95924 | 651.4967 | 320.0567 |
8 periods w/ quarters | 0.9397434 | 292.3102 | 86.95563 | 659.4412 | 319.7305 |
1 quarter model
8 period model, by quarter
Read the press release: rmc.link/420class4
# Brute force code for variable generation of quarterly data lags
df <- df %>%
group_by(gvkey) %>%
mutate(revtq_lag1=lag(revtq), revtq_lag2=lag(revtq, 2),
revtq_lag3=lag(revtq, 3), revtq_lag4=lag(revtq, 4),
revtq_lag5=lag(revtq, 5), revtq_lag6=lag(revtq, 6),
revtq_lag7=lag(revtq, 7), revtq_lag8=lag(revtq, 8),
revtq_lag9=lag(revtq, 9), revtq_gr=revtq / revtq_lag1 - 1,
revtq_gr1=lag(revtq_gr), revtq_gr2=lag(revtq_gr, 2),
revtq_gr3=lag(revtq_gr, 3), revtq_gr4=lag(revtq_gr, 4),
revtq_gr5=lag(revtq_gr, 5), revtq_gr6=lag(revtq_gr, 6),
revtq_gr7=lag(revtq_gr, 7), revtq_gr8=lag(revtq_gr, 8),
revtq_yoy=revtq / revtq_lag4 - 1, revtq_yoy1=lag(revtq_yoy),
revtq_yoy2=lag(revtq_yoy, 2), revtq_yoy3=lag(revtq_yoy, 3),
revtq_yoy4=lag(revtq_yoy, 4), revtq_yoy5=lag(revtq_yoy, 5),
revtq_yoy6=lag(revtq_yoy, 6), revtq_yoy7=lag(revtq_yoy, 7),
revtq_yoy8=lag(revtq_yoy, 8), revtq_d=revtq - revtq_l1,
revtq_d1=lag(revtq_d), revtq_d2=lag(revtq_d, 2),
revtq_d3=lag(revtq_d, 3), revtq_d4=lag(revtq_d, 4),
revtq_d5=lag(revtq_d, 5), revtq_d6=lag(revtq_d, 6),
revtq_d7=lag(revtq_d, 7), revtq_d8=lag(revtq_d, 8)) %>%
ungroup()
# Custom html table for small data frames
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
} else {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
column_spec(1,bold=T)
}
}
# These functions are a bit ugly, but can construct many charts quickly
# eval(parse(text=var)) is just a way to convert the string name to a variable reference
# Density plot for 1st to 99th percentile of data
plt_dist <- function(df,var) {
df %>%
filter(eval(parse(text=var)) < quantile(eval(parse(text=var)),0.99, na.rm=TRUE),
eval(parse(text=var)) > quantile(eval(parse(text=var)),0.01, na.rm=TRUE)) %>%
ggplot(aes(x=eval(parse(text=var)))) +
geom_density() + xlab(var)
}
# Density plot for 1st to 99th percentile of both columns
plt_bar <- function(df,var) {
df %>%
filter(eval(parse(text=var)) < quantile(eval(parse(text=var)),0.99, na.rm=TRUE),
eval(parse(text=var)) > quantile(eval(parse(text=var)),0.01, na.rm=TRUE)) %>%
ggplot(aes(y=eval(parse(text=var)), x=fqtr)) +
geom_bar(stat = "summary", fun.y = "mean") + xlab(var)
}
# Scatter plot with lag for 1st to 99th percentile of data
plt_sct <- function(df,var1, var2) {
df %>%
filter(eval(parse(text=var1)) < quantile(eval(parse(text=var1)),0.99, na.rm=TRUE),
eval(parse(text=var2)) < quantile(eval(parse(text=var2)),0.99, na.rm=TRUE),
eval(parse(text=var1)) > quantile(eval(parse(text=var1)),0.01, na.rm=TRUE),
eval(parse(text=var2)) > quantile(eval(parse(text=var2)),0.01, na.rm=TRUE)) %>%
ggplot(aes(y=eval(parse(text=var1)), x=eval(parse(text=var2)), color=factor(fqtr))) +
geom_point() + geom_smooth(method = "lm") + ylab(var1) + xlab(var2)
}
# Calculating various in and out of sample statistics
models <- list(mod1,mod2,mod3,mod4)
model_names <- c("1 period", "1 and 4 period", "8 periods", "8 periods w/ quarters")
df_test <- data.frame(adj_r_sq=sapply(models, function(x)summary(x)[["adj.r.squared"]]),
rmse_in=sapply(models, function(x)rmse(train$revtq, predict(x,train))),
mae_in=sapply(models, function(x)mae(train$revtq, predict(x,train))),
rmse_out=sapply(models, function(x)rmse(test$revtq, predict(x,test))),
mae_out=sapply(models, function(x)mae(test$revtq, predict(x,test))))
rownames(df_test) <- model_names
html_df(df_test) # Custom function using knitr and kableExtra