Code for Session 1 Sup

Author

Dr. Richard M. Crowley

Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.

library(knitr)
library(kableExtra)

#' Pretty formatted HTML tables
#' 
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
  if(!length(names)) {
    names=colnames(df)
  }
  kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
    kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
    {if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
    {if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
    {if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
    {if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}

library(tidyverse)
company <- c("Google", "Microsoft", "Goldman")
company
[1] "Google"    "Microsoft" "Goldman"  
tech_firm <- c(TRUE, TRUE, FALSE)
tech_firm
[1]  TRUE  TRUE FALSE
earnings <- c(12662, 21204, 4286)
earnings
[1] 12662 21204  4286
1:5
[1] 1 2 3 4 5
seq(from=0, to=100, by=5)
 [1]   0   5  10  15  20  25  30  35  40  45  50  55  60  65  70  75  80  85  90
[20]  95 100
rep(1,times=10)
 [1] 1 1 1 1 1 1 1 1 1 1
rep("hi",times=5)
[1] "hi" "hi" "hi" "hi" "hi"
earnings  # previously defined
[1] 12662 21204  4286
earnings + earnings  # Add element-wise
[1] 25324 42408  8572
earnings * earnings  # multiply element-wise
[1] 160326244 449609616  18369796
earnings + 10000  # Adding a scalar to a vector
[1] 22662 31204 14286
10000 + earnings  # Order doesn't matter
[1] 22662 31204 14286
earnings / 1000  # Dividing a vector by a scalar
[1] 12.662 21.204  4.286
# Dot product: sum of product of elements
earnings %*% earnings  # returns a matrix though...
          [,1]
[1,] 628305656
drop(earnings %*% earnings)  # Drop drops excess dimensions
[1] 628305656
length(earnings)  # returns the number of elements
[1] 3
sum(earnings)  # returns the sum of all elements
[1] 38152
earnings
[1] 12662 21204  4286
names(earnings) <- c("Google",
                     "Microsoft",
                     "Goldman")
earnings
   Google Microsoft   Goldman 
    12662     21204      4286 
# Equivalently:
names(earnings) <- company
earnings
   Google Microsoft   Goldman 
    12662     21204      4286 
earnings[1]
Google 
 12662 
earnings["Google"]
Google 
 12662 
# Each of the above 3 is equivalent
earnings[1:2]
   Google Microsoft 
    12662     21204 
c1 <- c(1,2,3)
c2 <- c(4,5,6)
c3 <- c(c1,c2)
c3
[1] 1 2 3 4 5 6
library(tidyverse)
df <- read.csv("../../Data/Session_1-2.csv")
# Filter to firms with at least $1M USD revenue, known net income, and fiscal year of 2017
clean_df <- df %>% filter(fyear==2017, !is.na(revt), !is.na(ni), revt > 1)
tech_df <- clean_df %>%
  filter(gsector==45) %>%
  mutate(revenue = revt,
         earnings = ni,
         margin = ni/revt)
earnings_2017 <- tech_df$ni
revenue_2017 <- tech_df$revt
names_2017 <- tech_df$conm
names(earnings_2017) <- names_2017
names(revenue_2017) <- names_2017
# Calculating proit margin for all public US tech firms
# 715 tech firms with >1M sales in 2017
summary(earnings_2017)  # Cleaned data from Compustat, in $M USD
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4307.49   -15.98     1.84   296.84    91.36 48351.00 
summary(revenue_2017)  # Cleaned data from Compustat, in $M USD
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
     1.06    102.62    397.57   3023.78   1531.59 229234.00 
profit_margin <- earnings_2017 / revenue_2017
summary(profit_margin)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-13.97960  -0.10253   0.01353  -0.10967   0.09295   1.02655 
# These are the worst, midpoint, and best profit margin firms in 2017. Our names carried over :)
profit_margin[order(profit_margin)][c(1,length(profit_margin)/2,length(profit_margin))]
HELIOS AND MATHESON ANALYTIC                   NLIGHT INC 
                -13.97960161                   0.01325588 
           CCUR HOLDINGS INC 
                  1.02654899 
columns <- c("Google", "Microsoft", "Goldman")
rows <- c("Earnings","Revenue")

# equivalent: matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),ncol=3)
firm_data <- matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),nrow=2)
firm_data
      [,1]   [,2]  [,3]
[1,] 12662   4286 89950
[2,] 21204 110855 42254
firm_data + firm_data
      [,1]   [,2]   [,3]
[1,] 25324   8572 179900
[2,] 42408 221710  84508
firm_data / 1000
       [,1]    [,2]   [,3]
[1,] 12.662   4.286 89.950
[2,] 21.204 110.855 42.254
firm_data_T <- t(firm_data)
firm_data_T
      [,1]   [,2]
[1,] 12662  21204
[2,]  4286 110855
[3,] 89950  42254
firm_data %*% firm_data_T
           [,1]        [,2]
[1,] 8269698540  4544356878
[2,] 4544356878 14523841157
rownames(firm_data) <- rows
colnames(firm_data) <- columns
firm_data
         Google Microsoft Goldman
Earnings  12662      4286   89950
Revenue   21204    110855   42254
firm_data[2,3]
[1] 42254
firm_data[,c("Google","Microsoft")]
         Google Microsoft
Earnings  12662      4286
Revenue   21204    110855
firm_data[1,]
   Google Microsoft   Goldman 
    12662      4286     89950 
indcode <- c(45,45,40)
jpdata <- c(17370, 115475)
# Preloaded: industry codes as indcode (vector)
# Preloaded: industry codes as indcode (vector)
#     - GICS codes: 40=Financials, 45=Information Technology
#     - See: https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard
# Preloaded: JPMorgan data as jpdata (vector)

mat <- rbind(firm_data,indcode)  # Add a row
rownames(mat)[3] <- "Industry"  # Name the new row
mat
         Google Microsoft Goldman
Earnings  12662      4286   89950
Revenue   21204    110855   42254
Industry     45        45      40
mat <- cbind(firm_data,jpdata)  # Add a column
colnames(mat)[4] <- "JPMorgan"  # Name the new column
mat
         Google Microsoft Goldman JPMorgan
Earnings  12662      4286   89950    17370
Revenue   21204    110855   42254   115475
# Ignore this code for now...
model <- summary(lm(earnings ~ revenue, data=tech_df))
#Note that this function is hiding something...
model

Call:
lm(formula = earnings ~ revenue, data = tech_df)

Residuals:
     Min       1Q   Median       3Q      Max 
-16045.0     20.0    141.6    177.1  12104.6 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.837e+02  4.491e+01  -4.091 4.79e-05 ***
revenue      1.589e-01  3.564e-03  44.585  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1166 on 713 degrees of freedom
Multiple R-squared:  0.736, Adjusted R-squared:  0.7356 
F-statistic:  1988 on 1 and 713 DF,  p-value: < 2.2e-16
model["r.squared"]
$r.squared
[1] 0.7360059
model[["r.squared"]]
[1] 0.7360059
model$r.squared
[1] 0.7360059
earnings["Google"]
Google 
 12662 
earnings[["Google"]]
[1] 12662
#Can't use $ with vectors
str(model)
List of 11
 $ call         : language lm(formula = earnings ~ revenue, data = tech_df)
 $ terms        :Classes 'terms', 'formula'  language earnings ~ revenue
  .. ..- attr(*, "variables")= language list(earnings, revenue)
  .. ..- attr(*, "factors")= int [1:2, 1] 0 1
  .. .. ..- attr(*, "dimnames")=List of 2
  .. .. .. ..$ : chr [1:2] "earnings" "revenue"
  .. .. .. ..$ : chr "revenue"
  .. ..- attr(*, "term.labels")= chr "revenue"
  .. ..- attr(*, "order")= int 1
  .. ..- attr(*, "intercept")= int 1
  .. ..- attr(*, "response")= int 1
  .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
  .. ..- attr(*, "predvars")= language list(earnings, revenue)
  .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
  .. .. ..- attr(*, "names")= chr [1:2] "earnings" "revenue"
 $ residuals    : Named num [1:715] -59.7 173.8 -620.2 586.7 613.6 ...
  ..- attr(*, "names")= chr [1:715] "1" "2" "3" "4" ...
 $ coefficients : num [1:2, 1:4] -1.84e+02 1.59e-01 4.49e+01 3.56e-03 -4.09 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2] "(Intercept)" "revenue"
  .. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
 $ aliased      : Named logi [1:2] FALSE FALSE
  ..- attr(*, "names")= chr [1:2] "(Intercept)" "revenue"
 $ sigma        : num 1166
 $ df           : int [1:3] 2 713 2
 $ r.squared    : num 0.736
 $ adj.r.squared: num 0.736
 $ fstatistic   : Named num [1:3] 1988 1 713
  ..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
 $ cov.unscaled : num [1:2, 1:2] 1.48e-03 -2.83e-08 -2.83e-08 9.35e-12
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2] "(Intercept)" "revenue"
  .. ..$ : chr [1:2] "(Intercept)" "revenue"
 - attr(*, "class")= chr "summary.lm"
library(DT) # This library is great for including larger collections of data in output
options(DT.options = list(scrollY="450px"))
# The DT library is great for including larger collections of data in output
DT::datatable(tech_df[1:20,c("conm","tic","margin")], rownames=FALSE)
df <- data.frame(companyName=company,
                 earnings=earnings,
                 tech_firm=tech_firm)
df
          companyName earnings tech_firm
Google         Google    12662      TRUE
Microsoft   Microsoft    21204      TRUE
Goldman       Goldman     4286     FALSE
df[,1]
[1] "Google"    "Microsoft" "Goldman"  
df$companyName
[1] "Google"    "Microsoft" "Goldman"  
df[[1]]
[1] "Google"    "Microsoft" "Goldman"  
df$all_zero <- 0
df$revenue <- c(110855, 89950, 42254)
df$margin <- df$earnings / df$revenue
# Custom function for small tables -- see last slide for code
html_df(df)
companyName earnings tech_firm all_zero revenue margin
Google Google 12662 TRUE 0 110855 0.1142213
Microsoft Microsoft 21204 TRUE 0 89950 0.2357310
Goldman Goldman 4286 FALSE 0 42254 0.1014342
sort(df$earnings)
[1]  4286 12662 21204
ordering <- order(df$earnings)
ordering
[1] 3 1 2
df <- df[ordering,]
df
          companyName earnings tech_firm all_zero revenue    margin
Goldman       Goldman     4286     FALSE        0   42254 0.1014342
Google         Google    12662      TRUE        0  110855 0.1142213
Microsoft   Microsoft    21204      TRUE        0   89950 0.2357310
# Example of multicolumn sorting:
example <- data.frame(firm=c("Google","Microsoft","Google","Microsoft"),
                      year=c(2017,2017,2016,2016))
example
       firm year
1    Google 2017
2 Microsoft 2017
3    Google 2016
4 Microsoft 2016
# with() allows us to avoiding prepending each column with "example$"
ordering <- order(example$firm, example$year)
example <- example[ordering,]
example
       firm year
3    Google 2016
1    Google 2017
4 Microsoft 2016
2 Microsoft 2017
df[df$tech_firm,]  # Remember the comma!
          companyName earnings tech_firm all_zero revenue    margin
Google         Google    12662      TRUE        0  110855 0.1142213
Microsoft   Microsoft    21204      TRUE        0   89950 0.2357310
subset(df,earnings < 20000)
        companyName earnings tech_firm all_zero revenue    margin
Goldman     Goldman     4286     FALSE        0   42254 0.1014342
Google       Google    12662      TRUE        0  110855 0.1142213
df$earnings
[1]  4286 12662 21204
df$earnings < 20000
[1]  TRUE  TRUE FALSE
sum(tech_df$revenue > 10000)
[1] 46
sum(tech_df$revenue > 10000 & tech_df$earnings < 0)
[1] 4
columns <- c("conm","tic","earnings","revenue")
tech_df[tech_df$revenue > 10000 & tech_df$earnings < 0, columns]
                            conm   tic  earnings  revenue
35                   CORNING INC   GLW  -497.000 10116.00
45  TELEFONAKTIEBOLAGET LM ERICS  ERIC -4307.493 24629.64
120        DELL TECHNOLOGIES INC 7732B -3728.000 78660.00
214                   NOKIA CORP   NOK -1796.087 27917.49
# Outputs odd for odd numbers and even for even numbers
even <- rep("even",5)
odd <- rep("odd",5)
numbers <- 1:5
ifelse(numbers %% 2, odd, even)
[1] "odd"  "even" "odd"  "even" "odd" 
i = 0
while(i < 5) {
  print(i)
  i = i + 2
}
[1] 0
[1] 2
[1] 4
for(i in c(0,2,4)) {
  print(i)
}
[1] 0
[1] 2
[1] 4
# Profit margin, all US tech firms
start <- Sys.time()
margin_1 <- rep(0,length(tech_df$ni))
for(i in seq_along(tech_df$ni)) {
  margin_1[i] <- tech_df$earnings[i] /
                 tech_df$revenue[i]
}
end <- Sys.time()
time_1 <- end - start
time_1
Time difference of 0.01099896 secs
# Profit margin, all US tech firms
start <- Sys.time()
margin_2 <- tech_df$earnings /
            tech_df$revenue
end <- Sys.time()
time_2 <- end - start
time_2
Time difference of 0.001000881 secs
identical(margin_1, margin_2)  # Are these calculations identical?  Yes they are.
[1] TRUE
paste(as.numeric(time_1) / as.numeric(time_2), "times") # How much slower is the loop?
[1] "10.9892806098142 times"
args(data.frame)
function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, 
    fix.empty.names = TRUE, stringsAsFactors = FALSE) 
NULL
args(data.frame)
function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, 
    fix.empty.names = TRUE, stringsAsFactors = FALSE) 
NULL

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
plot <- tech_df %>%
  subset(revenue > 10000) %>%
  ggplot(aes(x=revenue,y=earnings)) + # ggplot comes from ggplot2, part of tidyverse
  geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic)))  # Adds point, and ticker
Warning: Ignoring unknown aesthetics: text
ggplotly(plot)  # Makes the plot interactive
library(tidyverse)
library(plotly)

plot <- ggplot(subset(tech_df, revenue > 10000), aes(x=revenue,y=earnings)) +
  geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic)))
Warning: Ignoring unknown aesthetics: text
ggplotly(plot)  # Makes the plot interactive
vector = c(-2,-1,0,1,2)
sum(vector)
[1] 0
abs(vector)
[1] 2 1 0 1 2
sign(vector)
[1] -1 -1  0  1  1
quantile(tech_df$earnings)
        0%        25%        50%        75%       100% 
-4307.4930   -15.9765     1.8370    91.3550 48351.0000 
range(tech_df$earnings)
[1] -4307.493 48351.000
add_two <- function(n) {
  n + 2
}

add_two(500)
[1] 502
mult_together <- function(n1, n2=0, square=FALSE) {
  if (!square) {
    n1 * n2
  } else {
    n1 * n1
  }
}

mult_together(5,6)
[1] 30
mult_together(5,6,square=TRUE)
[1] 25
mult_together(5,square=TRUE)
[1] 25