Code for Session 1 Sup

Author

Dr. Richard M. Crowley

Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.

library(knitr)
library(kableExtra)

#' Pretty formatted HTML tables
#' 
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
  if(!length(names)) {
    names=colnames(df)
  }
  kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
    kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
    {if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
    {if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
    {if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
    {if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}

library(tidyverse)

company <- c("Google", "Microsoft", "Goldman")
company

[1] "Google"    "Microsoft" "Goldman"

tech_firm <- c(TRUE, TRUE, FALSE)
tech_firm

[1]  TRUE  TRUE FALSE

earnings <- c(12662, 21204, 4286)
earnings

[1] 12662 21204  4286

1:5

[1] 1 2 3 4 5

seq(from=0, to=100, by=5)

 [1]   0   5  10  15  20  25  30  35  40  45  50  55  60  65  70  75  80  85  90
[20]  95 100

rep(1,times=10)

 [1] 1 1 1 1 1 1 1 1 1 1

rep("hi",times=5)

[1] "hi" "hi" "hi" "hi" "hi"

earnings  # previously defined

[1] 12662 21204  4286

earnings + earnings  # Add element-wise

[1] 25324 42408  8572

earnings * earnings  # multiply element-wise

[1] 160326244 449609616  18369796

earnings + 10000  # Adding a scalar to a vector

[1] 22662 31204 14286

10000 + earnings  # Order doesn't matter

[1] 22662 31204 14286

earnings / 1000  # Dividing a vector by a scalar

[1] 12.662 21.204  4.286

# Dot product: sum of product of elements
earnings %*% earnings  # returns a matrix though...

          [,1]
[1,] 628305656

drop(earnings %*% earnings)  # Drop drops excess dimensions

[1] 628305656

length(earnings)  # returns the number of elements

[1] 3

sum(earnings)  # returns the sum of all elements

[1] 38152

earnings

[1] 12662 21204  4286

names(earnings) <- c("Google",
                     "Microsoft",
                     "Goldman")
earnings

   Google Microsoft   Goldman 
    12662     21204      4286

# Equivalently:
names(earnings) <- company
earnings

   Google Microsoft   Goldman 
    12662     21204      4286

earnings[1]

Google 
 12662

earnings["Google"]

Google 
 12662

# Each of the above 3 is equivalent
earnings[1:2]

   Google Microsoft 
    12662     21204

c1 <- c(1,2,3)
c2 <- c(4,5,6)
c3 <- c(c1,c2)
c3

[1] 1 2 3 4 5 6

library(tidyverse)
df <- read.csv("../../Data/Session_1-2.csv")
# Filter to firms with at least $1M USD revenue, known net income, and fiscal year of 2017
clean_df <- df %>% filter(fyear==2017, !is.na(revt), !is.na(ni), revt > 1)
tech_df <- clean_df %>%
  filter(gsector==45) %>%
  mutate(revenue = revt,
         earnings = ni,
         margin = ni/revt)
earnings_2017 <- tech_df$ni
revenue_2017 <- tech_df$revt
names_2017 <- tech_df$conm
names(earnings_2017) <- names_2017
names(revenue_2017) <- names_2017

# Calculating proit margin for all public US tech firms
# 715 tech firms with >1M sales in 2017
summary(earnings_2017)  # Cleaned data from Compustat, in $M USD

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4307.49   -15.98     1.84   296.84    91.36 48351.00

summary(revenue_2017)  # Cleaned data from Compustat, in $M USD

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
     1.06    102.62    397.57   3023.78   1531.59 229234.00

profit_margin <- earnings_2017 / revenue_2017
summary(profit_margin)

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-13.97960  -0.10253   0.01353  -0.10967   0.09295   1.02655

# These are the worst, midpoint, and best profit margin firms in 2017. Our names carried over :)
profit_margin[order(profit_margin)][c(1,length(profit_margin)/2,length(profit_margin))]

HELIOS AND MATHESON ANALYTIC                   NLIGHT INC 
                -13.97960161                   0.01325588 
           CCUR HOLDINGS INC 
                  1.02654899

columns <- c("Google", "Microsoft", "Goldman")
rows <- c("Earnings","Revenue")

# equivalent: matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),ncol=3)
firm_data <- matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),nrow=2)
firm_data

      [,1]   [,2]  [,3]
[1,] 12662   4286 89950
[2,] 21204 110855 42254

firm_data + firm_data

      [,1]   [,2]   [,3]
[1,] 25324   8572 179900
[2,] 42408 221710  84508

firm_data / 1000

       [,1]    [,2]   [,3]
[1,] 12.662   4.286 89.950
[2,] 21.204 110.855 42.254

firm_data_T <- t(firm_data)
firm_data_T

      [,1]   [,2]
[1,] 12662  21204
[2,]  4286 110855
[3,] 89950  42254

firm_data %*% firm_data_T

           [,1]        [,2]
[1,] 8269698540  4544356878
[2,] 4544356878 14523841157

rownames(firm_data) <- rows
colnames(firm_data) <- columns
firm_data

         Google Microsoft Goldman
Earnings  12662      4286   89950
Revenue   21204    110855   42254

firm_data[2,3]

[1] 42254

firm_data[,c("Google","Microsoft")]

         Google Microsoft
Earnings  12662      4286
Revenue   21204    110855

firm_data[1,]

   Google Microsoft   Goldman 
    12662      4286     89950

indcode <- c(45,45,40)
jpdata <- c(17370, 115475)
# Preloaded: industry codes as indcode (vector)

# Preloaded: industry codes as indcode (vector)
#     - GICS codes: 40=Financials, 45=Information Technology
#     - See: https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard
# Preloaded: JPMorgan data as jpdata (vector)

mat <- rbind(firm_data,indcode)  # Add a row
rownames(mat)[3] <- "Industry"  # Name the new row
mat

         Google Microsoft Goldman
Earnings  12662      4286   89950
Revenue   21204    110855   42254
Industry     45        45      40

mat <- cbind(firm_data,jpdata)  # Add a column
colnames(mat)[4] <- "JPMorgan"  # Name the new column
mat

         Google Microsoft Goldman JPMorgan
Earnings  12662      4286   89950    17370
Revenue   21204    110855   42254   115475

# Ignore this code for now...
model <- summary(lm(earnings ~ revenue, data=tech_df))
#Note that this function is hiding something...
model


Call:
lm(formula = earnings ~ revenue, data = tech_df)

Residuals:
     Min       1Q   Median       3Q      Max 
-16045.0     20.0    141.6    177.1  12104.6 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.837e+02  4.491e+01  -4.091 4.79e-05 ***
revenue      1.589e-01  3.564e-03  44.585  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1166 on 713 degrees of freedom
Multiple R-squared:  0.736, Adjusted R-squared:  0.7356 
F-statistic:  1988 on 1 and 713 DF,  p-value: < 2.2e-16

model["r.squared"]

$r.squared
[1] 0.7360059

model[["r.squared"]]

[1] 0.7360059

model$r.squared

[1] 0.7360059

earnings["Google"]

Google 
 12662

earnings[["Google"]]

[1] 12662

#Can't use $ with vectors

str(model)

List of 11
 $ call         : language lm(formula = earnings ~ revenue, data = tech_df)
 $ terms        :Classes 'terms', 'formula'  language earnings ~ revenue
  .. ..- attr(*, "variables")= language list(earnings, revenue)
  .. ..- attr(*, "factors")= int [1:2, 1] 0 1
  .. .. ..- attr(*, "dimnames")=List of 2
  .. .. .. ..$ : chr [1:2] "earnings" "revenue"
  .. .. .. ..$ : chr "revenue"
  .. ..- attr(*, "term.labels")= chr "revenue"
  .. ..- attr(*, "order")= int 1
  .. ..- attr(*, "intercept")= int 1
  .. ..- attr(*, "response")= int 1
  .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
  .. ..- attr(*, "predvars")= language list(earnings, revenue)
  .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
  .. .. ..- attr(*, "names")= chr [1:2] "earnings" "revenue"
 $ residuals    : Named num [1:715] -59.7 173.8 -620.2 586.7 613.6 ...
  ..- attr(*, "names")= chr [1:715] "1" "2" "3" "4" ...
 $ coefficients : num [1:2, 1:4] -1.84e+02 1.59e-01 4.49e+01 3.56e-03 -4.09 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2] "(Intercept)" "revenue"
  .. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
 $ aliased      : Named logi [1:2] FALSE FALSE
  ..- attr(*, "names")= chr [1:2] "(Intercept)" "revenue"
 $ sigma        : num 1166
 $ df           : int [1:3] 2 713 2
 $ r.squared    : num 0.736
 $ adj.r.squared: num 0.736
 $ fstatistic   : Named num [1:3] 1988 1 713
  ..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
 $ cov.unscaled : num [1:2, 1:2] 1.48e-03 -2.83e-08 -2.83e-08 9.35e-12
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2] "(Intercept)" "revenue"
  .. ..$ : chr [1:2] "(Intercept)" "revenue"
 - attr(*, "class")= chr "summary.lm"

library(DT) # This library is great for including larger collections of data in output
options(DT.options = list(scrollY="450px"))

# The DT library is great for including larger collections of data in output
DT::datatable(tech_df[1:20,c("conm","tic","margin")], rownames=FALSE)

df <- data.frame(companyName=company,
                 earnings=earnings,
                 tech_firm=tech_firm)
df

          companyName earnings tech_firm
Google         Google    12662      TRUE
Microsoft   Microsoft    21204      TRUE
Goldman       Goldman     4286     FALSE

df[,1]

[1] "Google"    "Microsoft" "Goldman"

df$companyName

[1] "Google"    "Microsoft" "Goldman"

df[[1]]

[1] "Google"    "Microsoft" "Goldman"

df$all_zero <- 0
df$revenue <- c(110855, 89950, 42254)
df$margin <- df$earnings / df$revenue
# Custom function for small tables -- see last slide for code
html_df(df)

	companyName	earnings	tech_firm	revenue	margin
Google	Google	12662	TRUE	110855	0.1142213
Microsoft	Microsoft	21204	TRUE	89950	0.2357310
Goldman	Goldman	4286	FALSE	42254	0.1014342

sort(df$earnings)

[1]  4286 12662 21204

ordering <- order(df$earnings)
ordering

[1] 3 1 2

df <- df[ordering,]
df

          companyName earnings tech_firm all_zero revenue    margin
Goldman       Goldman     4286     FALSE        0   42254 0.1014342
Google         Google    12662      TRUE        0  110855 0.1142213
Microsoft   Microsoft    21204      TRUE        0   89950 0.2357310

# Example of multicolumn sorting:
example <- data.frame(firm=c("Google","Microsoft","Google","Microsoft"),
                      year=c(2017,2017,2016,2016))
example

       firm year
1    Google 2017
2 Microsoft 2017
3    Google 2016
4 Microsoft 2016

# with() allows us to avoiding prepending each column with "example$"
ordering <- order(example$firm, example$year)
example <- example[ordering,]
example

       firm year
3    Google 2016
1    Google 2017
4 Microsoft 2016
2 Microsoft 2017

df[df$tech_firm,]  # Remember the comma!

          companyName earnings tech_firm all_zero revenue    margin
Google         Google    12662      TRUE        0  110855 0.1142213
Microsoft   Microsoft    21204      TRUE        0   89950 0.2357310

subset(df,earnings < 20000)

        companyName earnings tech_firm all_zero revenue    margin
Goldman     Goldman     4286     FALSE        0   42254 0.1014342
Google       Google    12662      TRUE        0  110855 0.1142213

df$earnings

[1]  4286 12662 21204

df$earnings < 20000

[1]  TRUE  TRUE FALSE

sum(tech_df$revenue > 10000)

[1] 46

sum(tech_df$revenue > 10000 & tech_df$earnings < 0)

[1] 4

columns <- c("conm","tic","earnings","revenue")
tech_df[tech_df$revenue > 10000 & tech_df$earnings < 0, columns]

                            conm   tic  earnings  revenue
35                   CORNING INC   GLW  -497.000 10116.00
45  TELEFONAKTIEBOLAGET LM ERICS  ERIC -4307.493 24629.64
120        DELL TECHNOLOGIES INC 7732B -3728.000 78660.00
214                   NOKIA CORP   NOK -1796.087 27917.49

# Outputs odd for odd numbers and even for even numbers
even <- rep("even",5)
odd <- rep("odd",5)
numbers <- 1:5
ifelse(numbers %% 2, odd, even)

[1] "odd"  "even" "odd"  "even" "odd"

i = 0
while(i < 5) {
  print(i)
  i = i + 2
}

[1] 0
[1] 2
[1] 4

for(i in c(0,2,4)) {
  print(i)
}

[1] 0
[1] 2
[1] 4

# Profit margin, all US tech firms
start <- Sys.time()
margin_1 <- rep(0,length(tech_df$ni))
for(i in seq_along(tech_df$ni)) {
  margin_1[i] <- tech_df$earnings[i] /
                 tech_df$revenue[i]
}
end <- Sys.time()
time_1 <- end - start
time_1

Time difference of 0.01099896 secs

# Profit margin, all US tech firms
start <- Sys.time()
margin_2 <- tech_df$earnings /
            tech_df$revenue
end <- Sys.time()
time_2 <- end - start
time_2

Time difference of 0.001000881 secs

identical(margin_1, margin_2)  # Are these calculations identical?  Yes they are.

[1] TRUE

paste(as.numeric(time_1) / as.numeric(time_2), "times") # How much slower is the loop?

[1] "10.9892806098142 times"

args(data.frame)

function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, 
    fix.empty.names = TRUE, stringsAsFactors = FALSE) 
NULL

args(data.frame)

function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE, 
    fix.empty.names = TRUE, stringsAsFactors = FALSE) 
NULL

library(tidyverse)
library(plotly)


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

plot <- tech_df %>%
  subset(revenue > 10000) %>%
  ggplot(aes(x=revenue,y=earnings)) + # ggplot comes from ggplot2, part of tidyverse
  geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic)))  # Adds point, and ticker

Warning: Ignoring unknown aesthetics: text

ggplotly(plot)  # Makes the plot interactive

library(tidyverse)
library(plotly)

plot <- ggplot(subset(tech_df, revenue > 10000), aes(x=revenue,y=earnings)) +
  geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic)))

Warning: Ignoring unknown aesthetics: text

ggplotly(plot)  # Makes the plot interactive

vector = c(-2,-1,0,1,2)
sum(vector)

[1] 0

abs(vector)

[1] 2 1 0 1 2

sign(vector)

[1] -1 -1  0  1  1

quantile(tech_df$earnings)

        0%        25%        50%        75%       100% 
-4307.4930   -15.9765     1.8370    91.3550 48351.0000

range(tech_df$earnings)

[1] -4307.493 48351.000

add_two <- function(n) {
  n + 2
}

add_two(500)

[1] 502

mult_together <- function(n1, n2=0, square=FALSE) {
  if (!square) {
    n1 * n2
  } else {
    n1 * n1
  }
}

mult_together(5,6)

[1] 30

mult_together(5,6,square=TRUE)

[1] 25

mult_together(5,square=TRUE)

[1] 25