library(knitr)
library(kableExtra)
#' Pretty formatted HTML tables
#'
#' @param df The dataframe to format as a table
#' @param names Names to use for columns if not using the df column names. Vector
#' @param highlight_cols Columns to highlight. Default is c(1) for highlighting the first row. Use c() to remove.
#' @param highlight_rows Rows to highlight. Default is c() to not highlight any rows.
#' @param color_rows Rows to highlight in color. Default is c() to not highlight any rows.
#' @param highlight_end_rows Rows to highlight. Uses a reverse indexing where 1 is the last row, 2 is second to last, etc.
#' @param full Use to span the full width of a slide. FALSE by default.
#' @param fixed_header Whether the first row should be treated as a header row. TRUE by default.
html_df <- function(df, names=NULL, highlight_cols=c(1), highlight_rows=c(), color_rows=c(), highlight_end_rows=c(), full=F, fixed_header=T) {
if(!length(names)) {
names=colnames(df)
}
kbl(df,"html", col.names = names, align = c("l",rep('c',length(cols)-1))) %>%
kable_paper(c("striped","hover"), full_width=full, fixed_thead=F, html_font = "\"Source Sans Pro\", Helvetica, sans-serif") %>%
{if(length(highlight_cols)) column_spec(., highlight_cols,bold=T, background = "#ffffff99") else .} %>%
{if(length(highlight_rows)) row_spec(., highlight_rows,bold=T, background = "#ffffff99") else .} %>%
{if(length(color_rows)) row_spec(., color_rows,bold=T, background = "#aaaaff99") else .} %>%
{if(length(highlight_end_rows)) row_spec(., nrow(df) + 1 - highlight_end_rows,bold=T, background = "#ffffff99") else .}
}
library(tidyverse)
Code for Session 1 Sup
Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
company <- c("Google", "Microsoft", "Goldman")
company
[1] "Google" "Microsoft" "Goldman"
tech_firm <- c(TRUE, TRUE, FALSE)
tech_firm
[1] TRUE TRUE FALSE
earnings <- c(12662, 21204, 4286)
earnings
[1] 12662 21204 4286
1:5
[1] 1 2 3 4 5
seq(from=0, to=100, by=5)
[1] 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90
[20] 95 100
earnings # previously defined
[1] 12662 21204 4286
earnings + earnings # Add element-wise
[1] 25324 42408 8572
earnings * earnings # multiply element-wise
[1] 160326244 449609616 18369796
earnings + 10000 # Adding a scalar to a vector
[1] 22662 31204 14286
10000 + earnings # Order doesn't matter
[1] 22662 31204 14286
earnings / 1000 # Dividing a vector by a scalar
[1] 12.662 21.204 4.286
# Dot product: sum of product of elements
earnings %*% earnings # returns a matrix though...
[,1]
[1,] 628305656
[1] 628305656
length(earnings) # returns the number of elements
[1] 3
sum(earnings) # returns the sum of all elements
[1] 38152
earnings
[1] 12662 21204 4286
Google Microsoft Goldman
12662 21204 4286
# Equivalently:
names(earnings) <- company
earnings
Google Microsoft Goldman
12662 21204 4286
earnings[1]
Google
12662
earnings["Google"]
Google
12662
# Each of the above 3 is equivalent
earnings[1:2]
Google Microsoft
12662 21204
library(tidyverse)
df <- read.csv("../../Data/Session_1-2.csv")
# Filter to firms with at least $1M USD revenue, known net income, and fiscal year of 2017
clean_df <- df %>% filter(fyear==2017, !is.na(revt), !is.na(ni), revt > 1)
tech_df <- clean_df %>%
filter(gsector==45) %>%
mutate(revenue = revt,
earnings = ni,
margin = ni/revt)
earnings_2017 <- tech_df$ni
revenue_2017 <- tech_df$revt
names_2017 <- tech_df$conm
names(earnings_2017) <- names_2017
names(revenue_2017) <- names_2017
# Calculating proit margin for all public US tech firms
# 715 tech firms with >1M sales in 2017
summary(earnings_2017) # Cleaned data from Compustat, in $M USD
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4307.49 -15.98 1.84 296.84 91.36 48351.00
summary(revenue_2017) # Cleaned data from Compustat, in $M USD
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.06 102.62 397.57 3023.78 1531.59 229234.00
profit_margin <- earnings_2017 / revenue_2017
summary(profit_margin)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-13.97960 -0.10253 0.01353 -0.10967 0.09295 1.02655
# These are the worst, midpoint, and best profit margin firms in 2017. Our names carried over :)
profit_margin[order(profit_margin)][c(1,length(profit_margin)/2,length(profit_margin))]
HELIOS AND MATHESON ANALYTIC NLIGHT INC
-13.97960161 0.01325588
CCUR HOLDINGS INC
1.02654899
columns <- c("Google", "Microsoft", "Goldman")
rows <- c("Earnings","Revenue")
# equivalent: matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),ncol=3)
firm_data <- matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),nrow=2)
firm_data
[,1] [,2] [,3]
[1,] 12662 4286 89950
[2,] 21204 110855 42254
firm_data + firm_data
[,1] [,2] [,3]
[1,] 25324 8572 179900
[2,] 42408 221710 84508
firm_data / 1000
[,1] [,2] [,3]
[1,] 12.662 4.286 89.950
[2,] 21.204 110.855 42.254
firm_data_T <- t(firm_data)
firm_data_T
[,1] [,2]
[1,] 12662 21204
[2,] 4286 110855
[3,] 89950 42254
firm_data %*% firm_data_T
[,1] [,2]
[1,] 8269698540 4544356878
[2,] 4544356878 14523841157
Google Microsoft Goldman
Earnings 12662 4286 89950
Revenue 21204 110855 42254
firm_data[2,3]
[1] 42254
firm_data[,c("Google","Microsoft")]
Google Microsoft
Earnings 12662 4286
Revenue 21204 110855
firm_data[1,]
Google Microsoft Goldman
12662 4286 89950
# Preloaded: industry codes as indcode (vector)
# - GICS codes: 40=Financials, 45=Information Technology
# - See: https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard
# Preloaded: JPMorgan data as jpdata (vector)
mat <- rbind(firm_data,indcode) # Add a row
rownames(mat)[3] <- "Industry" # Name the new row
mat
Google Microsoft Goldman
Earnings 12662 4286 89950
Revenue 21204 110855 42254
Industry 45 45 40
mat <- cbind(firm_data,jpdata) # Add a column
colnames(mat)[4] <- "JPMorgan" # Name the new column
mat
Google Microsoft Goldman JPMorgan
Earnings 12662 4286 89950 17370
Revenue 21204 110855 42254 115475
# Ignore this code for now...
model <- summary(lm(earnings ~ revenue, data=tech_df))
#Note that this function is hiding something...
model
Call:
lm(formula = earnings ~ revenue, data = tech_df)
Residuals:
Min 1Q Median 3Q Max
-16045.0 20.0 141.6 177.1 12104.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.837e+02 4.491e+01 -4.091 4.79e-05 ***
revenue 1.589e-01 3.564e-03 44.585 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1166 on 713 degrees of freedom
Multiple R-squared: 0.736, Adjusted R-squared: 0.7356
F-statistic: 1988 on 1 and 713 DF, p-value: < 2.2e-16
model["r.squared"]
$r.squared
[1] 0.7360059
model[["r.squared"]]
[1] 0.7360059
model$r.squared
[1] 0.7360059
earnings["Google"]
Google
12662
earnings[["Google"]]
[1] 12662
#Can't use $ with vectors
str(model)
List of 11
$ call : language lm(formula = earnings ~ revenue, data = tech_df)
$ terms :Classes 'terms', 'formula' language earnings ~ revenue
.. ..- attr(*, "variables")= language list(earnings, revenue)
.. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:2] "earnings" "revenue"
.. .. .. ..$ : chr "revenue"
.. ..- attr(*, "term.labels")= chr "revenue"
.. ..- attr(*, "order")= int 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(earnings, revenue)
.. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. ..- attr(*, "names")= chr [1:2] "earnings" "revenue"
$ residuals : Named num [1:715] -59.7 173.8 -620.2 586.7 613.6 ...
..- attr(*, "names")= chr [1:715] "1" "2" "3" "4" ...
$ coefficients : num [1:2, 1:4] -1.84e+02 1.59e-01 4.49e+01 3.56e-03 -4.09 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "revenue"
.. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
$ aliased : Named logi [1:2] FALSE FALSE
..- attr(*, "names")= chr [1:2] "(Intercept)" "revenue"
$ sigma : num 1166
$ df : int [1:3] 2 713 2
$ r.squared : num 0.736
$ adj.r.squared: num 0.736
$ fstatistic : Named num [1:3] 1988 1 713
..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
$ cov.unscaled : num [1:2, 1:2] 1.48e-03 -2.83e-08 -2.83e-08 9.35e-12
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "revenue"
.. ..$ : chr [1:2] "(Intercept)" "revenue"
- attr(*, "class")= chr "summary.lm"
# The DT library is great for including larger collections of data in output
DT::datatable(tech_df[1:20,c("conm","tic","margin")], rownames=FALSE)
df <- data.frame(companyName=company,
earnings=earnings,
tech_firm=tech_firm)
df
companyName earnings tech_firm
Google Google 12662 TRUE
Microsoft Microsoft 21204 TRUE
Goldman Goldman 4286 FALSE
df[,1]
[1] "Google" "Microsoft" "Goldman"
df$companyName
[1] "Google" "Microsoft" "Goldman"
df[[1]]
[1] "Google" "Microsoft" "Goldman"
df$all_zero <- 0
df$revenue <- c(110855, 89950, 42254)
df$margin <- df$earnings / df$revenue
# Custom function for small tables -- see last slide for code
html_df(df)
companyName | earnings | tech_firm | all_zero | revenue | margin | |
---|---|---|---|---|---|---|
12662 | TRUE | 0 | 110855 | 0.1142213 | ||
Microsoft | Microsoft | 21204 | TRUE | 0 | 89950 | 0.2357310 |
Goldman | Goldman | 4286 | FALSE | 0 | 42254 | 0.1014342 |
sort(df$earnings)
[1] 4286 12662 21204
ordering <- order(df$earnings)
ordering
[1] 3 1 2
df <- df[ordering,]
df
companyName earnings tech_firm all_zero revenue margin
Goldman Goldman 4286 FALSE 0 42254 0.1014342
Google Google 12662 TRUE 0 110855 0.1142213
Microsoft Microsoft 21204 TRUE 0 89950 0.2357310
# Example of multicolumn sorting:
example <- data.frame(firm=c("Google","Microsoft","Google","Microsoft"),
year=c(2017,2017,2016,2016))
example
firm year
1 Google 2017
2 Microsoft 2017
3 Google 2016
4 Microsoft 2016
# with() allows us to avoiding prepending each column with "example$"
ordering <- order(example$firm, example$year)
example <- example[ordering,]
example
firm year
3 Google 2016
1 Google 2017
4 Microsoft 2016
2 Microsoft 2017
df[df$tech_firm,] # Remember the comma!
companyName earnings tech_firm all_zero revenue margin
Google Google 12662 TRUE 0 110855 0.1142213
Microsoft Microsoft 21204 TRUE 0 89950 0.2357310
subset(df,earnings < 20000)
companyName earnings tech_firm all_zero revenue margin
Goldman Goldman 4286 FALSE 0 42254 0.1014342
Google Google 12662 TRUE 0 110855 0.1142213
df$earnings
[1] 4286 12662 21204
df$earnings < 20000
[1] TRUE TRUE FALSE
sum(tech_df$revenue > 10000)
[1] 46
sum(tech_df$revenue > 10000 & tech_df$earnings < 0)
[1] 4
columns <- c("conm","tic","earnings","revenue")
tech_df[tech_df$revenue > 10000 & tech_df$earnings < 0, columns]
conm tic earnings revenue
35 CORNING INC GLW -497.000 10116.00
45 TELEFONAKTIEBOLAGET LM ERICS ERIC -4307.493 24629.64
120 DELL TECHNOLOGIES INC 7732B -3728.000 78660.00
214 NOKIA CORP NOK -1796.087 27917.49
# Outputs odd for odd numbers and even for even numbers
even <- rep("even",5)
odd <- rep("odd",5)
numbers <- 1:5
ifelse(numbers %% 2, odd, even)
[1] "odd" "even" "odd" "even" "odd"
i = 0
while(i < 5) {
print(i)
i = i + 2
}
[1] 0
[1] 2
[1] 4
# Profit margin, all US tech firms
start <- Sys.time()
margin_1 <- rep(0,length(tech_df$ni))
for(i in seq_along(tech_df$ni)) {
margin_1[i] <- tech_df$earnings[i] /
tech_df$revenue[i]
}
end <- Sys.time()
time_1 <- end - start
time_1
Time difference of 0.01099896 secs
# Profit margin, all US tech firms
start <- Sys.time()
margin_2 <- tech_df$earnings /
tech_df$revenue
end <- Sys.time()
time_2 <- end - start
time_2
Time difference of 0.001000881 secs
identical(margin_1, margin_2) # Are these calculations identical? Yes they are.
[1] TRUE
paste(as.numeric(time_1) / as.numeric(time_2), "times") # How much slower is the loop?
[1] "10.9892806098142 times"
args(data.frame)
function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
fix.empty.names = TRUE, stringsAsFactors = FALSE)
NULL
args(data.frame)
function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
fix.empty.names = TRUE, stringsAsFactors = FALSE)
NULL
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
plot <- tech_df %>%
subset(revenue > 10000) %>%
ggplot(aes(x=revenue,y=earnings)) + # ggplot comes from ggplot2, part of tidyverse
geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic))) # Adds point, and ticker
Warning: Ignoring unknown aesthetics: text
ggplotly(plot) # Makes the plot interactive
library(tidyverse)
library(plotly)
plot <- ggplot(subset(tech_df, revenue > 10000), aes(x=revenue,y=earnings)) +
geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic)))
Warning: Ignoring unknown aesthetics: text
ggplotly(plot) # Makes the plot interactive
quantile(tech_df$earnings)
0% 25% 50% 75% 100%
-4307.4930 -15.9765 1.8370 91.3550 48351.0000
range(tech_df$earnings)
[1] -4307.493 48351.000
add_two <- function(n) {
n + 2
}
add_two(500)
[1] 502
mult_together <- function(n1, n2=0, square=FALSE) {
if (!square) {
n1 * n2
} else {
n1 * n1
}
}
mult_together(5,6)
[1] 30
mult_together(5,6,square=TRUE)
[1] 25
mult_together(5,square=TRUE)
[1] 25