Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
library(knitr)
library(kableExtra)
Attaching package: 㤼㸱kableExtra㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
group_rows
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
} else {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
column_spec(1,bold=T)
}
}
library(tidyverse)
df <- read.csv("../../Data/Session_1-2.csv")
# Filter to firms with at least $1M USD revenue, known net income, and fiscal year of 2017
clean_df <- df %>% filter(fyear==2017, !is.na(revt), !is.na(ni), revt > 1)
tech_df <- clean_df %>%
filter(gsector==45) %>%
mutate(revenue = revt,
earnings = ni,
margin = ni/revt)
earnings_2017 <- tech_df$ni
revenue_2017 <- tech_df$revt
names_2017 <- tech_df$conm
names(earnings_2017) <- names_2017
names(revenue_2017) <- names_2017
company <- c("Google", "Microsoft", "Goldman")
company
[1] "Google" "Microsoft" "Goldman"
tech_firm <- c(TRUE, TRUE, FALSE)
tech_firm
[1] TRUE TRUE FALSE
earnings <- c(12662, 21204, 4286)
earnings
[1] 12662 21204 4286
1:5
[1] 1 2 3 4 5
seq(from=0, to=100, by=5)
[1] 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100
rep(1,times=10)
[1] 1 1 1 1 1 1 1 1 1 1
rep("hi",times=5)
[1] "hi" "hi" "hi" "hi" "hi"
earnings # previously defined
[1] 12662 21204 4286
earnings + earnings # Add element-wise
[1] 25324 42408 8572
earnings * earnings # multiply element-wise
[1] 160326244 449609616 18369796
earnings + 10000 # Adding a scalar to a vector
[1] 22662 31204 14286
10000 + earnings # Order doesn't matter
[1] 22662 31204 14286
earnings / 1000 # Dividing a vector by a scalar
[1] 12.662 21.204 4.286
# Dot product: sum of product of elements
earnings %*% earnings # returns a matrix though...
[,1]
[1,] 628305656
drop(earnings %*% earnings) # Drop drops excess dimensions
[1] 628305656
length(earnings) # returns the number of elements
[1] 3
sum(earnings) # returns the sum of all elements
[1] 38152
earnings
[1] 12662 21204 4286
names(earnings) <- c("Google",
"Microsoft",
"Goldman")
earnings
Google Microsoft Goldman
12662 21204 4286
# Equivalently:
names(earnings) <- company
earnings
Google Microsoft Goldman
12662 21204 4286
earnings[1]
Google
12662
earnings["Google"]
Google
12662
# Each of the above 3 is equivalent
earnings[1:2]
Google Microsoft
12662 21204
c1 <- c(1,2,3)
c2 <- c(4,5,6)
c3 <- c(c1,c2)
c3
[1] 1 2 3 4 5 6
# Calculating proit margin for all public US tech firms
# 715 tech firms with >1M sales in 2017
summary(earnings_2017) # Cleaned data from Compustat, in $M USD
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4307.49 -15.98 1.84 296.84 91.36 48351.00
summary(revenue_2017) # Cleaned data from Compustat, in $M USD
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.06 102.62 397.57 3023.78 1531.59 229234.00
profit_margin <- earnings_2017 / revenue_2017
summary(profit_margin)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-13.97960 -0.10253 0.01353 -0.10967 0.09295 1.02655
# These are the worst, midpoint, and best profit margin firms in 2017. Our names carried over :)
profit_margin[order(profit_margin)][c(1,length(profit_margin)/2,length(profit_margin))]
HELIOS AND MATHESON ANALYTIC NLIGHT INC CCUR HOLDINGS INC
-13.97960161 0.01325588 1.02654899
columns <- c("Google", "Microsoft", "Goldman")
rows <- c("Earnings","Revenue")
# equivalent: matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),ncol=3)
firm_data <- matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),nrow=2)
firm_data
[,1] [,2] [,3]
[1,] 12662 4286 89950
[2,] 21204 110855 42254
firm_data + firm_data
[,1] [,2] [,3]
[1,] 25324 8572 179900
[2,] 42408 221710 84508
firm_data / 1000
[,1] [,2] [,3]
[1,] 12.662 4.286 89.950
[2,] 21.204 110.855 42.254
firm_data_T <- t(firm_data)
firm_data_T
[,1] [,2]
[1,] 12662 21204
[2,] 4286 110855
[3,] 89950 42254
firm_data %*% firm_data_T
[,1] [,2]
[1,] 8269698540 4544356878
[2,] 4544356878 14523841157
rownames(firm_data) <- rows
colnames(firm_data) <- columns
firm_data
Google Microsoft Goldman
Earnings 12662 4286 89950
Revenue 21204 110855 42254
firm_data[2,3]
[1] 42254
firm_data[,c("Google","Microsoft")]
Google Microsoft
Earnings 12662 4286
Revenue 21204 110855
firm_data[1,]
Google Microsoft Goldman
12662 4286 89950
indcode <- c(45,45,40)
jpdata <- c(17370, 115475)
# Preloaded: industry codes as indcode (vector)
# Preloaded: industry codes as indcode (vector)
# - GICS codes: 40=Financials, 45=Information Technology
# - See: https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard
# Preloaded: JPMorgan data as jpdata (vector)
mat <- rbind(firm_data,indcode) # Add a row
rownames(mat)[3] <- "Industry" # Name the new row
mat
Google Microsoft Goldman
Earnings 12662 4286 89950
Revenue 21204 110855 42254
Industry 45 45 40
mat <- cbind(firm_data,jpdata) # Add a column
colnames(mat)[4] <- "JPMorgan" # Name the new column
mat
Google Microsoft Goldman JPMorgan
Earnings 12662 4286 89950 17370
Revenue 21204 110855 42254 115475
# Ignore this code for now...
model <- summary(lm(earnings ~ revenue, data=tech_df))
#Note that this function is hiding something...
model
Call:
lm(formula = earnings ~ revenue, data = tech_df)
Residuals:
Min 1Q Median 3Q Max
-16045.0 20.0 141.6 177.1 12104.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.837e+02 4.491e+01 -4.091 4.79e-05 ***
revenue 1.589e-01 3.564e-03 44.585 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1166 on 713 degrees of freedom
Multiple R-squared: 0.736, Adjusted R-squared: 0.7356
F-statistic: 1988 on 1 and 713 DF, p-value: < 2.2e-16
model["r.squared"]
$r.squared
[1] 0.7360059
model[["r.squared"]]
[1] 0.7360059
model$r.squared
[1] 0.7360059
earnings["Google"]
Google
12662
earnings[["Google"]]
[1] 12662
#Can't use $ with vectors
str(model)
List of 11
$ call : language lm(formula = earnings ~ revenue, data = tech_df)
$ terms :Classes 'terms', 'formula' language earnings ~ revenue
.. ..- attr(*, "variables")= language list(earnings, revenue)
.. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:2] "earnings" "revenue"
.. .. .. ..$ : chr "revenue"
.. ..- attr(*, "term.labels")= chr "revenue"
.. ..- attr(*, "order")= int 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(earnings, revenue)
.. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. ..- attr(*, "names")= chr [1:2] "earnings" "revenue"
$ residuals : Named num [1:715] -59.7 173.8 -620.2 586.7 613.6 ...
..- attr(*, "names")= chr [1:715] "1" "2" "3" "4" ...
$ coefficients : num [1:2, 1:4] -1.84e+02 1.59e-01 4.49e+01 3.56e-03 -4.09 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "revenue"
.. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
$ aliased : Named logi [1:2] FALSE FALSE
..- attr(*, "names")= chr [1:2] "(Intercept)" "revenue"
$ sigma : num 1166
$ df : int [1:3] 2 713 2
$ r.squared : num 0.736
$ adj.r.squared: num 0.736
$ fstatistic : Named num [1:3] 1988 1 713
..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
$ cov.unscaled : num [1:2, 1:2] 1.48e-03 -2.83e-08 -2.83e-08 9.35e-12
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "revenue"
.. ..$ : chr [1:2] "(Intercept)" "revenue"
- attr(*, "class")= chr "summary.lm"
library(DT) # This library is great for including larger collections of data in output
datatable(tech_df[1:20,c("conm","tic","margin")], rownames=FALSE)
df <- data.frame(companyName=company,
earnings=earnings,
tech_firm=tech_firm)
df
df[,1]
[1] "Google" "Microsoft" "Goldman"
df$companyName
[1] "Google" "Microsoft" "Goldman"
df[[1]]
[1] "Google" "Microsoft" "Goldman"
df$all_zero <- 0
df$revenue <- c(110855, 89950, 42254)
df$margin <- df$earnings / df$revenue
# Custom function for small tables -- see last slide for code
html_df(df)
companyName | earnings | tech_firm | all_zero | revenue | margin | |
---|---|---|---|---|---|---|
12662 | TRUE | 0 | 110855 | 0.1142213 | ||
Microsoft | Microsoft | 21204 | TRUE | 0 | 89950 | 0.2357310 |
Goldman | Goldman | 4286 | FALSE | 0 | 42254 | 0.1014342 |
sort(df$earnings)
[1] 4286 12662 21204
ordering <- order(df$earnings)
ordering
[1] 3 1 2
df <- df[ordering,]
df
# Example of multicolumn sorting:
example <- data.frame(firm=c("Google","Microsoft","Google","Microsoft"),
year=c(2017,2017,2016,2016))
example
# with() allows us to avoiding prepending each column with "example$"
ordering <- order(example$firm, example$year)
example <- example[ordering,]
example
df[df$tech_firm,] # Remember the comma!
subset(df,earnings < 20000)
df$earnings
[1] 4286 12662 21204
df$earnings < 20000
[1] TRUE TRUE FALSE
sum(tech_df$revenue > 10000)
[1] 46
sum(tech_df$revenue > 10000 & tech_df$earnings < 0)
[1] 4
columns <- c("conm","tic","earnings","revenue")
tech_df[tech_df$revenue > 10000 & tech_df$earnings < 0, columns]
# Outputs odd for odd numbers and even for even numbers
even <- rep("even",5)
odd <- rep("odd",5)
numbers <- 1:5
ifelse(numbers %% 2, odd, even)
[1] "odd" "even" "odd" "even" "odd"
i = 0
while(i < 5) {
print(i)
i = i + 2
}
[1] 0
[1] 2
[1] 4
for(i in c(0,2,4)) {
print(i)
}
[1] 0
[1] 2
[1] 4
# Profit margin, all US tech firms
start <- Sys.time()
margin_1 <- rep(0,length(tech_df$ni))
for(i in seq_along(tech_df$ni)) {
margin_1[i] <- tech_df$earnings[i] /
tech_df$revenue[i]
}
end <- Sys.time()
time_1 <- end - start
time_1
Time difference of 0.07579708 secs
# Profit margin, all US tech firms
start <- Sys.time()
margin_2 <- tech_df$earnings /
tech_df$revenue
end <- Sys.time()
time_2 <- end - start
time_2
Time difference of 0.03490782 secs
identical(margin_1, margin_2) # Are these calculations identical? Yes they are.
[1] TRUE
paste(as.numeric(time_1) / as.numeric(time_2), "times") # How much slower is the loop?
[1] "2.17134973431502 times"
args(data.frame)
function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
fix.empty.names = TRUE, stringsAsFactors = default.stringsAsFactors())
NULL
args(data.frame)
function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
fix.empty.names = TRUE, stringsAsFactors = default.stringsAsFactors())
NULL
library(tidyverse)
library(plotly)
plot <- tech_df %>%
subset(revenue > 10000) %>%
ggplot(aes(x=revenue,y=earnings)) + # ggplot comes from ggplot2, part of tidyverse
geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic))) # Adds point, and ticker
ggplotly(plot) # Makes the plot interactive
library(tidyverse)
library(plotly)
plot <- ggplot(subset(tech_df, revenue > 10000), aes(x=revenue,y=earnings)) +
geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic)))
ggplotly(plot) # Makes the plot interactive
vector = c(-2,-1,0,1,2)
sum(vector)
[1] 0
abs(vector)
[1] 2 1 0 1 2
sign(vector)
[1] -1 -1 0 1 1
quantile(tech_df$earnings)
0% 25% 50% 75% 100%
-4307.4930 -15.9765 1.8370 91.3550 48351.0000
range(tech_df$earnings)
[1] -4307.493 48351.000
add_two <- function(n) {
n + 2
}
add_two(500)
[1] 502
mult_together <- function(n1, n2=0, square=FALSE) {
if (!square) {
n1 * n2
} else {
n1 * n1
}
}
mult_together(5,6)
[1] 30
mult_together(5,6,square=TRUE)
[1] 25
mult_together(5,square=TRUE)
[1] 25