Dr. Richard M. Crowley
Examples:
\[ \begin{matrix} \left(\begin{matrix} 1 \\ 2 \\ 3 \\ 4 \end{matrix}\right) & \text{or} & \left(\begin{matrix} 1 & 2 & 3 & 4 \end{matrix}\right) \end{matrix} \]
A row (or column) of data
company <- c("Google", "Microsoft", "Goldman")
company
## [1] "Google" "Microsoft" "Goldman"
tech_firm <- c(TRUE, TRUE, FALSE)
tech_firm
## [1] TRUE TRUE FALSE
earnings <- c(12662, 21204, 4286)
earnings
## [1] 12662 21204 4286
A vector in R is a 1 dimensional collection of 1 or more of the same data type
:
, e.g. 1:5
or 22:500
seq(from=0, to=100,
by=5)
1:5
## [1] 1 2 3 4 5
seq(from=0, to=100, by=5)
## [1] 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80
## [18] 85 90 95 100
\(\uparrow\) note that [18] means the 18th output
rep(1,times=10)
or rep("hi",times=5)
rep(1,times=10)
## [1] 1 1 1 1 1 1 1 1 1 1
rep("hi",times=5)
## [1] "hi" "hi" "hi" "hi" "hi"
Works the same as scalars, but applies element-wise
earnings # previously defined
## [1] 12662 21204 4286
earnings + earnings # Add element-wise
## [1] 25324 42408 8572
earnings * earnings # multiply element-wise
## [1] 160326244 449609616 18369796
Can also use 1 vector and 1 scalar
earnings + 10000 # Adding a scalar to a vector
## [1] 22662 31204 14286
10000 + earnings # Order doesn't matter
## [1] 22662 31204 14286
earnings / 1000 # Dividing a vector by a scalar
## [1] 12.662 21.204 4.286
%*%
# Dot product: sum of product of elements
earnings %*% earnings # returns a matrix though...
## [,1]
## [1,] 628305656
drop(earnings %*% earnings) # Drop drops excess dimensions
## [1] 628305656
length(earnings) # returns the number of elements
## [1] 3
sum(earnings) # returns the sum of all elements
## [1] 38152
Hard to read:
earnings
## [1] 12662 21204 4286
Easy to read:
names(earnings) <- c("Google",
"Microsoft",
"Goldman")
earnings
## Google Microsoft Goldman
## 12662 21204 4286
# Equivalently:
names(earnings) <- company
earnings
## Google Microsoft Goldman
## 12662 21204 4286
[1]
["Google"]
earnings[1]
## Google
## 12662
earnings["Google"]
## Google
## 12662
earnings[c(1,2)]
earnings[1:2]
earnings[c("Google",
"Microsoft")]
# Each of the above 3 is equivalent
earnings[1:2]
## Google Microsoft
## 12662 21204
c1 <- c(1,2,3)
c2 <- c(4,5,6)
c3 <- c(c1,c2)
c3
## [1] 1 2 3 4 5 6
# Calculating proit margin for all public US tech firms
# 715 tech firms with >1M sales in 2017
summary(earnings_2017) # Cleaned data from Compustat, in $M USD
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4307.49 -15.98 1.84 296.84 91.36 48351.00
summary(revenue_2017) # Cleaned data from Compustat, in $M USD
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.06 102.62 397.57 3023.78 1531.59 229234.00
profit_margin <- earnings_2017 / revenue_2017
summary(profit_margin)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -13.97960 -0.10253 0.01353 -0.10967 0.09295 1.02655
# These are the worst, midpoint, and best profit margin firms in 2017. Our names carried over :)
profit_margin[order(profit_margin)][c(1,length(profit_margin)/2,length(profit_margin))]
## HELIOS AND MATHESON ANALYTIC NLIGHT INC
## -13.97960161 0.01325588
## CCUR HOLDINGS INC
## 1.02654899
Example:
\[ \left(\begin{matrix} 1 & 2 & 3 & 4\\ 5 & 6 & 7 & 8\\ 9 & 10 & 11 & 12 \end{matrix}\right) \]
A rows and columns of data
columns <- c("Google", "Microsoft", "Goldman")
rows <- c("Earnings","Revenue")
# equivalent: matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),ncol=3)
firm_data <- matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),nrow=2)
firm_data
## [,1] [,2] [,3]
## [1,] 12662 4286 89950
## [2,] 21204 110855 42254
Everything with matrices works just like vectors
firm_data + firm_data
## [,1] [,2] [,3]
## [1,] 25324 8572 179900
## [2,] 42408 221710 84508
firm_data / 1000
## [,1] [,2] [,3]
## [1,] 12.662 4.286 89.950
## [2,] 21.204 110.855 42.254
firm_data_T <- t(firm_data)
firm_data_T
## [,1] [,2]
## [1,] 12662 21204
## [2,] 4286 110855
## [3,] 89950 42254
%*%
firm_data %*% firm_data_T
## [,1] [,2]
## [1,] 8269698540 4544356878
## [2,] 4544356878 14523841157
We won’t use these much, but they can be useful
rownames(firm_data) <- rows
colnames(firm_data) <- columns
firm_data
## Google Microsoft Goldman
## Earnings 12662 4286 89950
## Revenue 21204 110855 42254
matrix_name[rows,columns]
firm_data[2,3]
## [1] 42254
firm_data[,c("Google","Microsoft")]
## Google Microsoft
## Earnings 12662 4286
## Revenue 21204 110855
firm_data[1,]
## Google Microsoft Goldman
## 12662 4286 89950
# Preloaded: industry codes as indcode (vector)
# - GICS codes: 40=Financials, 45=Information Technology
# - See: https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard
# Preloaded: JPMorgan data as jpdata (vector)
mat <- rbind(firm_data,indcode) # Add a row
rownames(mat)[3] <- "Industry" # Name the new row
mat
## Google Microsoft Goldman
## Earnings 12662 4286 89950
## Revenue 21204 110855 42254
## Industry 45 45 40
mat <- cbind(firm_data,jpdata) # Add a column
colnames(mat)[4] <- "JPMorgan" # Name the new column
mat
## Google Microsoft Goldman JPMorgan
## Earnings 12662 4286 89950 17370
## Revenue 21204 110855 42254 115475
# Ignore this code for now...
model <- summary(lm(earnings ~ revenue, data=tech_df))
#Note that this function is hiding something...
model
##
## Call:
## lm(formula = earnings ~ revenue, data = tech_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16045.0 20.0 141.6 177.1 12104.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.837e+02 4.491e+01 -4.091 4.79e-05 ***
## revenue 1.589e-01 3.564e-03 44.585 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1166 on 713 degrees of freedom
## Multiple R-squared: 0.736, Adjusted R-squared: 0.7356
## F-statistic: 1988 on 1 and 713 DF, p-value: < 2.2e-16
[[index]]
[[c()]]
will drill through lists, as opposed to pulling multiple values$
model["r.squared"]
## $r.squared
## [1] 0.7360059
model[["r.squared"]]
## [1] 0.7360059
model$r.squared
## [1] 0.7360059
earnings["Google"]
## Google
## 12662
earnings[["Google"]]
## [1] 12662
#Can't use $ with vectors
str(model)
## List of 11
## $ call : language lm(formula = earnings ~ revenue, data = tech_df)
## $ terms :Classes 'terms', 'formula' language earnings ~ revenue
## .. ..- attr(*, "variables")= language list(earnings, revenue)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "earnings" "revenue"
## .. .. .. ..$ : chr "revenue"
## .. ..- attr(*, "term.labels")= chr "revenue"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(earnings, revenue)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "earnings" "revenue"
## $ residuals : Named num [1:715] -59.7 173.8 -620.2 586.7 613.6 ...
## ..- attr(*, "names")= chr [1:715] "1" "2" "3" "4" ...
## $ coefficients : num [1:2, 1:4] -1.84e+02 1.59e-01 4.49e+01 3.56e-03 -4.09 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "(Intercept)" "revenue"
## .. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
## $ aliased : Named logi [1:2] FALSE FALSE
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "revenue"
## $ sigma : num 1166
## $ df : int [1:3] 2 713 2
## $ r.squared : num 0.736
## $ adj.r.squared: num 0.736
## $ fstatistic : Named num [1:3] 1988 1 713
## ..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
## $ cov.unscaled : num [1:2, 1:2] 1.48e-03 -2.83e-08 -2.83e-08 9.35e-12
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "(Intercept)" "revenue"
## .. ..$ : chr [1:2] "(Intercept)" "revenue"
## - attr(*, "class")= chr "summary.lm"
Like a matrix:
[]
Like a list:
$
Think of columns as variables, rows as observations
library(DT) # This library is great for including larger collections of data in output
datatable(tech_df[1:20,c("conm","tic","margin")], rownames=FALSE)
df <- data.frame(companyName=company,
earnings=earnings,
tech_firm=tech_firm,
stringsAsFactors=FALSE)
df
## companyName earnings tech_firm
## Google Google 12662 TRUE
## Microsoft Microsoft 21204 TRUE
## Goldman Goldman 4286 FALSE
Caution:
stringsAsFactors=FALSE
is needed for R to retain string data!
df[,1]
## [1] "Google" "Microsoft" "Goldman"
df$companyName
## [1] "Google" "Microsoft" "Goldman"
df[[1]]
## [1] "Google" "Microsoft" "Goldman"
All are relatively equivalent. Using
$
is generally most natural. Using[,]
is good for complex references.
Suggested method: use
$
df$all_zero <- 0
df$revenue <- c(110855, 89950, 42254)
df$margin <- df$earnings / df$revenue
# Custom function for small tables -- see last slide for code
html_df(df)
companyName | earnings | tech_firm | all_zero | revenue | margin | |
---|---|---|---|---|---|---|
12662 | TRUE | 0 | 110855 | 0.1142213 | ||
Microsoft | Microsoft | 21204 | TRUE | 0 | 89950 | 0.2357310 |
Goldman | Goldman | 4286 | FALSE | 0 | 42254 | 0.1014342 |
Alternative method: use cbind() just like with matrices
sort()
sort(df$earnings)
## [1] 4286 12662 21204
THIS CAN’T SORT DATA FRAMES
order()
function
ordering <- order(df$earnings)
ordering
## [1] 3 1 2
df <- df[ordering,]
df
## companyName earnings tech_firm all_zero revenue margin
## Goldman Goldman 4286 FALSE 0 42254 0.1014342
## Google Google 12662 TRUE 0 110855 0.1142213
## Microsoft Microsoft 21204 TRUE 0 89950 0.2357310
order(level1,level2,...)
, where level_
are vectors or data frame columns# Example of multicolumn sorting:
example <- data.frame(firm=c("Google","Microsoft","Google","Microsoft"),
year=c(2017,2017,2016,2016))
example
## firm year
## 1 Google 2017
## 2 Microsoft 2017
## 3 Google 2016
## 4 Microsoft 2016
# with() allows us to avoiding prepending each column with "example$"
ordering <- order(example$firm, example$year)
example <- example[ordering,]
example
## firm year
## 3 Google 2016
## 1 Google 2017
## 4 Microsoft 2016
## 2 Microsoft 2017
df[df$tech_firm,] # Remember the comma!
## companyName earnings tech_firm all_zero revenue margin
## Google Google 12662 TRUE 0 110855 0.1142213
## Microsoft Microsoft 21204 TRUE 0 89950 0.2357310
subset(df,earnings < 20000)
## companyName earnings tech_firm all_zero revenue margin
## Goldman Goldman 4286 FALSE 0 42254 0.1014342
## Google Google 12662 TRUE 0 110855 0.1142213
earnings < 20000
df$earnings
## [1] 4286 12662 21204
df$earnings < 20000
## [1] TRUE TRUE FALSE
==
!=
>
<
>=
<=
!
|
&
==
2 == 2
\(\rightarrow\) TRUE2 == 3
\(\rightarrow\) FALSE'dog'=='dog'
\(\rightarrow\) TRUE'dog'=='cat'
\(\rightarrow\) FALSE!=
==
2 != 2
\(\rightarrow\) FALSE2 != 3
\(\rightarrow\) TRUE'dog'!='cat'
\(\rightarrow\) TRUE==
!=
>
<
>=
<=
!
|
&
>
2 > 1
\(\rightarrow\) TRUE2 > 2
\(\rightarrow\) FALSE2 > 3
\(\rightarrow\) FALSE'dog'>'cat'
\(\rightarrow\) TRUE>
2 < 1
\(\rightarrow\) FALSE2 < 2
\(\rightarrow\) FALSE2 < 3
\(\rightarrow\) TRUE'dog'<'cat'
\(\rightarrow\) FALSE>
2 >= 1
\(\rightarrow\) TRUE2 >= 2
\(\rightarrow\) TRUE2 >= 3
\(\rightarrow\) FALSE>
2 <= 1
\(\rightarrow\) FALSE2 <= 2
\(\rightarrow\) TRUE2 <= 3
\(\rightarrow\) TRUE!
!TRUE
\(\rightarrow\) FALSE!FALSE
\(\rightarrow\) TRUE&
TRUE & TRUE
\(\rightarrow\) TRUETRUE & FALSE
\(\rightarrow\) FALSEFALSE & FALSE
\(\rightarrow\) FALSE|
(pipe, same key as ‘\’)
|
is evaluated after all &
sTRUE | TRUE
\(\rightarrow\) TRUETRUE | FALSE
\(\rightarrow\) TRUEFALSE | FALSE
\(\rightarrow\) FALSEsum(tech_df$revenue > 10000)
## [1] 46
sum(tech_df$revenue > 10000 & tech_df$earnings < 0)
## [1] 4
columns <- c("conm","tic","earnings","revenue")
tech_df[tech_df$revenue > 10000 & tech_df$earnings < 0, columns]
## conm tic earnings revenue
## 35 CORNING INC GLW -497.000 10116.00
## 45 TELEFONAKTIEBOLAGET LM ERICS ERIC -4307.493 24629.64
## 120 DELL TECHNOLOGIES INC 7732B -3728.000 78660.00
## 214 NOKIA CORP NOK -1796.087 27917.49
TRUE
and FALSE
already
FALSE
can be represented as 0TRUE
can be represented as any non-zero numberInf
: Infinity, often caused by dividing something by 0NaN
: “Not a number,” likely that the expression 0/0 occurredNA
: A missing value, usually not due to a mathematical errorNull
: Indicates a variable has nothing in it# cond1, cond2, etc. can be any logical expression
if(cond1) {
# Code runs if cond1 is TRUE
} else if (cond2) { # Can repeat 'else if' as needed
# Code runs if this is the first condition that is TRUE
} else {
# Code runs if none of the above conditions TRUE
}
TRUE
or FALSE
TRUE
FALSE
# Outputs odd for odd numbers and even for even numbers
even <- rep("even",5)
odd <- rep("odd",5)
numbers <- 1:5
ifelse(numbers %% 2, odd, even)
## [1] "odd" "even" "odd" "even" "odd"
FALSE
i = 0
while(i < 5) {
print(i)
i = i + 2
}
## [1] 0
## [1] 2
## [1] 4
FALSE
, while incrementing a given variablefor(i in c(0,2,4)) {
print(i)
}
## [1] 0
## [1] 2
## [1] 4
# Profit margin, all US tech firms
start <- Sys.time()
margin_1 <- rep(0,length(tech_df$ni))
for(i in seq_along(tech_df$ni)) {
margin_1[i] <- tech_df$earnings[i] /
tech_df$revenue[i]
}
end <- Sys.time()
time_1 <- end - start
time_1
## Time difference of 0.009973049 secs
# Profit margin, all US tech firms
start <- Sys.time()
margin_2 <- tech_df$earnings /
tech_df$revenue
end <- Sys.time()
time_2 <- end - start
time_2
## Time difference of 0.001995087 secs
identical(margin_1, margin_2) # Are these calculations identical? Yes they are.
## [1] TRUE
paste(as.numeric(time_1) / as.numeric(time_2), "times") # How much slower is the loop?
## [1] "4.99880497131931 times"
?
and help()?data.frame
help(data.frame)
args(data.frame)
## function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
## fix.empty.names = TRUE, stringsAsFactors = default.stringsAsFactors())
## NULL
args(data.frame)
## function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
## fix.empty.names = TRUE, stringsAsFactors = default.stringsAsFactors())
## NULL
...
represents a series of inputs
name=data
, where name is the column name and data is a vector____ = ____
arguments are options for the function
stringsAsFactors = FALSE
from earlier?
command# To install the tidyverse package:
install.packages("tidyverse")
# To install ggplot2, dplyr, and magrittr packages:
install.packages(c("ggplot2", "dplyr", "magrittr"))
library()
# Load the tidyverse package
library(tidyverse)
Pipe notation is never necessary and not built in to R
%>%
Left %>% Right(arg2, ...)
is the same as Right(Left, arg2, ...)
Piping can drastically improve code readability
Plot tech firms’ earnings vs revenue, >$10B in revenue
library(tidyverse)
library(plotly)
plot <- tech_df %>%
subset(revenue > 10000) %>%
ggplot(aes(x=revenue,y=earnings)) + # ggplot comes from ggplot2, part of tidyverse
geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic))) # Adds point, and ticker
ggplotly(plot) # Makes the plot interactive
library(tidyverse)
library(plotly)
plot <- ggplot(subset(tech_df, revenue > 10000), aes(x=revenue,y=earnings)) +
geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic)))
ggplotly(plot) # Makes the plot interactive
Note: The ~ indicates a formula the left side is the y-axis and the right side is the x-axis
Note: The | tells lattice to make panels based on the variable(s) to the right
vector = c(-2,-1,0,1,2)
sum(vector)
## [1] 0
abs(vector)
## [1] 2 1 0 1 2
sign(vector)
## [1] -1 -1 0 1 1
quantile(tech_df$earnings)
## 0% 25% 50% 75% 100%
## -4307.4930 -15.9765 1.8370 91.3550 48351.0000
range(tech_df$earnings)
## [1] -4307.493 48351.000
my_func <- function(agruments) {code}
Simple function: Add 2 to a number
add_two <- function(n) {
n + 2
}
add_two(500)
## [1] 502
mult_together <- function(n1, n2=0, square=FALSE) {
if (!square) {
n1 * n2
} else {
n1 * n1
}
}
mult_together(5,6)
## [1] 30
mult_together(5,6,square=TRUE)
## [1] 25
mult_together(5,square=TRUE)
## [1] 25
# Custom code for small tables from dataframes
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text, "html", col.names=cols, align=c("l", rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options=c("striped","hover","responsive"), full_width=full)
} else {
kable(text, "html", col.names=cols, align=c("l", rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options=c("striped", "hover","responsive"), full_width=full) %>%
column_spec(1,bold=T)
}
}
# Custom code for pulling 1 day of ForEx data from OANDA
FXRate <- function(from="USD", to="SGD", dt=Sys.Date()) {
options("getSymbols.warning4.0"=FALSE)
require(quantmod)
data <- getSymbols(paste0(from, "/", to), from=dt-1, to=dt, src="oanda", auto.assign=F)
return(data[[1]])
}