company <- c("Google", "Microsoft", "Goldman")
company[1] "Google" "Microsoft" "Goldman"
tech_firm <- c(TRUE, TRUE, FALSE)
tech_firm[1] TRUE TRUE FALSE
earnings <- c(12662, 21204, 4286)
earnings[1] 12662 21204 4286
Examples:
\[ \begin{matrix} \left(\begin{matrix} 1 \\ 2 \\ 3 \\ 4 \end{matrix}\right) & \text{or} & \left(\begin{matrix} 1 & 2 & 3 & 4 \end{matrix}\right) \end{matrix} \]
A row (or column) of data
c() commandcompany <- c("Google", "Microsoft", "Goldman")
company[1] "Google" "Microsoft" "Goldman"
tech_firm <- c(TRUE, TRUE, FALSE)
tech_firm[1] TRUE TRUE FALSE
earnings <- c(12662, 21204, 4286)
earnings[1] 12662 21204 4286
A vector in R is a 1 dimensional collection of 1 or more of the same data type
Works the same as scalars, but applies element-wise
earnings # previously defined[1] 12662 21204 4286
earnings + earnings # Add element-wise[1] 25324 42408 8572
earnings * earnings # multiply element-wise[1] 160326244 449609616 18369796
Can also use 1 vector and 1 scalar
earnings + 10000 # Adding a scalar to a vector[1] 22662 31204 14286
10000 + earnings # Order doesn't matter[1] 22662 31204 14286
earnings / 1000 # Dividing a vector by a scalar[1] 12.662 21.204 4.286
%*%
# Dot product: sum of product of elements
earnings %*% earnings # returns a matrix though... [,1]
[1,] 628305656
[1] 628305656
names()
[1]
["Google"]
earnings[1]Google
12662
earnings["Google"]Google
12662
# Calculating proit margin for all public US tech firms
# 715 tech firms with >1M sales in 2017
summary(earnings_2017) # Cleaned data from Compustat, in $M USD Min. 1st Qu. Median Mean 3rd Qu. Max.
-4307.49 -15.98 1.84 296.84 91.36 48351.00
summary(revenue_2017) # Cleaned data from Compustat, in $M USD Min. 1st Qu. Median Mean 3rd Qu. Max.
1.06 102.62 397.57 3023.78 1531.59 229234.00
profit_margin <- earnings_2017 / revenue_2017
summary(profit_margin) Min. 1st Qu. Median Mean 3rd Qu. Max.
-13.97960 -0.10253 0.01353 -0.10967 0.09295 1.02655
Example:
\[ \left(\begin{matrix} 1 & 2 & 3 & 4\\ 5 & 6 & 7 & 8\\ 9 & 10 & 11 & 12 \end{matrix}\right) \]
A rows and columns of data
matrix() commandcolumns <- c("Google", "Microsoft", "Goldman")
rows <- c("Earnings","Revenue")
# equivalent: matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),ncol=3)
firm_data <- matrix(data=c(12662, 21204, 4286, 110855, 89950, 42254),nrow=2)
firm_data [,1] [,2] [,3]
[1,] 12662 4286 89950
[2,] 21204 110855 42254
Everything with matrices works just like vectors
firm_data + firm_data [,1] [,2] [,3]
[1,] 25324 8572 179900
[2,] 42408 221710 84508
firm_data / 1000 [,1] [,2] [,3]
[1,] 12.662 4.286 89.950
[2,] 21.204 110.855 42.254
t()
firm_data_T <- t(firm_data)
firm_data_T [,1] [,2]
[1,] 12662 21204
[2,] 4286 110855
[3,] 89950 42254
%*%
firm_data %*% firm_data_T [,1] [,2]
[1,] 8269698540 4544356878
[2,] 4544356878 14523841157
We won’t use these much, but they can be useful
rownames() for rowscolnames() for columnsmatrix_name[rows,columns]firm_data[2,3][1] 42254
firm_data[,c("Google","Microsoft")] Google Microsoft
Earnings 12662 4286
Revenue 21204 110855
firm_data[1,] Google Microsoft Goldman
12662 4286 89950
rbind()
cbind()
# Preloaded: industry codes as indcode (vector)
# - GICS codes: 40=Financials, 45=Information Technology
# - See: https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard
# Preloaded: JPMorgan data as jpdata (vector)
mat <- rbind(firm_data,indcode) # Add a row
rownames(mat)[3] <- "Industry" # Name the new row
mat Google Microsoft Goldman
Earnings 12662 4286 89950
Revenue 21204 110855 42254
Industry 45 45 40
mat <- cbind(firm_data,jpdata) # Add a column
colnames(mat)[4] <- "JPMorgan" # Name the new column
mat Google Microsoft Goldman JPMorgan
Earnings 12662 4286 89950 17370
Revenue 21204 110855 42254 115475
# Ignore this code for now...
model <- summary(lm(earnings ~ revenue, data=tech_df))
#Note that this function is hiding something...
model
Call:
lm(formula = earnings ~ revenue, data = tech_df)
Residuals:
Min 1Q Median 3Q Max
-16045.0 20.0 141.6 177.1 12104.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.837e+02 4.491e+01 -4.091 4.79e-05 ***
revenue 1.589e-01 3.564e-03 44.585 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1166 on 713 degrees of freedom
Multiple R-squared: 0.736, Adjusted R-squared: 0.7356
F-statistic: 1988 on 1 and 713 DF, p-value: < 2.2e-16
[[index]]
[[c()]] will drill through lists, as opposed to pulling multiple values$
model["r.squared"]$r.squared
[1] 0.7360059
model[["r.squared"]][1] 0.7360059
model$r.squared[1] 0.7360059
earnings["Google"]Google
12662
earnings[["Google"]][1] 12662
#Can't use $ with vectorsstr() will tell us what’s in this liststr(model)List of 11
$ call : language lm(formula = earnings ~ revenue, data = tech_df)
$ terms :Classes 'terms', 'formula' language earnings ~ revenue
.. ..- attr(*, "variables")= language list(earnings, revenue)
.. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:2] "earnings" "revenue"
.. .. .. ..$ : chr "revenue"
.. ..- attr(*, "term.labels")= chr "revenue"
.. ..- attr(*, "order")= int 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(earnings, revenue)
.. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. ..- attr(*, "names")= chr [1:2] "earnings" "revenue"
$ residuals : Named num [1:715] -59.7 173.8 -620.2 586.7 613.6 ...
..- attr(*, "names")= chr [1:715] "1" "2" "3" "4" ...
$ coefficients : num [1:2, 1:4] -1.84e+02 1.59e-01 4.49e+01 3.56e-03 -4.09 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "revenue"
.. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
$ aliased : Named logi [1:2] FALSE FALSE
..- attr(*, "names")= chr [1:2] "(Intercept)" "revenue"
$ sigma : num 1166
$ df : int [1:3] 2 713 2
$ r.squared : num 0.736
$ adj.r.squared: num 0.736
$ fstatistic : Named num [1:3] 1988 1 713
..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
$ cov.unscaled : num [1:2, 1:2] 1.48e-03 -2.83e-08 -2.83e-08 9.35e-12
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "revenue"
.. ..$ : chr [1:2] "(Intercept)" "revenue"
- attr(*, "class")= chr "summary.lm"
Like a matrix:
[]
Like a list:
$
Think of columns as variables, rows as observations
data.frame() functiondf <- data.frame(companyName=company,
earnings=earnings,
tech_firm=tech_firm)
df companyName earnings tech_firm
Google Google 12662 TRUE
Microsoft Microsoft 21204 TRUE
Goldman Goldman 4286 FALSE
Note:
stringsAsFactors=FALSEis no longer needed as of R 4.0.0
df[,1][1] "Google" "Microsoft" "Goldman"
df$companyName[1] "Google" "Microsoft" "Goldman"
df[[1]][1] "Google" "Microsoft" "Goldman"
All are relatively equivalent. Using
$is generally most natural. Using[,]is good for complex references.
Suggested method: use
$
df$all_zero <- 0
df$revenue <- c(110855, 89950, 42254)
df$margin <- df$earnings / df$revenue
# Custom function for small tables -- see last slide for code
html_df(df)| companyName | earnings | tech_firm | all_zero | revenue | margin | |
|---|---|---|---|---|---|---|
| 12662 | TRUE | 0 | 110855 | 0.1142213 | ||
| Microsoft | Microsoft | 21204 | TRUE | 0 | 89950 | 0.2357310 |
| Goldman | Goldman | 4286 | FALSE | 0 | 42254 | 0.1014342 |
Alternative method: use
cbind()just like with matrices
sort()
sort(df$earnings)[1] 4286 12662 21204
Warning
THIS CAN’T SORT DATA FRAMES
order() function
ordering <- order(df$earnings)
ordering[1] 3 1 2
df <- df[ordering,]
df companyName earnings tech_firm all_zero revenue margin
Goldman Goldman 4286 FALSE 0 42254 0.1014342
Google Google 12662 TRUE 0 110855 0.1142213
Microsoft Microsoft 21204 TRUE 0 89950 0.2357310
order(level1,level2,...), where level_ are vectors or data frame columns# Example of multicolumn sorting:
example <- data.frame(firm=c("Google","Microsoft","Google","Microsoft"),
year=c(2017,2017,2016,2016))
example firm year
1 Google 2017
2 Microsoft 2017
3 Google 2016
4 Microsoft 2016
# with() allows us to avoiding prepending each column with "example$"
ordering <- order(example$firm, example$year)
example <- example[ordering,]
example firm year
3 Google 2016
1 Google 2017
4 Microsoft 2016
2 Microsoft 2017
df[df$tech_firm,] # Remember the comma! companyName earnings tech_firm all_zero revenue margin
Google Google 12662 TRUE 0 110855 0.1142213
Microsoft Microsoft 21204 TRUE 0 89950 0.2357310
subset() function
subset(df,earnings < 20000) companyName earnings tech_firm all_zero revenue margin
Goldman Goldman 4286 FALSE 0 42254 0.1014342
Google Google 12662 TRUE 0 110855 0.1142213
earnings < 20000df$earnings[1] 4286 12662 21204
df$earnings < 20000[1] TRUE TRUE FALSE
== != > < >= <= ! | &
==
2 == 2 \(\rightarrow\) TRUE2 == 3 \(\rightarrow\) FALSE'dog'=='dog' \(\rightarrow\) TRUE'dog'=='cat' \(\rightarrow\) FALSE!=
==
2 != 2 \(\rightarrow\) FALSE2 != 3 \(\rightarrow\) TRUE'dog'!='cat' \(\rightarrow\) TRUE== != > < >= <= ! | &
>
2 > 1 \(\rightarrow\) TRUE2 > 2 \(\rightarrow\) FALSE2 > 3 \(\rightarrow\) FALSE'dog'>'cat' \(\rightarrow\) TRUE>
2 < 1 \(\rightarrow\) FALSE2 < 2 \(\rightarrow\) FALSE2 < 3 \(\rightarrow\) TRUE'dog'<'cat' \(\rightarrow\) FALSE>
2 >= 1 \(\rightarrow\) TRUE2 >= 2 \(\rightarrow\) TRUE2 >= 3 \(\rightarrow\) FALSE>
2 <= 1 \(\rightarrow\) FALSE2 <= 2 \(\rightarrow\) TRUE2 <= 3 \(\rightarrow\) TRUE!
!TRUE \(\rightarrow\) FALSE!FALSE \(\rightarrow\) TRUE&
TRUE & TRUE \(\rightarrow\) TRUETRUE & FALSE \(\rightarrow\) FALSEFALSE & FALSE \(\rightarrow\) FALSE| (pipe, same key as ‘\’)
| is evaluated after all &sTRUE | TRUE \(\rightarrow\) TRUETRUE | FALSE \(\rightarrow\) TRUEFALSE | FALSE \(\rightarrow\) FALSEsum(tech_df$revenue > 10000)[1] 46
sum(tech_df$revenue > 10000 & tech_df$earnings < 0)[1] 4
columns <- c("conm","tic","earnings","revenue")
tech_df[tech_df$revenue > 10000 & tech_df$earnings < 0, columns] conm tic earnings revenue
35 CORNING INC GLW -497.000 10116.00
45 TELEFONAKTIEBOLAGET LM ERICS ERIC -4307.493 24629.64
120 DELL TECHNOLOGIES INC 7732B -3728.000 78660.00
214 NOKIA CORP NOK -1796.087 27917.49
TRUE and FALSE already
FALSE can be represented as 0TRUE can be represented as any non-zero numberInf: Infinity, often caused by dividing something by 0NaN: “Not a number,” likely that the expression 0/0 occurredNA: A missing value, usually not due to a mathematical errorNull: Indicates a variable has nothing in it# cond1, cond2, etc. can be any logical expression
if(cond1) {
# Code runs if cond1 is TRUE
} else if (cond2) { # Can repeat 'else if' as needed
# Code runs if this is the first condition that is TRUE
} else {
# Code runs if none of the above conditions TRUE
}ifelse()
TRUE or FALSE
TRUE
FALSE

while() loop executes code repeatedly until a specified condition is FALSE
i = 0
while(i < 5) {
print(i)
i = i + 2
}[1] 0
[1] 2
[1] 4
identical(margin_1, margin_2) # Are these calculations identical? Yes they are.[1] TRUE
paste(as.numeric(time_1) / as.numeric(time_2), "times") # How much slower is the loop?[1] "5.00047687172151 times"
? and help()
data.frame():
args()
args(data.frame)function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
fix.empty.names = TRUE, stringsAsFactors = FALSE)
NULL
args(data.frame)function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
fix.empty.names = TRUE, stringsAsFactors = FALSE)
NULL
... represents a series of inputs
name=data, where name is the column name and data is a vector____ = ____ arguments are options for the function
? commandargs() functioninstall.packages() command# To install the tidyverse package:
install.packages("tidyverse")
# To install ggplot2, dplyr, and magrittr packages:
install.packages(c("ggplot2", "dplyr", "magrittr"))library()
Pipe notation is never necessary and not built in to R
%>%
Left %>% Right(arg2, ...) is the same as Right(Left, arg2, ...)
Piping can drastically improve code readability
Plot tech firms’ earnings vs revenue, >$10B in revenue
Note: The ~ indicates a formula the left side is the y-axis and the right side is the x-axis
Note: The | tells lattice to make panels based on the variable(s) to the right
function() function!
my_func <- function(agruments) {code}Simple function: Add 2 to a number
add_two <- function(n) {
n + 2
}
add_two(500)[1] 502
mult_together <- function(n1, n2=0, square=FALSE) {
if (!square) {
n1 * n2
} else {
n1 * n1
}
}
mult_together(5,6)[1] 30
mult_together(5,6,square=TRUE)[1] 25
mult_together(5,square=TRUE)[1] 25
Having completed these slides, you should be ready for any R code in the class!
