Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
# Google ngrams for Data TODO
There were 30 warnings (use warnings() to see them)
library(ngramr)
library(ggplot2)
ngd <- c("(Analytics + analytics)", "(Machine learning + machine learning)")
ggram(ngd, year_start=1960, geom = "area", google_theme=F, smoothing = 3) + theme(legend.position="bottom", legend.direction="horizontal")
library(readr)
df <- read_csv("../../Data/Session_1-1.csv")
df <- df[df$indfmt != "FS",] # remove FS codes
df <- df[order(df$gvkey,df$datadate),]
df$ni_lag <- c(NA, df$ni[1:nrow(df)-1])
df <- df[floor(df$datadate/10000) == 2017,]
df <- df[!(is.na(df$ni) | is.na(df$ni_lag)),]
library(ggplot2)
suppressPackageStartupMessages(library(plotly))
cor <- cor(x=df$ni, y=df$ni_lag, method="pearson")
set.seed(11)
df_s <- df[sample(nrow(df), 400),]
plot <- ggplot(df_s, aes(x=ni_lag, y=ni)) +
geom_point(shape=1, aes(text=sprintf("Ticker: %s", tic))) +
geom_smooth(aes(x=ni_lag, y=ni), method=lm, se=T) +
labs(x="2016 Net Income ($M USD)",
y="2017 Net Income ($M USD)")
ggplotly(plot, tooltip="text")
library(readr)
# import GDP data
dfg <- read_csv("../../Data/per-capita-gni-and-per-capita-gdp-at-current-market-prices-annual-2019.08.csv")
dfg <- dfg[dfg$level_1 == "Per Capita GDP",]
dfg <- dfg[order(dfg$year),]
# import unemployment data
dfu <- read_csv("../../Data/overall-unemployment-rate-annual-2019.08.csv")
dfu <- dfu[order(dfu$year),]
# merge into one dataset
df <- data.frame(year=1993:2016,
growth=dfg[dfg$year>1992,]$value / dfg[dfg$year > 1991 & dfg$year < 2016,]$value - 1,
unemploy=dfu[dfu$year>1992,]$unemployment_rate,
unemploy_lag=dfu[dfu$year<2016,]$unemployment_rate)
library(ggplot2)
suppressPackageStartupMessages(library(plotly))
cor <- cor(x=df$growth, y=df$unemploy, method="pearson")
plot <- ggplot(df, aes(x=unemploy, y=growth)) +
geom_point(shape=1, aes(text=sprintf("Year: %i", year))) +
geom_smooth(method=lm, se=T) +
labs(x="Unemployment rate",
y="GDP Growth")
ggplotly(plot, tooltip="text")
drown <- c(109,102,102,98,85,95,96,98,123,94,102) # US CDC data, 1999 to 2009 (falling in)
drown2 <- c(421,465,494,538,430,530,511,600,582,605,603)
ic <- c(16.1, 15.8, 16.2, 15.9 ,14.6 ,15.1 ,15.3, 14.8, 14.2 ,13.9 ,14 ,13.2 ,13.2 ,13 ,12.5 ,12.9 ,13.1) # Statista, 2000 to 2016
icprice <- c(126.858,128.467,130.858,131.733,134.808,137.508,144.592,150.575,155.500,161.742,164.375,173.408,179.033,175.467,178.250,177.575,179.233,183.373,192.768,196.637,195.055,209.161,215.524,215.489,215.412,219.201,219.452,219.903,219.593) # CPI data for ice cream and related products, 1990-2018
cancerthursday <- c(80262,80994,81321,81988,81390,82682,82109,82516,82663,83166,84978,84742) # CDC (tylervigen.com)
# https://futurism.com/correlation-vs-causation-2/
df <- data.frame(drown=drown2[2:length(drown2)],
icecream=ic[1 : length(drown2)-1],
icecreamprice=icprice[11:(length(drown2)+9)],
year=2000:2009)
df$drown <- df$drown / mean(df$drown)
df$icecream <- df$icecream / mean(df$icecream)
df$icecream2 <- df$icecream*df$icecreamprice / mean(df$icecream*df$icecreamprice)
df$icecream3 <- df$icecreamprice / mean(df$icecreamprice)
cor_unit <- cor(x=df$icecream, y=df$drown)
cor_rev <- cor(x=df$icecream2, y=df$drown)
cor_price <- cor(x=df$icecreamprice, y=df$drown)
library(ggplot2)
library(reshape2)
dm <- melt(df[c("drown", "icecream2", "year")],id.var=3)
plot1 <- ggplot(dm, aes(x=year, y=value, colour=variable)) +
geom_point(shape=1) +
geom_smooth(method=loess, se=F) +
labs(x="Year",
y="% of average year") +
scale_x_continuous(breaks=c(2000,2003,2006,2009)) +
scale_colour_discrete(name="Statistic",
breaks=c("icecream2","drown"),
labels=c("Ice cream revenue","Drowning in pools"))
dm <- melt(df[c("drown", "icecream3", "year")],id.var=3)
plot2 <- ggplot(dm, aes(x=year, y=value, colour=variable)) +
geom_point(shape=1) +
geom_smooth(method=loess, se=F) +
labs(x="Year",
y="% of average year") +
scale_x_continuous(breaks=c(2000,2003,2006,2009)) +
scale_colour_discrete(name="Statistic",
breaks=c("icecream3","drown"),
labels=c("Ice cream volume sold","Drowning in pools"))
dm <- melt(df[c("drown", "icecream", "year")],id.var=3)
plot3 <- ggplot(dm, aes(x=year, y=value, colour=variable)) +
geom_point(shape=1) +
geom_smooth(method=loess, se=F) +
labs(x="Year",
y="% of average year") +
scale_x_continuous(breaks=c(2000,2003,2006,2009)) +
scale_colour_discrete(name="Statistic",
breaks=c("icecream","drown"),
labels=c("Ice cream price","Drowning in pools"))
suppressPackageStartupMessages(library(plotly))
ggplotly(plot1)
suppressPackageStartupMessages(library(plotly))
ggplotly(plot3)
suppressPackageStartupMessages(library(plotly))
ggplotly(plot2)
# Addition uses '+'
1 + 1
[1] 2
# Subtraction uses '-'
2 - 1
[1] 1
# Multiplication uses '*'
3 * 3
[1] 9
# Division uses '/'
4 / 2
[1] 2
# Exponentiation uses '^'
5 ^ 5
[1] 3125
# Modulus (aka the remainder) uses '%%'
46 %% 6
[1] 4
# Integer division uses '%/%'
46 %/% 6
[1] 7
# Store 2 in 'x'
x <- 2
# Check the value of x
x
[1] 2
# Store arithmetic in y
y <- x * 2
# Check the value of y
y
[1] 4
# Previous value of x and y
paste(x, y)
[1] "2 4"
# Change x, then recheck the value
# of x and y
x <- 200
paste(x, y)
[1] "200 4"
# Data from Singtel's earnings reports, in Millions of SGD
singtel_2017 <- 3831.0
singtel_2018 <- 5430.3
# Compute growth
growth <- singtel_2018 / singtel_2017 - 1
# Check the value of growth
growth
[1] 0.4174628
company_name <- "Google" # character
company_name
[1] "Google"
company_name <- 'Google' # character
company_name
[1] "Google"
tech_firm <- TRUE # boolean
tech_firm
[1] TRUE
earnings <- 12662 # numeric, $M USD
earnings
[1] 12662
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v tibble 3.0.3 v dplyr 1.0.1
v tidyr 1.1.1 v stringr 1.4.0
v purrr 0.3.4 v forcats 0.5.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks plotly::filter(), stats::filter()
x dplyr::lag() masks stats::lag()
df <- read.csv("../../Data/Session_1-2.csv", stringsAsFactors=FALSE)
# Filter to firms with at least $1M USD revenue, known net income, and fiscal year of 2017
clean_df <- df %>% filter(fyear==2017, !is.na(revt), !is.na(ni), revt > 1)
tech_df <- clean_df %>%
filter(gsector==45) %>%
mutate(revenue = revt,
earnings = ni,
margin = ni/revt)
earnings_2017 <- tech_df$ni
revenue_2017 <- tech_df$revt
names_2017 <- tech_df$conm
names(earnings_2017) <- names_2017
names(revenue_2017) <- names_2017
# Calculating proit margin for all public US tech firms
# 715 tech firms in Compustat with >1M sales in 2017
# Data:
# earnings_2017: vector of earnings, $M USD
# revenue_2017: vector of revenue, $M USD
# names_2017: a vector of tickers (strings)
# Namining the vectors
names(earnings_2017) <- names_2017
names(revenue_2017) <- names_2017
earnings_2017[1:6]
AVX CORP BK TECHNOLOGIES ADVANCED MICRO DEVICES
4.910 -3.626 43.000
ASM INTERNATIONAL NV SKYWORKS SOLUTIONS INC ANALOG DEVICES
543.878 1010.200 727.259
revenue_2017[1:6]
AVX CORP BK TECHNOLOGIES ADVANCED MICRO DEVICES
1562.474 39.395 5329.000
ASM INTERNATIONAL NV SKYWORKS SOLUTIONS INC ANALOG DEVICES
886.503 3651.400 5107.503
# Summarizing vectors
summary(earnings_2017)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4307.49 -15.98 1.84 296.84 91.36 48351.00
summary(revenue_2017)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.06 102.62 397.57 3023.78 1531.59 229234.00
# Calculating profit margin
margin <- earnings_2017 / revenue_2017
summary(margin)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-13.97960 -0.10253 0.01353 -0.10967 0.09295 1.02655
# Worst, midpoint, and best profit margin firms in 2017. Our names carried over :)
margin[order(margin)][c(1,length(margin)/2,length(margin))]
HELIOS AND MATHESON ANALYTIC NLIGHT INC CCUR HOLDINGS INC
-13.97960161 0.01325588 1.02654899
columns <- c("Google", "Microsoft",
"Goldman")
rows <- c("Earnings","Revenue")
firm_data <- matrix(data=
c(12662, 21204, 4286, 110855,
89950, 42254), nrow=2)
# Equivalent:
# matrix(data=c(12662, 21204, 4286,
# 110855, 89950, 42254), ncol=3)
# Apply names
rownames(firm_data) <- rows
colnames(firm_data) <- columns
# Print the matrix
firm_data
Google Microsoft Goldman
Earnings 12662 4286 89950
Revenue 21204 110855 42254
firm_data[2, 3]
[1] 42254
firm_data[, c("Google", "Microsoft")]
Google Microsoft
Earnings 12662 4286
Revenue 21204 110855
firm_data[1,]
Google Microsoft Goldman
12662 4286 89950
firm_data["Revenue", "Goldman"]
[1] 42254
indcode <- c(45,45,40)
jpdata <- c(17370, 115475)
# Preloaded: industry codes as indcode (vector)
# Preloaded: industry codes as indcode (vector)
# - GICS codes: 40=Financials, 45=Information Technology
# - See: https://en.wikipedia.org/wiki/Global_Industry_Classification_Standard
# Preloaded: JPMorgan data as jpdata (vector)
mat <- rbind(firm_data,indcode) # Add a row
rownames(mat)[3] <- "Industry" # Name the new row
mat
Google Microsoft Goldman
Earnings 12662 4286 89950
Revenue 21204 110855 42254
Industry 45 45 40
mat <- cbind(firm_data,jpdata) # Add a column
colnames(mat)[4] <- "JPMorgan" # Name the new column
mat
Google Microsoft Goldman JPMorgan
Earnings 12662 4286 89950 17370
Revenue 21204 110855 42254 115475
# Ignore this code for now...
model <- summary(lm(earnings ~ revenue, data=tech_df))
#Note that this function is hiding something...
model
Call:
lm(formula = earnings ~ revenue, data = tech_df)
Residuals:
Min 1Q Median 3Q Max
-16045.0 20.0 141.6 177.1 12104.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.837e+02 4.491e+01 -4.091 4.79e-05 ***
revenue 1.589e-01 3.564e-03 44.585 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1166 on 713 degrees of freedom
Multiple R-squared: 0.736, Adjusted R-squared: 0.7356
F-statistic: 1988 on 1 and 713 DF, p-value: < 2.2e-16
model["r.squared"]
$r.squared
[1] 0.7360059
model[["r.squared"]]
[1] 0.7360059
model$r.squared
[1] 0.7360059
earnings <- c(12662, 21204, 4286)
company <- c("Google", "Microsoft", "Goldman")
names(earnings) <- company
earnings["Google"]
Google
12662
earnings[["Google"]]
[1] 12662
#Can't use $ with vectors
str(model)
List of 11
$ call : language lm(formula = earnings ~ revenue, data = tech_df)
$ terms :Classes 'terms', 'formula' language earnings ~ revenue
.. ..- attr(*, "variables")= language list(earnings, revenue)
.. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:2] "earnings" "revenue"
.. .. .. ..$ : chr "revenue"
.. ..- attr(*, "term.labels")= chr "revenue"
.. ..- attr(*, "order")= int 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(earnings, revenue)
.. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. ..- attr(*, "names")= chr [1:2] "earnings" "revenue"
$ residuals : Named num [1:715] -59.7 173.8 -620.2 586.7 613.6 ...
..- attr(*, "names")= chr [1:715] "1" "2" "3" "4" ...
$ coefficients : num [1:2, 1:4] -1.84e+02 1.59e-01 4.49e+01 3.56e-03 -4.09 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "revenue"
.. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
$ aliased : Named logi [1:2] FALSE FALSE
..- attr(*, "names")= chr [1:2] "(Intercept)" "revenue"
$ sigma : num 1166
$ df : int [1:3] 2 713 2
$ r.squared : num 0.736
$ adj.r.squared: num 0.736
$ fstatistic : Named num [1:3] 1988 1 713
..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
$ cov.unscaled : num [1:2, 1:2] 1.48e-03 -2.83e-08 -2.83e-08 9.35e-12
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "revenue"
.. ..$ : chr [1:2] "(Intercept)" "revenue"
- attr(*, "class")= chr "summary.lm"
library(tidyverse) # Imports most tidy packages
# Base R data import -- stringsAsFactors is important here
df <- read.csv("../../Data/Session_1-2.csv", stringsAsFactors=FALSE)
df <- subset(df, fyear == 2017 & !is.na(revt) & !is.na(ni) &
revt > 1 & gsector == 45)
df$margin = df$ni / df$revt
summary(df)
gvkey datadate fyear indfmt
Min. : 1072 Min. :20170630 Min. :2017 Length:715
1st Qu.: 20231 1st Qu.:20171231 1st Qu.:2017 Class :character
Median : 33232 Median :20171231 Median :2017 Mode :character
Mean : 79699 Mean :20172029 Mean :2017
3rd Qu.:148393 3rd Qu.:20171231 3rd Qu.:2017
Max. :315629 Max. :20180430 Max. :2017
consol popsrc datafmt tic
Length:715 Length:715 Length:715 Length:715
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
conm curcd ni revt
Length:715 Length:715 Min. :-4307.49 Min. : 1.06
Class :character Class :character 1st Qu.: -15.98 1st Qu.: 102.62
Mode :character Mode :character Median : 1.84 Median : 397.57
Mean : 296.84 Mean : 3023.78
3rd Qu.: 91.36 3rd Qu.: 1531.59
Max. :48351.00 Max. :229234.00
cik costat gind gsector gsubind
Min. : 2186 Length:715 Min. :451010 Min. :45 Min. :45101010
1st Qu.: 887604 Class :character 1st Qu.:451020 1st Qu.:45 1st Qu.:45102020
Median :1102307 Mode :character Median :451030 Median :45 Median :45103020
Mean :1086969 Mean :451653 Mean :45 Mean :45165290
3rd Qu.:1405497 3rd Qu.:452030 3rd Qu.:45 3rd Qu.:45203012
Max. :1725579 Max. :453010 Max. :45 Max. :45301020
NA's :3
margin
Min. :-13.97960
1st Qu.: -0.10253
Median : 0.01353
Mean : -0.10967
3rd Qu.: 0.09295
Max. : 1.02655
# Tidy import
df <- read_csv("../../Data/Session_1-2.csv") %>%
filter(fyear == 2017, # fiscal year
!is.na(revt), # revenue not missing
!is.na(ni), # net income not missing
revt > 1, # at least 1M USD in revenue
gsector == 45) %>% # tech firm
mutate(margin = ni/revt) # profit margin
summary(df)
gvkey datadate fyear indfmt
Length:715 Min. :20170630 Min. :2017 Length:715
Class :character 1st Qu.:20171231 1st Qu.:2017 Class :character
Mode :character Median :20171231 Median :2017 Mode :character
Mean :20172029 Mean :2017
3rd Qu.:20171231 3rd Qu.:2017
Max. :20180430 Max. :2017
consol popsrc datafmt tic
Length:715 Length:715 Length:715 Length:715
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
conm curcd ni revt
Length:715 Length:715 Min. :-4307.49 Min. : 1.06
Class :character Class :character 1st Qu.: -15.98 1st Qu.: 102.62
Mode :character Mode :character Median : 1.84 Median : 397.57
Mean : 296.84 Mean : 3023.78
3rd Qu.: 91.36 3rd Qu.: 1531.59
Max. :48351.00 Max. :229234.00
cik costat gind gsector gsubind
Length:715 Length:715 Min. :451010 Min. :45 Min. :45101010
Class :character Class :character 1st Qu.:451020 1st Qu.:45 1st Qu.:45102020
Mode :character Mode :character Median :451030 Median :45 Median :45103020
Mean :451653 Mean :45 Mean :45165290
3rd Qu.:452030 3rd Qu.:45 3rd Qu.:45203012
Max. :453010 Max. :45 Max. :45301020
margin
Min. :-13.97960
1st Qu.: -0.10253
Median : 0.01353
Mean : -0.10967
3rd Qu.: 0.09295
Max. : 1.02655
df <- mutate(
filter(
read_csv("../../Data/Session_1-2.csv"),
fyear == 2017, # fiscal year
!is.na(revt), # revenue not missing
!is.na(ni), # net income not missing
revt > 1, # at least 1M USD in revenue
gsector == 45), # tech firm
margin = ni/revt) # profit margin
summary(df)
gvkey datadate fyear indfmt
Length:715 Min. :20170630 Min. :2017 Length:715
Class :character 1st Qu.:20171231 1st Qu.:2017 Class :character
Mode :character Median :20171231 Median :2017 Mode :character
Mean :20172029 Mean :2017
3rd Qu.:20171231 3rd Qu.:2017
Max. :20180430 Max. :2017
consol popsrc datafmt tic
Length:715 Length:715 Length:715 Length:715
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
conm curcd ni revt
Length:715 Length:715 Min. :-4307.49 Min. : 1.06
Class :character Class :character 1st Qu.: -15.98 1st Qu.: 102.62
Mode :character Mode :character Median : 1.84 Median : 397.57
Mean : 296.84 Mean : 3023.78
3rd Qu.: 91.36 3rd Qu.: 1531.59
Max. :48351.00 Max. :229234.00
cik costat gind gsector gsubind
Length:715 Length:715 Min. :451010 Min. :45 Min. :45101010
Class :character Class :character 1st Qu.:451020 1st Qu.:45 1st Qu.:45102020
Mode :character Mode :character Median :451030 Median :45 Median :45103020
Mean :451653 Mean :45 Mean :45165290
3rd Qu.:452030 3rd Qu.:45 3rd Qu.:45203012
Max. :453010 Max. :45 Max. :45301020
margin
Min. :-13.97960
1st Qu.: -0.10253
Median : 0.01353
Mean : -0.10967
3rd Qu.: 0.09295
Max. : 1.02655
# To see the arguments a function takes, run:
args(data.frame)
function (..., row.names = NULL, check.rows = FALSE, check.names = TRUE,
fix.empty.names = TRUE, stringsAsFactors = default.stringsAsFactors())
NULL
add_two <- function(n) {
n + 2
}
add_two(500)
[1] 502
mult_together <- function(n1, n2=0, square=FALSE) {
if (!square) {
n1 * n2
} else {
n1 * n1
}
}
mult_together(5,6)
[1] 30
mult_together(5,6,square=TRUE)
[1] 25
mult_together(5,square=TRUE)
[1] 25
FXRate <- function(from="USD", to="SGD", dt=Sys.Date()) {
options("getSymbols.warning4.0"=FALSE)
require(quantmod)
data <- getSymbols(paste0(from, "/", to), from=dt-3, to=dt, src="oanda", auto.assign=F)
return(data[[1]])
}
date()
[1] "Mon Aug 17 01:54:47 2020"
FXRate(from="USD", to="SGD") # Today's SGD to USD rate
[1] 1.371802
FXRate(from="SGD", to="CNY") # Today's SGD to CNY rate
[1] 5.065638
FXRate(from="USD", to="SGD", dt=Sys.Date()-90) # Last quarter's SGD to USD rate
[1] 1.42746