Note that the directories used to store data are likely different on your computer, and such references will need to be changed before using any such code.
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
} else {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
column_spec(1,bold=T)
}
}
library(tidyverse)
# Note: read_csv is like read.csv, but loads as dplyr's format (tibble)
df_long <- read_csv("../../Data/general-business-expectations-by-detailed-services-industry-quarterly.csv")
df_long[1:3,c(1,4:5)]
# Note: read_csv is like read.csv, but loads as dplyr's format (tibble)
df_wide <- read_csv("../../Assignments/Assignment_1/Zip_Zhvi_SingleFamilyResidence.csv")
df_wide[1:3,c(1,8:10)]
# Read text from a .txt file using read_file()
doc <- read_file("../../Data/0001104659-14-015152.txt")
# str_wrap is from stringr from tidyverse
cat(str_wrap(substring(doc,1,500), 80))
UNITED STATES SECURITIES AND EXCHANGE COMMISSION WASHINGTON, D.C. 20549 FORM
10-K ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE
ACT OF 1934 For the fiscal year ended December 31, 2013 Commission file number
1-9924 Citigroup Inc. (Exact name of registrant as specified in its charter)
Securities registered pursuant to Section 12(b) of the Act: See Exhibit 99.01
Securities registered pursuant to Section 12(g) of the Act: none Indicate by
check mark if the registrant is a
library(httr)
library(XML)
httpResponse <- GET('https://coinmarketcap.com/currencies/ethereum/')
html = httr::content(httpResponse, "text")
paste0('...', str_wrap(substring(html, 135418, 135480), 80), '...')
[1] "...div>$349.36 USD /</div><div>$352.16 USD</div></td></tr><tr><th..."
xpath <- '//*[@id="__next"]/div[1]/div[2]/div[1]/div[2]/div[1]/div/div[2]/span[1]/span[1]/text()'
hdoc = htmlParse(html, asText=TRUE) # from XML
price <- xpathSApply(hdoc, xpath, xmlValue)
print(paste0("Ethereum was priced at $", price,
" when these slides were compiled"))
[1] "Ethereum was priced at $$349.39 when these slides were compiled"
# The actual version I use (with caching to avoid repeated lookups) is in the appendix
cryptoMC <- function(name) {
httpResponse <- GET(paste('https://coinmarketcap.com/currencies/',name,'/',sep=''))
xpath <- '//*[@id="__next"]/div[1]/div[2]/div[1]/div[2]/div[1]/div/div[2]/span[1]/span[1]/text()'
hdoc = htmlParse(html, asText=TRUE)
plain.text <- xpathSApply(hdoc, xpath, xmlValue)
plain.text
}
paste("Ethereum was priced at", cryptoMC("ethereum"))
[1] "Ethereum was priced at $349.39"
paste("Litecoin was priced at", cryptoMC("litecoin"))
[1] "Litecoin was priced at $349.39"
cat(str_wrap(str_sub(doc, 9896, 9929), 80))
Citis net income was $13.5 billion
cat(str_wrap(str_sub(doc, 28900,29052), 80))
Net income decreased 14%, mainly driven by lower revenues and lower loan loss
reserve releases, partially offset by lower net credit losses and expenses.
sentence <- str_sub(doc, 9896, 9929)
str_to_lower(sentence)
[1] "citis net income was $13.5 billion"
str_to_upper(sentence)
[1] "CITIS NET INCOME WAS $13.5 BILLION"
str_to_title(sentence)
[1] "Citis Net Income Was $13.5 Billion"
board <- c("Michael L. Corbat", "Michael E. O’Neill", "Anthony M. Santomero",
"William S. Thompson, Jr.", "Duncan P. Hennes", "Gary M. Reiner",
"Joan E. Spero", "James S. Turley", "Franz B. Humer",
"Judith Rodin", "Robert L. Ryan", "Diana L. Taylor",
"Ernesto Zedillo Ponce de Leon", "Robert L. Joss")
titles <- c("CEO", "Chairman", "Former president, Fed (Philidelphia)",
"CEO, Retired, PIMCO", "Co-Founder/Partner, Atrevida Partners",
"Operating Partner, General Atlantic",
"Senior Research Scholar, Columbia University",
"Former Chairman & CEO, E&Y", "Chairman, Roche",
"President, Rockefeller Foundation", "CFO, Retired, Medtronic",
"MD, Wolfensohn Fund Management", "Professor, Yale University",
"Professor/Dean Emeritus, Stanford GSB")
# board is a list of director names
# titles is a list of the director's titles
paste(board, titles, sep=", ")
[1] "Michael L. Corbat, CEO"
[2] "Michael E. O’Neill, Chairman"
[3] "Anthony M. Santomero, Former president, Fed (Philidelphia)"
[4] "William S. Thompson, Jr., CEO, Retired, PIMCO"
[5] "Duncan P. Hennes, Co-Founder/Partner, Atrevida Partners"
[6] "Gary M. Reiner, Operating Partner, General Atlantic"
[7] "Joan E. Spero, Senior Research Scholar, Columbia University"
[8] "James S. Turley, Former Chairman & CEO, E&Y"
[9] "Franz B. Humer, Chairman, Roche"
[10] "Judith Rodin, President, Rockefeller Foundation"
[11] "Robert L. Ryan, CFO, Retired, Medtronic"
[12] "Diana L. Taylor, MD, Wolfensohn Fund Management"
[13] "Ernesto Zedillo Ponce de Leon, Professor, Yale University"
[14] "Robert L. Joss, Professor/Dean Emeritus, Stanford GSB"
cat(str_wrap(paste0("Citi's board consists of: ",
paste(board[1:length(board)-1], collapse=", "),
", and ", board[length(board)], "."), 80))
Citi's board consists of: Michael L. Corbat, Michael E. O’Neill, Anthony M.
Santomero, William S. Thompson, Jr., Duncan P. Hennes, Gary M. Reiner, Joan E.
Spero, James S. Turley, Franz B. Humer, Judith Rodin, Robert L. Ryan, Diana L.
Taylor, Ernesto Zedillo Ponce de Leon, and Robert L. Joss.
sentence
[1] "Citis net income was $13.5 billion"
str_replace_all(sentence, "\\$13.5", "over $10")
[1] "Citis net income was over $10 billion"
paragraphs <- str_split(doc, '\n')[[1]]
# number of paragraphs
length(paragraphs)
[1] 206
# Last paragraph
cat(str_wrap(paragraphs[206], 80))
The total amount of securities authorized pursuant to any instrument defining
rights of holders of long-term debt of the Company does not exceed 10% of the
total assets of the Company and its consolidated subsidiaries. The Company
will furnish copies of any such instrument to the SEC upon request. Copies of
any of the exhibits referred to above will be furnished at a cost of $0.25 per
page (although no charge will be made for the 2013 Annual Report on Form 10-
K) to security holders who make written request to Citigroup Inc., Corporate
Governance, 153 East 53 rd Street, 19 th Floor, New York, New York 10022. *
Denotes a management contract or compensatory plan or arrangement. + Filed
herewith.
str_locate_all(tolower(doc), "net income")
[[1]]
start end
[1,] 8508 8517
[2,] 9902 9911
[3,] 16549 16558
[4,] 17562 17571
[5,] 28900 28909
[6,] 32197 32206
[7,] 35077 35086
[8,] 37252 37261
[9,] 40187 40196
[10,] 43257 43266
[11,] 45345 45354
[12,] 47618 47627
[13,] 51865 51874
[14,] 51953 51962
[15,] 52663 52672
[16,] 52748 52757
[17,] 54970 54979
[18,] 58817 58826
[19,] 96022 96031
[20,] 96717 96726
[21,] 99297 99306
[22,] 188340 188349
[23,] 189049 189058
[24,] 201462 201471
[25,] 456097 456106
[26,] 460158 460167
[27,] 460446 460455
[28,] 460467 460476
[29,] 475016 475025
[30,] 475298 475307
[31,] 545581 545590
[32,] 554362 554371
x <- str_detect(str_to_lower(paragraphs), "net income")
x[1:10]
[1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
sum(x)
[1] 13
x <- str_count(str_to_lower(paragraphs), "net income")
x[1:10]
[1] 0 0 0 0 0 4 0 0 2 2
max(x)
[1] 4
str_locate(str_to_lower(doc), "net income")
start end
[1,] 8508 8517
str_extract(str_to_lower(doc), "net income")
[1] "net income"
# Extract all emails from the annual report
str_extract_all(doc,'[:graph:]+@[:alnum:]+\\.[.[:alnum:]]+')
[[1]]
[1] "shareholder@computershare.com" "shareholder@computershare.com"
[3] "docserve@citi.com" "shareholderrelations@citi.com"
text <- c("abcde", 'ABCDE', '12345', '!?!?.', 'ABC123?', "With space", "New\nline")
html_df(data.frame(
text=text,
alpha=str_detect(text,'[:alpha:]'),
lower=str_detect(text,'[:lower:]'),
upper=str_detect(text,'[:upper:]'),
digit=str_detect(text,'[:digit:]'),
alnum=str_detect(text,'[:alnum:]')
))
text | alpha | lower | upper | digit | alnum |
---|---|---|---|---|---|
abcde | TRUE | TRUE | FALSE | FALSE | TRUE |
ABCDE | TRUE | FALSE | TRUE | FALSE | TRUE |
12345 | FALSE | FALSE | FALSE | TRUE | TRUE |
!?!?. | FALSE | FALSE | FALSE | FALSE | FALSE |
ABC123? | TRUE | FALSE | TRUE | TRUE | TRUE |
With space | TRUE | TRUE | TRUE | FALSE | TRUE |
New line | TRUE | TRUE | TRUE | FALSE | TRUE |
NA
text <- c("abcde", 'ABCDE', '12345', '!?!?.', 'ABC123?', "With space", "New\nline")
html_df(data.frame(
text=text,
punct=str_detect(text,'[:punct:]'),
graph=str_detect(text,'[:graph:]'),
space=str_detect(text,'[:space:]'),
blank=str_detect(text,'[:blank:]'),
period=str_detect(text,'.')
))
text | punct | graph | space | blank | period |
---|---|---|---|---|---|
abcde | FALSE | TRUE | FALSE | FALSE | TRUE |
ABCDE | FALSE | TRUE | FALSE | FALSE | TRUE |
12345 | FALSE | TRUE | FALSE | FALSE | TRUE |
!?!?. | TRUE | TRUE | FALSE | FALSE | TRUE |
ABC123? | TRUE | TRUE | FALSE | FALSE | TRUE |
With space | FALSE | TRUE | TRUE | TRUE | TRUE |
New line | FALSE | TRUE | TRUE | FALSE | TRUE |
NA
df_RE <- read.csv("../../Data/Session_2-1.csv")
RE_names <- unique(df_RE$conm)
# Real estate firm names with 3 vowels in a row
str_subset(RE_names, '[AEIOU]{3}')
[1] "STADLAUER MALZFABRIK" "JOAO FORTES ENGENHARIA SA"
# Real estate firm names with no vowels
str_subset(RE_names, '^[^AEIOU]+$')
[1] "FGP LTD" "MBK PCL" "MYP LTD" "MCT BHD" "R T C L LTD"
# Real estate firm names with at least 12 vowels
str_subset(RE_names, '([^AEIOU]*[AEIOU]){11,}')
[1] "INTERNATIONAL ENTERTAINMENT" "PREMIERE HORIZON ALLIANCE"
[3] "JOAO FORTES ENGENHARIA SA" "OVERSEAS CHINESE TOWN (ASIA)"
[5] "COOPERATIVE CONSTRUCTION CO" "FRANCE TOURISME IMMOBILIER"
[7] "BONEI HATICHON CIVIL ENGINE"
# Real estate firm names with a repeated 4 letter pattern
str_subset(RE_names, '([:upper:]{4}).*\\1')
[1] "INTERNATIONAL ENTERTAINMENT" "CHONG HONG CONSTRUCTION CO"
[3] "ZHONGHONG HOLDING CO LTD" "DEUTSCHE GEOTHERMISCHE IMMOB"
# Compustat firm names example
df_RE_names <- df_RE %>%
group_by(isin) %>%
slice(1) %>%
mutate(SG_in_name = str_detect(conm, "(SG|SINGAPORE)"),
name_length = str_length(conm),
SG_firm = ifelse(fic=="SGP",1,0)) %>%
ungroup()
df_RE_names %>%
group_by(SG_firm) %>%
mutate(pct_SG = mean(SG_in_name) * 100) %>%
slice(1) %>%
ungroup() %>%
select(SG_firm, pct_SG)
library(DT)
df_RE_names %>%
group_by(fic) %>%
mutate(avg_name_length = mean(name_length)) %>%
slice(1) %>%
ungroup() %>%
select(fic, avg_name_length) %>%
arrange(desc(avg_name_length), fic) %>%
datatable(options = list(pageLength = 5))
library(quanteda)
textstat_readability(doc, "Flesch.Kincaid")
textstat_readability(doc, "FOG")
textstat_readability(doc, "Coleman.Liau.short")
# Example of "tokenizing"
library(tidytext)
df_doc <- data.frame(ID=c("0001104659-14-015152"), text=c(doc)) %>%
unnest_tokens(word, text)
# word is the name for the new column
# text is the name of the string column in the input data
html_df(head(df_doc))
ID | word | |
---|---|---|
1 | 0001104659-14-015152 | united |
1.1 | 0001104659-14-015152 | states |
1.2 | 0001104659-14-015152 | securities |
1.3 | 0001104659-14-015152 | and |
1.4 | 0001104659-14-015152 | exchange |
1.5 | 0001104659-14-015152 | commission |
# get a list of stopwords
stop_en <- stopwords::stopwords("english") # Snowball English
paste0(length(stop_en), " words: ", paste(stop_en[1:5], collapse=", "))
[1] "175 words: i, me, my, myself, we"
stop_SMART <- stopwords::stopwords(source="smart") # SMART English
paste0(length(stop_SMART), " words: ", paste(stop_SMART[1:5], collapse=", "))
[1] "571 words: a, a's, able, about, above"
stop_fr <- stopwords::stopwords("french") # Snowball French
paste0(length(stop_fr), " words: ", paste(stop_fr[1:5], collapse=", "))
[1] "164 words: au, aux, avec, ce, ces"
df_doc_stop <- df_doc %>%
anti_join(data.frame(word=stop_SMART))
Joining, by = "word"
nrow(df_doc)
[1] 128728
nrow(df_doc_stop)
[1] 74985
terms <- df_doc_stop %>%
count(ID, word, sort=TRUE) %>%
ungroup()
total_terms <- terms %>%
group_by(ID) %>%
summarize(total = sum(n))
tf <- left_join(terms, total_terms) %>% mutate(tf=n/total)
tf
terms_ns <- df_doc %>%
count(ID, word, sort=TRUE) %>%
ungroup()
total_terms_ns <- terms_ns %>%
group_by(ID) %>%
summarize(total = sum(n))
tf_no_stop <- left_join(terms_ns, total_terms_ns) %>% mutate(tf=n/total)
get_sentiments("afinn") %>%
group_by(value) %>%
slice(1) %>%
ungroup()
get_sentiments("bing") %>%
group_by(sentiment) %>%
slice(1) %>%
ungroup()
get_sentiments("nrc") %>%
group_by(sentiment) %>%
slice(1) %>%
ungroup()
get_sentiments("loughran") %>%
group_by(sentiment) %>%
slice(1) %>%
ungroup()
tf_sent <- tf %>% left_join(get_sentiments("loughran"))
Joining, by = "word"
tf_sent[1:5,]
tf_sent[!is.na(tf_sent$sentiment),][1:5,]
tf_sent %>%
spread(sentiment, tf, fill=0) %>%
select(constraining, litigious, negative, positive, superfluous, uncertainty) %>%
colSums()
constraining litigious negative positive superfluous uncertainty
0.013242649 0.020750817 0.034780289 0.007054744 0.000373408 0.025325065
#library(plotly)
tf_sent %>%
filter(!is.na(sentiment)) %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
mutate(row = row_number()) %>%
filter(row < 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(y=n, x=word)) + geom_col() + theme(axis.text.x = element_text(angle=90, hjust=1)) +
facet_wrap(~sentiment, ncol=3, scales="free_x")
#ggplotly(p)
corp <- cast_dfm(tf, ID, word, n)
textplot_wordcloud(dfm(corp), color = RColorBrewer::brewer.pal(9, "Set1"))
corp_no_stop <- cast_dfm(tf_no_stop, ID, word, n)
textplot_wordcloud(dfm(corp_no_stop), color = RColorBrewer::brewer.pal(9, "Set1"))