Dr. Richard M. Crowley
legal
on fps
– the only coefficient in the modelWe’ll get started on these today – sentiment and readability
We will cover making topic models in a later session
This is what we will work on today, and we will revist some of this in the remaining class sessions
Wide format
## # A tibble: 3 x 3
## quarter level_3 value
## <chr> <chr> <chr>
## 1 1995-Q1 Wholesale Trade 17
## 2 1995-Q1 Retail Trade -18
## 3 1995-Q1 Accommodation 16
Long format
## # A tibble: 3 x 4
## RegionID `1996-04` `1996-05` `1996-06`
## <int> <int> <int> <int>
## 1 84654 334200 335400 336500
## 2 90668 235700 236900 236700
## 3 91982 210400 212200 212200
The structure is given by the IDs, dates, and variables
"JANUARY"
, "ONE"
, "FEMALE"
All of these require us to determine and impose structure
What are call centers using NLP for?
How does NLP help call centers with their business?
Where an we make use of NLP in business?
\ | ( ) [ { } ^ $ * + ? . !
\
\
is a special character, we’ll need to put \
before \
…
$
, we would use \\$
\t
is tab\r
is newline (files from Macs)\r\n
is newline (files from Windows)\n
is newline (files from Unix, Linux, etc.)# Read text from a .txt file using read_file()
doc <- read_file("../../Data/0001104659-14-015152.txt")
# str_wrap is from stringr from tidyverse
cat(str_wrap(substring(doc,1,500), 80))
## UNITED STATES SECURITIES AND EXCHANGE COMMISSION WASHINGTON, D.C. 20549 FORM
## 10-K ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE
## ACT OF 1934 For the fiscal year ended December 31, 2013 Commission file number
## 1-9924 Citigroup Inc. (Exact name of registrant as specified in its charter)
## Securities registered pursuant to Section 12(b) of the Act: See Exhibit 99.01
## Securities registered pursuant to Section 12(g) of the Act: none Indicate by
## check mark if the registrant is a
<a>
, <table>
, <img>
, etc.library(RCurl)
library(XML)
html <- getURL('https://coinmarketcap.com/currencies/ethereum/')
cat(str_wrap(substring(html, 46320, 46427), 80))
## n class="h2 text-semi-bold details-panel-item--price__value" data-currency-
## value>208.90</span> <span class="
xpath <- '//*[@id="quote_price"]/span[1]/text()'
hdoc = htmlParse(html, asText=TRUE) # from XML
price <- xpathSApply(hdoc, xpath, xmlValue)
print(paste0("Ethereum was priced at $", price,
" when these slides were compiled"))
## [1] "Ethereum was priced at $208.90 when these slides were compiled"
# The actual version I use (with caching to avoid repeated lookups) is in the appendix
cryptoMC <- function(name) {
html <- getURL(paste('https://coinmarketcap.com/currencies/',name,'/',sep=''))
xpath <- '//*[@id="quote_price"]/span[1]/text()'
hdoc = htmlParse(html, asText=TRUE)
plain.text <- xpathSApply(hdoc, xpath, xmlValue)
plain.text
}
paste("Ethereum was priced at", cryptoMC("ethereum"))
## [1] "Ethereum was priced at 208.90"
paste("Litecoin was priced at", cryptoMC("litecoin"))
## [1] "Litecoin was priced at 54.71"
We will cover these using stringr as opposed to base R – stringr’s commands are much more consistent
cat(str_wrap(str_sub(doc, 9896, 9929), 80))
## Citis net income was $13.5 billion
cat(str_wrap(str_sub(doc, 28900,29052), 80))
## Net income decreased 14%, mainly driven by lower revenues and lower loan loss
## reserve releases, partially offset by lower net credit losses and expenses.
sep=
optioncollapse=
with the desired separatorsep=""
sentence <- str_sub(doc, 9896, 9929)
str_to_lower(sentence)
## [1] "citis net income was $13.5 billion"
str_to_upper(sentence)
## [1] "CITIS NET INCOME WAS $13.5 BILLION"
str_to_title(sentence)
## [1] "Citis Net Income Was $13.5 Billion"
str_
prefixed functions support non-English languages as well# You can run this in an R terminal! (It doesn't work in Rmarkdown though)
str_to_upper("Citis net income was $13.5 billion", locale='tr') # Turkish
paste
# board is a list of director names
# titles is a list of the director's titles
paste(board, titles, sep=", ")
## [1] "Michael L. Corbat, CEO"
## [2] "Michael E. O’Neill, Chairman"
## [3] "Anthony M. Santomero, Former president, Fed (Philidelphia)"
## [4] "William S. Thompson, Jr., CEO, Retired, PIMCO"
## [5] "Duncan P. Hennes, Co-Founder/Partner, Atrevida Partners"
## [6] "Gary M. Reiner, Operating Partner, General Atlantic"
## [7] "Joan E. Spero, Senior Research Scholar, Columbia University"
## [8] "James S. Turley, Former Chairman & CEO, E&Y"
## [9] "Franz B. Humer, Chairman, Roche"
## [10] "Judith Rodin, President, Rockefeller Foundation"
## [11] "Robert L. Ryan, CFO, Retired, Medtronic"
## [12] "Diana L. Taylor, MD, Wolfensohn Fund Management"
## [13] "Ernesto Zedillo Ponce de Leon, Professor, Yale University"
## [14] "Robert L. Joss, Professor/Dean Emeritus, Stanford GSB"
cat(str_wrap(paste0("Citi's board consists of: ",
paste(board[1:length(board)-1], collapse=", "),
", and ", board[length(board)], "."), 80))
## Citi's board consists of: Michael L. Corbat, Michael E. O’Neill, Anthony M.
## Santomero, William S. Thompson, Jr., Duncan P. Hennes, Gary M. Reiner, Joan E.
## Spero, James S. Turley, Franz B. Humer, Judith Rodin, Robert L. Ryan, Diana L.
## Taylor, Ernesto Zedillo Ponce de Leon, and Robert L. Joss.
sentence
## [1] "Citis net income was $13.5 billion"
str_replace_all(sentence, "\\$13.5", "over $10")
## [1] "Citis net income was over $10 billion"
[[1]]
can extract the first vectorn=
n=
paragraphs <- str_split(doc, '\n')[[1]]
# number of paragraphs
length(paragraphs)
## [1] 206
# Last paragraph
cat(str_wrap(paragraphs[206], 80))
## The total amount of securities authorized pursuant to any instrument defining
## rights of holders of long-term debt of the Company does not exceed 10% of the
## total assets of the Company and its consolidated subsidiaries. The Company
## will furnish copies of any such instrument to the SEC upon request. Copies of
## any of the exhibits referred to above will be furnished at a cost of $0.25 per
## page (although no charge will be made for the 2013 Annual Report on Form 10-
## K) to security holders who make written request to Citigroup Inc., Corporate
## Governance, 153 East 53 rd Street, 19 th Floor, New York, New York 10022. *
## Denotes a management contract or compensatory plan or arrangement. + Filed
## herewith.
str_locate_all(tolower(doc), "net income")
## [[1]]
## start end
## [1,] 8508 8517
## [2,] 9902 9911
## [3,] 16549 16558
## [4,] 17562 17571
## [5,] 28900 28909
## [6,] 32197 32206
## [7,] 35077 35086
## [8,] 37252 37261
## [9,] 40187 40196
## [10,] 43257 43266
## [11,] 45345 45354
## [12,] 47618 47627
## [13,] 51865 51874
## [14,] 51953 51962
## [15,] 52663 52672
## [16,] 52748 52757
## [17,] 54970 54979
## [18,] 58817 58826
## [19,] 96022 96031
## [20,] 96717 96726
## [21,] 99297 99306
## [22,] 188340 188349
## [23,] 189049 189058
## [24,] 201462 201471
## [25,] 456097 456106
## [26,] 460158 460167
## [27,] 460446 460455
## [28,] 460467 460476
## [29,] 475016 475025
## [30,] 475298 475307
## [31,] 545581 545590
## [32,] 554362 554371
TRUE
or FALSE
for the presence of a string in the textx <- str_detect(str_to_lower(paragraphs), "net income")
x[1:10]
## [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
sum(x)
## [1] 13
x <- str_count(str_to_lower(paragraphs), "net income")
x[1:10]
## [1] 0 0 0 0 0 4 0 0 2 2
max(x)
## [1] 4
str_locate(str_to_lower(doc), "net income")
## start end
## [1,] 8508 8517
str_extract(str_to_lower(doc), "net income")
## [1] "net income"
@
sign.
in it[:graph:]+
.
.
though[:alnum:]+\\.[.[:alnum:]]+
# Extract all emails from the annual report
str_extract_all(doc,'[:graph:]+@[:alnum:]+\\.[.[:alnum:]]+')
## [[1]]
## [1] "shareholder@computershare.com" "shareholder@computershare.com"
## [3] "docserve@citi.com" "shareholderrelations@citi.com"
@
was itself – it isn’t a special character in strings in R\\.
is just a period – we need to escape .
because it is special in R[: :]
, is a set of characters
[:graph:]
means any letter, number, or punctuation[:alnum:]
means any letter or number+
is used to indicate that we want 1 or more of the preceding element – as many as it can match
[:graph:]+
meant “Give us every letter, number, and punctuation you can, but make sure there is at least 1.”[ ]
, ask for anything inside
[.[:alnum:]]+
meant “Give us every letter, number, and .
you can, but make sure there is at least 1.”shareholder@computershare.com
[:graph:]+@[:alnum:]+\\.[.[:alnum:]]+
[:graph:]+
\Rightarrow shareholder
@
\Rightarrow @
[:alnum:]+
\Rightarrow computershare
\\.
\Rightarrow .
[.[:alnum:]]+
\Rightarrow com
.
matches everything[:alpha:]
matches all letters[:lower:]
matches all lowercase letters[:upper:]
matches all UPPERCASE letters[:digit:]
matches all numbers 0 through 9[:alnum:]
matches all letters and numbers[:punct:]
matches all punctuation[:graph:]
matches all letters, numbers, and punctuation[:space:]
or \s
match ANY whitespace
\S
is the exact opposite[:blank:]
matches whitespace except newlinestext <- c("abcde", 'ABCDE', '12345', '!?!?.', 'ABC123?', "With space", "New\nline")
html_df(data.frame(
text=text,
alpha=str_detect(text,'[:alpha:]'),
lower=str_detect(text,'[:lower:]'),
upper=str_detect(text,'[:upper:]'),
digit=str_detect(text,'[:digit:]'),
alnum=str_detect(text,'[:alnum:]')
))
text | alpha | lower | upper | digit | alnum |
---|---|---|---|---|---|
abcde | TRUE | TRUE | FALSE | FALSE | TRUE |
ABCDE | TRUE | FALSE | TRUE | FALSE | TRUE |
12345 | FALSE | FALSE | FALSE | TRUE | TRUE |
!?!?. | FALSE | FALSE | FALSE | FALSE | FALSE |
ABC123? | TRUE | FALSE | TRUE | TRUE | TRUE |
With space | TRUE | TRUE | TRUE | FALSE | TRUE |
New line | TRUE | TRUE | TRUE | FALSE | TRUE |
text <- c("abcde", 'ABCDE', '12345', '!?!?.', 'ABC123?', "With space", "New\nline")
html_df(data.frame(
text=text,
punct=str_detect(text,'[:punct:]'),
graph=str_detect(text,'[:graph:]'),
space=str_detect(text,'[:space:]'),
blank=str_detect(text,'[:blank:]'),
period=str_detect(text,'.')
))
text | punct | graph | space | blank | period |
---|---|---|---|---|---|
abcde | FALSE | TRUE | FALSE | FALSE | TRUE |
ABCDE | FALSE | TRUE | FALSE | FALSE | TRUE |
12345 | FALSE | TRUE | FALSE | FALSE | TRUE |
!?!?. | TRUE | TRUE | FALSE | FALSE | TRUE |
ABC123? | TRUE | TRUE | FALSE | FALSE | TRUE |
With space | FALSE | TRUE | TRUE | TRUE | TRUE |
New line | FALSE | TRUE | TRUE | FALSE | TRUE |
[ ]
can be used to create a class of characters to look for
[abc]
matches anything that is a
, b
, c
[^ ]
can be used to create a class of everything else
[^abc]
matches anything that isn’t a
, b
, or c
x
is some element
x?
looks for 0 or 1 of x
x*
looks for 0 or more of x
x+
looks for 1 or more of x
x{n}
looks for n
(a number) of x
x{n, }
looks for at least n
of x
x{n,m}
looks for at least n
and at most m
of x
?
to any quantity operator to make it prefer the shortest match possible^
indicates the start of the string$
indicates the end of the string( )
can be used to group components|
can be used within groups as a logical or\\1
refers to the first group\\2
refers to the second group# Real estate firm names with 3 vowels in a row
str_subset(RE_names, '[AEIOU]{3}')
## [1] "STADLAUER MALZFABRIK" "JOAO FORTES ENGENHARIA SA"
# Real estate firm names with no vowels
str_subset(RE_names, '^[^AEIOU]+$')
## [1] "FGP LTD" "MBK PCL" "MYP LTD" "MCT BHD" "R T C L LTD"
# Real estate firm names with at least 12 vowels
str_subset(RE_names, '([^AEIOU]*[AEIOU]){11,}')
## [1] "INTERNATIONAL ENTERTAINMENT" "PREMIERE HORIZON ALLIANCE"
## [3] "JOAO FORTES ENGENHARIA SA" "OVERSEAS CHINESE TOWN (ASIA)"
## [5] "COOPERATIVE CONSTRUCTION CO" "FRANCE TOURISME IMMOBILIER"
## [7] "BONEI HATICHON CIVIL ENGINE"
# Real estate firm names with a repeated 4 letter pattern
str_subset(RE_names, '([:upper:]{4}).*\\1')
## [1] "INTERNATIONAL ENTERTAINMENT" "CHONG HONG CONSTRUCTION CO"
## [3] "ZHONGHONG HOLDING CO LTD" "DEUTSCHE GEOTHERMISCHE IMMOB"
[12][90][:digit:][:digit:]
[01][:digit:]
[0123][:digit:]
Cavaet: Regexes are generally slow. If you can code something to avoid them, that is often better. But often that may be infeasible.
str_*()
functions use regex by default, they actually have four modes
boundary("word")
# Compustat firm names example
df_RE_names <- df_RE %>%
group_by(isin) %>%
slice(1) %>%
mutate(SG_in_name = str_detect(conm, "(SG|SINGAPORE)"),
name_length = str_length(conm),
SG_firm = ifelse(fic=="SGP",1,0)) %>%
ungroup()
## Warning: package 'bindrcpp' was built under R version 3.5.1
df_RE_names %>%
group_by(SG_firm) %>%
mutate(pct_SG = mean(SG_in_name) * 100) %>%
slice(1) %>%
ungroup() %>%
select(SG_firm, pct_SG)
## # A tibble: 2 x 2
## SG_firm pct_SG
## <dbl> <dbl>
## 1 0 0.369
## 2 1 4.76
library(DT)
df_RE_names %>%
group_by(fic) %>%
mutate(avg_name_length = mean(name_length)) %>%
slice(1) %>%
ungroup() %>%
select(fic, avg_name_length) %>%
arrange(desc(avg_name_length), fic) %>%
datatable(options = list(pageLength = 5))
206.835 - 1.015 \left(\frac{\#~words}{\#~sentences}\right) - 84.6\left(\frac{\#~syllables}{\#~words}\right)
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.5.1
textstat_readability(doc, "Flesch.Kincaid")
## document Flesch.Kincaid
## 1 text1 17.56528
\begin{aligned} & \left[ Mean(Words~per~sentence) +\right.\\ &\left.(\%~of~words~>3~syllables)\right] \times 0.4 \end{aligned}
textstat_readability(doc, "FOG")
## document FOG
## 1 text1 21.63388
5.88\left(\frac{\#~letters}{\#~words}\right)-29.6\left(\frac{\#~sentences}{\#~words}\right)-15.8
textstat_readability(doc, "Coleman.Liau")
## document Coleman.Liau
## 1 text1 29.03967
to_lower=FALSE
to avoid this if needed# Example of "tokenizing"
library(tidytext)
df_doc <- data.frame(ID=c("0001104659-14-015152"), text=c(doc),
stringsAsFactors = F) %>%
unnest_tokens(word, text)
# word is the name for the new column
# text is the name of the string column in the input data
token=
)
n=
)tm
package to remove stopwords
# get a list of stopwords
library(stopwords)
stop_en <- stopwords("english") # Snowball English
paste0(length(stop_en), " words: ", paste(stop_en[1:5], collapse=", "))
## [1] "175 words: i, me, my, myself, we"
stop_SMART <- stopwords(source="smart") # SMART English
paste0(length(stop_SMART), " words: ", paste(stop_SMART[1:5], collapse=", "))
## [1] "571 words: a, a's, able, about, above"
stop_fr <- stopwords("french") # Snowball French
paste0(length(stop_fr), " words: ", paste(stop_fr[1:5], collapse=", "))
## [1] "164 words: au, aux, avec, ce, ces"
df_doc_stop <- df_doc %>%
anti_join(data.frame(word=stop_SMART, stringsAsFactors = F))
## Joining, by = "word"
nrow(df_doc)
## [1] 128728
nrow(df_doc_stop)
## [1] 74985
terms <- df_doc_stop %>%
count(ID, word, sort=TRUE) %>%
ungroup()
total_terms <- terms %>%
group_by(ID) %>%
summarize(total = sum(n))
tf <- left_join(terms, total_terms) %>% mutate(tf=n/total)
## Joining, by = "ID"
tf
## # A tibble: 5,543 x 5
## ID word n total tf
## <chr> <chr> <int> <int> <dbl>
## 1 0001104659-14-015152 citi 826 74985 0.0110
## 2 0001104659-14-015152 2013 743 74985 0.00991
## 3 0001104659-14-015152 credit 704 74985 0.00939
## 4 0001104659-14-015152 citis 660 74985 0.00880
## 5 0001104659-14-015152 risk 624 74985 0.00832
## 6 0001104659-14-015152 december 523 74985 0.00697
## 7 0001104659-14-015152 financial 513 74985 0.00684
## 8 0001104659-14-015152 31 505 74985 0.00673
## 9 0001104659-14-015152 loans 495 74985 0.00660
## 10 0001104659-14-015152 assets 488 74985 0.00651
## # ... with 5,533 more rows
get_sentiments("afinn") %>%
group_by(score) %>%
slice(1) %>%
ungroup()
## # A tibble: 11 x 2
## word score
## <chr> <int>
## 1 bastard -5
## 2 ass -4
## 3 abhor -3
## 4 abandon -2
## 5 absentee -1
## 6 some kind 0
## 7 aboard 1
## 8 abilities 2
## 9 admire 3
## 10 amazing 4
## 11 breathtaking 5
get_sentiments("bing") %>%
group_by(sentiment) %>%
slice(1) %>%
ungroup()
## # A tibble: 2 x 2
## word sentiment
## <chr> <chr>
## 1 2-faced negative
## 2 a+ positive
get_sentiments("nrc") %>%
group_by(sentiment) %>%
slice(1) %>%
ungroup()
## # A tibble: 10 x 2
## word sentiment
## <chr> <chr>
## 1 abandoned anger
## 2 abundance anticipation
## 3 aberration disgust
## 4 abandon fear
## 5 absolution joy
## 6 abandon negative
## 7 abba positive
## 8 abandon sadness
## 9 abandonment surprise
## 10 abacus trust
Loughran & McDonald dictionary – finance specific, targeted at annual reports
get_sentiments("loughran") %>%
group_by(sentiment) %>%
slice(1) %>%
ungroup()
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 abide constraining
## 2 abovementioned litigious
## 3 abandon negative
## 4 able positive
## 5 aegis superfluous
## 6 abeyance uncertainty
tf_sent <- tf %>% left_join(get_sentiments("loughran"))
## Joining, by = "word"
tf_sent[1:5,]
## # A tibble: 5 x 6
## ID word n total tf sentiment
## <chr> <chr> <int> <int> <dbl> <chr>
## 1 0001104659-14-015152 citi 826 74985 0.0110 <NA>
## 2 0001104659-14-015152 2013 743 74985 0.00991 <NA>
## 3 0001104659-14-015152 credit 704 74985 0.00939 <NA>
## 4 0001104659-14-015152 citis 660 74985 0.00880 <NA>
## 5 0001104659-14-015152 risk 624 74985 0.00832 uncertainty
tf_sent[!is.na(tf_sent$sentiment),][1:5,]
## # A tibble: 5 x 6
## ID word n total tf sentiment
## <chr> <chr> <int> <int> <dbl> <chr>
## 1 0001104659-14-015152 risk 624 74985 0.00832 uncertainty
## 2 0001104659-14-015152 loss 267 74985 0.00356 negative
## 3 0001104659-14-015152 losses 265 74985 0.00353 negative
## 4 0001104659-14-015152 approximately 232 74985 0.00309 uncertainty
## 5 0001104659-14-015152 regulatory 216 74985 0.00288 litigious
tf_sent %>%
spread(sentiment, tf, fill=0) %>%
select(constraining, litigious, negative, positive, superfluous, uncertainty) %>%
colSums()
## constraining litigious negative positive superfluous
## 0.013242649 0.020750817 0.034780289 0.007054744 0.000373408
## uncertainty
## 0.025325065
corp <- corpus(df_doc_stop, docid_field="ID", text_field="word")
textplot_wordcloud(dfm(corp), color = RColorBrewer::brewer.pal(10, "RdBu"))
corp <- corpus(df_doc, docid_field="ID", text_field="word")
textplot_wordcloud(dfm(corp), color = RColorBrewer::brewer.pal(10, "RdBu"))
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
} else {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
column_spec(1,bold=T)
}
}
cryptoMC <- function(name) {
if (exists(name)) {
get(name)
} else{
html <- getURL(paste('https://coinmarketcap.com/currencies/',name,'/',sep=''))
xpath <- '//*[@id="quote_price"]/span[1]/text()'
doc = htmlParse(html, asText=TRUE)
plain.text <- xpathSApply(doc, xpath, xmlValue)
assign(name, gsub("\n","",gsub(" ", "", paste(plain.text, collapse = ""), fixed = TRUE), fixed = TRUE),envir = .GlobalEnv)
get(name)
}
}
# Loads line-by-line by default
# This makes it document-by-document
library(textreadr)
df2 <- read_dir("G:/2014/2014/") %>%
group_by(document) %>%
mutate(text=paste(content, collapse="\n")) %>%
select(document,text)
slice(1) %>%
ungroup()
# Create a plot of the top words by sentiment
tf_sent %>%
filter(!is.na(sentiment)) %>%
group_by(sentiment) %>%
arrange(desc(n)) %>%
mutate(row = row_number()) %>%
filter(row < 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(y=n, x=word)) + geom_col() + theme(axis.text.x = element_text(angle=90, hjust=1)) +
facet_wrap(~sentiment, ncol=3, scales="free_x")