Calculating quantities with text
Note: Due to missing packages in DataCamp light, namely quanteda
, textdata
, and tidytext
, I have provided sample code that you can run on your own computer in RStudio. Make sure to run install.packages("quanteda")
, install.packages("textdata")
, and install.packages("tidytext")
to install those packages if you don’t have them.
Each of the three exercises below can be run as standalone scripts, as they contain all needed imports within their code blocks
Exercise 6: Readability with Quanteda
How does the readability of JPMorgan’s annual report compare to the Citigroup annual report from class?
# load in readr (or tidyverse) to get read_file() function
library(readr)
# Load in all of JPM's 2014 annual report
doc <- read_file("https://rmc.link/Slides/acct420v3/Session_7/0000019617-14-000289.txt")
# Load in quanteda
library(quanteda)
# Calculate the three readability measures
textstat_readability(doc, "Flesch.Kincaid")
textstat_readability(doc, "FOG")
textstat_readability(doc, "Coleman.Liau")
Warning in for (i in seq_len(n)) { :
closing unused connection 3 (https://rmc.link/Slides/acct420v3/Session_7/0000019617-14-000289.txt)
#END
Exercise 7: Readability with Quanteda
How does the sentiment of JPMorgan’s annual report compare to the Citigroup annual report from class?
# load in readr (or tidyverse) to get read_file() function
library(readr)
# Load in all of JPM's 2014 annual report
doc <- read_file("https://rmc.link/Slides/acct420v3/Session_7/0000019617-14-000289.txt")
# Load in tidytext
library(tidytext)
# Load some components of tidyverse
library(dplyr) # for the usual commands
library(tidyr) # for spread
# convert document to tidy format
df_doc <- data.frame(ID=c("0000019617-14-000289"), text=c(doc),
stringsAsFactors = F) %>%
unnest_tokens(word, text)
# Calculate term frequency
terms <- df_doc %>%
count(ID, word, sort=TRUE) %>%
ungroup()
total_terms <- terms %>%
group_by(ID) %>%
summarize(total = sum(n))
`summarise()` ungrouping output (override with `.groups` argument)
tf <- left_join(terms, total_terms) %>% mutate(tf=n/total)
Joining, by = "ID"
# Get the Loughran McDonald sentiment dictionary
sentiment <- get_sentiments("loughran")
# Merge in sentiment
tf_sent <- tf %>% left_join(sentiment)
Joining, by = "word"
# Calculate the three readability measures
tf_sent %>%
spread(sentiment, tf, fill=0) %>%
select(constraining, litigious, negative, positive, superfluous, uncertainty) %>%
colSums()
constraining litigious negative positive superfluous uncertainty
0.0127076134 0.0178352469 0.0308215361 0.0055735147 0.0001672054 0.0220153829
#END
Exercise 8: Make a word cloud after removing stopwords
# load in readr (or tidyverse) to get read_file() function
library(readr)
# Load in all of JPM's 2014 annual report
doc <- read_file("https://rmc.link/Slides/acct420v3/Session_7/0000019617-14-000289.txt")
# Load in quanteda and tidytext
library(quanteda)
library(tidytext)
# Load in some of tidyverse
library(dplyr)
# convert document to tidy format
df_doc <- data.frame(ID=c("0000019617-14-000289"), text=c(doc),
stringsAsFactors = F) %>%
unnest_tokens(word, text)
# Pull a list of stopwords
stopwords <- stopwords::stopwords(source="smart")
# Remove stopwords
df_doc_stop <- df_doc %>%
anti_join(data.frame(word=stopwords, stringsAsFactors=F))
Joining, by = "word"
# Calculate term frequency
terms <- df_doc_stop %>%
count(ID, word, sort=TRUE) %>%
ungroup()
total_terms <- terms %>%
group_by(ID) %>%
summarize(total = sum(n))
`summarise()` ungrouping output (override with `.groups` argument)
tf <- left_join(terms, total_terms) %>% mutate(tf=n/total)
Joining, by = "ID"
# Build a corpus object for quanteda
corp <- cast_dfm(tf, ID, word, n)
# Plot a word cloud -- If you don't have RColorBrewer installed, you can
# remove the `color=` option.
textplot_wordcloud(dfm(corp), color = RColorBrewer::brewer.pal(9, "Set1"))
#END
LS0tDQp0aXRsZTogIlNlc3Npb24gNyBSIHByYWN0aWNlIChvZmZsaW5lKSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KYXV0aG9yOiAiRHIuIFJpY2hhcmQgTS4gQ3Jvd2xleSINCmRhdGU6ICJBQ0NUIDQyMCBTZXNzaW9uIDciDQotLS0NCg0KIyMgQ2FsY3VsYXRpbmcgcXVhbnRpdGllcyB3aXRoIHRleHQNCg0KTm90ZTogRHVlIHRvIG1pc3NpbmcgcGFja2FnZXMgaW4gRGF0YUNhbXAgbGlnaHQsIG5hbWVseSBgcXVhbnRlZGFgLCBgdGV4dGRhdGFgLCBhbmQgYHRpZHl0ZXh0YCwgSSBoYXZlIHByb3ZpZGVkIHNhbXBsZSBjb2RlIHRoYXQgeW91IGNhbiBydW4gb24geW91ciBvd24gY29tcHV0ZXIgaW4gUlN0dWRpby4gIE1ha2Ugc3VyZSB0byBydW4gYGluc3RhbGwucGFja2FnZXMoInF1YW50ZWRhIilgLCBgaW5zdGFsbC5wYWNrYWdlcygidGV4dGRhdGEiKWAsIGFuZCBgaW5zdGFsbC5wYWNrYWdlcygidGlkeXRleHQiKWAgdG8gaW5zdGFsbCB0aG9zZSBwYWNrYWdlcyBpZiB5b3UgZG9uJ3QgaGF2ZSB0aGVtLg0KDQpFYWNoIG9mIHRoZSB0aHJlZSBleGVyY2lzZXMgYmVsb3cgY2FuIGJlIHJ1biBhcyBzdGFuZGFsb25lIHNjcmlwdHMsIGFzIHRoZXkgY29udGFpbiBhbGwgbmVlZGVkIGltcG9ydHMgd2l0aGluIHRoZWlyIGNvZGUgYmxvY2tzDQoNCiMjIyBFeGVyY2lzZSA2OiBSZWFkYWJpbGl0eSB3aXRoIFF1YW50ZWRhDQoNCkhvdyBkb2VzIHRoZSByZWFkYWJpbGl0eSBvZiBKUE1vcmdhbidzIGFubnVhbCByZXBvcnQgY29tcGFyZSB0byB0aGUgQ2l0aWdyb3VwIGFubnVhbCByZXBvcnQgZnJvbSBjbGFzcz8NCg0KYGBge3J9DQojIGxvYWQgaW4gcmVhZHIgKG9yIHRpZHl2ZXJzZSkgdG8gZ2V0IHJlYWRfZmlsZSgpIGZ1bmN0aW9uDQpsaWJyYXJ5KHJlYWRyKQ0KDQojIExvYWQgaW4gYWxsIG9mIEpQTSdzIDIwMTQgYW5udWFsIHJlcG9ydA0KZG9jIDwtIHJlYWRfZmlsZSgiaHR0cHM6Ly9ybWMubGluay9TbGlkZXMvYWNjdDQyMHYzL1Nlc3Npb25fNy8wMDAwMDE5NjE3LTE0LTAwMDI4OS50eHQiKQ0KDQojIExvYWQgaW4gcXVhbnRlZGENCmxpYnJhcnkocXVhbnRlZGEpDQoNCiMgQ2FsY3VsYXRlIHRoZSB0aHJlZSByZWFkYWJpbGl0eSBtZWFzdXJlcw0KdGV4dHN0YXRfcmVhZGFiaWxpdHkoZG9jLCAiRmxlc2NoLktpbmNhaWQiKQ0KdGV4dHN0YXRfcmVhZGFiaWxpdHkoZG9jLCAiRk9HIikNCnRleHRzdGF0X3JlYWRhYmlsaXR5KGRvYywgIkNvbGVtYW4uTGlhdSIpDQoNCiNFTkQNCmBgYA0KDQojIyMgRXhlcmNpc2UgNzogUmVhZGFiaWxpdHkgd2l0aCBRdWFudGVkYQ0KDQpIb3cgZG9lcyB0aGUgc2VudGltZW50IG9mIEpQTW9yZ2FuJ3MgYW5udWFsIHJlcG9ydCBjb21wYXJlIHRvIHRoZSBDaXRpZ3JvdXAgYW5udWFsIHJlcG9ydCBmcm9tIGNsYXNzPw0KDQpgYGB7cn0NCiMgbG9hZCBpbiByZWFkciAob3IgdGlkeXZlcnNlKSB0byBnZXQgcmVhZF9maWxlKCkgZnVuY3Rpb24NCmxpYnJhcnkocmVhZHIpDQoNCiMgTG9hZCBpbiBhbGwgb2YgSlBNJ3MgMjAxNCBhbm51YWwgcmVwb3J0DQpkb2MgPC0gcmVhZF9maWxlKCJodHRwczovL3JtYy5saW5rL1NsaWRlcy9hY2N0NDIwdjMvU2Vzc2lvbl83LzAwMDAwMTk2MTctMTQtMDAwMjg5LnR4dCIpDQoNCiMgTG9hZCBpbiB0aWR5dGV4dA0KbGlicmFyeSh0aWR5dGV4dCkNCg0KIyBMb2FkIHNvbWUgY29tcG9uZW50cyBvZiB0aWR5dmVyc2UNCmxpYnJhcnkoZHBseXIpICAjIGZvciB0aGUgdXN1YWwgY29tbWFuZHMNCmxpYnJhcnkodGlkeXIpICAjIGZvciBzcHJlYWQNCg0KIyBjb252ZXJ0IGRvY3VtZW50IHRvIHRpZHkgZm9ybWF0DQpkZl9kb2MgPC0gZGF0YS5mcmFtZShJRD1jKCIwMDAwMDE5NjE3LTE0LTAwMDI4OSIpLCB0ZXh0PWMoZG9jKSwNCiAgICAgICAgICAgICAgICAgICAgIHN0cmluZ3NBc0ZhY3RvcnMgPSBGKSAlPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQ0KDQojIENhbGN1bGF0ZSB0ZXJtIGZyZXF1ZW5jeQ0KdGVybXMgPC0gZGZfZG9jICU+JQ0KICBjb3VudChJRCwgd29yZCwgc29ydD1UUlVFKSAlPiUNCiAgdW5ncm91cCgpDQp0b3RhbF90ZXJtcyA8LSB0ZXJtcyAlPiUgDQogIGdyb3VwX2J5KElEKSAlPiUgDQogIHN1bW1hcml6ZSh0b3RhbCA9IHN1bShuKSkNCnRmIDwtIGxlZnRfam9pbih0ZXJtcywgdG90YWxfdGVybXMpICU+JSBtdXRhdGUodGY9bi90b3RhbCkNCg0KIyBHZXQgdGhlIExvdWdocmFuIE1jRG9uYWxkIHNlbnRpbWVudCBkaWN0aW9uYXJ5DQpzZW50aW1lbnQgPC0gZ2V0X3NlbnRpbWVudHMoImxvdWdocmFuIikNCg0KIyBNZXJnZSBpbiBzZW50aW1lbnQNCnRmX3NlbnQgPC0gdGYgJT4lIGxlZnRfam9pbihzZW50aW1lbnQpDQoNCiMgQ2FsY3VsYXRlIHRoZSB0aHJlZSByZWFkYWJpbGl0eSBtZWFzdXJlcw0KdGZfc2VudCAlPiUNCiAgc3ByZWFkKHNlbnRpbWVudCwgdGYsIGZpbGw9MCkgJT4lDQogIHNlbGVjdChjb25zdHJhaW5pbmcsIGxpdGlnaW91cywgbmVnYXRpdmUsIHBvc2l0aXZlLCBzdXBlcmZsdW91cywgdW5jZXJ0YWludHkpICU+JQ0KICBjb2xTdW1zKCkNCg0KI0VORA0KYGBgDQoNCiMjIyBFeGVyY2lzZSA4OiBNYWtlIGEgd29yZCBjbG91ZCBhZnRlciByZW1vdmluZyBzdG9wd29yZHMNCg0KYGBge3J9DQojIGxvYWQgaW4gcmVhZHIgKG9yIHRpZHl2ZXJzZSkgdG8gZ2V0IHJlYWRfZmlsZSgpIGZ1bmN0aW9uDQpsaWJyYXJ5KHJlYWRyKQ0KDQojIExvYWQgaW4gYWxsIG9mIEpQTSdzIDIwMTQgYW5udWFsIHJlcG9ydA0KZG9jIDwtIHJlYWRfZmlsZSgiaHR0cHM6Ly9ybWMubGluay9TbGlkZXMvYWNjdDQyMHYzL1Nlc3Npb25fNy8wMDAwMDE5NjE3LTE0LTAwMDI4OS50eHQiKQ0KDQojIExvYWQgaW4gcXVhbnRlZGEgYW5kIHRpZHl0ZXh0DQpsaWJyYXJ5KHF1YW50ZWRhKQ0KbGlicmFyeSh0aWR5dGV4dCkNCg0KIyBMb2FkIGluIHNvbWUgb2YgdGlkeXZlcnNlDQpsaWJyYXJ5KGRwbHlyKQ0KDQojIGNvbnZlcnQgZG9jdW1lbnQgdG8gdGlkeSBmb3JtYXQNCmRmX2RvYyA8LSBkYXRhLmZyYW1lKElEPWMoIjAwMDAwMTk2MTctMTQtMDAwMjg5IiksIHRleHQ9Yyhkb2MpLA0KICAgICAgICAgICAgICAgICAgICAgc3RyaW5nc0FzRmFjdG9ycyA9IEYpICU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpDQoNCiMgUHVsbCBhIGxpc3Qgb2Ygc3RvcHdvcmRzDQpzdG9wd29yZHMgPC0gc3RvcHdvcmRzOjpzdG9wd29yZHMoc291cmNlPSJzbWFydCIpDQoNCiMgUmVtb3ZlIHN0b3B3b3Jkcw0KZGZfZG9jX3N0b3AgPC0gZGZfZG9jICU+JQ0KICBhbnRpX2pvaW4oZGF0YS5mcmFtZSh3b3JkPXN0b3B3b3Jkcywgc3RyaW5nc0FzRmFjdG9ycz1GKSkNCg0KIyBDYWxjdWxhdGUgdGVybSBmcmVxdWVuY3kNCnRlcm1zIDwtIGRmX2RvY19zdG9wICU+JQ0KICBjb3VudChJRCwgd29yZCwgc29ydD1UUlVFKSAlPiUNCiAgdW5ncm91cCgpDQp0b3RhbF90ZXJtcyA8LSB0ZXJtcyAlPiUgDQogIGdyb3VwX2J5KElEKSAlPiUgDQogIHN1bW1hcml6ZSh0b3RhbCA9IHN1bShuKSkNCnRmIDwtIGxlZnRfam9pbih0ZXJtcywgdG90YWxfdGVybXMpICU+JSBtdXRhdGUodGY9bi90b3RhbCkNCg0KIyBCdWlsZCBhIGNvcnB1cyBvYmplY3QgZm9yIHF1YW50ZWRhDQpjb3JwIDwtIGNhc3RfZGZtKHRmLCBJRCwgd29yZCwgbikNCg0KIyBQbG90IGEgd29yZCBjbG91ZCAtLSBJZiB5b3UgZG9uJ3QgaGF2ZSBSQ29sb3JCcmV3ZXIgaW5zdGFsbGVkLCB5b3UgY2FuDQojIHJlbW92ZSB0aGUgYGNvbG9yPWAgb3B0aW9uLg0KdGV4dHBsb3Rfd29yZGNsb3VkKGRmbShjb3JwKSwgY29sb3IgPSBSQ29sb3JCcmV3ZXI6OmJyZXdlci5wYWwoOSwgIlNldDEiKSkNCg0KI0VORA0KYGBg