Calculating quantities with text
Note: Due to missing packages in DataCamp light, namely quanteda
, textdata
, and tidytext
, I have provided sample code that you can run on your own computer in RStudio. Make sure to run install.packages("quanteda")
, install.packages("textdata")
, and install.packages("tidytext")
to install those packages if you don’t have them.
Each of the three exercises below can be run as standalone scripts, as they contain all needed imports within their code blocks
Exercise 6: Readability with Quanteda
How does the readability of JPMorgan’s annual report compare to the Citigroup annual report from class?
# load in readr (or tidyverse) to get read_file() function
library(readr)
# Load in all of JPM's 2014 annual report
doc <- read_file("https://rmc.link/Slides/acct420v2/Session_7/0000019617-14-000289.txt")
# Load in quanteda
library(quanteda)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Package version: 1.5.1
Parallel computing: 2 of 8 threads used.
See https://quanteda.io for tutorials and examples.
Attaching package: 㤼㸱quanteda㤼㸲
The following object is masked from 㤼㸱package:utils㤼㸲:
View
# Calculate the three readability measures
textstat_readability(doc, "Flesch.Kincaid")
textstat_readability(doc, "FOG")
textstat_readability(doc, "Coleman.Liau")
#END
Exercise 7: Readability with Quanteda
How does the sentiment of JPMorgan’s annual report compare to the Citigroup annual report from class?
# load in readr (or tidyverse) to get read_file() function
library(readr)
# Load in all of JPM's 2014 annual report
doc <- read_file("https://rmc.link/Slides/acct420v2/Session_7/0000019617-14-000289.txt")
# Load in tidytext
library(tidytext)
# Load some components of tidyverse
library(dplyr) # for the usual commands
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
library(tidyr) # for spread
# convert document to tidy format
df_doc <- data.frame(ID=c("0000019617-14-000289"), text=c(doc),
stringsAsFactors = F) %>%
unnest_tokens(word, text)
# Calculate term frequency
terms <- df_doc %>%
count(ID, word, sort=TRUE) %>%
ungroup()
total_terms <- terms %>%
group_by(ID) %>%
summarize(total = sum(n))
tf <- left_join(terms, total_terms) %>% mutate(tf=n/total)
Joining, by = "ID"
# Get the Loughran McDonald sentiment dictionary
sentiment <- get_sentiments("loughran")
# Merge in sentiment
tf_sent <- tf %>% left_join(sentiment)
Joining, by = "word"
# Calculate the three readability measures
tf_sent %>%
spread(sentiment, tf, fill=0) %>%
select(constraining, litigious, negative, positive, superfluous, uncertainty) %>%
colSums()
constraining litigious negative positive superfluous uncertainty
0.0127076134 0.0178352469 0.0308215361 0.0055735147 0.0001672054 0.0220153829
#END
Exercise 8: Make a word cloud after removing stopwords
# load in readr (or tidyverse) to get read_file() function
library(readr)
# Load in all of JPM's 2014 annual report
doc <- read_file("https://rmc.link/Slides/acct420v2/Session_7/0000019617-14-000289.txt")
# Load in quanteda and tidytext
library(quanteda)
library(tidytext)
# Load in some of tidyverse
library(dplyr)
# convert document to tidy format
df_doc <- data.frame(ID=c("0000019617-14-000289"), text=c(doc),
stringsAsFactors = F) %>%
unnest_tokens(word, text)
# Pull a list of stopwords
stopwords <- stopwords(source="smart")
# Remove stopwords
df_doc_stop <- df_doc %>%
anti_join(data.frame(word=stopwords, stringsAsFactors=F))
Joining, by = "word"
# Build a corpus object for quanteda
corp <- corpus(df_doc_stop, docid_field="ID", text_field="word")
# Plot a word cloud -- If you don't have RColorBrewer installed, you can
# remove the `color=` option.
textplot_wordcloud(dfm(corp), color = RColorBrewer::brewer.pal(9, "Set1"))
#END
LS0tDQp0aXRsZTogIlNlc3Npb24gNyBSIHByYWN0aWNlIChvZmZsaW5lKSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KYXV0aG9yOiAiRHIuIFJpY2hhcmQgTS4gQ3Jvd2xleSINCmRhdGU6ICJBQ0NUIDQyMCwgRmFsbCAyMDE5LCBTZXNzaW9uIDciDQotLS0NCg0KIyMgQ2FsY3VsYXRpbmcgcXVhbnRpdGllcyB3aXRoIHRleHQNCg0KTm90ZTogRHVlIHRvIG1pc3NpbmcgcGFja2FnZXMgaW4gRGF0YUNhbXAgbGlnaHQsIG5hbWVseSBgcXVhbnRlZGFgLCBgdGV4dGRhdGFgLCBhbmQgYHRpZHl0ZXh0YCwgSSBoYXZlIHByb3ZpZGVkIHNhbXBsZSBjb2RlIHRoYXQgeW91IGNhbiBydW4gb24geW91ciBvd24gY29tcHV0ZXIgaW4gUlN0dWRpby4gIE1ha2Ugc3VyZSB0byBydW4gYGluc3RhbGwucGFja2FnZXMoInF1YW50ZWRhIilgLCBgaW5zdGFsbC5wYWNrYWdlcygidGV4dGRhdGEiKWAsIGFuZCBgaW5zdGFsbC5wYWNrYWdlcygidGlkeXRleHQiKWAgdG8gaW5zdGFsbCB0aG9zZSBwYWNrYWdlcyBpZiB5b3UgZG9uJ3QgaGF2ZSB0aGVtLg0KDQpFYWNoIG9mIHRoZSB0aHJlZSBleGVyY2lzZXMgYmVsb3cgY2FuIGJlIHJ1biBhcyBzdGFuZGFsb25lIHNjcmlwdHMsIGFzIHRoZXkgY29udGFpbiBhbGwgbmVlZGVkIGltcG9ydHMgd2l0aGluIHRoZWlyIGNvZGUgYmxvY2tzDQoNCiMjIyBFeGVyY2lzZSA2OiBSZWFkYWJpbGl0eSB3aXRoIFF1YW50ZWRhDQoNCkhvdyBkb2VzIHRoZSByZWFkYWJpbGl0eSBvZiBKUE1vcmdhbidzIGFubnVhbCByZXBvcnQgY29tcGFyZSB0byB0aGUgQ2l0aWdyb3VwIGFubnVhbCByZXBvcnQgZnJvbSBjbGFzcz8NCg0KYGBge3J9DQoNCiMgbG9hZCBpbiByZWFkciAob3IgdGlkeXZlcnNlKSB0byBnZXQgcmVhZF9maWxlKCkgZnVuY3Rpb24NCmxpYnJhcnkocmVhZHIpDQoNCiMgTG9hZCBpbiBhbGwgb2YgSlBNJ3MgMjAxNCBhbm51YWwgcmVwb3J0DQpkb2MgPC0gcmVhZF9maWxlKCJodHRwczovL3JtYy5saW5rL1NsaWRlcy9hY2N0NDIwdjIvU2Vzc2lvbl83LzAwMDAwMTk2MTctMTQtMDAwMjg5LnR4dCIpDQoNCiMgTG9hZCBpbiBxdWFudGVkYQ0KbGlicmFyeShxdWFudGVkYSkNCg0KIyBDYWxjdWxhdGUgdGhlIHRocmVlIHJlYWRhYmlsaXR5IG1lYXN1cmVzDQp0ZXh0c3RhdF9yZWFkYWJpbGl0eShkb2MsICJGbGVzY2guS2luY2FpZCIpDQp0ZXh0c3RhdF9yZWFkYWJpbGl0eShkb2MsICJGT0ciKQ0KdGV4dHN0YXRfcmVhZGFiaWxpdHkoZG9jLCAiQ29sZW1hbi5MaWF1IikNCg0KI0VORA0KYGBgDQoNCiMjIyBFeGVyY2lzZSA3OiBSZWFkYWJpbGl0eSB3aXRoIFF1YW50ZWRhDQoNCkhvdyBkb2VzIHRoZSBzZW50aW1lbnQgb2YgSlBNb3JnYW4ncyBhbm51YWwgcmVwb3J0IGNvbXBhcmUgdG8gdGhlIENpdGlncm91cCBhbm51YWwgcmVwb3J0IGZyb20gY2xhc3M/DQoNCmBgYHtyfQ0KIyBsb2FkIGluIHJlYWRyIChvciB0aWR5dmVyc2UpIHRvIGdldCByZWFkX2ZpbGUoKSBmdW5jdGlvbg0KbGlicmFyeShyZWFkcikNCg0KIyBMb2FkIGluIGFsbCBvZiBKUE0ncyAyMDE0IGFubnVhbCByZXBvcnQNCmRvYyA8LSByZWFkX2ZpbGUoImh0dHBzOi8vcm1jLmxpbmsvU2xpZGVzL2FjY3Q0MjB2Mi9TZXNzaW9uXzcvMDAwMDAxOTYxNy0xNC0wMDAyODkudHh0IikNCg0KIyBMb2FkIGluIHRpZHl0ZXh0DQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KDQojIExvYWQgc29tZSBjb21wb25lbnRzIG9mIHRpZHl2ZXJzZQ0KbGlicmFyeShkcGx5cikgICMgZm9yIHRoZSB1c3VhbCBjb21tYW5kcw0KbGlicmFyeSh0aWR5cikgICMgZm9yIHNwcmVhZA0KDQojIGNvbnZlcnQgZG9jdW1lbnQgdG8gdGlkeSBmb3JtYXQNCmRmX2RvYyA8LSBkYXRhLmZyYW1lKElEPWMoIjAwMDAwMTk2MTctMTQtMDAwMjg5IiksIHRleHQ9Yyhkb2MpLA0KICAgICAgICAgICAgICAgICAgICAgc3RyaW5nc0FzRmFjdG9ycyA9IEYpICU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpDQoNCiMgQ2FsY3VsYXRlIHRlcm0gZnJlcXVlbmN5DQp0ZXJtcyA8LSBkZl9kb2MgJT4lDQogIGNvdW50KElELCB3b3JkLCBzb3J0PVRSVUUpICU+JQ0KICB1bmdyb3VwKCkNCnRvdGFsX3Rlcm1zIDwtIHRlcm1zICU+JSANCiAgZ3JvdXBfYnkoSUQpICU+JSANCiAgc3VtbWFyaXplKHRvdGFsID0gc3VtKG4pKQ0KdGYgPC0gbGVmdF9qb2luKHRlcm1zLCB0b3RhbF90ZXJtcykgJT4lIG11dGF0ZSh0Zj1uL3RvdGFsKQ0KDQojIEdldCB0aGUgTG91Z2hyYW4gTWNEb25hbGQgc2VudGltZW50IGRpY3Rpb25hcnkNCnNlbnRpbWVudCA8LSBnZXRfc2VudGltZW50cygibG91Z2hyYW4iKQ0KDQojIE1lcmdlIGluIHNlbnRpbWVudA0KdGZfc2VudCA8LSB0ZiAlPiUgbGVmdF9qb2luKHNlbnRpbWVudCkNCg0KIyBDYWxjdWxhdGUgdGhlIHRocmVlIHJlYWRhYmlsaXR5IG1lYXN1cmVzDQp0Zl9zZW50ICU+JQ0KICBzcHJlYWQoc2VudGltZW50LCB0ZiwgZmlsbD0wKSAlPiUNCiAgc2VsZWN0KGNvbnN0cmFpbmluZywgbGl0aWdpb3VzLCBuZWdhdGl2ZSwgcG9zaXRpdmUsIHN1cGVyZmx1b3VzLCB1bmNlcnRhaW50eSkgJT4lDQogIGNvbFN1bXMoKQ0KDQojRU5EDQpgYGANCg0KIyMjIEV4ZXJjaXNlIDg6IE1ha2UgYSB3b3JkIGNsb3VkIGFmdGVyIHJlbW92aW5nIHN0b3B3b3Jkcw0KDQpgYGB7cn0NCiMgbG9hZCBpbiByZWFkciAob3IgdGlkeXZlcnNlKSB0byBnZXQgcmVhZF9maWxlKCkgZnVuY3Rpb24NCmxpYnJhcnkocmVhZHIpDQoNCiMgTG9hZCBpbiBhbGwgb2YgSlBNJ3MgMjAxNCBhbm51YWwgcmVwb3J0DQpkb2MgPC0gcmVhZF9maWxlKCJodHRwczovL3JtYy5saW5rL1NsaWRlcy9hY2N0NDIwdjIvU2Vzc2lvbl83LzAwMDAwMTk2MTctMTQtMDAwMjg5LnR4dCIpDQoNCiMgTG9hZCBpbiBxdWFudGVkYSBhbmQgdGlkeXRleHQNCmxpYnJhcnkocXVhbnRlZGEpDQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KDQojIExvYWQgaW4gc29tZSBvZiB0aWR5dmVyc2UNCmxpYnJhcnkoZHBseXIpDQoNCiMgY29udmVydCBkb2N1bWVudCB0byB0aWR5IGZvcm1hdA0KZGZfZG9jIDwtIGRhdGEuZnJhbWUoSUQ9YygiMDAwMDAxOTYxNy0xNC0wMDAyODkiKSwgdGV4dD1jKGRvYyksDQogICAgICAgICAgICAgICAgICAgICBzdHJpbmdzQXNGYWN0b3JzID0gRikgJT4lDQogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkNCg0KIyBQdWxsIGEgbGlzdCBvZiBzdG9wd29yZHMNCnN0b3B3b3JkcyA8LSBzdG9wd29yZHMoc291cmNlPSJzbWFydCIpDQoNCiMgUmVtb3ZlIHN0b3B3b3Jkcw0KZGZfZG9jX3N0b3AgPC0gZGZfZG9jICU+JQ0KICBhbnRpX2pvaW4oZGF0YS5mcmFtZSh3b3JkPXN0b3B3b3Jkcywgc3RyaW5nc0FzRmFjdG9ycz1GKSkNCg0KIyBCdWlsZCBhIGNvcnB1cyBvYmplY3QgZm9yIHF1YW50ZWRhDQpjb3JwIDwtIGNvcnB1cyhkZl9kb2Nfc3RvcCwgZG9jaWRfZmllbGQ9IklEIiwgdGV4dF9maWVsZD0id29yZCIpDQoNCiMgUGxvdCBhIHdvcmQgY2xvdWQgLS0gSWYgeW91IGRvbid0IGhhdmUgUkNvbG9yQnJld2VyIGluc3RhbGxlZCwgeW91IGNhbg0KIyByZW1vdmUgdGhlIGBjb2xvcj1gIG9wdGlvbi4NCnRleHRwbG90X3dvcmRjbG91ZChkZm0oY29ycCksIGNvbG9yID0gUkNvbG9yQnJld2VyOjpicmV3ZXIucGFsKDksICJTZXQxIikpDQoNCiNFTkQNCmBgYA==