Calculating quantities with text

Note: Due to missing packages in DataCamp light, namely quanteda, textdata, and tidytext, I have provided sample code that you can run on your own computer in RStudio. Make sure to run install.packages("quanteda"), install.packages("textdata"), and install.packages("tidytext") to install those packages if you don’t have them.

Each of the three exercises below can be run as standalone scripts, as they contain all needed imports within their code blocks

Exercise 6: Readability with Quanteda

How does the readability of JPMorgan’s annual report compare to the Citigroup annual report from class?

# load in readr (or tidyverse) to get read_file() function
library(readr)
# Load in all of JPM's 2014 annual report
doc <- read_file("https://rmc.link/Slides/acct420v2/Session_7/0000019617-14-000289.txt")
# Load in quanteda
library(quanteda)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     
Package version: 1.5.1
Parallel computing: 2 of 8 threads used.
See https://quanteda.io for tutorials and examples.

Attaching package: 㤼㸱quanteda㤼㸲

The following object is masked from 㤼㸱package:utils㤼㸲:

    View
# Calculate the three readability measures
textstat_readability(doc, "Flesch.Kincaid")
textstat_readability(doc, "FOG")
textstat_readability(doc, "Coleman.Liau")
#END

Exercise 7: Readability with Quanteda

How does the sentiment of JPMorgan’s annual report compare to the Citigroup annual report from class?

# load in readr (or tidyverse) to get read_file() function
library(readr)
# Load in all of JPM's 2014 annual report
doc <- read_file("https://rmc.link/Slides/acct420v2/Session_7/0000019617-14-000289.txt")
# Load in tidytext
library(tidytext)
# Load some components of tidyverse
library(dplyr)  # for the usual commands

Attaching package: 㤼㸱dplyr㤼㸲

The following objects are masked from 㤼㸱package:stats㤼㸲:

    filter, lag

The following objects are masked from 㤼㸱package:base㤼㸲:

    intersect, setdiff, setequal, union
library(tidyr)  # for spread
# convert document to tidy format
df_doc <- data.frame(ID=c("0000019617-14-000289"), text=c(doc),
                     stringsAsFactors = F) %>%
  unnest_tokens(word, text)
# Calculate term frequency
terms <- df_doc %>%
  count(ID, word, sort=TRUE) %>%
  ungroup()
total_terms <- terms %>% 
  group_by(ID) %>% 
  summarize(total = sum(n))
tf <- left_join(terms, total_terms) %>% mutate(tf=n/total)
Joining, by = "ID"
# Get the Loughran McDonald sentiment dictionary
sentiment <- get_sentiments("loughran")
# Merge in sentiment
tf_sent <- tf %>% left_join(sentiment)
Joining, by = "word"
# Calculate the three readability measures
tf_sent %>%
  spread(sentiment, tf, fill=0) %>%
  select(constraining, litigious, negative, positive, superfluous, uncertainty) %>%
  colSums()
constraining    litigious     negative     positive  superfluous  uncertainty 
0.0127076134 0.0178352469 0.0308215361 0.0055735147 0.0001672054 0.0220153829 
#END

Exercise 8: Make a word cloud after removing stopwords

# load in readr (or tidyverse) to get read_file() function
library(readr)
# Load in all of JPM's 2014 annual report
doc <- read_file("https://rmc.link/Slides/acct420v2/Session_7/0000019617-14-000289.txt")
# Load in quanteda and tidytext
library(quanteda)
library(tidytext)
# Load in some of tidyverse
library(dplyr)
# convert document to tidy format
df_doc <- data.frame(ID=c("0000019617-14-000289"), text=c(doc),
                     stringsAsFactors = F) %>%
  unnest_tokens(word, text)
# Pull a list of stopwords
stopwords <- stopwords(source="smart")
# Remove stopwords
df_doc_stop <- df_doc %>%
  anti_join(data.frame(word=stopwords, stringsAsFactors=F))
Joining, by = "word"
# Build a corpus object for quanteda
corp <- corpus(df_doc_stop, docid_field="ID", text_field="word")
# Plot a word cloud -- If you don't have RColorBrewer installed, you can
# remove the `color=` option.
textplot_wordcloud(dfm(corp), color = RColorBrewer::brewer.pal(9, "Set1"))

#END
LS0tDQp0aXRsZTogIlNlc3Npb24gNyBSIHByYWN0aWNlIChvZmZsaW5lKSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KYXV0aG9yOiAiRHIuIFJpY2hhcmQgTS4gQ3Jvd2xleSINCmRhdGU6ICJBQ0NUIDQyMCwgRmFsbCAyMDE5LCBTZXNzaW9uIDciDQotLS0NCg0KIyMgQ2FsY3VsYXRpbmcgcXVhbnRpdGllcyB3aXRoIHRleHQNCg0KTm90ZTogRHVlIHRvIG1pc3NpbmcgcGFja2FnZXMgaW4gRGF0YUNhbXAgbGlnaHQsIG5hbWVseSBgcXVhbnRlZGFgLCBgdGV4dGRhdGFgLCBhbmQgYHRpZHl0ZXh0YCwgSSBoYXZlIHByb3ZpZGVkIHNhbXBsZSBjb2RlIHRoYXQgeW91IGNhbiBydW4gb24geW91ciBvd24gY29tcHV0ZXIgaW4gUlN0dWRpby4gIE1ha2Ugc3VyZSB0byBydW4gYGluc3RhbGwucGFja2FnZXMoInF1YW50ZWRhIilgLCBgaW5zdGFsbC5wYWNrYWdlcygidGV4dGRhdGEiKWAsIGFuZCBgaW5zdGFsbC5wYWNrYWdlcygidGlkeXRleHQiKWAgdG8gaW5zdGFsbCB0aG9zZSBwYWNrYWdlcyBpZiB5b3UgZG9uJ3QgaGF2ZSB0aGVtLg0KDQpFYWNoIG9mIHRoZSB0aHJlZSBleGVyY2lzZXMgYmVsb3cgY2FuIGJlIHJ1biBhcyBzdGFuZGFsb25lIHNjcmlwdHMsIGFzIHRoZXkgY29udGFpbiBhbGwgbmVlZGVkIGltcG9ydHMgd2l0aGluIHRoZWlyIGNvZGUgYmxvY2tzDQoNCiMjIyBFeGVyY2lzZSA2OiBSZWFkYWJpbGl0eSB3aXRoIFF1YW50ZWRhDQoNCkhvdyBkb2VzIHRoZSByZWFkYWJpbGl0eSBvZiBKUE1vcmdhbidzIGFubnVhbCByZXBvcnQgY29tcGFyZSB0byB0aGUgQ2l0aWdyb3VwIGFubnVhbCByZXBvcnQgZnJvbSBjbGFzcz8NCg0KYGBge3J9DQoNCiMgbG9hZCBpbiByZWFkciAob3IgdGlkeXZlcnNlKSB0byBnZXQgcmVhZF9maWxlKCkgZnVuY3Rpb24NCmxpYnJhcnkocmVhZHIpDQoNCiMgTG9hZCBpbiBhbGwgb2YgSlBNJ3MgMjAxNCBhbm51YWwgcmVwb3J0DQpkb2MgPC0gcmVhZF9maWxlKCJodHRwczovL3JtYy5saW5rL1NsaWRlcy9hY2N0NDIwdjIvU2Vzc2lvbl83LzAwMDAwMTk2MTctMTQtMDAwMjg5LnR4dCIpDQoNCiMgTG9hZCBpbiBxdWFudGVkYQ0KbGlicmFyeShxdWFudGVkYSkNCg0KIyBDYWxjdWxhdGUgdGhlIHRocmVlIHJlYWRhYmlsaXR5IG1lYXN1cmVzDQp0ZXh0c3RhdF9yZWFkYWJpbGl0eShkb2MsICJGbGVzY2guS2luY2FpZCIpDQp0ZXh0c3RhdF9yZWFkYWJpbGl0eShkb2MsICJGT0ciKQ0KdGV4dHN0YXRfcmVhZGFiaWxpdHkoZG9jLCAiQ29sZW1hbi5MaWF1IikNCg0KI0VORA0KYGBgDQoNCiMjIyBFeGVyY2lzZSA3OiBSZWFkYWJpbGl0eSB3aXRoIFF1YW50ZWRhDQoNCkhvdyBkb2VzIHRoZSBzZW50aW1lbnQgb2YgSlBNb3JnYW4ncyBhbm51YWwgcmVwb3J0IGNvbXBhcmUgdG8gdGhlIENpdGlncm91cCBhbm51YWwgcmVwb3J0IGZyb20gY2xhc3M/DQoNCmBgYHtyfQ0KIyBsb2FkIGluIHJlYWRyIChvciB0aWR5dmVyc2UpIHRvIGdldCByZWFkX2ZpbGUoKSBmdW5jdGlvbg0KbGlicmFyeShyZWFkcikNCg0KIyBMb2FkIGluIGFsbCBvZiBKUE0ncyAyMDE0IGFubnVhbCByZXBvcnQNCmRvYyA8LSByZWFkX2ZpbGUoImh0dHBzOi8vcm1jLmxpbmsvU2xpZGVzL2FjY3Q0MjB2Mi9TZXNzaW9uXzcvMDAwMDAxOTYxNy0xNC0wMDAyODkudHh0IikNCg0KIyBMb2FkIGluIHRpZHl0ZXh0DQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KDQojIExvYWQgc29tZSBjb21wb25lbnRzIG9mIHRpZHl2ZXJzZQ0KbGlicmFyeShkcGx5cikgICMgZm9yIHRoZSB1c3VhbCBjb21tYW5kcw0KbGlicmFyeSh0aWR5cikgICMgZm9yIHNwcmVhZA0KDQojIGNvbnZlcnQgZG9jdW1lbnQgdG8gdGlkeSBmb3JtYXQNCmRmX2RvYyA8LSBkYXRhLmZyYW1lKElEPWMoIjAwMDAwMTk2MTctMTQtMDAwMjg5IiksIHRleHQ9Yyhkb2MpLA0KICAgICAgICAgICAgICAgICAgICAgc3RyaW5nc0FzRmFjdG9ycyA9IEYpICU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpDQoNCiMgQ2FsY3VsYXRlIHRlcm0gZnJlcXVlbmN5DQp0ZXJtcyA8LSBkZl9kb2MgJT4lDQogIGNvdW50KElELCB3b3JkLCBzb3J0PVRSVUUpICU+JQ0KICB1bmdyb3VwKCkNCnRvdGFsX3Rlcm1zIDwtIHRlcm1zICU+JSANCiAgZ3JvdXBfYnkoSUQpICU+JSANCiAgc3VtbWFyaXplKHRvdGFsID0gc3VtKG4pKQ0KdGYgPC0gbGVmdF9qb2luKHRlcm1zLCB0b3RhbF90ZXJtcykgJT4lIG11dGF0ZSh0Zj1uL3RvdGFsKQ0KDQojIEdldCB0aGUgTG91Z2hyYW4gTWNEb25hbGQgc2VudGltZW50IGRpY3Rpb25hcnkNCnNlbnRpbWVudCA8LSBnZXRfc2VudGltZW50cygibG91Z2hyYW4iKQ0KDQojIE1lcmdlIGluIHNlbnRpbWVudA0KdGZfc2VudCA8LSB0ZiAlPiUgbGVmdF9qb2luKHNlbnRpbWVudCkNCg0KIyBDYWxjdWxhdGUgdGhlIHRocmVlIHJlYWRhYmlsaXR5IG1lYXN1cmVzDQp0Zl9zZW50ICU+JQ0KICBzcHJlYWQoc2VudGltZW50LCB0ZiwgZmlsbD0wKSAlPiUNCiAgc2VsZWN0KGNvbnN0cmFpbmluZywgbGl0aWdpb3VzLCBuZWdhdGl2ZSwgcG9zaXRpdmUsIHN1cGVyZmx1b3VzLCB1bmNlcnRhaW50eSkgJT4lDQogIGNvbFN1bXMoKQ0KDQojRU5EDQpgYGANCg0KIyMjIEV4ZXJjaXNlIDg6IE1ha2UgYSB3b3JkIGNsb3VkIGFmdGVyIHJlbW92aW5nIHN0b3B3b3Jkcw0KDQpgYGB7cn0NCiMgbG9hZCBpbiByZWFkciAob3IgdGlkeXZlcnNlKSB0byBnZXQgcmVhZF9maWxlKCkgZnVuY3Rpb24NCmxpYnJhcnkocmVhZHIpDQoNCiMgTG9hZCBpbiBhbGwgb2YgSlBNJ3MgMjAxNCBhbm51YWwgcmVwb3J0DQpkb2MgPC0gcmVhZF9maWxlKCJodHRwczovL3JtYy5saW5rL1NsaWRlcy9hY2N0NDIwdjIvU2Vzc2lvbl83LzAwMDAwMTk2MTctMTQtMDAwMjg5LnR4dCIpDQoNCiMgTG9hZCBpbiBxdWFudGVkYSBhbmQgdGlkeXRleHQNCmxpYnJhcnkocXVhbnRlZGEpDQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KDQojIExvYWQgaW4gc29tZSBvZiB0aWR5dmVyc2UNCmxpYnJhcnkoZHBseXIpDQoNCiMgY29udmVydCBkb2N1bWVudCB0byB0aWR5IGZvcm1hdA0KZGZfZG9jIDwtIGRhdGEuZnJhbWUoSUQ9YygiMDAwMDAxOTYxNy0xNC0wMDAyODkiKSwgdGV4dD1jKGRvYyksDQogICAgICAgICAgICAgICAgICAgICBzdHJpbmdzQXNGYWN0b3JzID0gRikgJT4lDQogIHVubmVzdF90b2tlbnMod29yZCwgdGV4dCkNCg0KIyBQdWxsIGEgbGlzdCBvZiBzdG9wd29yZHMNCnN0b3B3b3JkcyA8LSBzdG9wd29yZHMoc291cmNlPSJzbWFydCIpDQoNCiMgUmVtb3ZlIHN0b3B3b3Jkcw0KZGZfZG9jX3N0b3AgPC0gZGZfZG9jICU+JQ0KICBhbnRpX2pvaW4oZGF0YS5mcmFtZSh3b3JkPXN0b3B3b3Jkcywgc3RyaW5nc0FzRmFjdG9ycz1GKSkNCg0KIyBCdWlsZCBhIGNvcnB1cyBvYmplY3QgZm9yIHF1YW50ZWRhDQpjb3JwIDwtIGNvcnB1cyhkZl9kb2Nfc3RvcCwgZG9jaWRfZmllbGQ9IklEIiwgdGV4dF9maWVsZD0id29yZCIpDQoNCiMgUGxvdCBhIHdvcmQgY2xvdWQgLS0gSWYgeW91IGRvbid0IGhhdmUgUkNvbG9yQnJld2VyIGluc3RhbGxlZCwgeW91IGNhbg0KIyByZW1vdmUgdGhlIGBjb2xvcj1gIG9wdGlvbi4NCnRleHRwbG90X3dvcmRjbG91ZChkZm0oY29ycCksIGNvbG9yID0gUkNvbG9yQnJld2VyOjpicmV3ZXIucGFsKDksICJTZXQxIikpDQoNCiNFTkQNCmBgYA==