ACCT 420: Textual analysis

Session 8

Dr. Richard M. Crowley

Textual data and textual analysis

Regex example

Breaking down an email
1. A local name
2. An @ sign
3. A domain, which will have a . in it
Local names can have many different characters in them
- Match it with [:graph:]+
The domain is pretty restrictive, generally just alphanumeric and .
- There can be multiple . though
- Match it with [:alnum:]+\\.[.[:alnum:]]+

# Extract all emails from the annual report
str_extract_all(doc,'[:graph:]+@[:alnum:]+\\.[.[:alnum:]]+')

## [[1]]
## [1] "shareholder@computershare.com" "shareholder@computershare.com"
## [3] "docserve@citi.com"             "shareholderrelations@citi.com"

text	alpha	lower	upper	digit	alnum
abcde	TRUE	TRUE	FALSE	FALSE	TRUE
ABCDE	TRUE	FALSE	TRUE	FALSE	TRUE
12345	FALSE	FALSE	FALSE	TRUE	TRUE
!?!?.	FALSE	FALSE	FALSE	FALSE	FALSE
ABC123?	TRUE	FALSE	TRUE	TRUE	TRUE
With space	TRUE	TRUE	TRUE	FALSE	TRUE
New line	TRUE	TRUE	TRUE	FALSE	TRUE

text	punct	graph	space	blank	period
abcde	FALSE	TRUE	FALSE	FALSE	TRUE
ABCDE	FALSE	TRUE	FALSE	FALSE	TRUE
12345	FALSE	TRUE	FALSE	FALSE	TRUE
!?!?.	TRUE	TRUE	FALSE	FALSE	TRUE
ABC123?	TRUE	TRUE	FALSE	FALSE	TRUE
With space	FALSE	TRUE	TRUE	TRUE	TRUE
New line	FALSE	TRUE	TRUE	FALSE	TRUE

Example: Regex form (292 Real estate firms)

# Real estate firm names with 3 vowels in a row
str_subset(RE_names, '[AEIOU]{3}')

## [1] "STADLAUER MALZFABRIK"      "JOAO FORTES ENGENHARIA SA"

# Real estate firm names with no vowels
str_subset(RE_names, '^[^AEIOU]+$')

## [1] "FGP LTD"     "MBK PCL"     "MYP LTD"     "MCT BHD"     "R T C L LTD"

# Real estate firm names with at least 12 vowels
str_subset(RE_names, '([^AEIOU]*[AEIOU]){11,}')

## [1] "INTERNATIONAL ENTERTAINMENT"  "PREMIERE HORIZON ALLIANCE"   
## [3] "JOAO FORTES ENGENHARIA SA"    "OVERSEAS CHINESE TOWN (ASIA)"
## [5] "COOPERATIVE CONSTRUCTION CO"  "FRANCE TOURISME IMMOBILIER"  
## [7] "BONEI HATICHON CIVIL ENGINE"

# Real estate firm names with a repeated 4 letter pattern
str_subset(RE_names, '([:upper:]{4}).*\\1')

## [1] "INTERNATIONAL ENTERTAINMENT"  "CHONG HONG CONSTRUCTION CO"  
## [3] "ZHONGHONG HOLDING CO LTD"     "DEUTSCHE GEOTHERMISCHE IMMOB"

Expanding usage

Anything covered so far can be used for text in data
- Ex.: Firm names or addresses in Compustat

# Compustat firm names example
df_RE_names <- df_RE %>%
  group_by(isin) %>%
  slice(1) %>%
  mutate(SG_in_name = str_detect(conm, "(SG|SINGAPORE)"),
         name_length = str_length(conm),
         SG_firm = ifelse(fic=="SGP",1,0)) %>%
  ungroup()

## Warning: package 'bindrcpp' was built under R version 3.5.1

df_RE_names %>%
  group_by(SG_firm) %>%
  mutate(pct_SG = mean(SG_in_name) * 100) %>%
  slice(1) %>%
  ungroup() %>%
  select(SG_firm, pct_SG)

## # A tibble: 2 x 2
##   SG_firm pct_SG
##     <dbl>  <dbl>
## 1       0  0.369
## 2       1  4.76

Readability: Flesch Kincaid

$\displaystyle { 206.835 - 1.015 \left(\frac{\#~words}{\#~sentences}\right) - 84.6\left(\frac{\#~syllables}{\#~words}\right) }$

A score generally below 100
- Higher is more readable
- Conversational English should be around 80-90
- A JC or poly graduate should be able to read anything 50 or higher
- A Bachelor’s degree could be necessary for anything below 30

library(quanteda)

## Warning: package 'quanteda' was built under R version 3.5.1

textstat_readability(doc, "Flesch.Kincaid")

##   document Flesch.Kincaid
## 1    text1       17.56528

Readability: Fog

$\displaystyle { \begin{aligned} & \left[ Mean(Words~per~sentence) +\right.\\ &\left.(\%~of~words~>3~syllables)\right] \times 0.4 \end{aligned} }$

An approximate grade level required for reading a document
- A JC or poly graduate should read at a level of 12
  - New York Times articles are usually around 13
- A Bachelor’s degree holder should read at 17

textstat_readability(doc, "FOG")

##   document      FOG
## 1    text1 21.63388

Readability: Coleman-Liau

$\displaystyle { 5.88\left(\frac{\#~letters}{\#~words}\right)-29.6\left(\frac{\#~sentences}{\#~words}\right)-15.8 }$

Provides an approximate grade level like Fog, on the same scale as Fog

textstat_readability(doc, "Coleman.Liau")

##   document Coleman.Liau
## 1    text1     29.03967

Stopwords

Stopwords – words we remove because they have little content
- the, a, an, and, …
Also helps with our curse a bit – removes the words entirely
We’ll use the tm package to remove stopwords
- Uses a mix of SMART and Snowball stemmer under the hood

# get a list of stopwords
library(stopwords)
stop_en <- stopwords("english")  # Snowball English
paste0(length(stop_en), " words: ", paste(stop_en[1:5], collapse=", "))

## [1] "175 words: i, me, my, myself, we"

stop_SMART <- stopwords(source="smart")  # SMART English
paste0(length(stop_SMART), " words: ", paste(stop_SMART[1:5], collapse=", "))

## [1] "571 words: a, a's, able, about, above"

stop_fr <- stopwords("french")  # Snowball French
paste0(length(stop_fr), " words: ", paste(stop_fr[1:5], collapse=", "))

## [1] "164 words: au, aux, avec, ce, ces"

Applying stopwords to a corpus

When we have a tidy set of text, we can just use dplyr for this!
- dplyr’s anti_join() function is like a merge, but where all matches are deleted

df_doc_stop <- df_doc %>%
  anti_join(data.frame(word=stop_SMART, stringsAsFactors = F))

## Joining, by = "word"

nrow(df_doc)

## [1] 128728

nrow(df_doc_stop)

## [1] 74985

Converting to term frequency

terms <- df_doc_stop %>%
  count(ID, word, sort=TRUE) %>%
  ungroup()
total_terms <- terms %>% 
  group_by(ID) %>% 
  summarize(total = sum(n))
tf <- left_join(terms, total_terms) %>% mutate(tf=n/total)

## Joining, by = "ID"

tf

## # A tibble: 5,543 x 5
##    ID                   word          n total      tf
##    <chr>                <chr>     <int> <int>   <dbl>
##  1 0001104659-14-015152 citi        826 74985 0.0110 
##  2 0001104659-14-015152 2013        743 74985 0.00991
##  3 0001104659-14-015152 credit      704 74985 0.00939
##  4 0001104659-14-015152 citis       660 74985 0.00880
##  5 0001104659-14-015152 risk        624 74985 0.00832
##  6 0001104659-14-015152 december    523 74985 0.00697
##  7 0001104659-14-015152 financial   513 74985 0.00684
##  8 0001104659-14-015152 31          505 74985 0.00673
##  9 0001104659-14-015152 loans       495 74985 0.00660
## 10 0001104659-14-015152 assets      488 74985 0.00651
## # ... with 5,533 more rows

Merging in sentiment data

tf_sent <- tf %>% left_join(get_sentiments("loughran"))

## Joining, by = "word"

tf_sent[1:5,]

## # A tibble: 5 x 6
##   ID                   word       n total      tf sentiment  
##   <chr>                <chr>  <int> <int>   <dbl> <chr>      
## 1 0001104659-14-015152 citi     826 74985 0.0110  <NA>       
## 2 0001104659-14-015152 2013     743 74985 0.00991 <NA>       
## 3 0001104659-14-015152 credit   704 74985 0.00939 <NA>       
## 4 0001104659-14-015152 citis    660 74985 0.00880 <NA>       
## 5 0001104659-14-015152 risk     624 74985 0.00832 uncertainty

tf_sent[!is.na(tf_sent$sentiment),][1:5,]

## # A tibble: 5 x 6
##   ID                   word              n total      tf sentiment  
##   <chr>                <chr>         <int> <int>   <dbl> <chr>      
## 1 0001104659-14-015152 risk            624 74985 0.00832 uncertainty
## 2 0001104659-14-015152 loss            267 74985 0.00356 negative   
## 3 0001104659-14-015152 losses          265 74985 0.00353 negative   
## 4 0001104659-14-015152 approximately   232 74985 0.00309 uncertainty
## 5 0001104659-14-015152 regulatory      216 74985 0.00288 litigious

Summarizing document sentiment

tf_sent %>%
  spread(sentiment, tf, fill=0) %>%
  select(constraining, litigious, negative, positive, superfluous, uncertainty) %>%
  colSums()

## constraining    litigious     negative     positive  superfluous 
##  0.013242649  0.020750817  0.034780289  0.007054744  0.000373408 
##  uncertainty 
##  0.025325065

ACCT 420: Textual analysis Session 8 Dr. Richard M. Crowley