Dr. Richard M. Crowley
For reading large files, readr is your friend
library(readr) # or library(tidyverse)
df <- read_csv("really_big_file.csv.zip")
library(readtext)
library(quanteda)
# Needs ~1.5GB
corp <- corpus(readtext("/media/Scratch/Data/Parser2/10-K/2014/*.txt"))
summary(corp)
## Text Types Tokens Sentences
## 1 0000002178-14-000010.txt 2929 22450 798
## 2 0000003499-14-000005.txt 2710 23907 769
## 3 0000003570-14-000031.txt 3866 55142 1541
## 4 0000004187-14-000020.txt 2902 26959 934
## 5 0000004457-14-000036.txt 3050 23941 883
## 6 0000004904-14-000019.txt 3408 30358 1119
## 7 0000004904-14-000029.txt 370 1308 40
## 8 0000004904-14-000031.txt 362 1302 45
## 9 0000004904-14-000034.txt 358 1201 42
## 10 0000004904-14-000037.txt 367 1269 45
## 11 0000004977-14-000052.txt 4859 73718 2457
## 12 0000005513-14-000008.txt 5316 91413 2918
## 13 0000006201-14-000004.txt 5377 113072 3437
## 14 0000006845-14-000009.txt 3232 28186 981
## 15 0000007039-14-000002.txt 2977 19710 697
## 16 0000007084-14-000011.txt 3912 46631 1531
## 17 0000007332-14-000004.txt 4802 58263 1766
## 18 0000008868-14-000013.txt 4252 62537 1944
## 19 0000008947-14-000068.txt 2904 26081 881
## 20 0000009092-14-000004.txt 3033 25204 896
## 21 0000009346-14-000004.txt 2909 27542 863
## 22 0000009984-14-000030.txt 3953 44728 1550
## 23 0000011199-14-000006.txt 3446 29982 1062
## 24 0000011544-14-000012.txt 3838 41611 1520
## 25 0000012208-14-000020.txt 3870 39709 1301
## 26 0000012400-14-000004.txt 2807 19214 646
## 27 0000012779-14-000010.txt 3295 34173 1102
## 28 0000012927-14-000004.txt 4371 48588 1676
## 29 0000013239-14-000010.txt 5538 78463 2522
## 30 0000014272-14-000054.txt 4976 59373 2123
## 31 0000014693-14-000014.txt 3832 33197 1167
## 32 0000014707-14-000020.txt 4125 47644 1593
## 33 0000014930-14-000039.txt 4283 54619 1778
## 34 0000015511-14-000004.txt 2825 19799 656
## 35 0000015615-14-000027.txt 4776 64639 2106
## 36 0000016918-14-000011.txt 4084 50041 1517
## 37 0000018172-14-000009.txt 3786 42474 1372
## 38 0000018230-14-000058.txt 2810 17821 603
## 39 0000018255-14-000009.txt 2930 23789 804
## 40 0000018396-14-000004.txt 3095 30957 1083
## 41 0000018498-14-000012.txt 4024 50836 1497
## 42 0000019149-14-000005.txt 3471 41261 1312
## 43 0000019612-14-000013.txt 4571 68707 2108
## 44 0000019617-14-000289.txt 2769 19899 547
## 45 0000020212-14-000007.txt 5003 63132 1970
## 46 0000020232-14-000009.txt 3104 28789 930
## 47 0000020286-14-000009.txt 4930 64244 2312
## 48 0000020520-14-000017.txt 3613 34691 1115
## 49 0000020629-14-000037.txt 3091 30904 990
## 50 0000020639-14-000016.txt 2273 22888 651
## 51 0000020740-14-000007.txt 2964 22086 670
## 52 0000021175-14-000021.txt 3670 43256 1551
## 53 0000021344-14-000008.txt 5060 81806 2707
## 54 0000022356-14-000010.txt 4386 54007 1921
## 55 0000023082-14-000039.txt 4911 75305 2284
## 56 0000023111-14-000003.txt 2831 28449 895
## 57 0000023675-14-000011.txt 3267 27989 1009
## 58 0000024545-14-000004.txt 5072 73746 2393
## 59 0000025232-14-000004.txt 3454 37013 1198
## 60 0000025475-14-000033.txt 3944 50297 1618
## 61 0000026172-14-000008.txt 4129 51559 1704
## 62 0000026324-14-000004.txt 4748 51365 1777
## 63 0000026780-14-000005.txt 3752 41220 1479
## 64 0000027419-14-000014.txt 3120 26800 959
## 65 0000027904-14-000003.txt 4193 45453 1714
## 66 0000027996-14-000013.txt 4224 56718 2015
## 67 0000028385-14-000003.txt 295 707 31
## 68 0000028412-14-000067.txt 4919 72537 2312
## 69 0000028823-14-000032.txt 4247 45524 1402
## 70 0000028917-14-000049.txt 3091 28061 963
## 71 0000029332-14-000034.txt 3291 30825 1085
## 72 0000029905-14-000012.txt 3949 39853 1204
## 73 0000029915-14-000010.txt 3037 24109 787
## 74 0000029915-14-000011.txt 5287 68694 2258
## 75 0000029989-14-000002.txt 3209 37369 1267
## 76 0000030554-14-000002.txt 3425 23638 844
## 77 0000030625-14-000017.txt 4595 52765 1746
## 78 0000030697-14-000006.txt 4214 56586 1706
## 79 0000031107-14-000006.txt 2662 22442 738
## 80 0000031347-14-000006.txt 3868 34788 1123
## 81 0000031978-14-000006.txt 3784 45236 1550
## 82 0000034067-14-000003.txt 4050 35325 1214
## 83 0000034088-14-000012.txt 4024 37020 1397
## 84 0000034782-14-000008.txt 4198 41171 1390
## 85 0000034903-14-000009.txt 3527 38999 1222
## 86 0000036047-14-000006.txt 4185 49537 1523
## 87 0000037785-14-000006.txt 4181 39318 1350
## 88 0000037996-14-000010.txt 4735 45858 1695
## 89 0000038009-14-000027.txt 3249 26019 963
## 90 0000038074-14-000019.txt 4967 67626 2309
## 91 0000038079-14-000018.txt 3911 46199 1452
## 92 0000038725-14-000042.txt 3385 31007 1098
## 93 0000039263-14-000013.txt 4881 73192 2331
## 94 0000039899-14-000010.txt 4935 50546 1868
## 95 0000039911-14-000045.txt 3196 30736 1073
## 96 0000040211-14-000010.txt 3837 41081 1458
## 97 0000040533-14-000002.txt 3902 34523 1385
## 98 0000040554-14-000023.txt 5488 85180 2831
## 99 0000040554-14-000024.txt 4348 60770 2007
## 100 0000040729-14-000007.txt 5848 100971 3599
# Uses ~20GB of RAM... Break corp into chunks if RAM constrained
corp_FOG <- textstat_readability(corp, "FOG")
corp_FOG %>%
head() %>%
html_dOG()
document | FOG |
---|---|
0000002178-14-000010.txt | 21.03917 |
0000003499-14-000005.txt | 20.36549 |
0000003570-14-000031.txt | 22.24386 |
0000004187-14-000020.txt | 18.75720 |
0000004457-14-000036.txt | 19.22683 |
0000004904-14-000019.txt | 20.51594 |
Recall that Citi’s annual report had a Fog index of 21.63
summary(corp_FOG$FOG)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.33 20.32 21.01 21.05 21.75 35.37
ggplot(corp_FOG, aes(x=FOG)) + geom_density()
df_SIC <- read.csv('../../Data/Filings2014.csv') %>%
select(accession, regsic) %>%
mutate(accession=paste0(accession, ".txt")) %>%
rename(document=accession) %>%
mutate(industry = case_when(
regsic >=0100 & regsic <= 0999 ~ "Agriculture",
regsic >=1000 & regsic <= 1499 ~ "Mining",
regsic >=1500 & regsic <= 1799 ~ "Construction",
regsic >=2000 & regsic <= 3999 ~ "Manufacturing",
regsic >=4000 & regsic <= 4999 ~ "Utilities",
regsic >=5000 & regsic <= 5199 ~ "Wholesale Trade",
regsic >=5200 & regsic <= 5999 ~ "Retail Trade",
regsic >=6000 & regsic <= 6799 ~ "Finance",
regsic >=7000 & regsic <= 8999 ~ "Services",
regsic >=9100 & regsic <= 9999 ~ "Public Admin"
)) %>%
group_by(document) %>%
slice(1) %>%
ungroup()
corp_FOG <- corp_FOG %>% left_join(df_SIC)
## Joining, by = "document"
corp_FOG %>%
head() %>%
html_df()
document | FOG | regsic | industry |
---|---|---|---|
0000002178-14-000010.txt | 21.03917 | 5172 | Wholesale Trade |
0000003499-14-000005.txt | 20.36549 | 6798 | Finance |
0000003570-14-000031.txt | 22.24386 | 4924 | Utilities |
0000004187-14-000020.txt | 18.75720 | 4950 | Utilities |
0000004457-14-000036.txt | 19.22683 | 7510 | Services |
0000004904-14-000019.txt | 20.51594 | 4911 | Utilities |
ggplot(corp_FOG[!is.na(corp_FOG$industry),], aes(x=factor(industry), y=FOG)) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(corp_FOG[!is.na(corp_FOG$industry),], aes(x=FOG)) +
geom_density() + facet_wrap(~industry)
library(lattice)
densityplot(~FOG | industry,
data=corp_FOG,
plot.points=F,
main="Fog index distibution by industry (SIC)",
xlab="Fog index",
layout=c(3,3))
kwic(corp, phrase("global warming")) %>% mutate(text=paste(pre,keyword,post)) %>%
select(docname, text) %>% datatable(options = list(pageLength = 5), rownames=F)
It’s quite complicated in the background, but it boils down to a system where:
package:stmbrowser
package for visualization (on Github)Think about the structure of a matrix where rows are document names and columns are individual words. How much of this matrix will be 0s?
stem=TRUE
, Code similar words as the same
remove=c(...)
, You can supply a list of stop words to remove
remove=stopwords()
for a simple listremove=stopwords(source='smart')
remove=stopwords("zh", source="stopwords-iso")
# adding industry to the corpus
docs <- docnames(corp)
docs <- data.frame(document=docs, stringsAsFactors = F)
docs <- docs %>% left_join(df_SIC)
docvars(corp, field="industry") <- docs$industry
# Simplest way
tdm <- dfm(corp)
# With stopwords
tdm <- dfm(corp,
remove=stopwords(source='smart'))
# With stopwords and stemming -> Used in next slides
# 2.6B elements in the matrix
tdm <- dfm(corp,
stem=TRUE,
remove=stopwords(source='smart'),
remove_punct=TRUE,
remove_numbers=TRUE) %>%
dfm_trim(min_termfreq=10, termfreq_type = "count")
topfeatures(tdm, n=5, groups="industry")
## $`Wholesale Trade`
## compani oper million financi product
## 30371 20340 18085 17552 17300
##
## $Finance
## compani loan financi decemb million
## 438185 392164 299978 286791 274376
##
## $Utilities
## oper million compani financi includ
## 112038 107322 101971 79010 76604
##
## $Services
## compani oper million financi servic
## 222276 145506 138397 131881 120817
##
## $Manufacturing
## compani product million oper financi
## 434805 368900 275829 240181 231687
##
## $Mining
## compani oper gas oil decemb
## 97798 92076 74150 65532 60475
##
## $Construction
## compani million oper financi decemb
## 15479 14885 12431 10899 10149
##
## $`Retail Trade`
## compani oper million financi includ
## 62780 43637 41428 35824 32478
##
## $Agriculture
## compani oper financi year product
## 4200 3016 2949 2756 2750
##
## $<NA>
## numeric(0)
\frac{\text{How many times a word is in the document}}{\text{How many documents the word is in}}
\frac{f_{w,d}}{f_d} \cdot -\log_2\left(\frac{n_t}{N}\right)
tfidf_mat <- dfm_tfidf(tdm, base=2, scheme_tf="prop")
topfeatures(tfidf_mat, n=5, groups="industry")
## $`Wholesale Trade`
## graybar grainger oil million bottl
## 0.3140485 0.2899255 0.2187512 0.2184815 0.2122642
##
## $Finance
## ab mortgag depositor loan reit
## 9.863862 7.414096 6.192815 5.109854 5.046502
##
## $Utilities
## gas fcc pipelin energi aircraft
## 2.005220 1.484092 1.227766 1.164767 1.020255
##
## $Services
## game client casino million softwar
## 2.394468 1.760647 1.635549 1.496073 1.404740
##
## $Manufacturing
## clinic fda trial drug patient
## 7.057913 5.487707 3.949705 3.935010 3.799611
##
## $Mining
## gas oil drill well explor
## 6.550322 6.308205 4.935983 2.412994 2.035304
##
## $Construction
## homebuild home iveda layn alp
## 0.5143533 0.3827212 0.3557692 0.2360279 0.2303252
##
## $`Retail Trade`
## restaur store merchandis franchise franchis
## 2.6829714 1.5131383 1.3382872 0.8695339 0.5706876
##
## $Agriculture
## yew uspb mushroom prc nbp
## 0.2894138 0.2140732 0.2066838 0.2031097 0.1862960
##
## $<NA>
## numeric(0)
topfeatures(tfidf_mat, n=20, groups="industry")$Finance
readRDS('../../Data/corp_tfidf_bank.rds')
## ab mortgag depositor loan reit trust
## 9.863862 7.414096 6.192815 5.109854 5.046502 4.394811
## reinsur truste estat tenant instruct partnership
## 3.809024 3.607591 3.188824 3.100092 2.970419 2.697215
## real million pool fdic residenti bancorp
## 2.506670 2.482285 2.287610 2.238533 2.149133 2.074819
## obligor rmbs
## 2.055811 2.055453
# quanteda's conversion for the stm package
out <- convert(tdm, to = 'stm')
# quanteda's conversion for the lda package
# out <- convert(tdm, to = 'lda')
# quanteda's conversion for the topicmodels package
# out <- convert(tdm, to = 'topicmodels')
out$documents
: Index number of each word and its count for each documentout$vocab
: Words and their index numbersout$meta
a data frame of the other information from the corpus (like industry
)out$documents[[1]][,386:390]
## [,1] [,2] [,3] [,4] [,5]
## [1,] 14590 14593 14598 14614 14625
## [2,] 1 1 38 3 1
out$vocab[c(out$documents[[1]][,386:390][1,])]
## [1] "earlier" "earliest" "earn" "earthen" "eas"
K
, the number of topics we want. I’ll use 10 for simplicity, but often we need more to neatly categorize the text
K=100
is a popular choice when we are using the output as an input to another modelK=31
, as that captures the most restatements in samplelibrary(stm)
topics <- stm(out$documents, out$vocab, K=10)
labelTopics(topics)
## Topic 1 Top Words:
## Highest Prob: properti, oper, million, decemb, compani, interest, leas
## FREX: ffo, efih, efh, tenant, hotel, casino, guc
## Lift: aliansc, baluma, change-of-ownership, crj700s, directly-reimburs, escena, hhmk
## Score: reit, hotel, game, ffo, tenant, casino, efih
## Topic 2 Top Words:
## Highest Prob: compani, stock, share, common, financi, director, offic
## FREX: prc, asher, shaanxi, wfoe, eit, hubei, yew
## Lift: aagc, abramowitz, accello, akash, alix, alkam, almati
## Score: prc, compani, penni, stock, share, rmb, director
## Topic 3 Top Words:
## Highest Prob: product, develop, compani, clinic, market, includ, approv
## FREX: dose, preclin, nda, vaccin, oncolog, anda, fdas
## Lift: 1064nm, 12-001hr, 25-gaug, 2ml, 3shape, 503b, 600mg
## Score: clinic, fda, preclin, dose, patent, nda, product
## Topic 4 Top Words:
## Highest Prob: invest, fund, manag, market, asset, trade, interest
## FREX: uscf, nfa, unl, uga, mlai, bno, dno
## Lift: a-1t, aion, apx-endex, bessey, bolduc, broyhil, buran
## Score: uscf, fhlbank, rmbs, uga, invest, mlai, ung
## Topic 5 Top Words:
## Highest Prob: servic, report, file, program, provid, network, requir
## FREX: echostar, fcc, fccs, telesat, ilec, starz, retransmiss
## Lift: 1100-n, 2-usb, 2011-c1, 2012-ccre4, 2013-c9, aastra, accreditor
## Score: entergi, fcc, echostar, wireless, broadcast, video, cabl
## Topic 6 Top Words:
## Highest Prob: loan, bank, compani, financi, decemb, million, interest
## FREX: nonaccru, oreo, tdrs, bancorp, fdic, charge-off, alll
## Lift: 100bp, 4-famili, acnb, acquired-impair, amerihom, ameriserv, annb
## Score: fhlb, loan, bank, mortgag, risk-weight, tdrs, nonaccru
## Topic 7 Top Words:
## Highest Prob: compani, million, oper, financi, revenu, result, includ
## FREX: vmware, imax, franchise, merchandis, affinion, exhibitor, softwar
## Lift: 4.75x, 9corpor, accessdm, acvc, adtech, adxpos, aecsoft
## Score: million, product, restaur, custom, game, video, merchandis
## Topic 8 Top Words:
## Highest Prob: compani, insur, million, loss, financi, invest, rate
## FREX: policyhold, reinsur, lae, dac, annuiti, ambac, cede
## Lift: agcpl, ahccc, amcareco, argoglob, asil, connecticut-domicil, feinsod
## Score: reinsur, policyhold, lae, onebeacon, insur, million, dac
## Topic 9 Top Words:
## Highest Prob: million, compani, oper, financi, cost, product, decemb
## FREX: wafer, alcoa, pepco, dpl, nstar, usec, kcsm
## Lift: 1.5mw, 11-hour, 1ynanomet, 3dfx, 3ms, 3pd, 40g
## Score: million, product, ameren, cleco, manufactur, wafer, postretir
## Topic 10 Top Words:
## Highest Prob: gas, oper, oil, natur, million, cost, decemb
## FREX: ngl, ngls, oneok, mgp, permian, qep, wes
## Lift: 12asset, 1businesscommerci, 94-mile, aivh, amargo, amopp, angell
## Score: gas, drill, oil, ngl, crude, unithold, ngls
Highest prob
is a straightforward measure to interpret
out$meta$industry <- factor(out$meta$industry)
out$meta$industry <- addNA(out$meta$industry)
doc_topics = data.frame(document=names(out$documents),
industry=out$meta$industry,
topic=1,
weight=topics$theta[,1])
for (i in 2:10) {
temp = data.frame(document=names(out$documents),
industry=out$meta$industry,
topic=i,
weight=topics$theta[,i])
doc_topics = rbind(doc_topics, temp)
}
# Proporitional topics (%)
doc_topics <- doc_topics %>%
group_by(document) %>%
mutate(topic_prop = weight / sum(weight)) %>%
ungroup()
# Manually label topics
topic_labels = data.frame(topic = 1:10,
topic_name = c('Real Estate', 'Management', 'Product',
'Investment', 'Services', 'Financing',
'Service2', 'Insurance', 'Industrial',
'Utility'))
doc_topics <- doc_topics %>% left_join(topic_labels)
doc_topics %>% filter(document=='0001104659-14-015152.txt')
## # A tibble: 10 x 6
## document industry topic weight topic_prop topic_name
## <fct> <fct> <dbl> <dbl> <dbl> <fct>
## 1 0001104659-14-015152.txt Finance 1 0.000316 0.000316 Real Estate
## 2 0001104659-14-015152.txt Finance 2 0.0000594 0.0000594 Management
## 3 0001104659-14-015152.txt Finance 3 0.0000153 0.0000153 Product
## 4 0001104659-14-015152.txt Finance 4 0.168 0.168 Investment
## 5 0001104659-14-015152.txt Finance 5 0.0172 0.0172 Services
## 6 0001104659-14-015152.txt Finance 6 0.433 0.433 Financing
## 7 0001104659-14-015152.txt Finance 7 0.00332 0.00332 Service2
## 8 0001104659-14-015152.txt Finance 8 0.303 0.303 Insurance
## 9 0001104659-14-015152.txt Finance 9 0.0755 0.0755 Industrial
## 10 0001104659-14-015152.txt Finance 10 0.0000558 0.0000558 Utility
doc_topics %>%
filter(document=='0001104659-14-015152.txt' |
document=='0000019617-14-000289.txt') %>%
mutate(Company=ifelse(document=='0001104659-14-015152.txt', 'Citi','JPM')) %>%
ggplot(aes(x=factor(topic_name), y=topic_prop, fill=factor(topic_name))) +
geom_col() + facet_wrap(~Company) +
theme(axis.text.x=element_blank(),axis.ticks.x = element_blank())
doc_topics %>%
group_by(industry, topic) %>%
mutate(topic_prop = mean(topic_prop)) %>%
slice(1) %>%
ungroup() %>%
ggplot(aes(x=factor(topic_name), y=topic_prop, fill=factor(topic_name))) +
geom_col() + facet_wrap(~industry) +
theme(axis.text.x=element_blank(),axis.ticks.x = element_blank())
package:STM
’s toLDAvis() function
# Code to generate LDAvis
toLDAvis(topics, out$documents, R=10)
# code to generate stmBrowser
stmBrowser(topics, data=data.frame(text=names(out$documents),
industry=out$meta$industry),
c('industry'), text='text')
How might we leverage LDA (or other topic modeling methods) to improve and simplify analytics?
We’ll build a different classification system, based on what they discuss in their annual reports
\min_{C_k} \sum_{k=1}^{K} \sum_{x_i\in C_k} \left(x_i - \mu_k\right)^2
library(tidyr)
wide_topics <- spread(doc_topics[,c(1,2,5,6)], topic_name, topic_prop)
mat <- wide_topics[,3:12]
mat[,1:6] %>% head() %>% html_df()
Financing | Industrial | Insurance | Investment | Management | Product |
---|---|---|---|---|---|
0.0105862 | 0.1578543 | 0.1088631 | 0.0004632 | 0.1161191 | 0.0002101 |
0.0467173 | 0.0059438 | 0.0235389 | 0.0005284 | 0.0801189 | 0.0001432 |
0.0069105 | 0.0351987 | 0.0003661 | 0.0201215 | 0.0023672 | 0.0000186 |
0.0870371 | 0.8271759 | 0.0003259 | 0.0003334 | 0.0206444 | 0.0000485 |
0.0036086 | 0.2680866 | 0.2677154 | 0.0008808 | 0.0026448 | 0.0000949 |
0.0000976 | 0.5299432 | 0.0001593 | 0.0007533 | 0.0009532 | 0.0000318 |
set.seed(6845868)
clusters <- kmeans(mat, 9)
clusters$cluster %>% head()
## [1] 9 1 9 2 2 2
cbind(as.data.frame(clusters$center), data.frame(kmean=1:9)) %>%
gather("Topics","weights",-kmean) %>%
ggplot(aes(x=factor(Topics), y=weights, fill=factor(Topics))) +
geom_col() +
facet_wrap(~kmean) +
theme(axis.text.x=element_blank(),axis.ticks.x = element_blank())
library(cluster) # Uses PCA (principle component analysis)
clusplot(mat, clusters$cluster, color=TRUE, shade=TRUE,
labels=4)
library(Rtsne)
dups <- which(duplicated(mat))
wide_nodup <- wide_topics[-dups,]
wide_nodup$kmean <- clusters$cluster[-dups]
#slow O(n log(n)). Original model was O(n^2) though
tsne_data <- Rtsne(mat[-dups,])
wide_nodup <- wide_nodup %>%
mutate(tsne1 = tsne_data$Y[, 1], tsne2 = tsne_data$Y[, 2])
ggplot(wide_nodup, aes(x = tsne1, y = tsne2, colour = industry)) +
geom_point(alpha = 0.3) + theme_bw()
ggplot(wide_nodup, aes(x = tsne1, y = tsne2, colour = factor(kmean))) +
geom_point(alpha = 0.3) + theme_bw()
ggplot(wide_nodup, aes(x=kmean)) + geom_bar() + facet_wrap(~factor(industry))
ggplot(wide_nodup, aes(x=tsne1, y=tsne2, color=factor(kmean))) + geom_point() +
facet_wrap(~factor(industry))
ggplot(wide_nodup, aes(x=tsne1, y=tsne2, color=factor(industry))) + geom_point() +
facet_wrap(~factor(kmean))
wide_topics$dist <- sqrt(rowSums(mat - fitted(clusters))^2)
wide_topics[,c(1,2,3,13)] %>% arrange(desc(dist)) %>% slice(1:5) %>% html_df()
document | industry | Financing | dist |
---|---|---|---|
0000771266-14-000007.txt | Manufacturing | 0.0071376 | 0 |
0001193125-14-098013.txt | Manufacturing | 0.0079638 | 0 |
0000880177-14-000019.txt | Manufacturing | 0.0000968 | 0 |
0001193125-14-109073.txt | Services | 0.0004547 | 0 |
0001046311-14-000004.txt | Services | 0.0375458 | 0 |
dist
is shown as 0 because the distances are very, very small (but positive)wide_topics[,c(1,2,4,7,9,10,13)] %>% filter(industry!="Finance") %>% arrange(desc(dist)) %>% slice(1:5)
## # A tibble: 5 x 7
## document industry Industrial Management `Real Estate` Service2 dist
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00007712… Manufac… 0.351 0.0385 0.000277 0.568 1.13e-15
## 2 00011931… Manufac… 0.143 0.0579 0.0107 0.739 1.13e-15
## 3 00008801… Manufac… 0.367 0.00317 0.000111 0.560 1.11e-15
## 4 00011931… Services 0.00167 0.169 0.263 0.504 1.11e-15
## 5 00010463… Services 0.00242 0.00510 0.373 0.540 1.11e-15
Text based industry classification using 10-Ks has been shown to be quite viable, such as in work by Hoberg and Phillips.
What else could we use clustering to solve?
industry
measure was NA
train <- wide_topics[!is.na(wide_topics$industry),]
label <- wide_topics[is.na(wide_topics$industry),]
library(caret)
trControl <- trainControl(method='cv', number=20)
tout <- train(industry ~ .,
method = 'knn',
tuneGrid = expand.grid(k=1:20),
trControl = trControl,
metric = "Accuracy",
data = train[,-1])
saveRDS(tout, '../../Data/corp_knn.rds')
tout
## k-Nearest Neighbors
##
## 5804 samples
## 10 predictor
## 9 classes: 'Agriculture', 'Construction', 'Finance', 'Manufacturing', 'Mining', 'Retail Trade', 'Services', 'Utilities', 'Wholesale Trade'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 5226, 5222, 5223, 5224, 5223, 5226, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.6922669 0.6037548
## 2 0.6883222 0.5984635
## 3 0.7219205 0.6397779
## 4 0.7305403 0.6495724
## 5 0.7374387 0.6581581
## 6 0.7384702 0.6592123
## 7 0.7460449 0.6686815
## 8 0.7505306 0.6741651
## 9 0.7515604 0.6753179
## 10 0.7512102 0.6749574
## 11 0.7489795 0.6718804
## 12 0.7491537 0.6719035
## 13 0.7525919 0.6764543
## 14 0.7508766 0.6741010
## 15 0.7529349 0.6766597
## 16 0.7506983 0.6737148
## 17 0.7500110 0.6727821
## 18 0.7488041 0.6711643
## 19 0.7494908 0.6718556
## 20 0.7496676 0.6719961
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 15.
ggplot(tout$results, aes(x=k, y=Accuracy)) +
geom_line() +
geom_ribbon(aes(ymin=Accuracy - AccuracySD*1.96,
ymax=Accuracy + AccuracySD*1.96), alpha=0.2) +
geom_vline(xintercept=13, color="blue") +
xlab("k, optimal = 13")
label$industry_pred <- predict(tout,
label)
label[,c("document",
"industry_pred")] %>%
head %>% html_df
document | industry_pred |
---|---|
0000817473-14-000010.txt | Finance |
0000820027-14-000025.txt | Finance |
0000837465-14-000002.txt | Manufacturing |
0000837919-14-000002.txt | Finance |
0000891092-14-000570.txt | Finance |
0000891092-14-002078.txt | Finance |
“Any sufficiently advanced technology is indistinguishable from magic.” – Sir Arthur Charles Clarke
Today, we:
library(knitr)
library(kableExtra)
html_df <- function(text, cols=NULL, col1=FALSE, full=F) {
if(!length(cols)) {
cols=colnames(text)
}
if(!col1) {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full)
} else {
kable(text,"html", col.names = cols, align = c("l",rep('c',length(cols)-1))) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width=full) %>%
column_spec(1,bold=T)
}
}
clusters <- kmeans(mat, 50)
clusters$cluster %>% head()
## [1] 32 35 42 7 27 22
wide_nodup$kmean2 <- clusters$cluster[-dups]
ggplot(wide_nodup, aes(x = tsne1, y = tsne2, colour = factor(kmean2))) +
geom_point(alpha = 0.3) + theme_bw()
ggplot(wide_nodup, aes(x=kmean2)) + geom_bar() + facet_wrap(~factor(industry))
ggplot(wide_nodup, aes(x=tsne1, y=tsne2, color=factor(kmean2))) + geom_point() +
facet_wrap(~factor(industry))