# docs contains all of our cleaned 10-K filings
# doc_names contains the filings' accession numbers
# Prepare the needed parts for gensim's LDA implementation
words = gensim.corpora.Dictionary(articles)
words.filter_extremes(no_below=3, no_above=0.5)
words.filter_tokens(bad_ids=[words.token2id['_']]) # '_' is not treated as a symbol by spaCy
corpus = [words.doc2bow(doc) for doc in articles]
# Save the intermediate data -- useful if we want to tweak model parameters and re-run later
with open('../../Data/corpus_WSJ.pkl', 'wb') as f:
pickle.dump([corpus, words], f, protocol=pickle.HIGHEST_PROTOCOL)
# Run the model
lda = gensim.models.ldamodel.LdaModel(corpus, id2word=words, num_topics=10, passes=5,
update_every=5, alpha='auto', eta='auto')Main application: Analyzing Wall Street Journal articles
- On eLearn you will find a full month of the WSJ in text format
Tasks
- Apply a topic model to the documents
- Analyze the effect of gender on topic discussion using STM
- Apply ETM to the documents to see how words and topics relate






