# From base python
from collections import Counter            # Counters
import csv                                 # Read and write csv files
import glob
import pickle                              # Reading and writing python objects to file
import os                                  # Work with file paths
import re                                  # Regular expressions 

# External
import numpy as np                         # Simple mathematics function and linear algebra
# External
import gensim                              # Simple topic modeling
import gensim.downloader                   # For importing a pre-built word2vec model
# Note: As of July 2021, gensim's installation process is moderately broken on python 3.9 and above.
# You can use python 3.8 instead, or install the 6.42GB worth of development tools needed to build
# the package yourself.  To install those, grab the installer from:
# Download at: https://visualstudio.microsoft.com/visual-cpp-build-tools/
# and then select "Desktop development with C++" and install.  You must keep all default selections
# checked for the install to work.
# After installing, reboot.  Then `pip install gensim` will work just fine.
import matplotlib.pyplot as plt            # Charting functions
plt.rcParams['figure.figsize'] = [12, 8]   # Make plots larger by default
import pandas as pd                        # Work with data frames
import pyLDAvis                            # Easily visualize your topic models
import pyLDAvis.gensim_models              # Easily visualize your topic models
pyLDAvis.enable_notebook()                 # Enables notebook support -- only needed when using Jupyter
import seaborn as sns
%matplotlib inline
import spacy                               # SpaCy -- for NLP parsing
from tqdm import tqdm                      # Provides a progress bar

with open('../../Data/S8_WSJ_2008.10.txt', 'rt') as f:
    text2 = f.readlines()

articles = []
article = ''
authors = []
author = ''
reading = False
for line in text2:
    if reading:  # check for the end of an article
        if 'Company / organization: ' in line or 'Credit: ' in line or 'Copyright: ' in line:
            # Done reading the article: output it
            reading = False
            articles.append(article.replace('\n\n','\n'))
            article = ''
            authors.append(author)
            author = ''
        else:
            article += line.replace('---','')
    else:  # check for the start of an article
        if 'Full text: ' in line and 'Full text: Not available' not in line:
            # Start reading the article in
            article = line[11:].replace('---','')
            reading = True
        elif 'Author:' in line and ' and ' not in line:
            author = line[8:].replace('\r','').replace('\n','')
            if ',' in line:
                author = author.split(',')[1].strip() + ' ' + author.split(',')[0]
            if author == 'Anonymous':
                author = ''
            author = author.split()[0] if ' ' in author else author
        else:
            pass  # not part of an article, nothing to do.

with open('../../Data/S8_pass_to_R.csv', 'wt') as f:
    writer = csv.writer(f)
    writer.writerow(['doc_id', 'text', 'author'])
    for i, article in enumerate(articles):
        writer.writerow([i, article, authors[i]])

def distance_matrix_np(pts):
    """Returns matrix of pairwise Euclidean distances. Vectorized numpy version."""
    return np.sum((pts[None,:] - pts[:, None])**2, -1)**0.5

def plot_similarity(messages, embeddings, rotation):
    messages2 = []
    for message in messages:
        if len(message.split()) > 4:
            c = 0
            temp = ''
            for m in message.split():
                temp += m
                c += 1
                if c==4:
                    temp += '\n'
                    c = 0
                else:
                    temp += ' '
            temp = temp[:-1]
            messages2.append(temp)
        else:
            messages2.append(message)
    messages = messages2
    corr = distance_matrix_np(embeddings)
    corr = 1 - corr/2
    sns.set(font_scale=1.2)
    g = sns.heatmap(
        corr,
        xticklabels=messages,
        yticklabels=messages,
        vmin=0,
        vmax=1,
        cmap="YlOrRd")
    g.set_xticklabels(messages, rotation=rotation)
    g.set_yticklabels(messages, rotation=0)
    g.set_title("Semantic Textual Similarity")
    return g

# This lemmatizes words, drops stopwords, drops symbols, drops numbers, and lowercases words
def cleaner(doc):
    return [token.lemma_.lower() for token in doc if (not token.is_stop) and (token.pos_ not in ['SYM', 'NUM', 'SPACE', 'PUNCT', 'X'])]

# Define our pipe
# We keep lemmatizer to lemmatize with, tagger as it is a dependency of tagger
nlp = spacy.load("en_core_web_sm", exclude=['ner', 'parser'])

%%time
# 5 seconds to run
docs = []
for doc in nlp.pipe(articles):
    docs.append(cleaner(doc))

CPU times: total: 59.2 s
Wall time: 59.2 s

id = 0
out = [['id', 'author', 'text']]
for i in range(0, len(docs)):
    doc = docs[i]
    author = authors[i]
    id += 1
    out.append([id, author, ' '.join(doc)])
with open('../../Data/S6_WSJ_tidytext.csv', 'wt') as f:
    writer = csv.writer(f)
    writer.writerows(out)

# Takes trivial time and trivial RAM usage
# A dictionary mapping words to IDs
# LDA works on IDs to make things computationally efficient
words = gensim.corpora.Dictionary(docs)

# filter out words that are too frequent (unlikely to be useful) and those that are extremely rare
words.filter_extremes(no_below=3, no_above=0.5)

# Gensim corpus -- bag of words
corpus = [words.doc2bow(doc) for doc in docs]

# Save to a file
with open('M:/Scratch/corpus_WSJ.pkl', 'wb') as f:
    pickle.dump([corpus, words], f, protocol=pickle.HIGHEST_PROTOCOL)

with open('M:/Scratch/corpus_WSJ.pkl', 'rb') as f:
    corpus, words = pickle.load(f)

# Note: not using gensim.models.ldamulticore.LdaMulticore, as the model's performance is quite bad unless you tune
# the alpha and eta parameters yourself.  The defaults will give poorly specified output on our corpus.
lda = gensim.models.ldamodel.LdaModel(corpus, id2word=words, num_topics=10, passes=5,
                                      update_every=5, alpha='auto', eta='auto')

lda.save('../../Data/lda_WSJ')

lda = gensim.models.ldamodel.LdaModel.load('../../Data/lda_WSJ')

with open('M:/Scratch/corpus_WSJ.pkl', 'rb') as f:
    corpus, words = pickle.load(f)

lda.show_topic(0, 10)

[('quarter', 0.0074630263),
 ('sale', 0.006757144),
 ('share', 0.0059989654),
 ('stock', 0.005518357),
 ('market', 0.005500558),
 ('price', 0.0053370525),
 ('fall', 0.0046162326),
 ('u.s.', 0.004529298),
 ('expect', 0.0040102066),
 ('maker', 0.0039168703)]

for i in range(0,10):
    top = lda.show_topic(i, 10)
    top_words = [w for w, _ in top ]
    
    print('{}: {}'.format(i, ' '.join(top_words)))

0: quarter sale share stock market price fall u.s. expect maker
1: obama sen. state u.s. mccain president government tax people campaign
2: employee like work ms. old people include u.s. come '
3: bank government financial market capital u.s. plan loan & credit
4: fund business market people service inc. like product money investor
5: oil price market tax high energy business gas share financial
6: china state government work week imf report plan economic u.s.
7: market price like sell stock inc. people credit call city
8: share u.s. price wine like early market play store music
9: market rate financial bank u.s. stock investor economic fall economy

ldavis = pyLDAvis.gensim_models.prepare(lda, corpus, words, sort_topics=False)

pyLDAvis.display(ldavis)

pyLDAvis.save_html(ldavis, 'ldavis.html')

gamma, _ = lda.inference(corpus)

gamma

array([[4.0431023e+01, 6.8710417e-02, 3.2323015e+00, ..., 5.0813157e-02,
        6.4209081e-02, 8.6920204e+01],
       [8.3458073e-02, 6.8710379e-02, 6.8541192e-02, ..., 5.0813157e-02,
        6.4209081e-02, 2.9844608e+02],
       [8.3458580e-02, 2.5323831e+02, 6.8541281e-02, ..., 5.0813157e-02,
        9.3321266e+00, 7.9264663e-02],
       ...,
       [2.8226041e+02, 6.8710431e-02, 8.1152222e+01, ..., 6.8954911e+00,
        6.4209081e-02, 1.6420626e+01],
       [8.3457865e-02, 1.8752879e+02, 6.8541244e-02, ..., 5.0813157e-02,
        6.4209104e-02, 7.9264961e-02],
       [8.3457761e-02, 6.8710461e-02, 6.8541236e-02, ..., 5.0813157e-02,
        6.4209081e-02, 7.9264499e-02]], dtype=float32)

topic_dist = gamma / gamma.sum(axis=1)[:,None]

topic_dist.shape

(2080, 10)

topic_names = ['Firm performance', 'US Government', 'Employment', 'Banking',
               'Investments', 'Energy', 'Foreign Economics', 'Equity pricing',
               'Retail', 'Macro economics']

df = pd.DataFrame(data=topic_dist, columns=topic_names)

corr = df.corr()
np.fill_diagonal(corr.values, np.NAN)  # remove diagonals from the plot
sns.heatmap(corr, cmap="RdYlGn", annot=True)

<Axes: >

Session 8: Topic Modeling¶

Getting started¶

Additional functions for USE analysis¶

LDA¶

Preparations¶

Setup for longer documents -- not needed for our data, but useful to have¶

Cleaning documents using spaCy¶

Cleaning up our articles for gensim¶

Building the gensim dictionary and corpus¶

Examining the LDA model¶

Applying the topics to our documents¶