# From base python
from collections import Counter            # Counters
import csv                                 # Read and write csv files
import glob
import pickle                              # Reading and writing python objects to file
import os                                  # Work with file paths
import re                                  # Regular expressions 

# External
import numpy as np                         # Simple mathematics function and linear algebra
# External
import gensim                              # Simple topic modeling
import gensim.downloader                   # For importing a pre-built word2vec model
# Note: As of July 2021, gensim's installation process is moderately broken on python 3.9 and above.
# You can use python 3.8 instead, or install the 6.42GB worth of development tools needed to build
# the package yourself.  To install those, grab the installer from:
# Download at: https://visualstudio.microsoft.com/visual-cpp-build-tools/
# and then select "Desktop development with C++" and install.  You must keep all default selections
# checked for the install to work.
# After installing, reboot.  Then `pip install gensim` will work just fine.
import matplotlib.pyplot as plt            # Charting functions
plt.rcParams['figure.figsize'] = [12, 8]   # Make plots larger by default
import pandas as pd                        # Work with data frames
import pyLDAvis                            # Easily visualize your topic models
import pyLDAvis.gensim_models              # Easily visualize your topic models
pyLDAvis.enable_notebook()                 # Enables notebook support -- only needed when using Jupyter
import seaborn as sns
%matplotlib inline
import spacy                               # SpaCy -- for NLP parsing
import tensorflow as tf                    # Neural network backend
import tensorflow_hub as hub               # Pretrained neural network models
from tqdm import tqdm                      # Provides a progress bar

M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\past\builtins\misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  from imp import reload
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\seaborn\rcmod.py:82: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
  if LooseVersion(mpl.__version__) >= "3.0":
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\setuptools\_distutils\version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
  other = LooseVersion(other)
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:18: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  DESCRIPTOR = _descriptor.FileDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:36: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:43: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:29: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _TENSORSHAPEPROTO_DIM = _descriptor.Descriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:73: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:80: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:66: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _TENSORSHAPEPROTO = _descriptor.Descriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:19: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  DESCRIPTOR = _descriptor.FileDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:33: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:37: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:41: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:45: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:49: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:53: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:57: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:61: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:65: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:69: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:73: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:77: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:81: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:85: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:89: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:93: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:97: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:101: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:105: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:109: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:113: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:117: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:121: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:125: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:129: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:133: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:137: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:141: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:145: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:149: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:153: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:157: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:161: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:165: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:169: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:173: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:177: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:181: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:185: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:189: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:193: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:197: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:201: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:205: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:209: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:213: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:217: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.EnumValueDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:27: DeprecationWarning: Call to deprecated create function EnumDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _DATATYPE = _descriptor.EnumDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:20: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  DESCRIPTOR = _descriptor.FileDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:39: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:46: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:32: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _RESOURCEHANDLEPROTO_DTYPEANDSHAPE = _descriptor.Descriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:76: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:83: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:90: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:97: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:104: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:111: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:69: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _RESOURCEHANDLEPROTO = _descriptor.Descriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:21: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  DESCRIPTOR = _descriptor.FileDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:40: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:47: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:54: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:61: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:68: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:75: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:82: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:89: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:96: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:103: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:110: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:117: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:124: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:131: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:138: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:145: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:152: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:33: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _TENSORPROTO = _descriptor.Descriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:183: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:190: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:197: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:176: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _VARIANTTENSORDATAPROTO = _descriptor.Descriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:21: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  DESCRIPTOR = _descriptor.FileDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:40: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:47: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:54: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:61: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:68: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:75: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:82: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:89: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _descriptor.FieldDescriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:33: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool.
  _ATTRVALUE_LISTVALUE = _descriptor.Descriptor(
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:23: DeprecationWarning: NEAREST is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.NEAREST or Dither.NONE instead.
  'nearest': pil_image.NEAREST,
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:24: DeprecationWarning: BILINEAR is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BILINEAR instead.
  'bilinear': pil_image.BILINEAR,
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:25: DeprecationWarning: BICUBIC is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BICUBIC instead.
  'bicubic': pil_image.BICUBIC,
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:28: DeprecationWarning: HAMMING is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.HAMMING instead.
  if hasattr(pil_image, 'HAMMING'):
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:29: DeprecationWarning: HAMMING is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.HAMMING instead.
  _PIL_INTERPOLATION_METHODS['hamming'] = pil_image.HAMMING
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:30: DeprecationWarning: BOX is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BOX instead.
  if hasattr(pil_image, 'BOX'):
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:31: DeprecationWarning: BOX is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BOX instead.
  _PIL_INTERPOLATION_METHODS['box'] = pil_image.BOX
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:33: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.
  if hasattr(pil_image, 'LANCZOS'):
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:34: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.
  _PIL_INTERPOLATION_METHODS['lanczos'] = pil_image.LANCZOS
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow_hub\__init__.py:74: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
  if (distutils.version.LooseVersion(tf.__version__) <
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow_hub\__init__.py:75: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
  distutils.version.LooseVersion(required_tensorflow_version)):


with open('../../Data/S4_WSJ_2013.09.09.txt', 'rt') as f:
    text2 = f.readlines()


articles = []
authors = []
article = ''
reading = False
for line in text2:
    if 'Author: ' in line:
        author = line.split(':')[-1].strip()
        if ',' in author:
            author = author.split(', ')[1].split()[0]
    if reading:  # check for the end of an article
        if 'Company / organization: ' in line or 'Credit: ' in line or 'Copyright: ' in line:
            # Done reading the article: output it
            reading = False
            articles.append(article)
            authors.append(author)
            article = ''
        else:
            article += line
    else:  # check for the start of an article
        if 'Full text: ' in line and 'Full text: Not available' not in line:
            # Start reading the article in
            article = line[11:]
            reading = True
        else:
            pass  # not part of an article, nothing to do.


def distance_matrix_np(pts):
    """Returns matrix of pairwise Euclidean distances. Vectorized numpy version."""
    return np.sum((pts[None,:] - pts[:, None])**2, -1)**0.5


def plot_similarity(messages, embeddings, rotation):
    messages2 = []
    for message in messages:
        if len(message.split()) > 4:
            c = 0
            temp = ''
            for m in message.split():
                temp += m
                c += 1
                if c==4:
                    temp += '\n'
                    c = 0
                else:
                    temp += ' '
            temp = temp[:-1]
            messages2.append(temp)
        else:
            messages2.append(message)
    messages = messages2
    corr = distance_matrix_np(embeddings)
    corr = 1 - corr/2
    sns.set(font_scale=1.2)
    g = sns.heatmap(
        corr,
        xticklabels=messages,
        yticklabels=messages,
        vmin=0,
        vmax=1,
        cmap="YlOrRd")
    g.set_xticklabels(messages, rotation=rotation)
    g.set_yticklabels(messages, rotation=0)
    g.set_title("Semantic Textual Similarity")
    return g


# Note: Running this cell will download ~1.7GB of data
# It is stored in ~/gensim_data/
# (where ~ represents your Users folder)
# You can safely delete the data when you are done experimenting
base_w2v = gensim.downloader.load('word2vec-google-news-300')


base_w2v.doesnt_match(['Queen', 'King', 'Prince', 'Peasant'])

'Peasant'


base_w2v.doesnt_match(['Singapore', 'Malyasia', 'Indonesia', 'Germany'])

'Germany'


base_w2v.doesnt_match(['Euro', 'USD', 'RMB', 'computer'])

'computer'


base_w2v.doesnt_match(['mee goreng', 'char kway teoh', 'laksa', 'hamburger'])

'laksa'


base_w2v.most_similar(['Earnings'])

[('earnings', 0.7311209440231323),
 ('Profit', 0.715435266494751),
 ('F#Q##_Earnings', 0.6902576684951782),
 ('Profits', 0.6605858206748962),
 ('Net_Income', 0.6565544605255127),
 ('3rd_Quarter_Earnings', 0.6558468341827393),
 ('3Q_Earnings', 0.6529892086982727),
 ('2nd_Quarter_Earnings', 0.6502183675765991),
 ('2Q_Earnings', 0.6475957036018372),
 ('1Q_Earnings', 0.6449595093727112)]


base_w2v.most_similar(['KPMG'])

[('PwC', 0.8044511675834656),
 ('PricewaterhouseCoopers', 0.8032213449478149),
 ('Deloitte', 0.7856793403625488),
 ('Grant_Thornton', 0.7815380096435547),
 ('PriceWaterhouseCoopers', 0.7609084844589233),
 ('KMPG', 0.7575339674949646),
 ('PricewaterhouseCoopers_PwC', 0.7438498139381409),
 ('Pricewaterhouse_Coopers', 0.7163810729980469),
 ('Delloitte', 0.7009097933769226),
 ('KPMG_LLP', 0.7008421421051025)]


base_w2v.most_similar(['Accrual'])

[('accrual', 0.6091281771659851),
 ('accruals', 0.544994592666626),
 ('Accrued_severance', 0.5363413691520691),
 ('Accruals', 0.5340796709060669),
 ('Termination_Benefits', 0.5254215002059937),
 ('----------------------_Adjusted', 0.5224087238311768),
 ('-----------_Non_GAAP', 0.522375762462616),
 ('------------------------------------_Adjusted', 0.5210174322128296),
 ('--------------------------------------------_Adjusted', 0.5202162861824036),
 ('Deferral', 0.5182511210441589)]


base_w2v.most_similar(positive=['King', 'woman'], negative=['man'])

[('Queen', 0.551562488079071),
 ('Oprah_BFF_Gayle', 0.4759754538536072),
 ('Geoffrey_Rush_Exit', 0.4646016061306),
 ('Princess', 0.45336732268333435),
 ('Yvonne_Stickney', 0.4507041871547699),
 ('L._Bonauto', 0.44221362471580505),
 ('gal_pal_Gayle', 0.44083890318870544),
 ('Alveda_C.', 0.44027918577194214),
 ('Tupou_V.', 0.43738630414009094),
 ('K._Letourneau', 0.4351031482219696)]


analogy = base_w2v['King'] + base_w2v['woman'] + base_w2v['man']
analogy = analogy / np.linalg.norm(analogy)


print('King', np.linalg.norm(analogy - base_w2v['King']))

King 1.988859


print('Queen', np.linalg.norm(analogy - base_w2v['Queen']))

Queen 2.7364812


base_w2v.most_similar('King')

[('Jackson', 0.5326346158981323),
 ('Prince', 0.5306329727172852),
 ('Tupou_V.', 0.529282808303833),
 ('KIng', 0.5227501392364502),
 ('e_mail_robert.king_@', 0.5173625349998474),
 ('king', 0.515891969203949),
 ('Queen', 0.5157252550125122),
 ('Geoffrey_Rush_Exit', 0.4992094337940216),
 ('prosecutor_Dan_Satterberg', 0.4985077977180481),
 ('NECN_Alison', 0.49128594994544983)]


print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


# this code is done for you -- just run it to load the model
# If you get a bunch of warnings when downloading, ignore them -- it's a bug in some versions of Jupyter.
base_w2v = gensim.downloader.load('word2vec-google-news-300')


# Replace YOUR_WORD_HERE with your chosen word.
base_w2v.most_similar('Enron')

[('Skilling', 0.6776296496391296),
 ('WorldCom', 0.648767352104187),
 ('Fastow', 0.6305845379829407),
 ('ENRON', 0.6234410405158997),
 ('Enron_Broadband', 0.6094325184822083),
 ('CFO_Andrew_Fastow', 0.6074832081794739),
 ('Corp._ENRNQ', 0.6013128161430359),
 ('Enron_Corp._ENRNQ.PK', 0.5955372452735901),
 ('Enron_Kenneth_Lay', 0.592575192451477),
 ('WorldCom_debacles', 0.5907949805259705)]


# Replace YOUR_WORD_HERE with your chosen word.
base_w2v.most_similar(negative=['Accounting'])

[('Ironic_eh', 0.24367554485797882),
 ('HuMax_TAC_TM', 0.2254338413476944),
 ('legend_Alberto_Tomba', 0.22333985567092896),
 ('MAURICE_RIVER_TWP', 0.2193179577589035),
 ('scotch_swilling', 0.21673253178596497),
 ('NEW_BEDFORD_MASS', 0.21074101328849792),
 ('Nasdaq_NASDAQ_TRIN', 0.20925845205783844),
 ('lovingly_curated', 0.20696194469928741),
 ('minuter', 0.2062486708164215),
 ('giggling_gaggle', 0.2035253494977951)]


# Function to format anaologies and parse them with word2vec
def analogize(question):
    question = [q.strip() for q in question.split(':')]
    if len(question) == 5:
        del question[2]
    response = base_w2v.most_similar(positive=[question[1], question[2]], negative=[question[0]])
    return question[0] + ' : ' + question[1] + ' :: ' + question[2] + ' : ' + response[0][0]


# Example
analogize('Paris : France :: London : ?')

'Paris : France :: London : Britain'


# Fill in on your own!
analogize(' :  ::  : ?')


def filing_iterator(path_to_10K_folder):
    for file in sorted(glob.glob(path_to_10K_folder + '*')):
        text = open(file, 'rt').read()
        while len(text) > 200000:
            i = 199999
            while text[i] != '\n':
                i -= 1
                if i == 100000:
                    i = i + len(text[i:200000].split()[0])
                    break
            yield text[0:i]
            text = text[i:]
        yield text


def name_iterator(path_to_10K_folder):
    for file in sorted(glob.glob(path_to_10K_folder + '*')):
        text = open(file, 'rt').read()
        name = file.split('/')[-1]
        while len(text) > 200000:
            i = 199999
            while text[i] != '\n':
                i -= 1
                if i == 100000:
                    i = i + len(text[i:200000].split()[0])
                    break
            yield name
            text = text[i:]
        yield name


def reassemble_docs(docs, names):
    doc_names = []
    compiled_docs = []
    name_ids = {}
    # Determine which docs fit together
    for i in range(0, len(names)):
        name = names[i]
        if name not in name_ids:
            name_ids[name] = [i]
        else:
            name_ids[name].append(i)
    
    for name in name_ids:
        doc_names.append(name)
        doc = []
        for id in name_ids[name]:
            doc += docs[id]
        compiled_docs.append(doc)
    return (compiled_docs, doc_names)


# This lemmatizes words, drops stopwords, drops symbols, drops numbers, and lowercases words
def cleaner(doc):
    return [token.lemma_.lower() for token in doc if (not token.is_stop) and (token.pos_ not in ['SYM', 'NUM', 'SPACE', 'PUNCT', 'X'])]


# Define our pipe
# We keep lemmatizer to lemmatize with, tagger as it is a dependency of tagger
nlp = spacy.load("en_core_web_sm", exclude=['ner', 'parser'])
# As there is a 10-K longer than 1M characters, up the max_length of a document to 2M.
# spaCy needs up to 1GB of RAM per 1M characters to parse a file.
nlp.max_length = 200000


total_docs = len(list(filing_iterator('M:/Data_emulated/SEC_Filings/10-K_clean/2014/')))


doc_names = list(name_iterator('M:/Data_emulated/SEC_Filings/10-K_clean/2014/'))


%%time
# 27.5 minutes to run and as much as 500MB*n_process to process; most RAM usage released on completion
# Benchmark time based on running on an AMD Ryzen 5900X CPU
docs = []
for doc in tqdm(nlp.pipe(filing_iterator('M:/Data_emulated/SEC_Filings/10-K_clean/2014/'),
                         batch_size=1, n_process=20),
                total=total_docs):
    docs.append(cleaner(doc))


# Fix docs -- they were split into multiple list when greater than 200,000 characters in length
docs, doc_names = reassemble_docs(docs, doc_names)


doc_names = [doc.replace('2014\\', '') for doc in doc_names]


# Save to a file
with open('M:/Scratch/docs.pkl', 'wb') as f:
    pickle.dump([docs, doc_names], f, protocol=pickle.HIGHEST_PROTOCOL)


with open('M:/Scratch/docs.pkl', 'rb') as f:
    docs, _ = pickle.load(f)


# This lemmatizes words, drops stopwords, drops symbols, drops numbers, and lowercases words
def cleaner(doc):
    return [token.lemma_.lower() for token in doc if (not token.is_stop) and (token.pos_ not in ['SYM', 'NUM', 'SPACE', 'PUNCT', 'X'])]


# Define our pipe
# We keep lemmatizer to lemmatize with, tagger as it is a dependency of tagger
nlp = spacy.load("en_core_web_sm", exclude=['ner', 'parser'])


%%time
# 5 seconds to run
docs = []
for doc in nlp.pipe(articles):
    docs.append(cleaner(doc))

CPU times: total: 5.25 s
Wall time: 5.25 s


id = 0
out = [['id', 'author', 'text']]
for i in range(0, len(docs)):
    doc = docs[i]
    author = authors[i]
    id += 1
    out.append([id, author, ' '.join(doc)])
with open('../../Data/S6_WSJ_tidytext.csv', 'wt') as f:
    writer = csv.writer(f)
    writer.writerows(out)


# Takes trivial time and trivial RAM usage
# A dictionary mapping words to IDs
# LDA works on IDs to make things computationally efficient
words = gensim.corpora.Dictionary(docs)


# filter out words that are too frequent (unlikely to be useful) and those that are extremely rare
words.filter_extremes(no_below=3, no_above=0.5)


# Gensim corpus -- bag of words
corpus = [words.doc2bow(doc) for doc in docs]


# Save to a file
with open('M:/Scratch/corpus_WSJ.pkl', 'wb') as f:
    pickle.dump([corpus, words], f, protocol=pickle.HIGHEST_PROTOCOL)


with open('M:/Scratch/corpus_WSJ.pkl', 'rb') as f:
    corpus, words = pickle.load(f)


# Note: not using gensim.models.ldamulticore.LdaMulticore, as the model's performance is quite bad unless you tune
# the alpha and eta parameters yourself.  The defaults will give poorly specified output on our corpus.
lda = gensim.models.ldamodel.LdaModel(corpus, id2word=words, num_topics=10, passes=5,
                                      update_every=5, alpha='auto', eta='auto')


lda.save('../../Data/lda_WSJ')


lda = gensim.models.ldamodel.LdaModel.load('../../Data/lda_WSJ')


with open('M:/Scratch/corpus_WSJ.pkl', 'rb') as f:
    corpus, words = pickle.load(f)


lda.show_topic(0, 10)

[('abbott', 0.012434472),
 ('party', 0.008847061),
 ('government', 0.007975474),
 ('power', 0.007954226),
 ('labor', 0.007714725),
 ('conservative', 0.006868049),
 ('s&p', 0.006789061),
 ('political', 0.0066726357),
 ('rudd', 0.006448531),
 ('policy', 0.006251966)]


for i in range(0,10):
    top = lda.show_topic(i, 10)
    top_words = [w for w, _ in top ]
    
    print('{}: {}'.format(i, ' '.join(top_words)))

0: abbott party government power labor conservative s&p political rudd policy
1: school ms. people white district security service inc. user officer
2: benefit city life home trip live people de blasio york
3: play williams city game partner set season . good azarenka
4: company work price day people end start share take retirement
5: % fund bank fee economy government investor mortgage financial crisis
6: market % country china u.s. report group car buy investor
7: company lhota city catsimatidis retiree work health plan people york
8: % health blasio de voter likely city old york support
9: president house rule u.s. congress vote company syria obama include


ldavis = pyLDAvis.gensim_models.prepare(lda, corpus, words, sort_topics=False)

M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\pyLDAvis\_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.
  default_term_info = default_term_info.sort_values(


pyLDAvis.display(ldavis)


pyLDAvis.save_html(ldavis, 'ldavis.html')


gamma, _ = lda.inference(corpus)


gamma

array([[5.4804064e-02, 5.0945837e-02, 6.3865036e-02, ..., 6.6675432e-02,
        5.7522774e-02, 2.3574228e+01],
       [5.4804064e-02, 5.0945837e-02, 6.3865043e-02, ..., 6.6675439e-02,
        5.7522766e-02, 6.9161303e-02],
       [5.4804068e-02, 5.0945837e-02, 6.3865036e-02, ..., 6.6675417e-02,
        5.7522770e-02, 2.8097081e+00],
       ...,
       [5.4804064e-02, 5.0945837e-02, 6.3865043e-02, ..., 2.4901863e+02,
        5.7522770e-02, 6.9161311e-02],
       [5.4804064e-02, 5.0945841e-02, 6.3865058e-02, ..., 7.0250870e+01,
        5.7522774e-02, 2.1621773e+01],
       [5.4804064e-02, 5.0945837e-02, 1.9057749e+01, ..., 6.6675410e-02,
        5.7522770e-02, 6.9161288e-02]], dtype=float32)


topic_dist = gamma / gamma.sum(axis=1)[:,None]


topic_dist.shape

(118, 10)


topic_names = ['Politics', 'Government services', 'Politics', 'Sports',
               'Working life', 'Economics', 'Foreign investment', 'NYC',
               'Local govenment', 'Foreign policy']

df = pd.DataFrame(data=topic_dist, columns=topic_names)


sns.heatmap(df.corr(), cmap="RdYlGn", annot=True)

<AxesSubplot:>


embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")


messages = ['Two words',
            'This is a sentence.',
            'This is a few sentences.  They are strung together.  They are in one string'
           ]

embeddings = embed(messages)
embeddings

<tf.Tensor: shape=(3, 512), dtype=float32, numpy=
array([[-1.01800682e-02, -3.09843477e-02, -4.27960455e-02, ...,
         1.08032905e-01,  9.26830471e-05, -6.09762501e-03],
       [-1.20698074e-02, -3.86185870e-02,  1.53449713e-03, ...,
         3.33885401e-02, -7.09105358e-02, -1.68962975e-03],
       [ 3.62214185e-02,  1.81559310e-03, -7.63325114e-03, ...,
         5.97876869e-02, -1.07888736e-01, -6.03896379e-03]], dtype=float32)>


for e in embeddings:
    print(e.shape)

(512,)
(512,)
(512,)


messages = [
    "How are you feeling?",
    "How are you?",
    "What's up?",
    "How old are you?",
    "How old are you, in years?",
    "What is your age?",
]
embeddings = embed(messages)
plot_similarity(messages, embeddings, 90)

M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\seaborn\rcmod.py:400: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
  if LooseVersion(mpl.__version__) >= "3.0":
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\setuptools\_distutils\version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
  other = LooseVersion(other)

<AxesSubplot:title={'center':'Semantic Textual Similarity'}>


messages = [
    # Add your own messages here
    "Message 1",
    "..."
]
embeddings = embed(messages)
plot_similarity(messages, embeddings, 90)

Session 6: Embeddings and Topic Modeling¶

Getting started¶

Additional functions for USE analysis¶

Word2vec¶

Analogies in Word2Vec¶

The sleight of hand of Word2Vec analogies¶

Exercise -- Try it out!¶

Exercise 1: Importing word2vec¶

Exercise 2: Exploring word similarity in word2vec¶

Exercise 3: Analogies with word2vec¶

LDA¶

Preparations¶

Setup for longer documents -- not needed for our data, but useful to have¶

Cleaning documents using spaCy¶

Cleaning up our articles for gensim¶

Needed for STM later¶

Building the gensim dictionary and corpus¶

Examining the LDA model¶

Applying the topics to our documents¶

Using a Universal Sentence Encoder (USE) model¶

Exercise -- Try it out yourself!¶