First, we will load the packages we need for these exercises.
# From base python
from collections import Counter # Counters
import csv # Read and write csv files
import glob
import pickle # Reading and writing python objects to file
import os # Work with file paths
import re # Regular expressions
# External
import numpy as np # Simple mathematics function and linear algebra
# External
import gensim # Simple topic modeling
import gensim.downloader # For importing a pre-built word2vec model
# Note: As of July 2021, gensim's installation process is moderately broken on python 3.9 and above.
# You can use python 3.8 instead, or install the 6.42GB worth of development tools needed to build
# the package yourself. To install those, grab the installer from:
# Download at: https://visualstudio.microsoft.com/visual-cpp-build-tools/
# and then select "Desktop development with C++" and install. You must keep all default selections
# checked for the install to work.
# After installing, reboot. Then `pip install gensim` will work just fine.
import matplotlib.pyplot as plt # Charting functions
plt.rcParams['figure.figsize'] = [12, 8] # Make plots larger by default
import pandas as pd # Work with data frames
import pyLDAvis # Easily visualize your topic models
import pyLDAvis.gensim_models # Easily visualize your topic models
pyLDAvis.enable_notebook() # Enables notebook support -- only needed when using Jupyter
import seaborn as sns
%matplotlib inline
import spacy # SpaCy -- for NLP parsing
import tensorflow as tf # Neural network backend
import tensorflow_hub as hub # Pretrained neural network models
from tqdm import tqdm # Provides a progress bar
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\past\builtins\misc.py:45: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses from imp import reload M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\seaborn\rcmod.py:82: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. if LooseVersion(mpl.__version__) >= "3.0": M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\setuptools\_distutils\version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. other = LooseVersion(other) M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:18: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. DESCRIPTOR = _descriptor.FileDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:36: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:43: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:29: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _TENSORSHAPEPROTO_DIM = _descriptor.Descriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:73: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:80: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_shape_pb2.py:66: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _TENSORSHAPEPROTO = _descriptor.Descriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:19: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. DESCRIPTOR = _descriptor.FileDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:33: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:37: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:41: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:45: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:49: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:53: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:57: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:61: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:65: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:69: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:73: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:77: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:81: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:85: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:89: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:93: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:97: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:101: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:105: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:109: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:113: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:117: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:121: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:125: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:129: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:133: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:137: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:141: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:145: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:149: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:153: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:157: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:161: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:165: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:169: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:173: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:177: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:181: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:185: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:189: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:193: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:197: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:201: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:205: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:209: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:213: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:217: DeprecationWarning: Call to deprecated create function EnumValueDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.EnumValueDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\types_pb2.py:27: DeprecationWarning: Call to deprecated create function EnumDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _DATATYPE = _descriptor.EnumDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:20: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. DESCRIPTOR = _descriptor.FileDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:39: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:46: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:32: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _RESOURCEHANDLEPROTO_DTYPEANDSHAPE = _descriptor.Descriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:76: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:83: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:90: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:97: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:104: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:111: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\resource_handle_pb2.py:69: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _RESOURCEHANDLEPROTO = _descriptor.Descriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:21: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. DESCRIPTOR = _descriptor.FileDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:40: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:47: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:54: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:61: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:68: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:75: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:82: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:89: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:96: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:103: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:110: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:117: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:124: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:131: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:138: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:145: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:152: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:33: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _TENSORPROTO = _descriptor.Descriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:183: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:190: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:197: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\tensor_pb2.py:176: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _VARIANTTENSORDATAPROTO = _descriptor.Descriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:21: DeprecationWarning: Call to deprecated create function FileDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. DESCRIPTOR = _descriptor.FileDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:40: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:47: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:54: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:61: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:68: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:75: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:82: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:89: DeprecationWarning: Call to deprecated create function FieldDescriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _descriptor.FieldDescriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow\core\framework\attr_value_pb2.py:33: DeprecationWarning: Call to deprecated create function Descriptor(). Note: Create unlinked descriptors is going to go away. Please use get/find descriptors from generated code or query the descriptor_pool. _ATTRVALUE_LISTVALUE = _descriptor.Descriptor( M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:23: DeprecationWarning: NEAREST is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.NEAREST or Dither.NONE instead. 'nearest': pil_image.NEAREST, M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:24: DeprecationWarning: BILINEAR is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BILINEAR instead. 'bilinear': pil_image.BILINEAR, M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:25: DeprecationWarning: BICUBIC is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BICUBIC instead. 'bicubic': pil_image.BICUBIC, M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:28: DeprecationWarning: HAMMING is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.HAMMING instead. if hasattr(pil_image, 'HAMMING'): M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:29: DeprecationWarning: HAMMING is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.HAMMING instead. _PIL_INTERPOLATION_METHODS['hamming'] = pil_image.HAMMING M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:30: DeprecationWarning: BOX is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BOX instead. if hasattr(pil_image, 'BOX'): M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:31: DeprecationWarning: BOX is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BOX instead. _PIL_INTERPOLATION_METHODS['box'] = pil_image.BOX M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:33: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead. if hasattr(pil_image, 'LANCZOS'): M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\keras_preprocessing\image\utils.py:34: DeprecationWarning: LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead. _PIL_INTERPOLATION_METHODS['lanczos'] = pil_image.LANCZOS M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow_hub\__init__.py:74: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. if (distutils.version.LooseVersion(tf.__version__) < M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\tensorflow_hub\__init__.py:75: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. distutils.version.LooseVersion(required_tensorflow_version)):
To start, we need to load in the text data from last session and process it again.
with open('../../Data/S4_WSJ_2013.09.09.txt', 'rt') as f:
text2 = f.readlines()
A lot of work with text analytics typically goes into cleaning up the document. In the case of the above, we probably want 119 articles separated out such that each article is an element of a list. To do this, we can use some basic looping and conditional statements. Note 2 key insights for this:
Full text:
. This is unlikely to be used in the article text itself, so it can serve as a delimiter for the start of an article.Company / organization:
, Credit:
, or Copyright:
Full text: Not available
articles = []
authors = []
article = ''
reading = False
for line in text2:
if 'Author: ' in line:
author = line.split(':')[-1].strip()
if ',' in author:
author = author.split(', ')[1].split()[0]
if reading: # check for the end of an article
if 'Company / organization: ' in line or 'Credit: ' in line or 'Copyright: ' in line:
# Done reading the article: output it
reading = False
articles.append(article)
authors.append(author)
article = ''
else:
article += line
else: # check for the start of an article
if 'Full text: ' in line and 'Full text: Not available' not in line:
# Start reading the article in
article = line[11:]
reading = True
else:
pass # not part of an article, nothing to do.
Given the above list, we are now in good shape to do whichever analysis it is that we need.
def distance_matrix_np(pts):
"""Returns matrix of pairwise Euclidean distances. Vectorized numpy version."""
return np.sum((pts[None,:] - pts[:, None])**2, -1)**0.5
def plot_similarity(messages, embeddings, rotation):
messages2 = []
for message in messages:
if len(message.split()) > 4:
c = 0
temp = ''
for m in message.split():
temp += m
c += 1
if c==4:
temp += '\n'
c = 0
else:
temp += ' '
temp = temp[:-1]
messages2.append(temp)
else:
messages2.append(message)
messages = messages2
corr = distance_matrix_np(embeddings)
corr = 1 - corr/2
sns.set(font_scale=1.2)
g = sns.heatmap(
corr,
xticklabels=messages,
yticklabels=messages,
vmin=0,
vmax=1,
cmap="YlOrRd")
g.set_xticklabels(messages, rotation=rotation)
g.set_yticklabels(messages, rotation=0)
g.set_title("Semantic Textual Similarity")
return g
# Note: Running this cell will download ~1.7GB of data
# It is stored in ~/gensim_data/
# (where ~ represents your Users folder)
# You can safely delete the data when you are done experimenting
base_w2v = gensim.downloader.load('word2vec-google-news-300')
base_w2v.doesnt_match(['Queen', 'King', 'Prince', 'Peasant'])
'Peasant'
base_w2v.doesnt_match(['Singapore', 'Malyasia', 'Indonesia', 'Germany'])
'Germany'
base_w2v.doesnt_match(['Euro', 'USD', 'RMB', 'computer'])
'computer'
base_w2v.doesnt_match(['mee goreng', 'char kway teoh', 'laksa', 'hamburger'])
'laksa'
base_w2v.most_similar(['Earnings'])
[('earnings', 0.7311209440231323), ('Profit', 0.715435266494751), ('F#Q##_Earnings', 0.6902576684951782), ('Profits', 0.6605858206748962), ('Net_Income', 0.6565544605255127), ('3rd_Quarter_Earnings', 0.6558468341827393), ('3Q_Earnings', 0.6529892086982727), ('2nd_Quarter_Earnings', 0.6502183675765991), ('2Q_Earnings', 0.6475957036018372), ('1Q_Earnings', 0.6449595093727112)]
base_w2v.most_similar(['KPMG'])
[('PwC', 0.8044511675834656), ('PricewaterhouseCoopers', 0.8032213449478149), ('Deloitte', 0.7856793403625488), ('Grant_Thornton', 0.7815380096435547), ('PriceWaterhouseCoopers', 0.7609084844589233), ('KMPG', 0.7575339674949646), ('PricewaterhouseCoopers_PwC', 0.7438498139381409), ('Pricewaterhouse_Coopers', 0.7163810729980469), ('Delloitte', 0.7009097933769226), ('KPMG_LLP', 0.7008421421051025)]
base_w2v.most_similar(['Accrual'])
[('accrual', 0.6091281771659851), ('accruals', 0.544994592666626), ('Accrued_severance', 0.5363413691520691), ('Accruals', 0.5340796709060669), ('Termination_Benefits', 0.5254215002059937), ('----------------------_Adjusted', 0.5224087238311768), ('-----------_Non_GAAP', 0.522375762462616), ('------------------------------------_Adjusted', 0.5210174322128296), ('--------------------------------------------_Adjusted', 0.5202162861824036), ('Deferral', 0.5182511210441589)]
A common way of demonstrating the power of word vectors is to demonstrate their usage in analogies. Consider the below:
Man is to king, as woman is to ___
Algorithmically, we can represent this as King - man + woman = ?
.
base_w2v.most_similar(positive=['King', 'woman'], negative=['man'])
[('Queen', 0.551562488079071), ('Oprah_BFF_Gayle', 0.4759754538536072), ('Geoffrey_Rush_Exit', 0.4646016061306), ('Princess', 0.45336732268333435), ('Yvonne_Stickney', 0.4507041871547699), ('L._Bonauto', 0.44221362471580505), ('gal_pal_Gayle', 0.44083890318870544), ('Alveda_C.', 0.44027918577194214), ('Tupou_V.', 0.43738630414009094), ('K._Letourneau', 0.4351031482219696)]
As we can see above, the algorithm correctly guesses Queen
.
Generally, word2vec implementations don't allow it to return one of the input words in the output. As such, if we compare raw distances between King
and the analogy versus Queen
and the analogy, we will surprisingly find that word2vec really thinks the following:
Man is to King, as woman is to King.
analogy = base_w2v['King'] + base_w2v['woman'] + base_w2v['man']
analogy = analogy / np.linalg.norm(analogy)
print('King', np.linalg.norm(analogy - base_w2v['King']))
King 1.988859
print('Queen', np.linalg.norm(analogy - base_w2v['Queen']))
Queen 2.7364812
That being said, it isn't all sleight of hand. Queen is not the closest word to King in the word2vec structure, yet it is the second guess for the analogy. As such, we can see that the algorithm does encapsulate some meaning.
base_w2v.most_similar('King')
[('Jackson', 0.5326346158981323), ('Prince', 0.5306329727172852), ('Tupou_V.', 0.529282808303833), ('KIng', 0.5227501392364502), ('e_mail_robert.king_@', 0.5173625349998474), ('king', 0.515891969203949), ('Queen', 0.5157252550125122), ('Geoffrey_Rush_Exit', 0.4992094337940216), ('prosecutor_Dan_Satterberg', 0.4985077977180481), ('NECN_Alison', 0.49128594994544983)]
If you have Tensorflow installed, you can run this locally. If not, you can run it at https://rmc.link/colab_w2v
print(list(gensim.downloader.info()['models'].keys()))
['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']
As there is only 1 English word2vec model available, word2vec-google-news-300
, we will use that model.
Note: Running the cell below will download ~1.7GB of data if you haven't already downloaded it previously. The data will be stored in ~/gensim_data/
(where ~ represents your Users folder). You can safely delete the data when you are done experimenting.
# this code is done for you -- just run it to load the model
# If you get a bunch of warnings when downloading, ignore them -- it's a bug in some versions of Jupyter.
base_w2v = gensim.downloader.load('word2vec-google-news-300')
To gain some familiarity with how word2vec function, pick a few words and run the base_w2v.most_similar()
function with those words (once per word). This will list out the 10 closest words to the word you chose, based on how those words were used in Google News. Pay attention to the different linkages it made (some you may have expected, some you may not have!). If you have no ideas of where to start, 'Enron'
is an interesting one.
If you get an error, it just means your word wasn't in the data -- pick a new word.
Note: Some phrases are also in the data. You can try a phrase by replacing any spaces with underscores (_
).
# Replace YOUR_WORD_HERE with your chosen word.
base_w2v.most_similar('Enron')
[('Skilling', 0.6776296496391296), ('WorldCom', 0.648767352104187), ('Fastow', 0.6305845379829407), ('ENRON', 0.6234410405158997), ('Enron_Broadband', 0.6094325184822083), ('CFO_Andrew_Fastow', 0.6074832081794739), ('Corp._ENRNQ', 0.6013128161430359), ('Enron_Corp._ENRNQ.PK', 0.5955372452735901), ('Enron_Kenneth_Lay', 0.592575192451477), ('WorldCom_debacles', 0.5907949805259705)]
Next, take a look at how word2vec captures antonyms. Here you will most likely find that word2vec is a bit shaky, offering up words that are either rather odd or nonsensical. An instereting one to try here is Accounting
(note: the capitalization matters!).
# Replace YOUR_WORD_HERE with your chosen word.
base_w2v.most_similar(negative=['Accounting'])
[('Ironic_eh', 0.24367554485797882), ('HuMax_TAC_TM', 0.2254338413476944), ('legend_Alberto_Tomba', 0.22333985567092896), ('MAURICE_RIVER_TWP', 0.2193179577589035), ('scotch_swilling', 0.21673253178596497), ('NEW_BEDFORD_MASS', 0.21074101328849792), ('Nasdaq_NASDAQ_TRIN', 0.20925845205783844), ('lovingly_curated', 0.20696194469928741), ('minuter', 0.2062486708164215), ('giggling_gaggle', 0.2035253494977951)]
To input an analogy, we make use of the positive=
and negative=
parameters.
For an analogy of the form A : B :: C : ?, we would specify positive=['B, C']
and negative=['A']
.
# Function to format anaologies and parse them with word2vec
def analogize(question):
question = [q.strip() for q in question.split(':')]
if len(question) == 5:
del question[2]
response = base_w2v.most_similar(positive=[question[1], question[2]], negative=[question[0]])
return question[0] + ' : ' + question[1] + ' :: ' + question[2] + ' : ' + response[0][0]
Try to come up with an analogy, and see whether your predicted word ends up in word2vec's top 10! An example is given below.
# Example
analogize('Paris : France :: London : ?')
'Paris : France :: London : Britain'
# Fill in on your own!
analogize(' : :: : ?')
Reading a corpus efficiently is tricky. A good practice is to construct an iterator. What this means is that it will load your data on the fly as it is needed for processing, rather than loading it up front. You can consequently save a lot of RAM usage by doing this. sorted()
is used to wrap glob.glob()
to ensure consistent behavior on multiple calls.
def filing_iterator(path_to_10K_folder):
for file in sorted(glob.glob(path_to_10K_folder + '*')):
text = open(file, 'rt').read()
while len(text) > 200000:
i = 199999
while text[i] != '\n':
i -= 1
if i == 100000:
i = i + len(text[i:200000].split()[0])
break
yield text[0:i]
text = text[i:]
yield text
def name_iterator(path_to_10K_folder):
for file in sorted(glob.glob(path_to_10K_folder + '*')):
text = open(file, 'rt').read()
name = file.split('/')[-1]
while len(text) > 200000:
i = 199999
while text[i] != '\n':
i -= 1
if i == 100000:
i = i + len(text[i:200000].split()[0])
break
yield name
text = text[i:]
yield name
def reassemble_docs(docs, names):
doc_names = []
compiled_docs = []
name_ids = {}
# Determine which docs fit together
for i in range(0, len(names)):
name = names[i]
if name not in name_ids:
name_ids[name] = [i]
else:
name_ids[name].append(i)
for name in name_ids:
doc_names.append(name)
doc = []
for id in name_ids[name]:
doc += docs[id]
compiled_docs.append(doc)
return (compiled_docs, doc_names)
# This lemmatizes words, drops stopwords, drops symbols, drops numbers, and lowercases words
def cleaner(doc):
return [token.lemma_.lower() for token in doc if (not token.is_stop) and (token.pos_ not in ['SYM', 'NUM', 'SPACE', 'PUNCT', 'X'])]
# Define our pipe
# We keep lemmatizer to lemmatize with, tagger as it is a dependency of tagger
nlp = spacy.load("en_core_web_sm", exclude=['ner', 'parser'])
# As there is a 10-K longer than 1M characters, up the max_length of a document to 2M.
# spaCy needs up to 1GB of RAM per 1M characters to parse a file.
nlp.max_length = 200000
total_docs = len(list(filing_iterator('M:/Data_emulated/SEC_Filings/10-K_clean/2014/')))
doc_names = list(name_iterator('M:/Data_emulated/SEC_Filings/10-K_clean/2014/'))
%%time
# 27.5 minutes to run and as much as 500MB*n_process to process; most RAM usage released on completion
# Benchmark time based on running on an AMD Ryzen 5900X CPU
docs = []
for doc in tqdm(nlp.pipe(filing_iterator('M:/Data_emulated/SEC_Filings/10-K_clean/2014/'),
batch_size=1, n_process=20),
total=total_docs):
docs.append(cleaner(doc))
# Fix docs -- they were split into multiple list when greater than 200,000 characters in length
docs, doc_names = reassemble_docs(docs, doc_names)
doc_names = [doc.replace('2014\\', '') for doc in doc_names]
# Save to a file
with open('M:/Scratch/docs.pkl', 'wb') as f:
pickle.dump([docs, doc_names], f, protocol=pickle.HIGHEST_PROTOCOL)
with open('M:/Scratch/docs.pkl', 'rb') as f:
docs, _ = pickle.load(f)
# This lemmatizes words, drops stopwords, drops symbols, drops numbers, and lowercases words
def cleaner(doc):
return [token.lemma_.lower() for token in doc if (not token.is_stop) and (token.pos_ not in ['SYM', 'NUM', 'SPACE', 'PUNCT', 'X'])]
# Define our pipe
# We keep lemmatizer to lemmatize with, tagger as it is a dependency of tagger
nlp = spacy.load("en_core_web_sm", exclude=['ner', 'parser'])
%%time
# 5 seconds to run
docs = []
for doc in nlp.pipe(articles):
docs.append(cleaner(doc))
CPU times: total: 5.25 s Wall time: 5.25 s
The below cell converts the data to tidytext format
id = 0
out = [['id', 'author', 'text']]
for i in range(0, len(docs)):
doc = docs[i]
author = authors[i]
id += 1
out.append([id, author, ' '.join(doc)])
with open('../../Data/S6_WSJ_tidytext.csv', 'wt') as f:
writer = csv.writer(f)
writer.writerows(out)
# Takes trivial time and trivial RAM usage
# A dictionary mapping words to IDs
# LDA works on IDs to make things computationally efficient
words = gensim.corpora.Dictionary(docs)
# filter out words that are too frequent (unlikely to be useful) and those that are extremely rare
words.filter_extremes(no_below=3, no_above=0.5)
# Gensim corpus -- bag of words
corpus = [words.doc2bow(doc) for doc in docs]
# Save to a file
with open('M:/Scratch/corpus_WSJ.pkl', 'wb') as f:
pickle.dump([corpus, words], f, protocol=pickle.HIGHEST_PROTOCOL)
with open('M:/Scratch/corpus_WSJ.pkl', 'rb') as f:
corpus, words = pickle.load(f)
# Note: not using gensim.models.ldamulticore.LdaMulticore, as the model's performance is quite bad unless you tune
# the alpha and eta parameters yourself. The defaults will give poorly specified output on our corpus.
lda = gensim.models.ldamodel.LdaModel(corpus, id2word=words, num_topics=10, passes=5,
update_every=5, alpha='auto', eta='auto')
lda.save('../../Data/lda_WSJ')
lda = gensim.models.ldamodel.LdaModel.load('../../Data/lda_WSJ')
with open('M:/Scratch/corpus_WSJ.pkl', 'rb') as f:
corpus, words = pickle.load(f)
lda.show_topic(0, 10)
[('abbott', 0.012434472), ('party', 0.008847061), ('government', 0.007975474), ('power', 0.007954226), ('labor', 0.007714725), ('conservative', 0.006868049), ('s&p', 0.006789061), ('political', 0.0066726357), ('rudd', 0.006448531), ('policy', 0.006251966)]
for i in range(0,10):
top = lda.show_topic(i, 10)
top_words = [w for w, _ in top ]
print('{}: {}'.format(i, ' '.join(top_words)))
0: abbott party government power labor conservative s&p political rudd policy 1: school ms. people white district security service inc. user officer 2: benefit city life home trip live people de blasio york 3: play williams city game partner set season . good azarenka 4: company work price day people end start share take retirement 5: % fund bank fee economy government investor mortgage financial crisis 6: market % country china u.s. report group car buy investor 7: company lhota city catsimatidis retiree work health plan people york 8: % health blasio de voter likely city old york support 9: president house rule u.s. congress vote company syria obama include
For the sake of exposition, I will label the above:
ldavis = pyLDAvis.gensim_models.prepare(lda, corpus, words, sort_topics=False)
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\pyLDAvis\_prepare.py:246: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only. default_term_info = default_term_info.sort_values(
pyLDAvis.display(ldavis)
pyLDAvis.save_html(ldavis, 'ldavis.html')
gamma, _ = lda.inference(corpus)
gamma
array([[5.4804064e-02, 5.0945837e-02, 6.3865036e-02, ..., 6.6675432e-02, 5.7522774e-02, 2.3574228e+01], [5.4804064e-02, 5.0945837e-02, 6.3865043e-02, ..., 6.6675439e-02, 5.7522766e-02, 6.9161303e-02], [5.4804068e-02, 5.0945837e-02, 6.3865036e-02, ..., 6.6675417e-02, 5.7522770e-02, 2.8097081e+00], ..., [5.4804064e-02, 5.0945837e-02, 6.3865043e-02, ..., 2.4901863e+02, 5.7522770e-02, 6.9161311e-02], [5.4804064e-02, 5.0945841e-02, 6.3865058e-02, ..., 7.0250870e+01, 5.7522774e-02, 2.1621773e+01], [5.4804064e-02, 5.0945837e-02, 1.9057749e+01, ..., 6.6675410e-02, 5.7522770e-02, 6.9161288e-02]], dtype=float32)
Normalization
topic_dist = gamma / gamma.sum(axis=1)[:,None]
topic_dist.shape
(118, 10)
topic_names = ['Politics', 'Government services', 'Politics', 'Sports',
'Working life', 'Economics', 'Foreign investment', 'NYC',
'Local govenment', 'Foreign policy']
df = pd.DataFrame(data=topic_dist, columns=topic_names)
sns.heatmap(df.corr(), cmap="RdYlGn", annot=True)
<AxesSubplot:>
We can grab the model from Tensorflow Hub, which makes this quite easy to use.
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
messages = ['Two words',
'This is a sentence.',
'This is a few sentences. They are strung together. They are in one string'
]
embeddings = embed(messages)
embeddings
<tf.Tensor: shape=(3, 512), dtype=float32, numpy= array([[-1.01800682e-02, -3.09843477e-02, -4.27960455e-02, ..., 1.08032905e-01, 9.26830471e-05, -6.09762501e-03], [-1.20698074e-02, -3.86185870e-02, 1.53449713e-03, ..., 3.33885401e-02, -7.09105358e-02, -1.68962975e-03], [ 3.62214185e-02, 1.81559310e-03, -7.63325114e-03, ..., 5.97876869e-02, -1.07888736e-01, -6.03896379e-03]], dtype=float32)>
Note how these embeddings are all the same shape. Regardless fo the length of text, USE will compress them into a 512-dimensional space.
for e in embeddings:
print(e.shape)
(512,) (512,) (512,)
messages = [
"How are you feeling?",
"How are you?",
"What's up?",
"How old are you?",
"How old are you, in years?",
"What is your age?",
]
embeddings = embed(messages)
plot_similarity(messages, embeddings, 90)
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\seaborn\rcmod.py:400: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. if LooseVersion(mpl.__version__) >= "3.0": M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\setuptools\_distutils\version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead. other = LooseVersion(other)
<AxesSubplot:title={'center':'Semantic Textual Similarity'}>
If you have Tensorflow installed, you can run this locally. If not, you can run it at https://rmc.link/colab_use
messages = [
# Add your own messages here
"Message 1",
"..."
]
embeddings = embed(messages)
plot_similarity(messages, embeddings, 90)