# This line is needed to force matplotlib to display inline in the notebook
%matplotlib inline

from collections import Counter
import pickle
import os

# The 3 lines below here suppress ConvergenceWarnings -- this is not necessarily something
# I would recommend doing in practice, but avoids spammed warnings in a particular cell
# towards the end of this file
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

import pandas as pd                        # Work with data frames
import numpy as np                         # Simple mathematics function and linear algebra
import matplotlib.pyplot as plt            # Charting functions
plt.rcParams['figure.figsize'] = [12, 8]   # Make plots larger by default
import seaborn as sns                      # More visualization tools that build on matplotlib
from sklearn import cluster                # For using kmeans
from sklearn import ensemble               # Ensembling algorithms
from sklearn import linear_model           # GLM-type models -- both econometric and ML
from sklearn import metrics                # Model performance evaluation
from sklearn import model_selection        # Training and testing splits
from sklearn import neighbors              # For KNN
from sklearn import preprocessing          # For standardizing data for LASSO and Elastic net
from sklearn import svm                    # For SVM-type models
#import xgboost as xgb                      # For xgboost models

# Note: the below is from the umap-learn package, NOT the umap package (despite the name)
import umap.umap_ as umap                  # Needed for the custom function `umap_compare_svm()`

# Load data
with open('../../Data/S1_models.pkl', 'rb') as f:
    models = pickle.load(f)

with open('../../Data/S2_models.pkl', 'rb') as f:
    models.update(pickle.load(f))

list(models.keys())

['ElasticNet',
 'LASSO',
 'logit',
 'train_pd',
 'test_pd',
 'train_X_ML',
 'test_X_ML',
 'train_y',
 'test_y',
 'vars',
 'SVC',
 'XGBoost']

def logistic(x):
    return 1 / (1 + np.exp(-1 * x))

train_X_ens = pd.DataFrame({'XGBoost':   models['XGBoost'].predict_proba(models['train_X_ML'])[:,1],
                            'SVC': logistic(models['SVC'].decision_function(models['train_X_ML'])),
                            'ElasticNet':        models['ElasticNet'].predict_proba(models['train_X_ML'])[:,1],
                            'LASSO':     models['LASSO'].predict_proba(models['train_X_ML'])[:,1],
                            'logit':  models['logit'].predict(models['train_pd'][models['vars']])})

test_X_ens =  pd.DataFrame({'XGBoost':   models['XGBoost'].predict_proba(models['test_X_ML'])[:,1],
                            'SVC': logistic(models['SVC'].decision_function(models['test_X_ML'])),
                            'ElasticNet':        models['ElasticNet'].predict_proba(models['test_X_ML'])[:,1],
                            'LASSO':     models['LASSO'].predict_proba(models['test_X_ML'])[:,1],
                            'logit':  models['logit'].predict(models['test_pd'][models['vars']])})

labels_ens = ['XGBoost', 'SVC', 'ElasticNet', 'LASSO', 'logit']

M:\Python_environments\miniconda\envs\MLSS\lib\site-packages\statsmodels\discrete\discrete_model.py:2385: RuntimeWarning: overflow encountered in exp
  return 1/(1+np.exp(-X))
M:\Python_environments\miniconda\envs\MLSS\lib\site-packages\statsmodels\discrete\discrete_model.py:2385: RuntimeWarning: overflow encountered in exp
  return 1/(1+np.exp(-X))

rank_X_ens = test_X_ens.rank()

arank_X_ens = rank_X_ens.XGBoost + rank_X_ens.SVC + rank_X_ens.ElasticNet + rank_X_ens.LASSO + rank_X_ens.logit

auc = metrics.roc_auc_score(models['test_pd'].Restate_Int, arank_X_ens)
print(auc)
fpr, tpr, thresholds = metrics.roc_curve(models['test_pd'].Restate_Int, arank_X_ens)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc)
display.plot()

0.6527995433553007

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x265c8a1af70>

clf1 = linear_model.LogisticRegression(random_state=1)
clf2 = ensemble.RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=2)

voting = ensemble.VotingClassifier(estimators = [
    ('linear', clf1), ('rf', clf2), ('gbm', clf3)
], voting = 'soft')

voting.fit(models['train_X_ML'], models['train_pd'].Restate_Int)

VotingClassifier(estimators=[('linear', LogisticRegression(random_state=1)),
                             ('rf',
                              RandomForestClassifier(n_estimators=50,
                                                     random_state=1)),
                             ('gbm',
                              GradientBoostingClassifier(learning_rate=1.0,
                                                         max_depth=1,
                                                         random_state=2))],
                 voting='soft')

VotingClassifier(estimators=[('linear', LogisticRegression(random_state=1)),
                             ('rf',
                              RandomForestClassifier(n_estimators=50,
                                                     random_state=1)),
                             ('gbm',
                              GradientBoostingClassifier(learning_rate=1.0,
                                                         max_depth=1,
                                                         random_state=2))],
                 voting='soft')

LogisticRegression(random_state=1)

RandomForestClassifier(n_estimators=50, random_state=1)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=2)

auc = metrics.roc_auc_score(models['test_pd'].Restate_Int, voting.predict_proba(models['test_X_ML'])[:,1])
auc
fpr, tpr, thresholds = metrics.roc_curve(models['test_pd'].Restate_Int, arank_X_ens)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc)
display.plot()

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x265cc7b3d90>

# From umap.plot source code on Github
def _get_embedding(umap_object):
    if hasattr(umap_object, "embedding_"):
        return umap_object.embedding_
    elif hasattr(umap_object, "embedding"):
        return umap_object.embedding
    else:
        raise ValueError("Could not find embedding attribute of umap_object")


# Cut down version of umap.plot.points to remove dependencies on  datashader, bokeh, holoviews, scikit-image, and colorcet
# Introduces a dependency on seaborn though
def umap_color(data_map, data_color, cmap='viridis', subset=None, title=None):
    reducer = umap.UMAP()
    umap_object = reducer.fit(data_map)
    embed = _get_embedding(umap_object)

    if subset is not None:
        embed_X = embed[subset,0]
        embed_Y = embed[subset,1]
        data_color = np.array(data_color[subset])
    else:
        embed_X = embed[:, 0]
        embed_Y = embed[:, 1]
    
    point_size = 100.0 / np.sqrt(len(embed_X))
    
    # color by values
    fig, ax = plt.subplots(figsize=(12,8))
    g = sns.scatterplot(ax=ax, x=embed_X, y=embed_Y, hue=data_color, size=point_size)
    _ = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    return g

df = models['train_pd']
topic_names = ['Topic_' + str(i) + '_n_oI' for i in range(1, 32)]

df['industry'] = np.select(
    [
        df['sic'].between(0, 999, inclusive="both"), 
        df['sic'].between(1000, 1499, inclusive="both"), 
        df['sic'].between(1500, 1799, inclusive="both"), 
        df['sic'].between(2000, 3999, inclusive="both"), 
        df['sic'].between(4000, 4999, inclusive="both"), 
        df['sic'].between(5000, 5199, inclusive="both"), 
        df['sic'].between(5200, 5999, inclusive="both"), 
        df['sic'].between(6000, 6799, inclusive="both"), 
        df['sic'].between(7000, 8999, inclusive="both"), 
        df['sic'].between(9100, 9999, inclusive="both")
    ], 
    [
        "Agriculture",
        "Mining",
        "Construction",
        "Manufacturing",
        "Utilities",
        "Wholesale Trade",
        "Retail Trade",
        "Finance",
        "Services",
        "Public Admin"
    ], 
    default='Unknown'
)

df[['industry', 'sic']]

set(df.industry.to_list())

{'Agriculture',
 'Construction',
 'Manufacturing',
 'Mining',
 'Public Admin',
 'Retail Trade',
 'Services',
 'Utilities',
 'Wholesale Trade'}

umap_color(df[topic_names], df.industry)

<Axes: >

model = cluster.KMeans(n_clusters=9, n_init='auto')
kmeans = model.fit(df[topic_names])

M:\Python_environments\miniconda\envs\MLSS\lib\site-packages\threadpoolctl.py:1010: RuntimeWarning: 
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  warnings.warn(msg, RuntimeWarning)

df['cluster'] = kmeans.labels_

umap_color(df[topic_names], df.cluster.astype("category"))

<Axes: >

iterations = 10
ks = []
gaps = []
sks = []
inertias = []

k = 2
init_k = k
optimizing = True
while optimizing:
    model = cluster.KMeans(n_clusters=k, n_init='auto')
    kmeans = model.fit(df[topic_names])
    inertia_at_k = kmeans.inertia_
    # run 50 iterations to determine s.d.
    sim_inertias = []
    for i in range(0,iterations):
        model = cluster.KMeans(n_clusters=k, n_init='auto')
        # Simulate on random data
        kmeans = model.fit(np.random.rand(11478,31))
        sim_inertias.append(kmeans.inertia_)
    l = np.mean(np.log(sim_inertias))
    gap = l - np.log(inertia_at_k)
    sk = (sum((1/iterations) * (np.log(sim_inertias) - l)**2)**(1/2)) * (1+1/iterations)**(1/2)
    ks.append(k)
    sks.append(sk)
    gaps.append(gap)
    inertias.append(inertia_at_k)
    if k > init_k:
        thresh = gap - gaps[-2] - sks[-2]
        if thresh < 0:
            print('Optimal k found: ' + str(k-1))
            print(thresh)
            break
        else:
            print('k={} Not optimal.  To optimize: {:.2f}'.format(k-1, thresh))
            k += 1
    else:
        k += 1

k=2 Not optimal.  To optimize: 0.08
k=3 Not optimal.  To optimize: 0.09
k=4 Not optimal.  To optimize: 0.14
k=5 Not optimal.  To optimize: 0.06
k=6 Not optimal.  To optimize: 0.07
k=7 Not optimal.  To optimize: 0.05
k=8 Not optimal.  To optimize: 0.08
k=9 Not optimal.  To optimize: 0.01
k=10 Not optimal.  To optimize: 0.11
Optimal k found: 11
-0.04015949701038385

print('Using optimal k = ' + str(k-1))
model = cluster.KMeans(n_clusters=k-1, n_init='auto')
kmeans = model.fit(df[topic_names])
df['cluster_opt'] = kmeans.labels_

Using optimal k = 11

umap_color(df[topic_names], df.cluster_opt.astype("category"))

<Axes: >

df[df.cluster_opt==1][['industry']].sample(n=10)

df[df.cluster_opt==2][['industry']].sample(n=10)

df[df.cluster_opt==3][['industry']].sample(n=10)

tables = [df[df.cluster_opt==1][['industry']].sample(n=10), df[df.cluster_opt==2][['industry']].sample(n=10), df[df.cluster_opt==3][['industry']].sample(n=10)]
with open('../Results/S3_cluster_tables.pkl', 'wb') as f:
    pickle.dump(tables, f, pickle.HIGHEST_PROTOCOL)

testing = models['test_pd']
testing['industry'] = np.select(
    [
        testing['sic'].between(0, 999, inclusive="both"), 
        testing['sic'].between(1000, 1499, inclusive="both"), 
        testing['sic'].between(1500, 1799, inclusive="both"), 
        testing['sic'].between(2000, 3999, inclusive="both"), 
        testing['sic'].between(4000, 4999, inclusive="both"), 
        testing['sic'].between(5000, 5199, inclusive="both"), 
        testing['sic'].between(5200, 5999, inclusive="both"), 
        testing['sic'].between(6000, 6799, inclusive="both"), 
        testing['sic'].between(7000, 8999, inclusive="both"), 
        testing['sic'].between(9100, 9999, inclusive="both")
    ], 
    [
        "Agriculture",
        "Mining",
        "Construction",
        "Manufacturing",
        "Utilities",
        "Wholesale Trade",
        "Retail Trade",
        "Finance",
        "Services",
        "Public Admin"
    ], 
    default='Unknown'
)

knn = neighbors.KNeighborsClassifier(n_neighbors=5)

knn.fit(df[topic_names], df['industry'])

KNeighborsClassifier()

KNeighborsClassifier()

in_pred = knn.predict(df[topic_names])
out_pred = knn.predict(testing[topic_names])

print('In sample: {},\nOut of sample: {}'.format(metrics.accuracy_score(df['industry'], in_pred),
                                                 metrics.accuracy_score(testing['industry'], out_pred)))

In sample: 0.9123540686530754,
Out of sample: 0.8597236981934112

knn_opt = neighbors.KNeighborsClassifier(algorithm='auto')
knn_param = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10],
    'leaf_size': [10, 20, 30, 40],
    'p': [1, 2],
    'weights': ['uniform', 'distance'],
}
                   
# with GridSearch
grid_knn = model_selection.GridSearchCV(
    estimator=knn_opt,
    param_grid=knn_param,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 5
)

grid_knn.fit(df[topic_names], df['industry'])

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'leaf_size': [10, 20, 30, 40],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'p': [1, 2], 'weights': ['uniform', 'distance']},
             scoring='accuracy')

GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'leaf_size': [10, 20, 30, 40],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'p': [1, 2], 'weights': ['uniform', 'distance']},
             scoring='accuracy')

KNeighborsClassifier()

KNeighborsClassifier()

params = grid_knn.best_params_
print(params)

{'leaf_size': 10, 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}

knn2 = neighbors.KNeighborsClassifier(**params)
knn2.fit(df[topic_names], df['industry'])

KNeighborsClassifier(leaf_size=10, n_neighbors=10, p=1, weights='distance')

KNeighborsClassifier(leaf_size=10, n_neighbors=10, p=1, weights='distance')

in_pred = knn2.predict(df[topic_names])
out_pred = knn2.predict(testing[topic_names])

print('In sample: {},\nOut of sample: {}'.format(metrics.accuracy_score(df['industry'], in_pred),
                                                 metrics.accuracy_score(testing['industry'], out_pred)))

In sample: 1.0,
Out of sample: 0.8852284803400637

	industry	sic
0	Wholesale Trade	5080
1	Wholesale Trade	5080
3	Manufacturing	3661
4	Manufacturing	3661
6	Manufacturing	2834
...	...	...
14246	Retail Trade	5531
14249	Wholesale Trade	5040
14251	Manufacturing	2835
14254	Services	8051
14299	Services	7370

Session 3: Ensembling and Clustering¶

Getting started¶

Ensembling¶

Simple, manual example¶

Building a voting classifier from scratch¶

Clustering¶

Setup¶

UMAP and kmeans¶

Optimizing the `k` parameter¶

KNN for multiclass prediction¶

	industry
8960	Manufacturing
12079	Manufacturing
3138	Utilities
2412	Utilities
430	Utilities
11712	Wholesale Trade
12080	Manufacturing
14076	Manufacturing
4646	Utilities
14169	Utilities

	industry
8745	Wholesale Trade
2602	Utilities
961	Utilities
8054	Utilities
1511	Mining
8547	Mining
13230	Mining
10413	Mining
1506	Utilities
3076	Utilities

	industry
12990	Services
8393	Services
8327	Manufacturing
9484	Manufacturing
4191	Services
12558	Manufacturing
13834	Services
5216	Manufacturing
4133	Manufacturing
11513	Manufacturing

Session 3: Ensembling and Clustering¶

Getting started¶

Ensembling¶

Simple, manual example¶

Building a voting classifier from scratch¶

Clustering¶

Setup¶

UMAP and kmeans¶

Optimizing the k parameter¶

KNN for multiclass prediction¶

Optimizing the `k` parameter¶