# This line is needed to force matplotlib to display inline in the notebook
%matplotlib inline

import pickle

# The 3 lines below here suppress ConvergenceWarnings -- this is not necessarily something
# I would recommend doing in practice, but avoids spammed warnings in a particular cell
# towards the end of this file
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

import pandas as pd                        # Work with data frames
import numpy as np                         # Simple mathematics function and linear algebra
import matplotlib.pyplot as plt            # Charting functions
plt.rcParams['figure.figsize'] = [12, 8]   # Make plots larger by default
import statsmodels.formula.api as smf      # Basic econometric regressions
from sklearn import model_selection        # Training and testing splits
from sklearn import linear_model           # GLM-type models -- both econometric and ML
from sklearn import metrics                # Model performance evaluation
from sklearn import preprocessing          # For standardizing data for LASSO and Elastic net


df = pd.read_csv('../../Data/S1_data.csv.gz')


vars_financial = ['logtotasset', 'rsst_acc', 'chg_recv', 'chg_inv', 'soft_assets', 'pct_chg_cashsales', 'chg_roa',
                  'issuance', 'oplease_dum', 'book_mkt', 'lag_sdvol', 'merger', 'bigNaudit', 'midNaudit', 'cffin',
                  'exfin', 'restruct']
vars_style = ['bullets', 'headerlen', 'newlines', 'alltags', 'processedsize', 'sentlen_u', 'wordlen_s', 'paralen_s',
              'repetitious_p', 'sentlen_s', 'typetoken', 'clindex', 'fog', 'active_p', 'passive_p', 'lm_negative_p',
              'lm_positive_p', 'allcaps', 'exclamationpoints', 'questionmarks']
vars_topic = ['Topic_' + str(i+1) + '_n_oI' for i in range(0,31)]


def coefplot(names, coef, title=None):
    # Make sure coef is list, cast to list if needed.
    if isinstance(coef, np.ndarray):
        if len(coef.shape) > 1:
            coef = list(coef[0])
        else:
            coef = list(coef)
    
    # Drop unneeded vars
    data = []
    for i in range(0, len(coef)):
        if coef[i] != 0:
            data.append([names[i], coef[i]])
    data.sort(key=lambda x: x[1])
    
    # Add in a key for the plot axis
    data = [data[i] + [i+1] for i in range(0,len(data))]
    
    fig, ax = plt.subplots(figsize=(4,0.25*len(data)))

    ax.scatter([i[1] for i in data], [i[2] for i in data])
    
    ax.grid(axis='y')
    ax.set(xlabel="Fitted value", ylabel="Residual", title=(title if title is not None else "Coefficient Plot"))
    
    ax.axvline(x=0, linestyle='dotted')
    ax.set_yticks([i[2] for i in data])
    ax.set_yticklabels([i[0] for i in data])
    
    return ax


def lasso_coefpath(model, X, Y):
    if 'alphas_' in dir(model):
        alphas = reg_lasso.alphas_
        coefs = []
        for a in alphas:
            temp_lasso = linear_model.Lasso(alpha=a, warm_start=True)
            temp_lasso.fit(X, Y)
            coefs.append(temp_lasso.coef_)

        fig, ax = plt.subplots()

        ax.plot(alphas, coefs)
        ax.set_xscale('log')
        ax.set_xlim(ax.get_xlim()[::-1])
        ax.set_xlabel("alpha")
        ax.set_ylabel("Coefficient values")
        
        return ax
    elif 'Cs_' in dir(model):
        Cs = reg_lasso.Cs_
        coefs = []
        for c in Cs:
            temp_lasso = linear_model.LogisticRegression(penalty='l1', solver='saga', C=c, warm_start=True)
            temp_lasso.fit(X, Y)
            coefs.append(temp_lasso.coef_[0])

        fig, ax = plt.subplots()

        ax.plot(Cs, coefs)
        ax.set_xscale('log')
        ax.set_xlabel("C")
        ax.set_ylabel("Coefficient values")
        
        return ax
    else:
        print("Does not match linear_model.LassoCV or linear_model.LogisticRegressionCV")
        return False


def lasso_scorepath(model, errorbars=True):
    if 'alphas_' in dir(model):
        alphas = reg_lasso.alphas_
        mean = np.mean(reg_lasso.mse_path_, axis=1)
        std = np.std(reg_lasso.mse_path_, axis=1)*1.96

        fig, ax = plt.subplots()
        
        if errorbars:
            ax.errorbar(alphas, mean, yerr=std, ecolor="lightgray", elinewidth=2, capsize=4, capthick=2)
        else:
            ax.plot(alphas, mean)
        ax.set_xscale('log')
        ax.set_xlabel("alpha")
        ax.set_ylabel("Mean Squared error")
        
        return ax
    elif 'Cs_' in dir(model):
        Cs = reg_lasso.Cs_
        mean = np.mean(reg_lasso.scores_[1], axis=0)
        std = np.std(reg_lasso.scores_[1], axis=0)*1.96

        fig, ax = plt.subplots()
        
        if errorbars:
            ax.errorbar(Cs, mean, yerr=std, ecolor="lightgray", elinewidth=2, capsize=4, capthick=2)
        else:
            ax.plot(Cs, mean)
        ax.set_xscale('log')
        ax.set_xlabel("C")
        ax.set_ylabel("ROC AUC")
        
        return ax
    else:
        print("Does not match linear_model.LassoCV or linear_model.LogisticRegressionCV")
        return False


# Check the years in the data
df['year'].unique()

array([2002, 2003, 2004, 1999, 2000, 2001], dtype=int64)


# Subset the final year to be the testing year
train = df[df.year < 2004]
test = df[df.year == 2004]
print(df.shape, train.shape, test.shape)

(14301, 198) (11478, 198) (2823, 198)


Y1 = df['sdvol1']
X1 = df.drop(columns=['sdvol1'])


# test_size specifies the percent of the files to hold for testing
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X1, Y1, test_size=0.2)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(11440, 197) (2861, 197) (11440,) (2861,)


formula = 'sdvol1 ~ ' + ' + '.join(vars_topic[0:-1])  # Drop the final value to avoid multicollinearity
formula

'sdvol1 ~ Topic_1_n_oI + Topic_2_n_oI + Topic_3_n_oI + Topic_4_n_oI + Topic_5_n_oI + Topic_6_n_oI + Topic_7_n_oI + Topic_8_n_oI + Topic_9_n_oI + Topic_10_n_oI + Topic_11_n_oI + Topic_12_n_oI + Topic_13_n_oI + Topic_14_n_oI + Topic_15_n_oI + Topic_16_n_oI + Topic_17_n_oI + Topic_18_n_oI + Topic_19_n_oI + Topic_20_n_oI + Topic_21_n_oI + Topic_22_n_oI + Topic_23_n_oI + Topic_24_n_oI + Topic_25_n_oI + Topic_26_n_oI + Topic_27_n_oI + Topic_28_n_oI + Topic_29_n_oI + Topic_30_n_oI'


model = smf.ols(formula=formula, data=train)
fit_ols = model.fit()
fit_ols.summary()


print('Supported regression types include: ' + ', '.join([i for i in dir(smf) if '_' not in i ]))

Supported regression types include: gee, glm, glmgam, gls, glsar, logit, mixedlm, mnlogit, negativebinomial, ols, phreg, poisson, probit, quantreg, rlm, wls


formula = 'Restate_Int ~ ' + \
          ' + '.join(vars_financial) + ' + ' +\
          ' + '.join(vars_style) + ' + ' +\
          ' + '.join(vars_topic[0:-1])  # Drop the final value to avoid multicollinearity
model = smf.logit(formula=formula, data=train)
fit_logit = model.fit()
fit_logit.summary()

M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\statsmodels\discrete\discrete_model.py:1819: RuntimeWarning: overflow encountered in exp
  return 1/(1+np.exp(-X))

Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.054196
         Iterations: 35

M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\statsmodels\base\model.py:604: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\statsmodels\discrete\discrete_model.py:1819: RuntimeWarning: overflow encountered in exp
  return 1/(1+np.exp(-X))


# Linear, in-sample
Y_hat_train = fit_ols.predict(train)

# squared=False means this function will return RMSE instead of MSE.
rmse = metrics.mean_squared_error(train.sdvol1, Y_hat_train, squared=False)
mae = metrics.mean_absolute_error(train.sdvol1, Y_hat_train)

print('RMSE: {:.4f}, MAE: {:.4f}'.format(rmse, mae))

RMSE: 0.0286, MAE: 0.0191


# Linear, out-of-sample
Y_hat_test = fit_ols.predict(test)

# squared=False means this function will return RMSE instead of MSE.
rmse = metrics.mean_squared_error(test.sdvol1, Y_hat_test, squared=False)
mae = metrics.mean_absolute_error(test.sdvol1, Y_hat_test)

print('RMSE: {:.4f}, MAE: {:.4f}'.format(rmse, mae))

RMSE: 0.0223, MAE: 0.0188


# Logit, in-sample
Y_hat_train = fit_logit.predict(train)

auc = metrics.roc_auc_score(train.Restate_Int, Y_hat_train)

print('ROC AUC: {:.4f}'.format(auc))

ROC AUC: 0.7795

M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\statsmodels\discrete\discrete_model.py:1819: RuntimeWarning: overflow encountered in exp
  return 1/(1+np.exp(-X))


fpr, tpr, thresholds = metrics.roc_curve(train.Restate_Int, Y_hat_train)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc)
display.plot()

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1982f51d100>


# Logit, out-of-sample
Y_hat_test = fit_logit.predict(test)

auc = metrics.roc_auc_score(test.Restate_Int, Y_hat_test)

print('ROC AUC: {:.4f}'.format(auc))

fpr, tpr, thresholds = metrics.roc_curve(test.Restate_Int, Y_hat_test)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auc)
display.plot()

M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\statsmodels\discrete\discrete_model.py:1819: RuntimeWarning: overflow encountered in exp
  return 1/(1+np.exp(-X))

ROC AUC: 0.6093

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1982f5d9340>


# Set up the data for the regression
vars = vars_topic
scaler_X = preprocessing.StandardScaler()
scaler_X.fit(train[vars])
train_X_linear = scaler_X.transform(train[vars])
test_X_linear = scaler_X.transform(test[vars])

scaler_Y = preprocessing.StandardScaler()
scaler_Y.fit(np.array(train.sdvol1).reshape(-1, 1))
train_Y_linear = scaler_Y.transform(np.array(train.sdvol1).reshape(-1, 1))
test_Y_linear = scaler_Y.transform(np.array(test.sdvol1).reshape(-1, 1))


reg_lasso = linear_model.Lasso(alpha=0.1)
reg_lasso.fit(train_X_linear, train_Y_linear)

Lasso(alpha=0.1)

Lasso(alpha=0.1)


print('\n'.join([str(i) for i in zip(vars, list(reg_lasso.coef_))]))

('Topic_1_n_oI', 0.0)
('Topic_2_n_oI', -0.0)
('Topic_3_n_oI', -0.0)
('Topic_4_n_oI', 0.0)
('Topic_5_n_oI', 0.0)
('Topic_6_n_oI', -0.0)
('Topic_7_n_oI', -0.024652670717253643)
('Topic_8_n_oI', 0.0)
('Topic_9_n_oI', 0.0025216975893069594)
('Topic_10_n_oI', -0.0)
('Topic_11_n_oI', -0.040763790800077984)
('Topic_12_n_oI', 0.014243533618841075)
('Topic_13_n_oI', -0.0)
('Topic_14_n_oI', -0.11823748613317253)
('Topic_15_n_oI', -0.0)
('Topic_16_n_oI', -0.019111032946428127)
('Topic_17_n_oI', -0.0)
('Topic_18_n_oI', 0.0230195397439584)
('Topic_19_n_oI', 0.0)
('Topic_20_n_oI', -0.0)
('Topic_21_n_oI', 0.0536221897573471)
('Topic_22_n_oI', -0.0)
('Topic_23_n_oI', -0.0)
('Topic_24_n_oI', -0.0)
('Topic_25_n_oI', 0.0)
('Topic_26_n_oI', 0.0)
('Topic_27_n_oI', -0.0)
('Topic_28_n_oI', 0.0)
('Topic_29_n_oI', 0.0)
('Topic_30_n_oI', -0.0)
('Topic_31_n_oI', 0.0)


coefplot(vars, reg_lasso.coef_)

<AxesSubplot:title={'center':'Coefficient Plot'}, xlabel='Fitted value', ylabel='Residual'>


# Set up the data for the regression
vars = vars_topic + vars_financial + vars_style
scaler_X = preprocessing.StandardScaler()
scaler_X.fit(train[vars])
train_X_logistic = scaler_X.transform(train[vars])
test_X_logistic = scaler_X.transform(test[vars])

train_Y_logistic = train.Restate_Int
test_Y_logistic = test.Restate_Int


reg_lasso = linear_model.LogisticRegression(penalty='l1', solver='saga', C=0.1)
reg_lasso.fit(train_X_logistic, train_Y_logistic)

LogisticRegression(C=0.1, penalty='l1', solver='saga')

LogisticRegression(C=0.1, penalty='l1', solver='saga')


print('\n'.join([str(i) for i in zip(vars, list(reg_lasso.coef_[0]))]))

('Topic_1_n_oI', -0.023919145113147133)
('Topic_2_n_oI', 0.0)
('Topic_3_n_oI', 0.0)
('Topic_4_n_oI', 0.0)
('Topic_5_n_oI', 0.0)
('Topic_6_n_oI', 0.0)
('Topic_7_n_oI', 0.0)
('Topic_8_n_oI', 0.0)
('Topic_9_n_oI', 0.0)
('Topic_10_n_oI', 0.0)
('Topic_11_n_oI', -0.04087610429404763)
('Topic_12_n_oI', -0.018395503953762708)
('Topic_13_n_oI', 0.0)
('Topic_14_n_oI', 0.0)
('Topic_15_n_oI', 0.0033662851632759476)
('Topic_16_n_oI', 0.0)
('Topic_17_n_oI', -0.005588661944932603)
('Topic_18_n_oI', 0.0)
('Topic_19_n_oI', -0.014462123617265897)
('Topic_20_n_oI', 0.0)
('Topic_21_n_oI', 0.0)
('Topic_22_n_oI', -0.006763975324455604)
('Topic_23_n_oI', 0.047978402254037066)
('Topic_24_n_oI', 0.0)
('Topic_25_n_oI', 0.05131271447055224)
('Topic_26_n_oI', 0.0)
('Topic_27_n_oI', 0.0)
('Topic_28_n_oI', -0.02663680289599326)
('Topic_29_n_oI', 0.0)
('Topic_30_n_oI', 0.0)
('Topic_31_n_oI', -0.0021827481185790676)
('logtotasset', 0.010398999812034612)
('rsst_acc', 0.0)
('chg_recv', 0.0)
('chg_inv', 0.0)
('soft_assets', 0.07005645438164804)
('pct_chg_cashsales', 0.0)
('chg_roa', 0.0)
('issuance', 0.0)
('oplease_dum', 0.0)
('book_mkt', 0.0)
('lag_sdvol', 0.0)
('merger', 0.0)
('bigNaudit', -0.003739903237554726)
('midNaudit', 0.01592485119919555)
('cffin', 0.07082160117955023)
('exfin', 0.0)
('restruct', 0.10457570712173134)
('bullets', 0.0)
('headerlen', 0.0)
('newlines', -0.000953615131155476)
('alltags', 0.0)
('processedsize', 0.1833285760772923)
('sentlen_u', 0.0)
('wordlen_s', 0.0)
('paralen_s', 0.10365093922339673)
('repetitious_p', 0.08358708393989216)
('sentlen_s', 0.0)
('typetoken', -0.0017209098493820357)
('clindex', -0.017444851673427474)
('fog', 0.0)
('active_p', -0.036821919688462094)
('passive_p', 0.025413687200692623)
('lm_negative_p', 0.19086022205290848)
('lm_positive_p', -0.008322322097298634)
('allcaps', 0.0)
('exclamationpoints', 0.0)
('questionmarks', 0.0)


coefplot(vars, reg_lasso.coef_)

<AxesSubplot:title={'center':'Coefficient Plot'}, xlabel='Fitted value', ylabel='Residual'>


reg_lasso = linear_model.LassoCV(cv=10)
reg_lasso.fit(train_X_linear, np.ravel(train_Y_linear))
print('The alpha that optimizes R^2 is: {}'.format(reg_lasso.alpha_))

The alpha that optimizes R^2 is: 0.01877812267942406


coefplot(vars, reg_lasso.coef_)

<AxesSubplot:title={'center':'Coefficient Plot'}, xlabel='Fitted value', ylabel='Residual'>


lasso_coefpath(reg_lasso, train_X_linear, train_Y_linear)

<AxesSubplot:xlabel='alpha', ylabel='Coefficient values'>


lasso_scorepath(reg_lasso, errorbars=False)

<AxesSubplot:xlabel='alpha', ylabel='Mean Squared error'>


reg_lasso = linear_model.LogisticRegressionCV(penalty='l1', solver='saga', Cs=10, cv=5, scoring="roc_auc")
reg_lasso.fit(train_X_logistic, train_Y_logistic)
print('The C that optimizes ROC AUC is: {}'.format(reg_lasso.C_))

The C that optimizes ROC AUC is: [2.7825594]


coefplot(vars, reg_lasso.coef_)

<AxesSubplot:title={'center':'Coefficient Plot'}, xlabel='Fitted value', ylabel='Residual'>


lasso_coefpath(reg_lasso, train_X_logistic, train_Y_logistic)

<AxesSubplot:xlabel='C', ylabel='Coefficient values'>


lasso_scorepath(reg_lasso)

<AxesSubplot:xlabel='C', ylabel='ROC AUC'>


reg_EN = linear_model.ElasticNetCV(cv=10, l1_ratio=[.1, .5, .7, .9, .95, .99, 1])
reg_EN.fit(train_X_linear, np.ravel(train_Y_linear))
print('Optimal R^2 at l1_ratio of {} and alpha of {:.4f}'.format(reg_EN.l1_ratio_,reg_EN.alpha_))

Optimal R^2 at l1_ratio of 0.5 and alpha of 0.0376


coefplot(vars, reg_EN.coef_)

<AxesSubplot:title={'center':'Coefficient Plot'}, xlabel='Fitted value', ylabel='Residual'>


reg_EN = linear_model.LogisticRegressionCV(
  penalty='elasticnet', solver='saga', Cs=5, cv=5,
  scoring="roc_auc", l1_ratios=[.96, .97, .98, .99, 1])
reg_EN.fit(train_X_logistic, train_Y_logistic)

print('ROC AUC is optimized at an l1_ratio of {} and a C of : {:.4f}'.format(reg_EN.l1_ratio_[0], reg_EN.C_[0]))

ROC AUC is optimized at an l1_ratio of 0.96 and a C of : 1.0000


coefplot(vars, reg_EN.coef_)

<AxesSubplot:title={'center':'Coefficient Plot'}, xlabel='Fitted value', ylabel='Residual'>


# Logistic regression
Y_hat_test = fit_logit.predict(test)
auc_logit = metrics.roc_auc_score(test.Restate_Int, Y_hat_test)
fpr_logit, tpr_logit, thresholds_logit = metrics.roc_curve(test.Restate_Int, Y_hat_test)
display = metrics.RocCurveDisplay(fpr=fpr_logit, tpr=tpr_logit, roc_auc=auc_logit)
display.plot()

M:\Python_environments\Anaconda3\envs\MLSS\lib\site-packages\statsmodels\discrete\discrete_model.py:1819: RuntimeWarning: overflow encountered in exp
  return 1/(1+np.exp(-X))

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x19831e68d90>


# LASSO
display = metrics.RocCurveDisplay.from_estimator(reg_lasso, test_X_logistic, test_Y_logistic)
display.plot()

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x198332da700>


# Elastic Net
display = metrics.RocCurveDisplay.from_estimator(reg_EN, test_X_logistic, test_Y_logistic)
display.plot()

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x19833c78280>


with open('../../Data/S1_models.pkl', 'wb') as f:
    pickle.dump({'ElasticNet': reg_EN, 'LASSO': reg_lasso, 'logit': fit_logit, 'train_pd': train, 'test_pd': test, 'train_X_ML': train_X_logistic, 'test_X_ML': test_X_logistic, 'train_y': train_Y_logistic, 'test_y': test_Y_logistic, 'vars': vars}, f)

Dep. Variable:	sdvol1	R-squared:	0.161
Model:	OLS	Adj. R-squared:	0.159
Method:	Least Squares	F-statistic:	73.45
Date:	Sun, 20 Aug 2023	Prob (F-statistic):	0.00
Time:	17:02:02	Log-Likelihood:	24508.
No. Observations:	11478	AIC:	-4.895e+04
Df Residuals:	11447	BIC:	-4.873e+04
Df Model:	30
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.0458	0.000	171.114	0.000	0.045	0.046
Topic_1_n_oI	1.1709	0.340	3.440	0.001	0.504	1.838
Topic_2_n_oI	0.5367	0.262	2.052	0.040	0.024	1.049
Topic_3_n_oI	0.4004	0.416	0.963	0.336	-0.415	1.216
Topic_4_n_oI	0.6475	0.239	2.713	0.007	0.180	1.115
Topic_5_n_oI	0.6777	0.246	2.752	0.006	0.195	1.160
Topic_6_n_oI	0.5422	0.363	1.494	0.135	-0.169	1.254
Topic_7_n_oI	-0.6519	0.286	-2.281	0.023	-1.212	-0.092
Topic_8_n_oI	0.5089	0.253	2.012	0.044	0.013	1.005
Topic_9_n_oI	2.1940	0.225	9.772	0.000	1.754	2.634
Topic_10_n_oI	0.6722	0.207	3.242	0.001	0.266	1.079
Topic_11_n_oI	-1.2180	0.259	-4.696	0.000	-1.726	-0.710
Topic_12_n_oI	-0.0311	0.295	-0.105	0.916	-0.609	0.547
Topic_13_n_oI	0.5372	0.811	0.662	0.508	-1.053	2.127
Topic_14_n_oI	-1.9815	0.279	-7.113	0.000	-2.528	-1.435
Topic_15_n_oI	0.7315	0.191	3.832	0.000	0.357	1.106
Topic_16_n_oI	-1.8828	0.447	-4.214	0.000	-2.759	-1.007
Topic_17_n_oI	-0.2011	0.339	-0.593	0.553	-0.866	0.463
Topic_18_n_oI	1.9874	0.375	5.305	0.000	1.253	2.722
Topic_19_n_oI	1.4410	0.232	6.208	0.000	0.986	1.896
Topic_20_n_oI	-1.5943	0.368	-4.330	0.000	-2.316	-0.873
Topic_21_n_oI	2.9559	0.338	8.737	0.000	2.293	3.619
Topic_22_n_oI	0.8334	0.205	4.065	0.000	0.431	1.235
Topic_23_n_oI	0.1696	0.213	0.797	0.426	-0.248	0.587
Topic_24_n_oI	0.6900	0.245	2.812	0.005	0.209	1.171
Topic_25_n_oI	0.7991	0.213	3.743	0.000	0.381	1.218
Topic_26_n_oI	0.5921	0.206	2.878	0.004	0.189	0.995
Topic_27_n_oI	1.5067	0.576	2.617	0.009	0.378	2.635
Topic_28_n_oI	1.2985	0.263	4.928	0.000	0.782	1.815
Topic_29_n_oI	0.6241	0.200	3.125	0.002	0.233	1.016
Topic_30_n_oI	-0.4546	0.465	-0.979	0.328	-1.365	0.456

Omnibus:	8439.589	Durbin-Watson:	1.399
Prob(Omnibus):	0.000	Jarque-Bera (JB):	291613.425
Skew:	3.157	Prob(JB):	0.00
Kurtosis:	26.872	Cond. No.	4.27e+03

Dep. Variable:	Restate_Int	No. Observations:	11478
Model:	Logit	Df Residuals:	11410
Method:	MLE	Df Model:	67
Date:	Sun, 20 Aug 2023	Pseudo R-squ.:	0.1205
Time:	17:02:03	Log-Likelihood:	-622.06
converged:	False	LL-Null:	-707.27
Covariance Type:	nonrobust	LLR p-value:	5.753e-11

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	-6.6337	5.591	-1.187	0.235	-17.592	4.324
logtotasset	0.0936	0.064	1.454	0.146	-0.033	0.220
rsst_acc	0.3269	0.323	1.013	0.311	-0.305	0.959
chg_recv	0.6838	1.307	0.523	0.601	-1.878	3.245
chg_inv	-1.4284	1.509	-0.947	0.344	-4.386	1.529
soft_assets	1.4508	0.470	3.088	0.002	0.530	2.371
pct_chg_cashsales	-0.0012	0.008	-0.145	0.885	-0.018	0.015
chg_roa	-0.2584	0.263	-0.981	0.327	-0.775	0.258
issuance	0.2336	0.422	0.554	0.580	-0.593	1.060
oplease_dum	0.1529	0.314	0.488	0.626	-0.462	0.768
book_mkt	0.0080	0.044	0.180	0.857	-0.079	0.095
lag_sdvol	-0.0400	0.100	-0.399	0.690	-0.237	0.157
merger	-0.2662	0.256	-1.039	0.299	-0.769	0.236
bigNaudit	-0.1544	0.445	-0.347	0.729	-1.027	0.718
midNaudit	0.3926	0.522	0.752	0.452	-0.630	1.415
cffin	0.5806	0.297	1.954	0.051	-0.002	1.163
exfin	-0.0202	0.024	-0.834	0.404	-0.068	0.027
restruct	0.6353	0.216	2.945	0.003	0.212	1.058
bullets	1.075e-05	3.48e-05	0.309	0.757	-5.74e-05	7.89e-05
headerlen	2.224e-05	0.000	0.197	0.844	-0.000	0.000
newlines	-0.0003	0.000	-1.599	0.110	-0.001	7.62e-05
alltags	-1.43e-07	2.8e-07	-0.511	0.610	-6.92e-07	4.06e-07
processedsize	5.323e-06	1.84e-06	2.893	0.004	1.72e-06	8.93e-06
sentlen_u	-0.0980	0.111	-0.884	0.377	-0.315	0.119
wordlen_s	0.9048	2.063	0.438	0.661	-3.139	4.949
paralen_s	0.0462	0.021	2.179	0.029	0.005	0.088
repetitious_p	1.5938	1.984	0.803	0.422	-2.295	5.482
sentlen_s	-0.0323	0.031	-1.053	0.292	-0.092	0.028
typetoken	-8.3850	3.526	-2.378	0.017	-15.295	-1.475
clindex	-0.3223	0.246	-1.310	0.190	-0.805	0.160
fog	0.2704	0.231	1.170	0.242	-0.183	0.723
active_p	-0.8746	2.264	-0.386	0.699	-5.312	3.562
passive_p	4.6649	3.770	1.238	0.216	-2.723	12.053
lm_negative_p	92.0693	18.403	5.003	0.000	55.999	128.139
lm_positive_p	-43.0639	64.547	-0.667	0.505	-169.575	83.447
allcaps	-0.0002	0.000	-0.773	0.439	-0.001	0.000
exclamationpoints	-0.0336	0.058	-0.584	0.559	-0.146	0.079
questionmarks	-24.3319	1.23e+05	-0.000	1.000	-2.4e+05	2.4e+05
Topic_1_n_oI	125.1676	137.553	0.910	0.363	-144.432	394.767
Topic_2_n_oI	77.4392	93.755	0.826	0.409	-106.316	261.195
Topic_3_n_oI	52.0919	164.329	0.317	0.751	-269.987	374.171
Topic_4_n_oI	130.1377	97.956	1.329	0.184	-61.852	322.128
Topic_5_n_oI	31.2128	102.921	0.303	0.762	-170.509	232.935
Topic_6_n_oI	46.2652	155.564	0.297	0.766	-258.634	351.164
Topic_7_n_oI	184.5507	114.663	1.610	0.108	-40.184	409.285
Topic_8_n_oI	22.0105	104.626	0.210	0.833	-183.054	227.075
Topic_9_n_oI	121.1205	97.873	1.238	0.216	-70.706	312.947
Topic_10_n_oI	91.1414	90.250	1.010	0.313	-85.745	268.028
Topic_11_n_oI	23.4753	105.112	0.223	0.823	-182.541	229.491
Topic_12_n_oI	19.1603	118.211	0.162	0.871	-212.528	250.849
Topic_13_n_oI	239.3227	207.335	1.154	0.248	-167.047	645.692
Topic_14_n_oI	76.5910	108.066	0.709	0.478	-135.214	288.396
Topic_15_n_oI	80.0223	76.228	1.050	0.294	-69.382	229.427
Topic_16_n_oI	196.9414	155.446	1.267	0.205	-107.726	501.609
Topic_17_n_oI	-137.2674	189.031	-0.726	0.468	-507.762	233.227
Topic_18_n_oI	36.9845	151.119	0.245	0.807	-259.204	333.173
Topic_19_n_oI	50.5549	94.652	0.534	0.593	-134.959	236.069
Topic_20_n_oI	-214.6025	266.673	-0.805	0.421	-737.271	308.066
Topic_21_n_oI	79.9359	128.198	0.624	0.533	-171.327	331.199
Topic_22_n_oI	-13.0197	103.030	-0.126	0.899	-214.955	188.916
Topic_23_n_oI	191.1390	82.721	2.311	0.021	29.009	353.269
Topic_24_n_oI	1.7167	163.281	0.011	0.992	-318.308	321.741
Topic_25_n_oI	114.4698	82.299	1.391	0.164	-46.833	275.772
Topic_26_n_oI	75.4314	80.733	0.934	0.350	-82.803	233.665
Topic_27_n_oI	-54.5069	238.932	-0.228	0.820	-522.806	413.792
Topic_28_n_oI	8.4901	111.726	0.076	0.939	-210.489	227.469
Topic_29_n_oI	80.1717	79.109	1.013	0.311	-74.879	235.223
Topic_30_n_oI	68.4699	160.105	0.428	0.669	-245.331	382.270

Session 1: Workflow and ML Regression¶

Getting started¶

Splitting samples into training and testing¶

Regressions with statsmodels¶

Linear regression¶

Logistic regression¶

Estimating how well a model works¶

Using machine learning for regression¶

Linear regression with LASSO¶

Logistic Regression with LASSO¶

Optimizing the LASSO parameter -- Linear regression¶

Optimizing the LASSO parameter -- Logistic regression¶

Elastic nets with cross validation¶

CV elastic net for OLS regression¶

CV elastic net for logistic regression¶

Final step: Comparing out of sample performance -- Logistic regression¶