import rpy2.rinterface
import rpy2.robjects as robjects

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.regressionplots as smgr
from sklearn.metrics import confusion_matrix, accuracy_score

plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 75
plt.rcParams['font.size'] = 12
sns.set_theme(style="whitegrid")
sns.set_theme(color_codes=True)
sns.set_context("poster")
%load_ext rpy2.ipython


%%R -o df
load('sem4.RData')
df = uk


# Selecting only usable data and removing NA's
df = df[['prvote', 'age', 'sex', 'income1', 'edu']].dropna()
# Recoding category to numeric
df['prvote_code'] = df.prvote.cat.codes
print(pd.DataFrame({'code': [0, 1, 2, 3], 'name': df.prvote.cat.categories}).set_index('code'))

                  name
code                  
0         Conservative
1               Labour
2     Liberal Democrat
3                Other


df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1265 entries, 3 to 2422
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   prvote       1265 non-null   category
 1   age          1265 non-null   float64 
 2   sex          1265 non-null   category
 3   income1      1265 non-null   float64 
 4   edu          1265 non-null   float64 
 5   prvote_code  1265 non-null   int8    
dtypes: category(2), float64(3), int8(1)
memory usage: 43.6+ KB


model = smf.mnlogit(formula='prvote_code ~ age + sex + income1 + edu', data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 1.269979
         Iterations 6
                          MNLogit Regression Results                          
==============================================================================
Dep. Variable:            prvote_code   No. Observations:                 1265
Model:                        MNLogit   Df Residuals:                     1250
Method:                           MLE   Df Model:                           12
Date:                 Вс, 07 фев 2021   Pseudo R-squ.:                 0.02096
Time:                        18:43:08   Log-Likelihood:                -1606.5
converged:                       True   LL-Null:                       -1640.9
Covariance Type:            nonrobust   LLR p-value:                 5.427e-10
=================================================================================
prvote_code=1       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         1.7062      0.468      3.645      0.000       0.789       2.624
sex[T.Female]    -0.0715      0.142     -0.505      0.614      -0.349       0.206
age              -0.0185      0.005     -3.900      0.000      -0.028      -0.009
income1          -0.1619      0.027     -5.896      0.000      -0.216      -0.108
edu               0.0041      0.021      0.202      0.840      -0.036       0.044
---------------------------------------------------------------------------------
prvote_code=2       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.5013      0.498      1.007      0.314      -0.474       1.477
sex[T.Female]     0.0239      0.154      0.156      0.876      -0.277       0.325
age              -0.0181      0.005     -3.544      0.000      -0.028      -0.008
income1          -0.1000      0.029     -3.411      0.001      -0.158      -0.043
edu               0.0424      0.021      1.985      0.047       0.001       0.084
---------------------------------------------------------------------------------
prvote_code=3       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         1.3495      0.673      2.004      0.045       0.030       2.669
sex[T.Female]    -0.3555      0.208     -1.711      0.087      -0.763       0.052
age              -0.0231      0.007     -3.409      0.001      -0.036      -0.010
income1          -0.2040      0.041     -5.034      0.000      -0.283      -0.125
edu              -0.0134      0.031     -0.427      0.669      -0.075       0.048
=================================================================================


print(model.pvalues < 0.05)

                   0      1      2
Intercept       True  False   True
sex[T.Female]  False  False  False
age             True   True   True
income1         True   True   True
edu            False   True  False


# Odds Ratio
odds = round(np.exp(model.params), 3)
odds.columns = df.prvote.cat.categories[1:]
print(odds)

               Labour  Liberal Democrat  Other
Intercept       5.508             1.651  3.855
sex[T.Female]   0.931             1.024  0.701
age             0.982             0.982  0.977
income1         0.851             0.905  0.815
edu             1.004             1.043  0.987


# Average Marginal Effects
print(model.get_margeff().summary().tables[1])

=================================================================================
prvote_code=0      dy/dx    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
sex[T.Female]     0.0171      0.027      0.642      0.521      -0.035       0.069
age               0.0042      0.001      4.770      0.000       0.002       0.006
income1           0.0321      0.005      6.545      0.000       0.022       0.042
edu              -0.0035      0.004     -0.910      0.363      -0.011       0.004
---------------------------------------------------------------------------------
prvote_code=1      dy/dx    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
sex[T.Female]    -0.0054      0.026     -0.208      0.835      -0.057       0.046
age              -0.0019      0.001     -2.206      0.027      -0.003      -0.000
income1          -0.0205      0.005     -4.234      0.000      -0.030      -0.011
edu              -0.0018      0.004     -0.465      0.642      -0.009       0.006
---------------------------------------------------------------------------------
prvote_code=2      dy/dx    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
sex[T.Female]     0.0173      0.024      0.724      0.469      -0.030       0.064
age              -0.0013      0.001     -1.753      0.080      -0.003       0.000
income1          -0.0014      0.004     -0.310      0.757      -0.010       0.007
edu               0.0075      0.003      2.256      0.024       0.001       0.014
---------------------------------------------------------------------------------
prvote_code=3      dy/dx    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
sex[T.Female]    -0.0290      0.017     -1.736      0.083      -0.062       0.004
age              -0.0010      0.001     -1.894      0.058      -0.002     3.5e-05
income1          -0.0102      0.003     -3.200      0.001      -0.017      -0.004
edu              -0.0022      0.003     -0.883      0.377      -0.007       0.003
=================================================================================


# Pseudo R^2
print('Pseudo R^2:', model.summary().tables[0][3][3])

Pseudo R^2: 0.02096


# PCP
predicted = [np.where(i == i.max())[0][0] for i in model.predict()]
print('Percentage of Correct Predictions (PCP): ', (round(sum(predicted == df.prvote_code)/len(df)*100, 1)), '%', sep='')
# or
print('Accuracy (aka PCP): ', round(accuracy_score(df['prvote_code'], predicted), 3)*100, '%', sep='')

Percentage of Correct Predictions (PCP): 40.8%
Accuracy (aka PCP): 40.8%


# Real x Predicted
pd.DataFrame(np.array(confusion_matrix(df['prvote_code'], predicted)), 
             index=df.prvote.cat.categories, columns=df.prvote.cat.categories+'_pred')


sm.graphics.plot_partregress(endog='prvote_code', exog_i='edu', exog_others=['age', 'sex', 'income1'], data=df, obs_labels=False);

	Conservative_pred	Labour_pred	Liberal Democrat_pred
Conservative	328	120	3
Labour	205	182	10
Liberal Democrat	168	120	6
Other	64	58	1

Predict which party was supported by age, gender, income group, and education¶

Interpret coefficients¶

P-values¶

Odds ratio¶

Average Marginal Effects¶

Look at the model fit¶

Make a plot to illustrate the relationship between education and party preferences¶

Consider theoretical explanations of the results¶