import rpy2.rinterface
import rpy2.robjects as robjects
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.regressionplots as smgr
from sklearn.metrics import confusion_matrix, accuracy_score
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 75
plt.rcParams['font.size'] = 12
sns.set_theme(style="whitegrid")
sns.set_theme(color_codes=True)
sns.set_context("poster")
%load_ext rpy2.ipython
%%R -o df
load('sem4.RData')
df = uk
# Selecting only usable data and removing NA's
df = df[['prvote', 'age', 'sex', 'income1', 'edu']].dropna()
# Recoding category to numeric
df['prvote_code'] = df.prvote.cat.codes
print(pd.DataFrame({'code': [0, 1, 2, 3], 'name': df.prvote.cat.categories}).set_index('code'))
name code 0 Conservative 1 Labour 2 Liberal Democrat 3 Other
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 1265 entries, 3 to 2422 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 prvote 1265 non-null category 1 age 1265 non-null float64 2 sex 1265 non-null category 3 income1 1265 non-null float64 4 edu 1265 non-null float64 5 prvote_code 1265 non-null int8 dtypes: category(2), float64(3), int8(1) memory usage: 43.6+ KB
model = smf.mnlogit(formula='prvote_code ~ age + sex + income1 + edu', data=df).fit()
print(model.summary())
Optimization terminated successfully. Current function value: 1.269979 Iterations 6 MNLogit Regression Results ============================================================================== Dep. Variable: prvote_code No. Observations: 1265 Model: MNLogit Df Residuals: 1250 Method: MLE Df Model: 12 Date: Вс, 07 фев 2021 Pseudo R-squ.: 0.02096 Time: 18:43:08 Log-Likelihood: -1606.5 converged: True LL-Null: -1640.9 Covariance Type: nonrobust LLR p-value: 5.427e-10 ================================================================================= prvote_code=1 coef std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 1.7062 0.468 3.645 0.000 0.789 2.624 sex[T.Female] -0.0715 0.142 -0.505 0.614 -0.349 0.206 age -0.0185 0.005 -3.900 0.000 -0.028 -0.009 income1 -0.1619 0.027 -5.896 0.000 -0.216 -0.108 edu 0.0041 0.021 0.202 0.840 -0.036 0.044 --------------------------------------------------------------------------------- prvote_code=2 coef std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 0.5013 0.498 1.007 0.314 -0.474 1.477 sex[T.Female] 0.0239 0.154 0.156 0.876 -0.277 0.325 age -0.0181 0.005 -3.544 0.000 -0.028 -0.008 income1 -0.1000 0.029 -3.411 0.001 -0.158 -0.043 edu 0.0424 0.021 1.985 0.047 0.001 0.084 --------------------------------------------------------------------------------- prvote_code=3 coef std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------- Intercept 1.3495 0.673 2.004 0.045 0.030 2.669 sex[T.Female] -0.3555 0.208 -1.711 0.087 -0.763 0.052 age -0.0231 0.007 -3.409 0.001 -0.036 -0.010 income1 -0.2040 0.041 -5.034 0.000 -0.283 -0.125 edu -0.0134 0.031 -0.427 0.669 -0.075 0.048 =================================================================================
print(model.pvalues < 0.05)
0 1 2 Intercept True False True sex[T.Female] False False False age True True True income1 True True True edu False True False
Model is only 'partly significant' in terms of its effects significance:
# Odds Ratio
odds = round(np.exp(model.params), 3)
odds.columns = df.prvote.cat.categories[1:]
print(odds)
Labour Liberal Democrat Other Intercept 5.508 1.651 3.855 sex[T.Female] 0.931 1.024 0.701 age 0.982 0.982 0.977 income1 0.851 0.905 0.815 edu 1.004 1.043 0.987
'Conservative' as reference group - read each statement as: "party by value comparing with 'Conservative' "
from Male to Female
lol is not significant in any submodelAge
have exactly the same effect in each submodel.
# Average Marginal Effects
print(model.get_margeff().summary().tables[1])
================================================================================= prvote_code=0 dy/dx std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------- sex[T.Female] 0.0171 0.027 0.642 0.521 -0.035 0.069 age 0.0042 0.001 4.770 0.000 0.002 0.006 income1 0.0321 0.005 6.545 0.000 0.022 0.042 edu -0.0035 0.004 -0.910 0.363 -0.011 0.004 --------------------------------------------------------------------------------- prvote_code=1 dy/dx std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------- sex[T.Female] -0.0054 0.026 -0.208 0.835 -0.057 0.046 age -0.0019 0.001 -2.206 0.027 -0.003 -0.000 income1 -0.0205 0.005 -4.234 0.000 -0.030 -0.011 edu -0.0018 0.004 -0.465 0.642 -0.009 0.006 --------------------------------------------------------------------------------- prvote_code=2 dy/dx std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------- sex[T.Female] 0.0173 0.024 0.724 0.469 -0.030 0.064 age -0.0013 0.001 -1.753 0.080 -0.003 0.000 income1 -0.0014 0.004 -0.310 0.757 -0.010 0.007 edu 0.0075 0.003 2.256 0.024 0.001 0.014 --------------------------------------------------------------------------------- prvote_code=3 dy/dx std err z P>|z| [0.025 0.975] --------------------------------------------------------------------------------- sex[T.Female] -0.0290 0.017 -1.736 0.083 -0.062 0.004 age -0.0010 0.001 -1.894 0.058 -0.002 3.5e-05 income1 -0.0102 0.003 -3.200 0.001 -0.017 -0.004 edu -0.0022 0.003 -0.883 0.377 -0.007 0.003 =================================================================================
Well, it's a lot of to describe:
Age
have nearly the same effect in each submodel.
All of those are really small effect sizes.
# Pseudo R^2
print('Pseudo R^2:', model.summary().tables[0][3][3])
Pseudo R^2: 0.02096
Pseudo R-squared is way lower than 0.1 which is totally not enough for considering model as good.
# PCP
predicted = [np.where(i == i.max())[0][0] for i in model.predict()]
print('Percentage of Correct Predictions (PCP): ', (round(sum(predicted == df.prvote_code)/len(df)*100, 1)), '%', sep='')
# or
print('Accuracy (aka PCP): ', round(accuracy_score(df['prvote_code'], predicted), 3)*100, '%', sep='')
Percentage of Correct Predictions (PCP): 40.8% Accuracy (aka PCP): 40.8%
Meh, could be better - but still OK, because here are not binary prediction (maybe for such task will be more suitable to use PCA or CFA or some clusterization techniques)
# Real x Predicted
pd.DataFrame(np.array(confusion_matrix(df['prvote_code'], predicted)),
index=df.prvote.cat.categories, columns=df.prvote.cat.categories+'_pred')
Conservative_pred | Labour_pred | Liberal Democrat_pred | Other_pred | |
---|---|---|---|---|
Conservative | 328 | 120 | 3 | 0 |
Labour | 205 | 182 | 10 | 0 |
Liberal Democrat | 168 | 120 | 6 | 0 |
Other | 64 | 58 | 1 | 0 |
Well, others is really underpredicted - zero at interception.
Overall, model just loves conservatives and tries to set any observation to than group.
sm.graphics.plot_partregress(endog='prvote_code', exog_i='edu', exog_others=['age', 'sex', 'income1'], data=df, obs_labels=False);
As already known from metrics years of education slightly increases probability to vote for 'Liberal Democrats' in comparisong of voting for 'Conservatives'
Great, but how I'm going to do it when I don't have any theory, RQ's or hypotheses?
OK, firstly - people became more conservative with years of age, but became more liberal with years of education. Such results is kinda obvious - elderly people are more conservative, educated ones - more liberal.
Secondly, people with more income tends to be more conservative - I suppose here will be great to look at interaction of income and age in further elaboration of the model.
Finally - multinomial logistic regression is not the best choice for such data and better to use some clusterization techniques (although logistic regression is clusterization)