import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 75
plt.rcParams['font.size'] = 12
sns.set_theme(style="whitegrid")
sns.set_theme(color_codes=True)
sns.set_context("poster")
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import rpy2.rinterface
import rpy2.robjects as robjects
%load_ext rpy2.ipython
%%R -o culture,values
load('test1.RData')
Select only respondents from Estonia.
# Constructing a model
model = smf.logit(formula='concert1 ~ age + gender + commun + bill + edu', data=culture).fit()
print(model.summary())
Optimization terminated successfully.
Current function value: 0.598241
Iterations 5
Logit Regression Results
==============================================================================
Dep. Variable: concert1 No. Observations: 23915
Model: Logit Df Residuals: 23908
Method: MLE Df Model: 6
Date: Пт, 12 фев 2021 Pseudo R-squ.: 0.08617
Time: 20:46:02 Log-Likelihood: -14307.
converged: True LL-Null: -15656.
Covariance Type: nonrobust LLR p-value: 0.000
===============================================================================================
coef std err z P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------------------
Intercept -2.5217 0.088 -28.552 0.000 -2.695 -2.349
gender[T.Female] 0.0724 0.029 2.523 0.012 0.016 0.129
commun[T.Small/middle town] 0.2003 0.034 5.932 0.000 0.134 0.267
commun[T.Large town] 0.2479 0.037 6.722 0.000 0.176 0.320
bill[T.no] 0.5992 0.030 19.730 0.000 0.540 0.659
age -0.0181 0.001 -20.144 0.000 -0.020 -0.016
edu 0.1212 0.004 33.489 0.000 0.114 0.128
===============================================================================================
model.pvalues < 0.05
Intercept True gender[T.Female] True commun[T.Small/middle town] True commun[T.Large town] True bill[T.no] True age True edu True dtype: bool
All coefficients are significant!
# Odds Ratio
round(np.exp(model.params), 3)
Intercept 0.080 gender[T.Female] 1.075 commun[T.Small/middle town] 1.222 commun[T.Large town] 1.281 bill[T.no] 1.821 age 0.982 edu 1.129 dtype: float64
# Average Marginal Effects
print(model.get_margeff().summary())
Logit Marginal Effects
=====================================
Dep. Variable: concert1
Method: dydx
At: overall
===============================================================================================
dy/dx std err z P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------------------
gender[T.Female] 0.0149 0.006 2.524 0.012 0.003 0.026
commun[T.Small/middle town] 0.0412 0.007 5.946 0.000 0.028 0.055
commun[T.Large town] 0.0510 0.008 6.744 0.000 0.036 0.066
bill[T.no] 0.1233 0.006 20.319 0.000 0.111 0.135
age -0.0037 0.000 -20.756 0.000 -0.004 -0.003
edu 0.0249 0.001 36.677 0.000 0.024 0.026
===============================================================================================
gender doesn't have strong effect
the larger community type the bigger probability to attend concert (I suppose just because of bigger amount of concerts in larger towns)
bill (or having difficulties of paying bills) have quite strong at least among other variables effect on concert attendance: for having problems with bills probability increases by 12%
age seems to have no strong effect on probability of concert attendance
and more educated ones will have bigger chances of concert attendance
# Check for quadratic relationships
# Constructing a model
model0 = smf.logit(formula='concert1 ~ np.power(age, 2) + gender + commun + bill + edu', data=culture).fit()
print(model0.summary())
Optimization terminated successfully.
Current function value: 0.598536
Iterations 5
Logit Regression Results
==============================================================================
Dep. Variable: concert1 No. Observations: 23915
Model: Logit Df Residuals: 23908
Method: MLE Df Model: 6
Date: Пт, 12 фев 2021 Pseudo R-squ.: 0.08571
Time: 20:46:03 Log-Likelihood: -14314.
converged: True LL-Null: -15656.
Covariance Type: nonrobust LLR p-value: 0.000
===============================================================================================
coef std err z P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------------------
Intercept -2.9373 0.079 -37.120 0.000 -3.092 -2.782
gender[T.Female] 0.0717 0.029 2.499 0.012 0.015 0.128
commun[T.Small/middle town] 0.2024 0.034 5.994 0.000 0.136 0.269
commun[T.Large town] 0.2518 0.037 6.829 0.000 0.180 0.324
bill[T.no] 0.6007 0.030 19.777 0.000 0.541 0.660
np.power(age, 2) -0.0002 8.85e-06 -19.656 0.000 -0.000 -0.000
edu 0.1206 0.004 33.256 0.000 0.113 0.128
===============================================================================================
Looks like there is no any significant and/or strong relationships between those two variables - neither confidence intervals are reliable (-0 is not goot CI at all) or the coefficient value almost at 0 too.
sm.graphics.plot_partregress(endog='concert1', exog_i='age', exog_others=['gender', 'commun', 'bill', 'edu'], data=culture, obs_labels=False);
From those two observations can be derived simple assumption (which is already confirmed by the model) that with more years of age probability of attending on concerts decreasing very slowly.
# Pseudo R^2
print('Pseudo R^2:', model.summary().tables[0][3][3])
Pseudo R^2: 0.08617
# PCP
predicted = [np.where(i == i.max())[0][0] for i in model.predict()]
print('Percentage of Correct Predictions (PCP): ', (round(sum(predicted == culture.concert1)/len(culture)*100, 1)), '%', sep='')
Percentage of Correct Predictions (PCP): 63.8%
Select only respondents from Finland.
values = values.dropna()
# Constructing a model
model = smf.ols(formula='achiv ~ cntry + stim + hedon + benev + intgndr + intage1 + gndr + age + eduy + domicil', data=values).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: achiv R-squared: 0.295
Model: OLS Adj. R-squared: 0.294
Method: Least Squares F-statistic: 470.1
Date: Пт, 12 фев 2021 Prob (F-statistic): 0.00
Time: 20:46:03 Log-Likelihood: -57447.
No. Observations: 40468 AIC: 1.150e+05
Df Residuals: 40431 BIC: 1.153e+05
Df Model: 36
Covariance Type: nonrobust
===============================================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------------
Intercept 1.7189 0.055 31.176 0.000 1.611 1.827
cntry[T.Belgium] -0.3112 0.032 -9.790 0.000 -0.374 -0.249
cntry[T.Bulgaria] 2.832e-15 1.57e-16 18.075 0.000 2.53e-15 3.14e-15
cntry[T.Switzerland] -0.4020 0.034 -11.907 0.000 -0.468 -0.336
cntry[T.Cyprus] -0.2071 0.042 -4.925 0.000 -0.290 -0.125
cntry[T.Czechia] -0.2594 0.030 -8.574 0.000 -0.319 -0.200
cntry[T.Germany] -0.5478 0.030 -18.416 0.000 -0.606 -0.490
cntry[T.Estonia] -0.5139 0.031 -16.401 0.000 -0.575 -0.452
cntry[T.Spain] -0.6515 0.033 -19.732 0.000 -0.716 -0.587
cntry[T.Finland] -0.9915 0.032 -30.633 0.000 -1.055 -0.928
cntry[T.France] -0.9998 0.031 -32.216 0.000 -1.061 -0.939
cntry[T.United Kingdom] -0.4503 0.030 -14.796 0.000 -0.510 -0.391
cntry[T.Croatia] 0.0215 0.035 0.621 0.534 -0.046 0.089
cntry[T.Hungary] 0.0378 0.033 1.152 0.249 -0.027 0.102
cntry[T.Ireland] -0.3325 0.031 -10.600 0.000 -0.394 -0.271
cntry[T.Italy] 0.4803 0.029 16.388 0.000 0.423 0.538
cntry[T.Lithuania] 0.0780 0.034 2.297 0.022 0.011 0.144
cntry[T.Latvia] -0.3672 0.040 -9.087 0.000 -0.446 -0.288
cntry[T.Montenegro] 4.456e-16 1.98e-17 22.525 0.000 4.07e-16 4.84e-16
cntry[T.Netherlands] -0.4066 0.032 -12.530 0.000 -0.470 -0.343
cntry[T.Norway] -0.6440 0.035 -18.337 0.000 -0.713 -0.575
cntry[T.Poland] 0.0138 0.035 0.393 0.694 -0.055 0.083
cntry[T.Portugal] -0.1344 0.039 -3.489 0.000 -0.210 -0.059
cntry[T.Serbia] 0.2358 0.032 7.257 0.000 0.172 0.299
cntry[T.Sweden] -0.9094 0.034 -27.002 0.000 -0.975 -0.843
cntry[T.Slovenia] 0.3095 0.035 8.783 0.000 0.240 0.379
cntry[T.Slovakia] 0.0087 0.038 0.227 0.820 -0.066 0.084
intgndr[T.Female] -0.0465 0.011 -4.223 0.000 -0.068 -0.025
gndr[T.Female] -0.1561 0.010 -15.357 0.000 -0.176 -0.136
domicil[T.Suburbs or outskirts of big city] -0.0502 0.019 -2.649 0.008 -0.087 -0.013
domicil[T.Town or small city] -0.0514 0.015 -3.512 0.000 -0.080 -0.023
domicil[T.Country village] -0.0766 0.015 -5.237 0.000 -0.105 -0.048
domicil[T.Farm or home in countryside] -0.1028 0.025 -4.158 0.000 -0.151 -0.054
stim 0.2381 0.005 46.561 0.000 0.228 0.248
hedon 0.1876 0.005 34.642 0.000 0.177 0.198
benev 0.1892 0.007 27.918 0.000 0.176 0.202
intage1 0.0014 0.000 3.198 0.001 0.001 0.002
age -0.0060 0.000 -20.502 0.000 -0.007 -0.005
eduy 0.0080 0.001 6.040 0.000 0.005 0.011
==============================================================================
Omnibus: 317.565 Durbin-Watson: 1.989
Prob(Omnibus): 0.000 Jarque-Bera (JB): 324.987
Skew: -0.220 Prob(JB): 2.69e-71
Kurtosis: 2.997 Cond. No. 1.03e+16
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.79e-24. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
model.pvalues[model.pvalues > 0.05]
cntry[T.Croatia] 0.534437 cntry[T.Hungary] 0.249233 cntry[T.Poland] 0.694398 cntry[T.Slovakia] 0.820231 dtype: float64
All coefficients are significant except:
Listed countries will be not interpreted in the further model exploration!
# Interaction effect between respondent’s age and respondent’s gender
model0 = smf.ols(formula='achiv ~ cntry + stim + hedon + benev + intgndr + intage1 + age*gndr + eduy + domicil', data=values).fit()
print('Age & Gender interaction with coef = ' + model0.summary().tables[1][-2][1].data.strip() + ' and p-value =' + model0.summary().tables[1][-2][4].data)
Age & Gender interaction with coef = -0.0051 and p-value = 0.000
# Visualize the interaction effect, interpret the results
sns.lmplot(x="age", y="achiv", hue="gndr", data=values,
truncate=0, scatter=0, line_kws={'linewidth':10},
height=9, aspect=12/8).set(yscale="log");
Interaction effect truly exists and gender significantly moderates effect of age - as can be seen overall trend with more years of age to values achievements lower but! for males that downtrend is elaborated with much lower slope than for females
# Look at model fit, interpret the results
model.summary().tables[0]
| Dep. Variable: | achiv | R-squared: | 0.295 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.294 |
| Method: | Least Squares | F-statistic: | 470.1 |
| Date: | Пт, 12 фев 2021 | Prob (F-statistic): | 0.00 |
| Time: | 20:46:06 | Log-Likelihood: | -57447. |
| No. Observations: | 40468 | AIC: | 1.150e+05 |
| Df Residuals: | 40431 | BIC: | 1.153e+05 |
| Df Model: | 36 | ||
| Covariance Type: | nonrobust |
R-sqaured and Adjusted R-squared are equal (which seems strange to me because I await for lower adjusted R-squared with such amount of regressors)
They both equals to 29.5-4% of explained variance which is not really cool with that amount of regressors.
# outliers
(c, p) = model.get_influence().cooks_distance
with mpl.rc_context():
mpl.rc("figure", figsize=(20, 15))
plt.stem(np.arange(len(c)), c, markerfmt=",");
Almost any observation looks to be influential - but that highly affected by number of observations.
# multicollinearity
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = add_constant(values[['achiv', 'age']].dropna())
pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
index=X.columns)
const 18.658150 achiv 1.055939 age 1.055939 dtype: float64
As can be seen by VIF value for constant variable (intercept) there is multicollinearity for the outcome (but not for numeric regressors).
stim 0.2381 0.005 46.561 0.000 0.228 0.248 hedon 0.1876 0.005 34.642 0.000 0.177 0.198 benev 0.1892 0.007 27.918 0.000 0.176 0.202 intage1 0.0014 0.000 3.198 0.001 0.001 0.002 age -0.0060 0.000 -20.502 0.000 -0.007 -0.005 eduy
### non-linear effects
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'stim')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'hedon')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'benev')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'intage1')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'age')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'eduy')
# non-constant error variance
print(sms.het_breuschpagan(model0.resid, values[['achiv', 'age']].dropna())[1::2])
print(sms.het_goldfeldquandt(model0.resid, values[['achiv', 'age']].dropna()))
(0.0, 0.0) (0.9177795585794114, 0.9999999994726807, 'increasing')
Both tests indicate violation of homoscedasticity.