import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 75
plt.rcParams['font.size'] = 12
sns.set_theme(style="whitegrid")
sns.set_theme(color_codes=True)
sns.set_context("poster")
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import rpy2.rinterface
import rpy2.robjects as robjects
%load_ext rpy2.ipython
%%R -o culture,values
load('test1.RData')
Select only respondents from Estonia.
# Constructing a model
model = smf.logit(formula='concert1 ~ age + gender + commun + bill + edu', data=culture).fit()
print(model.summary())
Optimization terminated successfully. Current function value: 0.598241 Iterations 5 Logit Regression Results ============================================================================== Dep. Variable: concert1 No. Observations: 23915 Model: Logit Df Residuals: 23908 Method: MLE Df Model: 6 Date: Пт, 12 фев 2021 Pseudo R-squ.: 0.08617 Time: 20:46:02 Log-Likelihood: -14307. converged: True LL-Null: -15656. Covariance Type: nonrobust LLR p-value: 0.000 =============================================================================================== coef std err z P>|z| [0.025 0.975] ----------------------------------------------------------------------------------------------- Intercept -2.5217 0.088 -28.552 0.000 -2.695 -2.349 gender[T.Female] 0.0724 0.029 2.523 0.012 0.016 0.129 commun[T.Small/middle town] 0.2003 0.034 5.932 0.000 0.134 0.267 commun[T.Large town] 0.2479 0.037 6.722 0.000 0.176 0.320 bill[T.no] 0.5992 0.030 19.730 0.000 0.540 0.659 age -0.0181 0.001 -20.144 0.000 -0.020 -0.016 edu 0.1212 0.004 33.489 0.000 0.114 0.128 ===============================================================================================
model.pvalues < 0.05
Intercept True gender[T.Female] True commun[T.Small/middle town] True commun[T.Large town] True bill[T.no] True age True edu True dtype: bool
All coefficients are significant!
# Odds Ratio
round(np.exp(model.params), 3)
Intercept 0.080 gender[T.Female] 1.075 commun[T.Small/middle town] 1.222 commun[T.Large town] 1.281 bill[T.no] 1.821 age 0.982 edu 1.129 dtype: float64
# Average Marginal Effects
print(model.get_margeff().summary())
Logit Marginal Effects ===================================== Dep. Variable: concert1 Method: dydx At: overall =============================================================================================== dy/dx std err z P>|z| [0.025 0.975] ----------------------------------------------------------------------------------------------- gender[T.Female] 0.0149 0.006 2.524 0.012 0.003 0.026 commun[T.Small/middle town] 0.0412 0.007 5.946 0.000 0.028 0.055 commun[T.Large town] 0.0510 0.008 6.744 0.000 0.036 0.066 bill[T.no] 0.1233 0.006 20.319 0.000 0.111 0.135 age -0.0037 0.000 -20.756 0.000 -0.004 -0.003 edu 0.0249 0.001 36.677 0.000 0.024 0.026 ===============================================================================================
gender doesn't have strong effect
the larger community type the bigger probability to attend concert (I suppose just because of bigger amount of concerts in larger towns)
bill (or having difficulties of paying bills) have quite strong at least among other variables effect on concert attendance: for having problems with bills probability increases by 12%
age seems to have no strong effect on probability of concert attendance
and more educated ones will have bigger chances of concert attendance
# Check for quadratic relationships
# Constructing a model
model0 = smf.logit(formula='concert1 ~ np.power(age, 2) + gender + commun + bill + edu', data=culture).fit()
print(model0.summary())
Optimization terminated successfully. Current function value: 0.598536 Iterations 5 Logit Regression Results ============================================================================== Dep. Variable: concert1 No. Observations: 23915 Model: Logit Df Residuals: 23908 Method: MLE Df Model: 6 Date: Пт, 12 фев 2021 Pseudo R-squ.: 0.08571 Time: 20:46:03 Log-Likelihood: -14314. converged: True LL-Null: -15656. Covariance Type: nonrobust LLR p-value: 0.000 =============================================================================================== coef std err z P>|z| [0.025 0.975] ----------------------------------------------------------------------------------------------- Intercept -2.9373 0.079 -37.120 0.000 -3.092 -2.782 gender[T.Female] 0.0717 0.029 2.499 0.012 0.015 0.128 commun[T.Small/middle town] 0.2024 0.034 5.994 0.000 0.136 0.269 commun[T.Large town] 0.2518 0.037 6.829 0.000 0.180 0.324 bill[T.no] 0.6007 0.030 19.777 0.000 0.541 0.660 np.power(age, 2) -0.0002 8.85e-06 -19.656 0.000 -0.000 -0.000 edu 0.1206 0.004 33.256 0.000 0.113 0.128 ===============================================================================================
Looks like there is no any significant and/or strong relationships between those two variables - neither confidence intervals are reliable (-0 is not goot CI at all) or the coefficient value almost at 0 too.
sm.graphics.plot_partregress(endog='concert1', exog_i='age', exog_others=['gender', 'commun', 'bill', 'edu'], data=culture, obs_labels=False);
From those two observations can be derived simple assumption (which is already confirmed by the model) that with more years of age probability of attending on concerts decreasing very slowly.
# Pseudo R^2
print('Pseudo R^2:', model.summary().tables[0][3][3])
Pseudo R^2: 0.08617
# PCP
predicted = [np.where(i == i.max())[0][0] for i in model.predict()]
print('Percentage of Correct Predictions (PCP): ', (round(sum(predicted == culture.concert1)/len(culture)*100, 1)), '%', sep='')
Percentage of Correct Predictions (PCP): 63.8%
Select only respondents from Finland.
values = values.dropna()
# Constructing a model
model = smf.ols(formula='achiv ~ cntry + stim + hedon + benev + intgndr + intage1 + gndr + age + eduy + domicil', data=values).fit()
print(model.summary())
OLS Regression Results ============================================================================== Dep. Variable: achiv R-squared: 0.295 Model: OLS Adj. R-squared: 0.294 Method: Least Squares F-statistic: 470.1 Date: Пт, 12 фев 2021 Prob (F-statistic): 0.00 Time: 20:46:03 Log-Likelihood: -57447. No. Observations: 40468 AIC: 1.150e+05 Df Residuals: 40431 BIC: 1.153e+05 Df Model: 36 Covariance Type: nonrobust =============================================================================================================== coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------------------------------------- Intercept 1.7189 0.055 31.176 0.000 1.611 1.827 cntry[T.Belgium] -0.3112 0.032 -9.790 0.000 -0.374 -0.249 cntry[T.Bulgaria] 2.832e-15 1.57e-16 18.075 0.000 2.53e-15 3.14e-15 cntry[T.Switzerland] -0.4020 0.034 -11.907 0.000 -0.468 -0.336 cntry[T.Cyprus] -0.2071 0.042 -4.925 0.000 -0.290 -0.125 cntry[T.Czechia] -0.2594 0.030 -8.574 0.000 -0.319 -0.200 cntry[T.Germany] -0.5478 0.030 -18.416 0.000 -0.606 -0.490 cntry[T.Estonia] -0.5139 0.031 -16.401 0.000 -0.575 -0.452 cntry[T.Spain] -0.6515 0.033 -19.732 0.000 -0.716 -0.587 cntry[T.Finland] -0.9915 0.032 -30.633 0.000 -1.055 -0.928 cntry[T.France] -0.9998 0.031 -32.216 0.000 -1.061 -0.939 cntry[T.United Kingdom] -0.4503 0.030 -14.796 0.000 -0.510 -0.391 cntry[T.Croatia] 0.0215 0.035 0.621 0.534 -0.046 0.089 cntry[T.Hungary] 0.0378 0.033 1.152 0.249 -0.027 0.102 cntry[T.Ireland] -0.3325 0.031 -10.600 0.000 -0.394 -0.271 cntry[T.Italy] 0.4803 0.029 16.388 0.000 0.423 0.538 cntry[T.Lithuania] 0.0780 0.034 2.297 0.022 0.011 0.144 cntry[T.Latvia] -0.3672 0.040 -9.087 0.000 -0.446 -0.288 cntry[T.Montenegro] 4.456e-16 1.98e-17 22.525 0.000 4.07e-16 4.84e-16 cntry[T.Netherlands] -0.4066 0.032 -12.530 0.000 -0.470 -0.343 cntry[T.Norway] -0.6440 0.035 -18.337 0.000 -0.713 -0.575 cntry[T.Poland] 0.0138 0.035 0.393 0.694 -0.055 0.083 cntry[T.Portugal] -0.1344 0.039 -3.489 0.000 -0.210 -0.059 cntry[T.Serbia] 0.2358 0.032 7.257 0.000 0.172 0.299 cntry[T.Sweden] -0.9094 0.034 -27.002 0.000 -0.975 -0.843 cntry[T.Slovenia] 0.3095 0.035 8.783 0.000 0.240 0.379 cntry[T.Slovakia] 0.0087 0.038 0.227 0.820 -0.066 0.084 intgndr[T.Female] -0.0465 0.011 -4.223 0.000 -0.068 -0.025 gndr[T.Female] -0.1561 0.010 -15.357 0.000 -0.176 -0.136 domicil[T.Suburbs or outskirts of big city] -0.0502 0.019 -2.649 0.008 -0.087 -0.013 domicil[T.Town or small city] -0.0514 0.015 -3.512 0.000 -0.080 -0.023 domicil[T.Country village] -0.0766 0.015 -5.237 0.000 -0.105 -0.048 domicil[T.Farm or home in countryside] -0.1028 0.025 -4.158 0.000 -0.151 -0.054 stim 0.2381 0.005 46.561 0.000 0.228 0.248 hedon 0.1876 0.005 34.642 0.000 0.177 0.198 benev 0.1892 0.007 27.918 0.000 0.176 0.202 intage1 0.0014 0.000 3.198 0.001 0.001 0.002 age -0.0060 0.000 -20.502 0.000 -0.007 -0.005 eduy 0.0080 0.001 6.040 0.000 0.005 0.011 ============================================================================== Omnibus: 317.565 Durbin-Watson: 1.989 Prob(Omnibus): 0.000 Jarque-Bera (JB): 324.987 Skew: -0.220 Prob(JB): 2.69e-71 Kurtosis: 2.997 Cond. No. 1.03e+16 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 1.79e-24. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
model.pvalues[model.pvalues > 0.05]
cntry[T.Croatia] 0.534437 cntry[T.Hungary] 0.249233 cntry[T.Poland] 0.694398 cntry[T.Slovakia] 0.820231 dtype: float64
All coefficients are significant except:
Listed countries will be not interpreted in the further model exploration!
# Interaction effect between respondent’s age and respondent’s gender
model0 = smf.ols(formula='achiv ~ cntry + stim + hedon + benev + intgndr + intage1 + age*gndr + eduy + domicil', data=values).fit()
print('Age & Gender interaction with coef = ' + model0.summary().tables[1][-2][1].data.strip() + ' and p-value =' + model0.summary().tables[1][-2][4].data)
Age & Gender interaction with coef = -0.0051 and p-value = 0.000
# Visualize the interaction effect, interpret the results
sns.lmplot(x="age", y="achiv", hue="gndr", data=values,
truncate=0, scatter=0, line_kws={'linewidth':10},
height=9, aspect=12/8).set(yscale="log");
Interaction effect truly exists and gender significantly moderates effect of age - as can be seen overall trend with more years of age to values achievements lower but! for males that downtrend is elaborated with much lower slope than for females
# Look at model fit, interpret the results
model.summary().tables[0]
Dep. Variable: | achiv | R-squared: | 0.295 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.294 |
Method: | Least Squares | F-statistic: | 470.1 |
Date: | Пт, 12 фев 2021 | Prob (F-statistic): | 0.00 |
Time: | 20:46:06 | Log-Likelihood: | -57447. |
No. Observations: | 40468 | AIC: | 1.150e+05 |
Df Residuals: | 40431 | BIC: | 1.153e+05 |
Df Model: | 36 | ||
Covariance Type: | nonrobust |
R-sqaured and Adjusted R-squared are equal (which seems strange to me because I await for lower adjusted R-squared with such amount of regressors)
They both equals to 29.5-4% of explained variance which is not really cool with that amount of regressors.
# outliers
(c, p) = model.get_influence().cooks_distance
with mpl.rc_context():
mpl.rc("figure", figsize=(20, 15))
plt.stem(np.arange(len(c)), c, markerfmt=",");
Almost any observation looks to be influential - but that highly affected by number of observations.
# multicollinearity
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = add_constant(values[['achiv', 'age']].dropna())
pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
index=X.columns)
const 18.658150 achiv 1.055939 age 1.055939 dtype: float64
As can be seen by VIF value for constant variable (intercept) there is multicollinearity for the outcome (but not for numeric regressors).
stim 0.2381 0.005 46.561 0.000 0.228 0.248 hedon 0.1876 0.005 34.642 0.000 0.177 0.198 benev 0.1892 0.007 27.918 0.000 0.176 0.202 intage1 0.0014 0.000 3.198 0.001 0.001 0.002 age -0.0060 0.000 -20.502 0.000 -0.007 -0.005 eduy
### non-linear effects
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'stim')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'hedon')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'benev')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'intage1')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'age')
with mpl.rc_context():
mpl.rc("figure", figsize=(10, 7))
smg.plot_ccpr(model, 'eduy')
# non-constant error variance
print(sms.het_breuschpagan(model0.resid, values[['achiv', 'age']].dropna())[1::2])
print(sms.het_goldfeldquandt(model0.resid, values[['achiv', 'age']].dropna()))
(0.0, 0.0) (0.9177795585794114, 0.9999999994726807, 'increasing')
Both tests indicate violation of homoscedasticity.