import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 75
plt.rcParams['font.size'] = 12
sns.set_theme(style="whitegrid")
sns.set_theme(color_codes=True)
sns.set_context("poster")

import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg

import rpy2.rinterface
import rpy2.robjects as robjects
%load_ext rpy2.ipython


%%R -o culture,values
load('test1.RData')


# Constructing a model
model = smf.logit(formula='concert1 ~ age + gender + commun + bill + edu', data=culture).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.598241
         Iterations 5
                           Logit Regression Results                           
==============================================================================
Dep. Variable:               concert1   No. Observations:                23915
Model:                          Logit   Df Residuals:                    23908
Method:                           MLE   Df Model:                            6
Date:                 Пт, 12 фев 2021   Pseudo R-squ.:                 0.08617
Time:                        20:46:02   Log-Likelihood:                -14307.
converged:                       True   LL-Null:                       -15656.
Covariance Type:            nonrobust   LLR p-value:                     0.000
===============================================================================================
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept                      -2.5217      0.088    -28.552      0.000      -2.695      -2.349
gender[T.Female]                0.0724      0.029      2.523      0.012       0.016       0.129
commun[T.Small/middle town]     0.2003      0.034      5.932      0.000       0.134       0.267
commun[T.Large town]            0.2479      0.037      6.722      0.000       0.176       0.320
bill[T.no]                      0.5992      0.030     19.730      0.000       0.540       0.659
age                            -0.0181      0.001    -20.144      0.000      -0.020      -0.016
edu                             0.1212      0.004     33.489      0.000       0.114       0.128
===============================================================================================


model.pvalues < 0.05

Intercept                      True
gender[T.Female]               True
commun[T.Small/middle town]    True
commun[T.Large town]           True
bill[T.no]                     True
age                            True
edu                            True
dtype: bool


# Odds Ratio
round(np.exp(model.params), 3)

Intercept                      0.080
gender[T.Female]               1.075
commun[T.Small/middle town]    1.222
commun[T.Large town]           1.281
bill[T.no]                     1.821
age                            0.982
edu                            1.129
dtype: float64


# Average Marginal Effects
print(model.get_margeff().summary())

        Logit Marginal Effects       
=====================================
Dep. Variable:               concert1
Method:                          dydx
At:                           overall
===============================================================================================
                                 dy/dx    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
gender[T.Female]                0.0149      0.006      2.524      0.012       0.003       0.026
commun[T.Small/middle town]     0.0412      0.007      5.946      0.000       0.028       0.055
commun[T.Large town]            0.0510      0.008      6.744      0.000       0.036       0.066
bill[T.no]                      0.1233      0.006     20.319      0.000       0.111       0.135
age                            -0.0037      0.000    -20.756      0.000      -0.004      -0.003
edu                             0.0249      0.001     36.677      0.000       0.024       0.026
===============================================================================================


# Check for quadratic relationships
# Constructing a model
model0 = smf.logit(formula='concert1 ~ np.power(age, 2) + gender + commun + bill + edu', data=culture).fit()
print(model0.summary())

Optimization terminated successfully.
         Current function value: 0.598536
         Iterations 5
                           Logit Regression Results                           
==============================================================================
Dep. Variable:               concert1   No. Observations:                23915
Model:                          Logit   Df Residuals:                    23908
Method:                           MLE   Df Model:                            6
Date:                 Пт, 12 фев 2021   Pseudo R-squ.:                 0.08571
Time:                        20:46:03   Log-Likelihood:                -14314.
converged:                       True   LL-Null:                       -15656.
Covariance Type:            nonrobust   LLR p-value:                     0.000
===============================================================================================
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept                      -2.9373      0.079    -37.120      0.000      -3.092      -2.782
gender[T.Female]                0.0717      0.029      2.499      0.012       0.015       0.128
commun[T.Small/middle town]     0.2024      0.034      5.994      0.000       0.136       0.269
commun[T.Large town]            0.2518      0.037      6.829      0.000       0.180       0.324
bill[T.no]                      0.6007      0.030     19.777      0.000       0.541       0.660
np.power(age, 2)               -0.0002   8.85e-06    -19.656      0.000      -0.000      -0.000
edu                             0.1206      0.004     33.256      0.000       0.113       0.128
===============================================================================================


sm.graphics.plot_partregress(endog='concert1', exog_i='age', exog_others=['gender', 'commun', 'bill', 'edu'], data=culture, obs_labels=False);


# Pseudo R^2
print('Pseudo R^2:', model.summary().tables[0][3][3])

Pseudo R^2: 0.08617


# PCP
predicted = [np.where(i == i.max())[0][0] for i in model.predict()]
print('Percentage of Correct Predictions (PCP): ', (round(sum(predicted == culture.concert1)/len(culture)*100, 1)), '%', sep='')

Percentage of Correct Predictions (PCP): 63.8%


values = values.dropna()
# Constructing a model
model = smf.ols(formula='achiv ~ cntry + stim + hedon + benev + intgndr + intage1 + gndr + age + eduy + domicil', data=values).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  achiv   R-squared:                       0.295
Model:                            OLS   Adj. R-squared:                  0.294
Method:                 Least Squares   F-statistic:                     470.1
Date:                 Пт, 12 фев 2021   Prob (F-statistic):               0.00
Time:                        20:46:03   Log-Likelihood:                -57447.
No. Observations:               40468   AIC:                         1.150e+05
Df Residuals:                   40431   BIC:                         1.153e+05
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
===============================================================================================================
                                                  coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------
Intercept                                       1.7189      0.055     31.176      0.000       1.611       1.827
cntry[T.Belgium]                               -0.3112      0.032     -9.790      0.000      -0.374      -0.249
cntry[T.Bulgaria]                            2.832e-15   1.57e-16     18.075      0.000    2.53e-15    3.14e-15
cntry[T.Switzerland]                           -0.4020      0.034    -11.907      0.000      -0.468      -0.336
cntry[T.Cyprus]                                -0.2071      0.042     -4.925      0.000      -0.290      -0.125
cntry[T.Czechia]                               -0.2594      0.030     -8.574      0.000      -0.319      -0.200
cntry[T.Germany]                               -0.5478      0.030    -18.416      0.000      -0.606      -0.490
cntry[T.Estonia]                               -0.5139      0.031    -16.401      0.000      -0.575      -0.452
cntry[T.Spain]                                 -0.6515      0.033    -19.732      0.000      -0.716      -0.587
cntry[T.Finland]                               -0.9915      0.032    -30.633      0.000      -1.055      -0.928
cntry[T.France]                                -0.9998      0.031    -32.216      0.000      -1.061      -0.939
cntry[T.United Kingdom]                        -0.4503      0.030    -14.796      0.000      -0.510      -0.391
cntry[T.Croatia]                                0.0215      0.035      0.621      0.534      -0.046       0.089
cntry[T.Hungary]                                0.0378      0.033      1.152      0.249      -0.027       0.102
cntry[T.Ireland]                               -0.3325      0.031    -10.600      0.000      -0.394      -0.271
cntry[T.Italy]                                  0.4803      0.029     16.388      0.000       0.423       0.538
cntry[T.Lithuania]                              0.0780      0.034      2.297      0.022       0.011       0.144
cntry[T.Latvia]                                -0.3672      0.040     -9.087      0.000      -0.446      -0.288
cntry[T.Montenegro]                          4.456e-16   1.98e-17     22.525      0.000    4.07e-16    4.84e-16
cntry[T.Netherlands]                           -0.4066      0.032    -12.530      0.000      -0.470      -0.343
cntry[T.Norway]                                -0.6440      0.035    -18.337      0.000      -0.713      -0.575
cntry[T.Poland]                                 0.0138      0.035      0.393      0.694      -0.055       0.083
cntry[T.Portugal]                              -0.1344      0.039     -3.489      0.000      -0.210      -0.059
cntry[T.Serbia]                                 0.2358      0.032      7.257      0.000       0.172       0.299
cntry[T.Sweden]                                -0.9094      0.034    -27.002      0.000      -0.975      -0.843
cntry[T.Slovenia]                               0.3095      0.035      8.783      0.000       0.240       0.379
cntry[T.Slovakia]                               0.0087      0.038      0.227      0.820      -0.066       0.084
intgndr[T.Female]                              -0.0465      0.011     -4.223      0.000      -0.068      -0.025
gndr[T.Female]                                 -0.1561      0.010    -15.357      0.000      -0.176      -0.136
domicil[T.Suburbs or outskirts of big city]    -0.0502      0.019     -2.649      0.008      -0.087      -0.013
domicil[T.Town or small city]                  -0.0514      0.015     -3.512      0.000      -0.080      -0.023
domicil[T.Country village]                     -0.0766      0.015     -5.237      0.000      -0.105      -0.048
domicil[T.Farm or home in countryside]         -0.1028      0.025     -4.158      0.000      -0.151      -0.054
stim                                            0.2381      0.005     46.561      0.000       0.228       0.248
hedon                                           0.1876      0.005     34.642      0.000       0.177       0.198
benev                                           0.1892      0.007     27.918      0.000       0.176       0.202
intage1                                         0.0014      0.000      3.198      0.001       0.001       0.002
age                                            -0.0060      0.000    -20.502      0.000      -0.007      -0.005
eduy                                            0.0080      0.001      6.040      0.000       0.005       0.011
==============================================================================
Omnibus:                      317.565   Durbin-Watson:                   1.989
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              324.987
Skew:                          -0.220   Prob(JB):                     2.69e-71
Kurtosis:                       2.997   Cond. No.                     1.03e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.79e-24. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.


model.pvalues[model.pvalues > 0.05]

cntry[T.Croatia]     0.534437
cntry[T.Hungary]     0.249233
cntry[T.Poland]      0.694398
cntry[T.Slovakia]    0.820231
dtype: float64


# Interaction effect between respondent’s age and respondent’s gender
model0 = smf.ols(formula='achiv ~ cntry + stim + hedon + benev + intgndr + intage1 + age*gndr + eduy + domicil', data=values).fit()
print('Age & Gender interaction with coef = ' + model0.summary().tables[1][-2][1].data.strip() + ' and p-value =' + model0.summary().tables[1][-2][4].data)

Age & Gender interaction with coef = -0.0051 and p-value = 0.000


# Visualize the interaction effect, interpret the results
sns.lmplot(x="age", y="achiv", hue="gndr", data=values, 
           truncate=0, scatter=0, line_kws={'linewidth':10},
           height=9, aspect=12/8).set(yscale="log");


# Look at model fit, interpret the results
model.summary().tables[0]


# outliers
(c, p) = model.get_influence().cooks_distance
with mpl.rc_context():
    mpl.rc("figure", figsize=(20, 15))
    plt.stem(np.arange(len(c)), c, markerfmt=",");


# multicollinearity
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = add_constant(values[['achiv', 'age']].dropna())
pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], 
          index=X.columns)

const    18.658150
achiv     1.055939
age       1.055939
dtype: float64


### non-linear effects
with mpl.rc_context():
    mpl.rc("figure", figsize=(10, 7))
    smg.plot_ccpr(model, 'stim')


with mpl.rc_context():
    mpl.rc("figure", figsize=(10, 7))
    smg.plot_ccpr(model, 'hedon')


with mpl.rc_context():
    mpl.rc("figure", figsize=(10, 7))
    smg.plot_ccpr(model, 'benev')


with mpl.rc_context():
    mpl.rc("figure", figsize=(10, 7))
    smg.plot_ccpr(model, 'intage1')


with mpl.rc_context():
    mpl.rc("figure", figsize=(10, 7))
    smg.plot_ccpr(model, 'age')


with mpl.rc_context():
    mpl.rc("figure", figsize=(10, 7))
    smg.plot_ccpr(model, 'eduy')


# non-constant error variance
print(sms.het_breuschpagan(model0.resid, values[['achiv', 'age']].dropna())[1::2])

print(sms.het_goldfeldquandt(model0.resid, values[['achiv', 'age']].dropna()))

(0.0, 0.0)
(0.9177795585794114, 0.9999999994726807, 'increasing')

Task 1: culture¶

Results interpretation:¶

Overall¶

Note on quadratic relationship of age & concert attendance¶

Plot interpretation¶

Model fit:¶

Task 2: values¶

Results interpretation:¶

Assumptions¶

No any non-linear relationships for numeric regressors are found.¶

Dep. Variable:	achiv	R-squared:	0.295
Model:	OLS	Adj. R-squared:	0.294
Method:	Least Squares	F-statistic:	470.1
Date:	Пт, 12 фев 2021	Prob (F-statistic):	0.00
Time:	20:46:06	Log-Likelihood:	-57447.
No. Observations:	40468	AIC:	1.150e+05
Df Residuals:	40431	BIC:	1.153e+05
Df Model:	36
Covariance Type:	nonrobust