1. Get data from read.dta("https://stats.idre.ucla.edu/stat/stata/dae/nb_data.dta":));

    • gender – gender of the respondent (female, male)
    • math – score on math test
    • daysabs – number of days absent
    • prog – type of program
      (use as factor; 1 – vocational, 2 – general, 3 – academic)
  2. Predict math test scored by gender, number of missed days and the program type

  3. Test for overdispersion and zero-inflation.
    Choose the appropriate model

  4. Interpret the coefficients

  5. Look at the model fit

knitr::opts_chunk$set(message=FALSE, warning=FALSE)
library(foreign)
df <- read.dta("https://stats.idre.ucla.edu/stat/stata/dae/nb_data.dta")
df$prog <- as.factor(df$prog)
df <- na.omit(df)

Predict math test scored by gender, number of missed days and the program type.

mpois = glm(math ~ gender + daysabs + prog, data = df, family = poisson)
summary(mpois)
## 
## Call:
## glm(formula = math ~ gender + daysabs + prog, family = poisson, 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -9.2435  -2.8949   0.1444   2.3245   8.4114  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.97042    0.02871 138.290  < 2e-16 ***
## gendermale  -0.03307    0.01633  -2.025  0.04282 *  
## daysabs     -0.01497    0.00142 -10.545  < 2e-16 ***
## prog2       -0.10405    0.02691  -3.867  0.00011 ***
## prog3        0.15157    0.02871   5.279  1.3e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4823.4  on 313  degrees of freedom
## Residual deviance: 4364.7  on 309  degrees of freedom
## AIC: 6095.3
## 
## Number of Fisher Scoring iterations: 5

All p-values for regressors show significant influence - that’s great

Test for overdispersion and zero-inflation.

library(AER)
# linear
dispersiontest(mpois, trafo = 1)
## 
##  Overdispersion test
## 
## data:  mpois
## z = 13.768, p-value < 2.2e-16
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
##    alpha 
## 11.44417

Mean and variance are not equal and overdispersion is definitely exists in the model.

# quadratic
dispersiontest(mpois, trafo = 2)
## 
##  Overdispersion test
## 
## data:  mpois
## z = 12.423, p-value < 2.2e-16
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
##     alpha 
## 0.2185793

Looks like better to choose quadratic relationship model.

Basing on overdispersion and zero-inflation tests I prefer to choose quadratic model.

fm_qpois <- glm(math ~ gender + daysabs + prog, data = df, family = quasipoisson)
fm_nb <- MASS::glm.nb(math ~ gender + daysabs + prog, data = df)
library(texreg)
screenreg(list(mpois, fm_qpois, fm_nb))
## 
## =======================================================
##                 Model 1       Model 2      Model 3     
## -------------------------------------------------------
## (Intercept)         3.97 ***     3.97 ***      3.98 ***
##                    (0.03)       (0.10)        (0.12)   
## gendermale         -0.03 *      -0.03         -0.05    
##                    (0.02)       (0.06)        (0.07)   
## daysabs            -0.01 ***    -0.01 **      -0.02 ** 
##                    (0.00)       (0.01)        (0.01)   
## prog2              -0.10 ***    -0.10         -0.10    
##                    (0.03)       (0.10)        (0.11)   
## prog3               0.15 ***     0.15          0.16    
##                    (0.03)       (0.10)        (0.12)   
## -------------------------------------------------------
## AIC              6095.25                    2949.36    
## BIC              6114.00                    2971.86    
## Log Likelihood  -3042.63                   -1468.68    
## Deviance         4364.73      4364.73        343.01    
## Num. obs.         314          314           314       
## =======================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

But now looks like that NB is even better than quadratic!

library(vcdExtra)
zero.test(table(df$math))
## Score test for zero inflation
## 
##      Chi-square = 0 
##      df = 1
##      pvalue: 1

Zero-inflation test looks very strange 🤔

sum(df$math==0)
## [1] 0

So my dataframe does not contain any zeros, I have zero of zeros in dependent variable - zero-inflation is not applicable.

Interpreting coefficients

library(mfx)
negbinmfx(math ~ gender + daysabs + prog, data = df)
## Call:
## negbinmfx(formula = math ~ gender + daysabs + prog, data = df)
## 
## Marginal Effects:
##               dF/dx Std. Err.       z    P>|z|   
## gendermale -2.23012   3.34163 -0.6674 0.504534   
## daysabs    -0.75298   0.25989 -2.8973 0.003764 **
## prog2      -4.85972   5.32292 -0.9130 0.361253   
## prog3       7.55840   6.13921  1.2312 0.218260   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
## [1] "gendermale" "prog2"      "prog3"

Model fit

library(pscl)
t(pR2(fm_nb))
## fitting null model for pseudo-r2
##           llh   llhNull       G2    McFadden       r2ML       r2CU
## [1,] -1468.68 -1481.161 24.96166 0.008426382 0.07641803 0.07642413

McFadden’s pseudo R^2 shows that in terms of fitness you have no fitness 😒