Count data models: PhD publications

# data from Long (1997)
# see http://data.princeton.edu/wws509/stata/overdispersion.html for Stata analysis

data("PhdPubs", package="vcdExtra")

library(MASS)
library(countreg)
library(vcdExtra)
library(car)
library(lmtest)    # for coeftest

a basic histogram of articles published

hist(PhdPubs$articles, breaks=0:19, col="pink", xlim=c(0,20),
     xlab="Number of Articles")

Fit the poisson model

phd.pois <- glm(articles ~ ., data=PhdPubs, family=poisson)
library(car)
Anova(phd.pois)

## Analysis of Deviance Table (Type II tests)
## 
## Response: articles
##             LR Chisq Df Pr(>Chisq)    
## female        17.069  1  3.605e-05 ***
## married        6.633  1    0.01001 *  
## kid5          22.102  1  2.586e-06 ***
## phdprestige    1.010  1    0.31500    
## mentor       126.776  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

interpreting coefficients

round(cbind(beta = coef(phd.pois),
            expbeta = exp(coef(phd.pois)),
            pct = 100 * (exp(coef(phd.pois)) - 1)), 3)

##               beta expbeta     pct
## (Intercept)  0.266   1.304  30.425
## female      -0.224   0.799 -20.102
## married      0.157   1.170  17.037
## kid5        -0.185   0.831 -16.882
## phdprestige  0.025   1.026   2.570
## mentor       0.025   1.026   2.555

check for interactions

phd.pois1 <- update(phd.pois, . ~ .^2)
Anova(phd.pois1)

## Analysis of Deviance Table (Type II tests)
## 
## Response: articles
##                     LR Chisq Df Pr(>Chisq)    
## female                14.521  1  0.0001386 ***
## married                6.201  1  0.0127677 *  
## kid5                  19.545  1  9.826e-06 ***
## phdprestige            0.963  1  0.3265451    
## mentor               128.112  1  < 2.2e-16 ***
## female:married         0.260  1  0.6099455    
## female:kid5            0.120  1  0.7292877    
## female:phdprestige     0.224  1  0.6357431    
## female:mentor          0.012  1  0.9126026    
## married:kid5                  0               
## married:phdprestige    1.706  1  0.1915350    
## married:mentor         1.157  1  0.2820254    
## kid5:phdprestige       0.164  1  0.6852251    
## kid5:mentor            2.823  1  0.0929044 .  
## phdprestige:mentor     3.810  1  0.0509367 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(phd.pois, phd.pois1, test="Chisq")

## Analysis of Deviance Table
## 
## Model 1: articles ~ female + married + kid5 + phdprestige + mentor
## Model 2: articles ~ female + married + kid5 + phdprestige + mentor + female:married + 
##     female:kid5 + female:phdprestige + female:mentor + married:kid5 + 
##     married:phdprestige + married:mentor + kid5:phdprestige + 
##     kid5:mentor + phdprestige:mentor
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       909     1633.6                       
## 2       900     1618.4  9   15.192  0.08578 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

LRstats(phd.pois, phd.pois1)

## Likelihood summary table:
##              AIC    BIC LR Chisq  Df Pr(>Chisq)    
## phd.pois  3313.3 3342.3   1633.6 909  < 2.2e-16 ***
## phd.pois1 3316.1 3388.4   1618.4 900  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## model diagnostics

op <- par(mfrow=c(1,2), mar=c(4,4,2,1)+.1)
plot(phd.pois, which=c(1,5))

par(op)

### component-plus-residual plot for non-linearity

crPlot(phd.pois, "mentor", pch=16, lwd=4, id = list(n=2))

## [1] 913 911 328 803

better version

op <- par(mar=c(4,4,2,1)+.1)
crPlot(phd.pois, "mentor", pch=16, lwd=4, id = list(n=2), 
    cex.lab=1.4, xlab="mentor publications" )

## [1] 913 911 328 803

text(70, 2.5, paste("b =",round( coef(phd.pois)["mentor"], 3)), col="red", cex=1.5)

par(op)

### Influence plot

influencePlot(phd.pois, id = list(n=2))

##       StudRes         Hat      CookD
## 328 -3.734244 0.193356161 0.36757530
## 803 -1.034076 0.086962320 0.01490683
## 913  5.382111 0.003560164 0.04339156
## 914  5.542302 0.009995654 0.10975550
## 915  5.150435 0.035504369 0.28331178

Effect plots

library(effects)
plot(allEffects(phd.pois))

Overdispersion

mean and variance

with(PhdPubs, c(mean=mean(articles), 
                var=var(articles), 
                ratio=var(articles)/mean(articles)))

##     mean      var    ratio 
## 1.692896 3.709742 2.191358

estimate of phi = dispersion parameter

with(phd.pois, deviance/df.residual)

## [1] 1.797137

sum(residuals(phd.pois, type = "pearson")^2)/phd.pois$df.residual

## [1] 1.830368

Fit the quasi-poisson model

phd.qpois <- glm(articles ~ ., data=PhdPubs, family=quasipoisson)

Fit the negative-binomial model

phd.nbin  <- glm.nb(articles ~ ., data=PhdPubs)

Estimate of theta

summary(phd.nbin)$theta

## [1] 2.266961

## compare models

LRstats(phd.pois, phd.qpois, phd.nbin)

## Likelihood summary table:
##              AIC    BIC LR Chisq  Df Pr(>Chisq)    
## phd.pois  3313.3 3342.3   1633.6 909     <2e-16 ***
## phd.qpois                        909               
## phd.nbin  3135.4 3169.1   1004.1 909     0.0149 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## compare standard errors

phd.SE <- sqrt(cbind(
  pois = diag(vcov(phd.pois)),
  qpois = diag(vcov(phd.qpois)),
  nbin = diag(vcov(phd.nbin))))
round(phd.SE, 4)

##               pois  qpois   nbin
## (Intercept) 0.0996 0.1348 0.1327
## female      0.0546 0.0738 0.0726
## married     0.0613 0.0829 0.0819
## kid5        0.0401 0.0543 0.0528
## phdprestige 0.0253 0.0342 0.0343
## mentor      0.0020 0.0027 0.0032

phd.coef <- cbind(
  pois = coef(phd.pois),
  qpois = coef(phd.qpois),
  nbin = coef(phd.nbin))
round(phd.coef,3)

##               pois  qpois   nbin
## (Intercept)  0.266  0.266  0.213
## female      -0.224 -0.224 -0.216
## married      0.157  0.157  0.153
## kid5        -0.185 -0.185 -0.176
## phdprestige  0.025  0.025  0.029
## mentor       0.025  0.025  0.029

round(cbind(beta = coef(phd.nbin),
            expbeta = exp(coef(phd.nbin)),
            pct = 100 * (exp(coef(phd.nbin)) - 1)), 3)

##               beta expbeta     pct
## (Intercept)  0.213   1.237  23.732
## female      -0.216   0.806 -19.447
## married      0.153   1.165  16.508
## kid5        -0.176   0.838 -16.167
## phdprestige  0.029   1.030   2.978
## mentor       0.029   1.029   2.909

zero-inflated and hurdle models

if (!require(countreg)) install.packages("countreg", repos="http://R-Forge.R-project.org")
# library(countreg)

plot zero articles

plot(factor(articles==0) ~ mentor, data=PhdPubs, 
    ylevels=2:1, ylab="Zero articles", 
    breaks=quantile(mentor, probs=seq(0,1,.2)), cex.lab=1.25)

Fit ZI and hurdle models

phd.zip <- zeroinfl(articles ~ ., data=PhdPubs, dist="poisson")
phd.znb <- zeroinfl(articles ~ ., data=PhdPubs, dist="negbin")

phd.hp  <- hurdle(articles ~ ., data=PhdPubs, dist="poisson")
phd.hnb <- hurdle(articles ~ ., data=PhdPubs, dist="negbin")

show coefficients

phd.zip

## 
## Call:
## zeroinfl(formula = articles ~ ., data = PhdPubs, dist = "poisson")
## 
## Count model coefficients (poisson with log link):
## (Intercept)       female      married         kid5  phdprestige       mentor  
##    0.599179    -0.208790     0.106230    -0.142706     0.006998     0.017846  
## 
## Zero-inflation model coefficients (binomial with logit link):
## (Intercept)       female      married         kid5  phdprestige       mentor  
##   -0.563316     0.108162    -0.355580     0.219738    -0.005371    -0.133126

phd.hp

## 
## Call:
## hurdle(formula = articles ~ ., data = PhdPubs, dist = "poisson")
## 
## Count model coefficients (truncated poisson with log link):
## (Intercept)       female      married         kid5  phdprestige       mentor  
##     0.63938     -0.22824      0.09899     -0.14173     -0.00279      0.01859  
## 
## Zero hurdle model coefficients (binomial with logit link):
## (Intercept)       female      married         kid5  phdprestige       mentor  
##     0.17844     -0.25077      0.32769     -0.28542      0.04281      0.07916

compare models

LRstats(phd.pois, phd.nbin, phd.zip, phd.znb, phd.hp, phd.hnb, 
        sortby="BIC")

## Likelihood summary table:
##             AIC    BIC LR Chisq  Df Pr(>Chisq)    
## phd.pois 3313.3 3342.3   3301.3 909  < 2.2e-16 ***
## phd.hp   3234.5 3292.4   3210.5 903  < 2.2e-16 ***
## phd.zip  3233.5 3291.3   3209.5 903  < 2.2e-16 ***
## phd.hnb  3130.9 3193.5   3104.9 902  < 2.2e-16 ***
## phd.znb  3125.8 3188.4   3099.8 902  < 2.2e-16 ***
## phd.nbin 3135.4 3169.1   3121.4 909  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

examine the ZIP model in more detail

NB: only mentor is important in the zero model

coeftest(phd.zip)

## 
## t test of coefficients:
## 
##                     Estimate Std. Error t value  Pr(>|t|)    
## count_(Intercept)  0.5991793  0.1186144  5.0515 5.305e-07 ***
## count_female      -0.2087902  0.0635263 -3.2867  0.001053 ** 
## count_married      0.1062298  0.0709663  1.4969  0.134768    
## count_kid5        -0.1427059  0.0474362 -3.0084  0.002699 ** 
## count_phdprestige  0.0069981  0.0298118  0.2347  0.814461    
## count_mentor       0.0178457  0.0023337  7.6471 5.256e-14 ***
## zero_(Intercept)  -0.5633164  0.4940503 -1.1402  0.254505    
## zero_female        0.1081615  0.2817308  0.3839  0.701130    
## zero_married      -0.3555798  0.3179619 -1.1183  0.263732    
## zero_kid5          0.2197383  0.1965801  1.1178  0.263948    
## zero_phdprestige  -0.0053714  0.1411827 -0.0380  0.969659    
## zero_mentor       -0.1331261  0.0464314 -2.8672  0.004238 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

coeftest(phd.znb)

## 
## t test of coefficients:
## 
##                     Estimate Std. Error t value  Pr(>|t|)    
## count_(Intercept)  0.3654957  0.1398736  2.6130  0.009123 ** 
## count_female      -0.1945997  0.0757710 -2.5683  0.010381 *  
## count_married      0.1015024  0.0843342  1.2036  0.229070    
## count_kid5        -0.1516356  0.0542427 -2.7955  0.005292 ** 
## count_phdprestige  0.0156126  0.0345164  0.4523  0.651145    
## count_mentor       0.0243967  0.0034978  6.9749 5.919e-12 ***
## zero_(Intercept)  -0.2942762  1.2827207 -0.2294  0.818598    
## zero_female        0.6579945  0.8556879  0.7690  0.442115    
## zero_married      -1.4740768  0.9176084 -1.6064  0.108529    
## zero_kid5          0.6241456  0.4410421  1.4152  0.157366    
## zero_phdprestige  -0.0112609  0.2888196 -0.0390  0.968907    
## zero_mentor       -0.8764770  0.3152265 -2.7805  0.005541 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

phd.zip1 <- zeroinfl(articles ~ .| mentor, data=PhdPubs, dist="poisson")
phd.znb1 <- zeroinfl(articles ~ .| mentor, data=PhdPubs, dist="negbin")

summary(phd.zip1)

## 
## Call:
## zeroinfl(formula = articles ~ . | mentor, data = PhdPubs, dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2952 -0.8713 -0.2620  0.5400  7.2426 
## 
## Count model coefficients (poisson with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.58819    0.10996   5.349 8.83e-08 ***
## female      -0.21791    0.05879  -3.707 0.000210 ***
## married      0.13597    0.06602   2.059 0.039454 *  
## kid5        -0.16262    0.04338  -3.749 0.000178 ***
## phdprestige  0.00671    0.02729   0.246 0.805737    
## mentor       0.01804    0.00229   7.878 3.32e-15 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.68649    0.20592  -3.334 0.000857 ***
## mentor      -0.13027    0.04037  -3.227 0.001250 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 16 
## Log-likelihood: -1606 on 8 Df

LRstats(phd.pois, phd.nbin, phd.zip, phd.znb, phd.hp, phd.hnb, phd.zip1, phd.znb1, 
    sortby="BIC")

## Likelihood summary table:
##             AIC    BIC LR Chisq  Df Pr(>Chisq)    
## phd.pois 3313.3 3342.3   3301.3 909  < 2.2e-16 ***
## phd.hp   3234.5 3292.4   3210.5 903  < 2.2e-16 ***
## phd.zip  3233.5 3291.3   3209.5 903  < 2.2e-16 ***
## phd.zip1 3227.5 3266.0   3211.5 907  < 2.2e-16 ***
## phd.hnb  3130.9 3193.5   3104.9 902  < 2.2e-16 ***
## phd.znb  3125.8 3188.4   3099.8 902  < 2.2e-16 ***
## phd.nbin 3135.4 3169.1   3121.4 909  < 2.2e-16 ***
## phd.znb1 3124.3 3167.7   3106.3 906  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

effect plot for important terms in the NBIN model

plot(allEffects(phd.nbin)[c(1,3,5)], rows=1, cols=3)

plot the zero model

phd.zero <- glm((articles==0) ~ mentor, data=PhdPubs, family=binomial)
plot(allEffects(phd.zero), main="Mentor effect on not publishing")