The dataset below contains data by state, including population, area in square miles, percent urban population, percent below poverty line, whether there are gun registration laws or not, and the number of homicides. The socioeconomic data are from 1990/91. The gun registration indicator is taken from a USA Today article (Tuesday, January 7, 1992, PAGE 5A).1
pop
= population of state (in 1000s of people)area
= area of state (in 1000 square miles)urban
= percent urban populationpoverty
= percent below poverty linegunreg
= whether there are gun registration laws or
nothomicides
= number of homicides in the past yearDo gun registration laws affect the rate of homicides in a state?
"GunReg" <-
structure(.Data = list(
"pop" = c(4089, 2372, 30380, 3291, 598, 13277, 1135, 2795, 11543, 5996,
4860, 9368, 4432, 5158, 6737, 635, 7760, 18058, 10939, 11961, 1004,
3560, 4953, 17349, 1770, 5018, 570, 3750, 3377, 680, 6623,
1039, 5610, 2495, 3713, 4252, 1235, 2592, 808, 1593, 1105,
1548, 1284, 3175, 2922, 703, 6286, 567, 4955, 1801, 460.),
"area" = c(52.4, 53.2, 163.7, 5.5, 0.1, 65.8, 10.9, 56.3, 57.9, 10.6,
12.4, 96.8, 86.9, 69.7, 53.8, 70.7, 8.7, 54.5, 44.8, 46.1, 1.5, 32,
42.1, 268.6, 84.9, 71.3, 656.4, 114, 104.1, 2.5, 59.4, 83.6, 36.4,
82.3, 40.4, 51.8, 35.4, 48.4, 147, 77.4, 9.4, 121.6, 110.6, 69.9,
98.4, 77.1, 42.8, 9.6, 65.5, 24.2, 97.8),
"urban" = c(60, 54, 93, 79, 100, 85, 89, 61, 85, 84, 81, 70, 71, 53,
50, 53, 89, 84, 74, 69, 86, 55, 61, 80, 87, 76, 68, 88, 82,
73, 63, 57, 65, 69, 52, 68, 45, 47, 53, 66, 51, 73, 88,
68, 71, 50, 69, 32, 66, 36, 65.),
"poverty" = c(19, 18.4, 14.2, 5.8, 19.2, 14.1, 10, 10.1, 13.3, 10.2,
9.3, 13.9, 12, 13.6, 13.2, 13.5, 9, 14.1, 11.8, 10.8, 8.2, 16.5,
16.9, 16.8, 9.8, 26.2, 11.2, 14.2, 12.1, 8.1, 16, 13.7, 14.1, 11.1,
17.4, 22, 12.5, 23.8, 15.8, 10.9, 7.1, 20.9, 10.7, 15.8, 11.3, 13.5,
10.6, 7.1, 9.2, 17.2, 10.6),
"gunreg" = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.),
"homicides" = c(410, 240, 3710, 170, 489, 1300, 44, 62, 1270, 200, 540,
1020, 100, 550, 730, 11, 350, 2550, 760, 740, 38, 350, 470, 2660,
43, 220, 56, 290, 155, 32, 720, 21, 380, 150, 260, 760,
23, 370, 29, 43, 32, 160, 135, 220, 120, 9, 550, 24, 240,
135, 20.)),
names = c("pop", "area", "urban", "poverty", "gunreg", "homicides"),
row.names = c("AL", "AR", "CA", "CT", "DC", "FL", "HI", "IA", "IL", "MA",
"MD", "MI", "MN", "MO", "NC", "ND", "NJ", "NY", "OH", "PA", "RI", "SC",
"TN", "TX", "UT", "WA", "AK", "AZ", "CO", "DE", "GA", "ID", "IN", "KS",
"KY", "LA", "ME", "MS", "MT", "NE", "NH", "NM", "NV", "OK", "OR", "SD",
"VA", "VT", "WI", "WV", "WY"), class = "data.frame")
Create a “population density” variable
GunReg <- GunReg %>% mutate(pop_density = (pop*1000)/(area*1000))
Scatterplot matrix
plot(GunReg)
scatterplotMatrix(GunReg)
Create homicide rate variable:
GunReg <- GunReg %>% mutate(homicide_rate = homicides/pop)
GunReg <- data.frame(state = row.names(GunReg), GunReg)
Correlations
cor(GunReg[,c(2:8)])
## pop area urban poverty gunreg
## pop 1.00000000 0.09445559 0.38102617 0.080474205 0.433288255
## area 0.09445559 1.00000000 0.06176086 0.061633537 -0.169263080
## urban 0.38102617 0.06176086 1.00000000 -0.138967877 0.381711182
## poverty 0.08047421 0.06163354 -0.13896788 1.000000000 -0.002147141
## gunreg 0.43328825 -0.16926308 0.38171118 -0.002147141 1.000000000
## homicides 0.94933164 0.15569769 0.35523752 0.176080637 0.375505900
## pop_density -0.05754515 -0.17864052 0.37838034 0.104020415 0.230611294
## homicides pop_density
## pop 0.9493316 -0.05754515
## area 0.1556977 -0.17864052
## urban 0.3552375 0.37838034
## poverty 0.1760806 0.10402042
## gunreg 0.3755059 0.23061129
## homicides 1.0000000 0.03171880
## pop_density 0.0317188 1.00000000
GunReg %>% ggplot(aes(x = poverty, y = homicide_rate)) +
geom_point()
plot(GunReg$poverty, GunReg$homicide_rate,
xlab = "Percent poverty", ylab = "Homicide rate per 1000 people")
# Use code below to interactively identify outliers:
# identify(GunReg$poverty, GunReg$homicide_rate)
boxplot(GunReg$homicide_rate ~ GunReg$gunreg,
xlab = "Gun Registration Law (0 = No)",
ylab = "Homicide Rate per 1,000 people")
mod <- glm(homicides ~ gunreg, family = poisson,
data = GunReg, offset = log(pop))
summary(mod)
##
## Call:
## glm(formula = homicides ~ gunreg, family = poisson, data = GunReg,
## offset = log(pop))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -19.827 -8.012 -2.853 2.114 34.513
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.54925 0.01424 -179.07 <2e-16 ***
## gunreg 0.25316 0.01598 15.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 5742.1 on 50 degrees of freedom
## Residual deviance: 5478.5 on 49 degrees of freedom
## AIC: 5843.6
##
## Number of Fisher Scoring iterations: 4
# Incorrect model since no offset is included
mod.wrong <- glm(homicides ~ gunreg, family = poisson, data = GunReg)
summary(mod.wrong)
##
## Call:
## glm(formula = homicides ~ gunreg, family = poisson, data = GunReg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -36.732 -15.944 -7.440 3.592 78.027
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.28503 0.01424 371.24 <2e-16 ***
## gunreg 1.31049 0.01598 82.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 35577 on 50 degrees of freedom
## Residual deviance: 27272 on 49 degrees of freedom
## AIC: 27637
##
## Number of Fisher Scoring iterations: 5
Something strange…
GunReg$x <- rnorm(51)
mod <- glm(homicides ~ gunreg + x, family = poisson,
data = GunReg, offset = log(pop))
summary(mod)
##
## Call:
## glm(formula = homicides ~ gunreg + x, family = poisson, data = GunReg,
## offset = log(pop))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -19.851 -7.986 -2.861 2.088 34.520
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.550845 0.014326 -178.059 <2e-16 ***
## gunreg 0.253389 0.015977 15.859 <2e-16 ***
## x -0.005565 0.005533 -1.006 0.315
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 5742.1 on 50 degrees of freedom
## Residual deviance: 5477.5 on 48 degrees of freedom
## AIC: 5844.6
##
## Number of Fisher Scoring iterations: 4
Quasi-Poisson model
mod_q <- glm(homicides ~ gunreg, family = quasipoisson,
data = GunReg, offset = log(pop))
summary(mod_q)
##
## Call:
## glm(formula = homicides ~ gunreg, family = quasipoisson, data = GunReg,
## offset = log(pop))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -19.827 -8.012 -2.853 2.114 34.513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.5493 0.1695 -15.039 <2e-16 ***
## gunreg 0.2532 0.1902 1.331 0.189
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 141.7833)
##
## Null deviance: 5742.1 on 50 degrees of freedom
## Residual deviance: 5478.5 on 49 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
Check calculations:
pres <- residuals(mod, type = "pearson")
phihat <- sum(pres^2)/(51 - 2)
orig_se <- summary(mod)$coefficients[,2]
corrected_se <- orig_se*sqrt(phihat)
Model comparison tests:
mod0 <- glm(homicides ~ gunreg, family = poisson,
data = GunReg, offset = log(pop))
mod0_q <- glm(homicides ~ gunreg, family = quasipoisson,
data = GunReg, offset = log(pop))
mod1 <- glm(homicides ~ gunreg + poverty + urban, family = poisson,
data = GunReg, offset = log(pop))
mod1_q <- glm(homicides ~ gunreg + poverty + urban, family = quasipoisson,
data = GunReg, offset = log(pop))
anova(mod0, mod1, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: homicides ~ gunreg
## Model 2: homicides ~ gunreg + poverty + urban
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 49 5478.5
## 2 47 3919.4 2 1559.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod0_q, mod1_q, test = "LRT")
## Analysis of Deviance Table
##
## Model 1: homicides ~ gunreg
## Model 2: homicides ~ gunreg + poverty + urban
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 49 5478.5
## 2 47 3919.4 2 1559.2 0.0001154 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Same deviance, but different p-values!
dev <- anova(mod0, mod1, test = "LRT")$Deviance[2]
# p-values
pchisq(dev, df = 2, lower.tail=FALSE) # p-value in original LRT
## [1] 0
pchisq(dev/phihat, df = 2, lower.tail=FALSE) # corrected p-value...?
## [1] 0.004117905
Other way to deal with overdispersion: negative binomial regression
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
mod_nb <- glm.nb(homicides ~ gunreg + poverty + urban + offset(log(pop)),
data = GunReg)
summary(mod_nb)
##
## Call:
## glm.nb(formula = homicides ~ gunreg + poverty + urban + offset(log(pop)),
## data = GunReg, init.theta = 3.418144423, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.78734 -0.84712 -0.06245 0.28497 2.73334
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.262017 0.478178 -11.004 < 2e-16 ***
## gunreg 0.158352 0.166324 0.952 0.34106
## poverty 0.100595 0.018321 5.491 4.01e-08 ***
## urban 0.017749 0.005608 3.165 0.00155 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(3.4181) family taken to be 1)
##
## Null deviance: 106.179 on 50 degrees of freedom
## Residual deviance: 53.877 on 47 degrees of freedom
## AIC: 634.09
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 3.418
## Std. Err.: 0.677
##
## 2 x log-likelihood: -624.088
Original source: http://people.reed.edu/~jones/141/Guns.html↩︎