Last data update: 2014.03.03
R: Data and Examples from Winkelmann and Boes (2009)
WinkelmannBoes2009 R Documentation
Data and Examples from Winkelmann and Boes (2009)
Description
This manual page collects a list of examples from the book. Some
solutions might not be exact and the list is not complete.
If you have suggestions for improvement (preferably in the form of code),
please contact the package maintainer.
References
Winkelmann, R., and Boes, S. (2009). Analysis of Microdata , 2nd ed. Berlin and Heidelberg: Springer-Verlag.
See Also
GSS7402
, GSOEP9402
, PSID1976
Examples
#########################################
## US General Social Survey 1974--2002 ##
#########################################
## data
data("GSS7402", package = "AER")
## completed fertility subset
gss40 <- subset(GSS7402, age >= 40)
## Chapter 1
## Table 1.1
gss_kids <- table(gss40$kids)
cbind(absolute = gss_kids,
relative = round(prop.table(gss_kids) * 100, digits = 2))
## Table 1.2
sd1 <- function(x) sd(x) / sqrt(length(x))
with(gss40, round(cbind(
"obs" = tapply(kids, year, length),
"av kids" = tapply(kids, year, mean),
" " = tapply(kids, year, sd1),
"prop childless" = tapply(kids, year, function(x) mean(x <= 0)),
" " = tapply(kids, year, function(x) sd1(x <= 0)),
"av schooling" = tapply(education, year, mean),
" " = tapply(education, year, sd1)
), digits = 2))
## Table 1.3
gss40$trend <- gss40$year - 1974
kids_lm1 <- lm(kids ~ factor(year), data = gss40)
kids_lm2 <- lm(kids ~ trend, data = gss40)
kids_lm3 <- lm(kids ~ trend + education, data = gss40)
## Chapter 2
## Table 2.1
kids_tab <- prop.table(xtabs(~ kids + year, data = gss40), 2) * 100
round(kids_tab[,c(4, 8)], digits = 2)
## Figure 2.1
barplot(t(kids_tab[, c(4, 8)]), beside = TRUE, legend = TRUE)
## Chapter 3, Example 3.14
## Table 3.1
gss40$nokids <- factor(gss40$kids <= 0,
levels = c(FALSE, TRUE), labels = c("no", "yes"))
nokids_p1 <- glm(nokids ~ 1, data = gss40, family = binomial(link = "probit"))
nokids_p2 <- glm(nokids ~ trend, data = gss40, family = binomial(link = "probit"))
nokids_p3 <- glm(nokids ~ trend + education + ethnicity + siblings,
data = gss40, family = binomial(link = "probit"))
## p. 87
lrtest(nokids_p1, nokids_p2, nokids_p3)
## Chapter 4, Example 4.1
gss40$nokids01 <- as.numeric(gss40$nokids) - 1
nokids_lm3 <- lm(nokids01 ~ trend + education + ethnicity + siblings, data = gss40)
coeftest(nokids_lm3, vcov = sandwich)
## Example 4.3
## Table 4.1
nokids_l1 <- glm(nokids ~ 1, data = gss40, family = binomial(link = "logit"))
nokids_l3 <- glm(nokids ~ trend + education + ethnicity + siblings,
data = gss40, family = binomial(link = "logit"))
lrtest(nokids_p3)
lrtest(nokids_l3)
## Table 4.2
nokids_xbar <- colMeans(model.matrix(nokids_l3))
sum(coef(nokids_p3) * nokids_xbar)
sum(coef(nokids_l3) * nokids_xbar)
dnorm(sum(coef(nokids_p3) * nokids_xbar))
dlogis(sum(coef(nokids_l3) * nokids_xbar))
dnorm(sum(coef(nokids_p3) * nokids_xbar)) * coef(nokids_p3)[3]
dlogis(sum(coef(nokids_l3) * nokids_xbar)) * coef(nokids_l3)[3]
exp(coef(nokids_l3)[3])
## Figure 4.4
## everything by hand (for ethnicity = "cauc" group)
nokids_xbar <- as.vector(nokids_xbar)
nokids_nd <- data.frame(education = seq(0, 20, by = 0.5), trend = nokids_xbar[2],
ethnicity = "cauc", siblings = nokids_xbar[4])
nokids_p3_fit <- predict(nokids_p3, newdata = nokids_nd,
type = "response", se.fit = TRUE)
plot(nokids_nd$education, nokids_p3_fit$fit, type = "l",
xlab = "education", ylab = "predicted probability", ylim = c(0, 0.3))
polygon(c(nokids_nd$education, rev(nokids_nd$education)),
c(nokids_p3_fit$fit + 1.96 * nokids_p3_fit$se.fit,
rev(nokids_p3_fit$fit - 1.96 * nokids_p3_fit$se.fit)),
col = "lightgray", border = "lightgray")
lines(nokids_nd$education, nokids_p3_fit$fit)
## using "effects" package (for average "ethnicity" variable)
library("effects")
nokids_p3_ef <- effect("education", nokids_p3, xlevels = list(education = 0:20))
plot(nokids_p3_ef, rescale.axis = FALSE, ylim = c(0, 0.3))
## using "effects" plus modification by hand
nokids_p3_ef1 <- as.data.frame(nokids_p3_ef)
plot(pnorm(fit) ~ education, data = nokids_p3_ef1, type = "n", ylim = c(0, 0.3))
polygon(c(0:20, 20:0), pnorm(c(nokids_p3_ef1$upper, rev(nokids_p3_ef1$lower))),
col = "lightgray", border = "lightgray")
lines(pnorm(fit) ~ education, data = nokids_p3_ef1)
## Table 4.6
## McFadden's R^2
1 - as.numeric( logLik(nokids_p3) / logLik(nokids_p1) )
1 - as.numeric( logLik(nokids_l3) / logLik(nokids_l1) )
## McKelvey and Zavoina R^2
r2mz <- function(obj) {
ystar <- predict(obj)
sse <- sum((ystar - mean(ystar))^2)
s2 <- switch(obj$family$link, "probit" = 1, "logit" = pi^2/3, NA)
n <- length(residuals(obj))
sse / (n * s2 + sse)
}
r2mz(nokids_p3)
r2mz(nokids_l3)
## AUC
library("ROCR")
nokids_p3_pred <- prediction(fitted(nokids_p3), gss40$nokids)
nokids_l3_pred <- prediction(fitted(nokids_l3), gss40$nokids)
plot(performance(nokids_p3_pred, "tpr", "fpr"))
abline(0, 1, lty = 2)
performance(nokids_p3_pred, "auc")
plot(performance(nokids_l3_pred, "tpr", "fpr"))
abline(0, 1, lty = 2)
performance(nokids_l3_pred, "auc")@y.values
## Chapter 7
## Table 7.3
## subset selection
gss02 <- subset(GSS7402, year == 2002 & (age < 40 | !is.na(agefirstbirth)))
#Z# This selection conforms with top of page 229. However, there
#Z# are too many observations: 1374. Furthermore, there are six
#Z# observations with agefirstbirth <= 14 which will cause problems in
#Z# taking logs!
## computing time to first birth
gss02$tfb <- with(gss02, ifelse(is.na(agefirstbirth), age - 14, agefirstbirth - 14))
#Z# currently this is still needed before taking logs
gss02$tfb <- pmax(gss02$tfb, 1)
tfb_tobit <- tobit(log(tfb) ~ education + ethnicity + siblings + city16 + immigrant,
data = gss02, left = -Inf, right = log(gss02$age - 14))
tfb_ols <- lm(log(tfb) ~ education + ethnicity + siblings + city16 + immigrant,
data = gss02, subset = !is.na(agefirstbirth))
## Chapter 8
## Example 8.3
gss2002 <- subset(GSS7402, year == 2002 & (agefirstbirth < 40 | age < 40))
gss2002$afb <- with(gss2002, Surv(ifelse(kids > 0, agefirstbirth, age), kids > 0))
afb_km <- survfit(afb ~ 1, data = gss2002)
afb_skm <- summary(afb_km)
print(afb_skm)
with(afb_skm, plot(n.event/n.risk ~ time, type = "s"))
plot(afb_km, xlim = c(10, 40), conf.int = FALSE)
## Example 8.9
library("survival")
afb_ex <- survreg(
afb ~ education + siblings + ethnicity + immigrant + lowincome16 + city16,
data = gss2002, dist = "exponential")
afb_wb <- survreg(
afb ~ education + siblings + ethnicity + immigrant + lowincome16 + city16,
data = gss2002, dist = "weibull")
afb_ln <- survreg(
afb ~ education + siblings + ethnicity + immigrant + lowincome16 + city16,
data = gss2002, dist = "lognormal")
## Example 8.11
kids_pois <- glm(kids ~ education + trend + ethnicity + immigrant + lowincome16 + city16,
data = gss40, family = poisson)
library("MASS")
kids_nb <- glm.nb(kids ~ education + trend + ethnicity + immigrant + lowincome16 + city16,
data = gss40)
lrtest(kids_pois, kids_nb)
############################################
## German Socio-Economic Panel 1994--2002 ##
############################################
## data
data("GSOEP9402", package = "AER")
## some convenience data transformations
gsoep <- GSOEP9402
gsoep$meducation2 <- cut(gsoep$meducation, breaks = c(6, 10.25, 12.25, 18),
labels = c("7-10", "10.5-12", "12.5-18"))
gsoep$year2 <- factor(gsoep$year)
## Chapter 1
## Table 1.4 plus visualizations
gsoep_tab <- xtabs(~ meducation2 + school, data = gsoep)
round(prop.table(gsoep_tab, 1) * 100, digits = 2)
spineplot(gsoep_tab)
plot(school ~ meducation, data = gsoep, breaks = c(7, 10.25, 12.25, 18))
plot(school ~ meducation, data = gsoep, breaks = c(7, 9, 10.5, 11.5, 12.5, 15, 18))
## Chapter 5
## Table 5.1
library("nnet")
gsoep_mnl <- multinom(
school ~ meducation + memployment + log(income) + log(size) + parity + year2,
data = gsoep)
coeftest(gsoep_mnl)[c(1:6, 1:6 + 14),]
## alternatively
if(require("mlogit")) {
gsoep_mnl2 <- mlogit(school ~ 0 | meducation + memployment + log(income) +
log(size) + parity + year2, data = gsoep, shape = "wide", reflevel = "Hauptschule")
coeftest(gsoep_mnl2)[1:12,]
}
## Table 5.2
library("effects")
gsoep_eff <- effect("meducation", gsoep_mnl,
xlevels = list(meducation = sort(unique(gsoep$meducation))))
gsoep_eff$prob
plot(gsoep_eff, confint = FALSE)
## Table 5.3, odds
exp(coef(gsoep_mnl)[, "meducation"])
## all effects
eff_mnl <- allEffects(gsoep_mnl)
plot(eff_mnl, ask = FALSE, confint = FALSE)
plot(eff_mnl, ask = FALSE, style = "stacked", colors = gray.colors(3))
## omit year
gsoep_mnl1 <- multinom(
school ~ meducation + memployment + log(income) + log(size) + parity,
data = gsoep)
lrtest(gsoep_mnl, gsoep_mnl1)
eff_mnl1 <- allEffects(gsoep_mnl1)
plot(eff_mnl1, ask = FALSE, confint = FALSE)
plot(eff_mnl1, ask = FALSE, style = "stacked", colors = gray.colors(3))
## Chapter 6
## Table 6.1
library("MASS")
gsoep$munemp <- factor(gsoep$memployment != "none",
levels = c(FALSE, TRUE), labels = c("no", "yes"))
gsoep_pop <- polr(school ~ meducation + munemp + log(income) + log(size) + parity + year2,
data = gsoep, method = "probit", Hess = TRUE)
gsoep_pol <- polr(school ~ meducation + munemp + log(income) + log(size) + parity + year2,
data = gsoep, Hess = TRUE)
lrtest(gsoep_pop)
lrtest(gsoep_pol)
## Table 6.2
## todo
eff_pol <- allEffects(gsoep_pol)
plot(eff_pol, ask = FALSE, confint = FALSE)
plot(eff_pol, ask = FALSE, style = "stacked", colors = gray.colors(3))
####################################
## Labor Force Participation Data ##
####################################
## Mroz data
data("PSID1976", package = "AER")
PSID1976$nwincome <- with(PSID1976, (fincome - hours * wage)/1000)
## visualizations
plot(hours ~ nwincome, data = PSID1976,
xlab = "Non-wife income (in USD 1000)",
ylab = "Hours of work in 1975")
plot(jitter(hours, 200) ~ jitter(wage, 50), data = PSID1976,
xlab = "Wife's average hourly wage (jittered)",
ylab = "Hours of work in 1975 (jittered)")
## Chapter 1, p. 18
hours_lm <- lm(hours ~ wage + nwincome + youngkids + oldkids, data = PSID1976,
subset = participation == "yes")
## Chapter 7
## Example 7.2, Table 7.1
hours_tobit <- tobit(hours ~ nwincome + education + experience + I(experience^2) +
age + youngkids + oldkids, data = PSID1976)
hours_ols1 <- lm(hours ~ nwincome + education + experience + I(experience^2) +
age + youngkids + oldkids, data = PSID1976)
hours_ols2 <- lm(hours ~ nwincome + education + experience + I(experience^2) +
age + youngkids + oldkids, data = PSID1976, subset = participation == "yes")
## Example 7.10, Table 7.4
wage_ols <- lm(log(wage) ~ education + experience + I(experience^2),
data = PSID1976, subset = participation == "yes")
library("sampleSelection")
wage_ghr <- selection(participation ~ nwincome + age + youngkids + oldkids +
education + experience + I(experience^2),
log(wage) ~ education + experience + I(experience^2), data = PSID1976)
## Exercise 7.13
hours_cragg1 <- glm(participation ~ nwincome + education +
experience + I(experience^2) + age + youngkids + oldkids,
data = PSID1976, family = binomial(link = "probit"))
library("truncreg")
hours_cragg2 <- truncreg(hours ~ nwincome + education +
experience + I(experience^2) + age + youngkids + oldkids,
data = PSID1976, subset = participation == "yes")
## Exercise 7.15
wage_olscoef <- sapply(c(-Inf, 0.5, 1, 1.5, 2), function(censpoint)
coef(lm(log(wage) ~ education + experience + I(experience^2),
data = PSID1976[log(PSID1976$wage) > censpoint,])))
wage_mlcoef <- sapply(c(0.5, 1, 1.5, 2), function(censpoint)
coef(tobit(log(wage) ~ education + experience + I(experience^2),
data = PSID1976, left = censpoint)))
##################################
## Choice of Brand for Crackers ##
##################################
## data
if(require("mlogit")) {
data("Cracker", package = "mlogit")
head(Cracker, 3)
crack <- mlogit.data(Cracker, varying = 2:13, shape = "wide", choice = "choice")
head(crack, 12)
## Table 5.6 (model 3 probably not fully converged in W&B)
crack$price <- crack$price/100
crack_mlogit1 <- mlogit(choice ~ price | 0, data = crack, reflevel = "private")
crack_mlogit2 <- mlogit(choice ~ price | 1, data = crack, reflevel = "private")
crack_mlogit3 <- mlogit(choice ~ price + feat + disp | 1, data = crack,
reflevel = "private")
lrtest(crack_mlogit1, crack_mlogit2, crack_mlogit3)
## IIA test
crack_mlogit_all <- update(crack_mlogit2, reflevel = "nabisco")
crack_mlogit_res <- update(crack_mlogit_all,
alt.subset = c("keebler", "nabisco", "sunshine"))
hmftest(crack_mlogit_all, crack_mlogit_res)
}
Results
R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> library(AER)
Loading required package: car
Loading required package: lmtest
Loading required package: zoo
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
Loading required package: sandwich
Loading required package: survival
> png(filename="/home/ddbj/snapshot/RGM3/R_CC/result/AER/WinkelmannBoes2009.Rd_%03d_medium.png", width=480, height=480)
> ### Name: WinkelmannBoes2009
> ### Title: Data and Examples from Winkelmann and Boes (2009)
> ### Aliases: WinkelmannBoes2009
> ### Keywords: datasets
>
> ### ** Examples
>
> #########################################
> ## US General Social Survey 1974--2002 ##
> #########################################
>
> ## data
> data("GSS7402", package = "AER")
>
> ## completed fertility subset
> gss40 <- subset(GSS7402, age >= 40)
>
>
> ## Chapter 1
> ## Table 1.1
> gss_kids <- table(gss40$kids)
> cbind(absolute = gss_kids,
+ relative = round(prop.table(gss_kids) * 100, digits = 2))
absolute relative
0 744 14.45
1 706 13.71
2 1368 26.56
3 1002 19.46
4 593 11.51
5 309 6.00
6 190 3.69
7 89 1.73
8 149 2.89
>
> ## Table 1.2
> sd1 <- function(x) sd(x) / sqrt(length(x))
> with(gss40, round(cbind(
+ "obs" = tapply(kids, year, length),
+ "av kids" = tapply(kids, year, mean),
+ " " = tapply(kids, year, sd1),
+ "prop childless" = tapply(kids, year, function(x) mean(x <= 0)),
+ " " = tapply(kids, year, function(x) sd1(x <= 0)),
+ "av schooling" = tapply(education, year, mean),
+ " " = tapply(education, year, sd1)
+ ), digits = 2))
obs av kids prop childless av schooling
1974 410 3.17 0.10 0.09 0.01 11.07 0.16
1978 445 2.73 0.09 0.14 0.02 11.00 0.15
1982 577 2.96 0.09 0.14 0.01 11.05 0.14
1986 470 2.70 0.09 0.16 0.02 11.34 0.14
1990 431 2.50 0.08 0.15 0.02 12.41 0.15
1994 989 2.40 0.06 0.15 0.01 12.78 0.10
1998 911 2.42 0.06 0.15 0.01 12.94 0.11
2002 917 2.36 0.06 0.16 0.01 13.25 0.10
>
> ## Table 1.3
> gss40$trend <- gss40$year - 1974
> kids_lm1 <- lm(kids ~ factor(year), data = gss40)
> kids_lm2 <- lm(kids ~ trend, data = gss40)
> kids_lm3 <- lm(kids ~ trend + education, data = gss40)
>
>
> ## Chapter 2
> ## Table 2.1
> kids_tab <- prop.table(xtabs(~ kids + year, data = gss40), 2) * 100
> round(kids_tab[,c(4, 8)], digits = 2)
year
kids 1986 2002
0 16.38 15.59
1 11.28 13.96
2 26.60 28.68
3 15.74 22.14
4 11.49 9.92
5 7.02 3.93
6 6.38 2.62
7 1.70 1.64
8 3.40 1.53
> ## Figure 2.1
> barplot(t(kids_tab[, c(4, 8)]), beside = TRUE, legend = TRUE)
>
>
> ## Chapter 3, Example 3.14
> ## Table 3.1
> gss40$nokids <- factor(gss40$kids <= 0,
+ levels = c(FALSE, TRUE), labels = c("no", "yes"))
> nokids_p1 <- glm(nokids ~ 1, data = gss40, family = binomial(link = "probit"))
> nokids_p2 <- glm(nokids ~ trend, data = gss40, family = binomial(link = "probit"))
> nokids_p3 <- glm(nokids ~ trend + education + ethnicity + siblings,
+ data = gss40, family = binomial(link = "probit"))
>
> ## p. 87
> lrtest(nokids_p1, nokids_p2, nokids_p3)
Likelihood ratio test
Model 1: nokids ~ 1
Model 2: nokids ~ trend
Model 3: nokids ~ trend + education + ethnicity + siblings
#Df LogLik Df Chisq Pr(>Chisq)
1 1 -2126.9
2 2 -2123.6 1 6.5677 0.01038 *
3 5 -2107.1 3 32.9906 3.235e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
>
> ## Chapter 4, Example 4.1
> gss40$nokids01 <- as.numeric(gss40$nokids) - 1
> nokids_lm3 <- lm(nokids01 ~ trend + education + ethnicity + siblings, data = gss40)
> coeftest(nokids_lm3, vcov = sandwich)
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.03972864 0.02898580 1.3706 0.1706
trend 0.00060242 0.00053829 1.1191 0.2631
education 0.00757341 0.00187637 4.0362 5.511e-05 ***
ethnicitycauc 0.01424775 0.01314836 1.0836 0.2786
siblings -0.00235388 0.00153770 -1.5308 0.1259
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
>
> ## Example 4.3
> ## Table 4.1
> nokids_l1 <- glm(nokids ~ 1, data = gss40, family = binomial(link = "logit"))
> nokids_l3 <- glm(nokids ~ trend + education + ethnicity + siblings,
+ data = gss40, family = binomial(link = "logit"))
> lrtest(nokids_p3)
Likelihood ratio test
Model 1: nokids ~ trend + education + ethnicity + siblings
Model 2: nokids ~ 1
#Df LogLik Df Chisq Pr(>Chisq)
1 5 -2107.1
2 1 -2126.9 -4 39.558 5.341e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> lrtest(nokids_l3)
Likelihood ratio test
Model 1: nokids ~ trend + education + ethnicity + siblings
Model 2: nokids ~ 1
#Df LogLik Df Chisq Pr(>Chisq)
1 5 -2106.0
2 1 -2126.9 -4 41.859 1.784e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
>
> ## Table 4.2
> nokids_xbar <- colMeans(model.matrix(nokids_l3))
> sum(coef(nokids_p3) * nokids_xbar)
[1] -1.069858
> sum(coef(nokids_l3) * nokids_xbar)
[1] -1.802965
> dnorm(sum(coef(nokids_p3) * nokids_xbar))
[1] 0.2250941
> dlogis(sum(coef(nokids_l3) * nokids_xbar))
[1] 0.1214709
> dnorm(sum(coef(nokids_p3) * nokids_xbar)) * coef(nokids_p3)[3]
education
0.007072723
> dlogis(sum(coef(nokids_l3) * nokids_xbar)) * coef(nokids_l3)[3]
education
0.007658209
> exp(coef(nokids_l3)[3])
education
1.065075
>
> ## Figure 4.4
> ## everything by hand (for ethnicity = "cauc" group)
> nokids_xbar <- as.vector(nokids_xbar)
> nokids_nd <- data.frame(education = seq(0, 20, by = 0.5), trend = nokids_xbar[2],
+ ethnicity = "cauc", siblings = nokids_xbar[4])
> nokids_p3_fit <- predict(nokids_p3, newdata = nokids_nd,
+ type = "response", se.fit = TRUE)
> plot(nokids_nd$education, nokids_p3_fit$fit, type = "l",
+ xlab = "education", ylab = "predicted probability", ylim = c(0, 0.3))
> polygon(c(nokids_nd$education, rev(nokids_nd$education)),
+ c(nokids_p3_fit$fit + 1.96 * nokids_p3_fit$se.fit,
+ rev(nokids_p3_fit$fit - 1.96 * nokids_p3_fit$se.fit)),
+ col = "lightgray", border = "lightgray")
> lines(nokids_nd$education, nokids_p3_fit$fit)
>
> ## using "effects" package (for average "ethnicity" variable)
> library("effects")
Attaching package: 'effects'
The following object is masked from 'package:car':
Prestige
> nokids_p3_ef <- effect("education", nokids_p3, xlevels = list(education = 0:20))
> plot(nokids_p3_ef, rescale.axis = FALSE, ylim = c(0, 0.3))
NOTE: the rescale.axis argument is deprecated; use type instead
>
> ## using "effects" plus modification by hand
> nokids_p3_ef1 <- as.data.frame(nokids_p3_ef)
> plot(pnorm(fit) ~ education, data = nokids_p3_ef1, type = "n", ylim = c(0, 0.3))
> polygon(c(0:20, 20:0), pnorm(c(nokids_p3_ef1$upper, rev(nokids_p3_ef1$lower))),
+ col = "lightgray", border = "lightgray")
> lines(pnorm(fit) ~ education, data = nokids_p3_ef1)
>
> ## Table 4.6
> ## McFadden's R^2
> 1 - as.numeric( logLik(nokids_p3) / logLik(nokids_p1) )
[1] 0.009299554
> 1 - as.numeric( logLik(nokids_l3) / logLik(nokids_l1) )
[1] 0.009840523
> ## McKelvey and Zavoina R^2
> r2mz <- function(obj) {
+ ystar <- predict(obj)
+ sse <- sum((ystar - mean(ystar))^2)
+ s2 <- switch(obj$family$link, "probit" = 1, "logit" = pi^2/3, NA)
+ n <- length(residuals(obj))
+ sse / (n * s2 + sse)
+ }
> r2mz(nokids_p3)
[1] 0.01784602
> r2mz(nokids_l3)
[1] 0.02076783
> ## AUC
> library("ROCR")
Loading required package: gplots
Attaching package: 'gplots'
The following object is masked from 'package:stats':
lowess
> nokids_p3_pred <- prediction(fitted(nokids_p3), gss40$nokids)
> nokids_l3_pred <- prediction(fitted(nokids_l3), gss40$nokids)
> plot(performance(nokids_p3_pred, "tpr", "fpr"))
> abline(0, 1, lty = 2)
> performance(nokids_p3_pred, "auc")
An object of class "performance"
Slot "x.name":
[1] "None"
Slot "y.name":
[1] "Area under the ROC curve"
Slot "alpha.name":
[1] "none"
Slot "x.values":
list()
Slot "y.values":
[[1]]
[1] 0.5828144
Slot "alpha.values":
list()
> plot(performance(nokids_l3_pred, "tpr", "fpr"))
> abline(0, 1, lty = 2)
> performance(nokids_l3_pred, "auc")@y.values
[[1]]
[1] 0.5828855
>
> ## Chapter 7
> ## Table 7.3
> ## subset selection
> gss02 <- subset(GSS7402, year == 2002 & (age < 40 | !is.na(agefirstbirth)))
> #Z# This selection conforms with top of page 229. However, there
> #Z# are too many observations: 1374. Furthermore, there are six
> #Z# observations with agefirstbirth <= 14 which will cause problems in
> #Z# taking logs!
>
> ## computing time to first birth
> gss02$tfb <- with(gss02, ifelse(is.na(agefirstbirth), age - 14, agefirstbirth - 14))
> #Z# currently this is still needed before taking logs
> gss02$tfb <- pmax(gss02$tfb, 1)
>
> tfb_tobit <- tobit(log(tfb) ~ education + ethnicity + siblings + city16 + immigrant,
+ data = gss02, left = -Inf, right = log(gss02$age - 14))
> tfb_ols <- lm(log(tfb) ~ education + ethnicity + siblings + city16 + immigrant,
+ data = gss02, subset = !is.na(agefirstbirth))
>
> ## Chapter 8
> ## Example 8.3
> gss2002 <- subset(GSS7402, year == 2002 & (agefirstbirth < 40 | age < 40))
> gss2002$afb <- with(gss2002, Surv(ifelse(kids > 0, agefirstbirth, age), kids > 0))
> afb_km <- survfit(afb ~ 1, data = gss2002)
> afb_skm <- summary(afb_km)
> print(afb_skm)
Call: survfit(formula = afb ~ 1, data = gss2002)
time n.risk n.event survival std.err lower 95% CI upper 95% CI
11 1371 1 0.9993 0.000729 0.9978 1.0000
13 1370 3 0.9971 0.001457 0.9942 0.9999
14 1367 2 0.9956 0.001783 0.9921 0.9991
15 1365 14 0.9854 0.003238 0.9791 0.9918
16 1351 40 0.9562 0.005525 0.9455 0.9671
17 1311 66 0.9081 0.007802 0.8929 0.9235
18 1245 97 0.8373 0.009967 0.8180 0.8571
19 1148 106 0.7600 0.011534 0.7378 0.7830
20 1030 122 0.6700 0.012726 0.6455 0.6954
21 898 123 0.5782 0.013406 0.5525 0.6051
22 767 97 0.5051 0.013612 0.4791 0.5325
23 654 72 0.4495 0.013600 0.4236 0.4770
24 563 61 0.4008 0.013480 0.3752 0.4281
25 484 60 0.3511 0.013248 0.3261 0.3781
26 407 45 0.3123 0.012986 0.2878 0.3388
27 351 45 0.2723 0.012618 0.2486 0.2981
28 294 37 0.2380 0.012223 0.2152 0.2632
29 246 32 0.2070 0.011795 0.1852 0.2315
30 209 38 0.1694 0.011119 0.1489 0.1926
31 163 18 0.1507 0.010730 0.1311 0.1733
32 135 19 0.1295 0.010264 0.1108 0.1512
33 108 17 0.1091 0.009766 0.0915 0.1300
34 83 7 0.0999 0.009542 0.0828 0.1205
35 65 13 0.0799 0.009101 0.0639 0.0999
36 45 7 0.0675 0.008815 0.0522 0.0872
37 29 7 0.0512 0.008572 0.0369 0.0711
38 17 3 0.0422 0.008499 0.0284 0.0626
39 12 2 0.0351 0.008411 0.0220 0.0562
> with(afb_skm, plot(n.event/n.risk ~ time, type = "s"))
> plot(afb_km, xlim = c(10, 40), conf.int = FALSE)
>
> ## Example 8.9
> library("survival")
> afb_ex <- survreg(
+ afb ~ education + siblings + ethnicity + immigrant + lowincome16 + city16,
+ data = gss2002, dist = "exponential")
> afb_wb <- survreg(
+ afb ~ education + siblings + ethnicity + immigrant + lowincome16 + city16,
+ data = gss2002, dist = "weibull")
> afb_ln <- survreg(
+ afb ~ education + siblings + ethnicity + immigrant + lowincome16 + city16,
+ data = gss2002, dist = "lognormal")
>
> ## Example 8.11
> kids_pois <- glm(kids ~ education + trend + ethnicity + immigrant + lowincome16 + city16,
+ data = gss40, family = poisson)
> library("MASS")
> kids_nb <- glm.nb(kids ~ education + trend + ethnicity + immigrant + lowincome16 + city16,
+ data = gss40)
> lrtest(kids_pois, kids_nb)
Likelihood ratio test
Model 1: kids ~ education + trend + ethnicity + immigrant + lowincome16 +
city16
Model 2: kids ~ education + trend + ethnicity + immigrant + lowincome16 +
city16
#Df LogLik Df Chisq Pr(>Chisq)
1 7 -10117
2 8 -10014 1 205.17 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
>
>
> ############################################
> ## German Socio-Economic Panel 1994--2002 ##
> ############################################
>
> ## data
> data("GSOEP9402", package = "AER")
>
> ## some convenience data transformations
> gsoep <- GSOEP9402
> gsoep$meducation2 <- cut(gsoep$meducation, breaks = c(6, 10.25, 12.25, 18),
+ labels = c("7-10", "10.5-12", "12.5-18"))
> gsoep$year2 <- factor(gsoep$year)
>
> ## Chapter 1
> ## Table 1.4 plus visualizations
> gsoep_tab <- xtabs(~ meducation2 + school, data = gsoep)
> round(prop.table(gsoep_tab, 1) * 100, digits = 2)
school
meducation2 Hauptschule Realschule Gymnasium
7-10 55.12 25.20 19.69
10.5-12 28.09 34.16 37.75
12.5-18 3.88 14.56 81.55
> spineplot(gsoep_tab)
> plot(school ~ meducation, data = gsoep, breaks = c(7, 10.25, 12.25, 18))
> plot(school ~ meducation, data = gsoep, breaks = c(7, 9, 10.5, 11.5, 12.5, 15, 18))
>
>
> ## Chapter 5
> ## Table 5.1
> library("nnet")
> gsoep_mnl <- multinom(
+ school ~ meducation + memployment + log(income) + log(size) + parity + year2,
+ data = gsoep)
# weights: 48 (30 variable)
initial value 741.563295
iter 10 value 655.748279
iter 20 value 624.992858
iter 30 value 618.605354
final value 618.475696
converged
> coeftest(gsoep_mnl)[c(1:6, 1:6 + 14),]
Estimate Std. Error z value Pr(>|z|)
Realschule:(Intercept) -6.3864877 2.36903996 -2.6958126 7.021716e-03
Realschule:meducation 0.3004843 0.07910641 3.7984819 1.455851e-04
Realschule:memploymentparttime 0.4933680 0.32189721 1.5326879 1.253528e-01
Realschule:memploymentnone 0.7526399 0.32884476 2.2887392 2.209451e-02
Realschule:log(income) 0.3934871 0.22539836 1.7457408 8.085601e-02
Realschule:log(size) -1.1921790 0.44641156 -2.6705827 7.571972e-03
Realschule:year22002 0.1922413 0.45158350 0.4257049 6.703229e-01
Gymnasium:(Intercept) -23.6975758 3.01022807 -7.8723523 3.480345e-15
Gymnasium:meducation 0.6597649 0.08144034 8.1012060 5.441700e-16
Gymnasium:memploymentparttime 0.9372429 0.34536421 2.7137813 6.652007e-03
Gymnasium:memploymentnone 1.1007579 0.35842760 3.0710746 2.132898e-03
Gymnasium:log(income) 1.6676745 0.28408439 5.8703492 4.348783e-09
>
> ## alternatively
> if(require("mlogit")) {
+ gsoep_mnl2 <- mlogit(school ~ 0 | meducation + memployment + log(income) +
+ log(size) + parity + year2, data = gsoep, shape = "wide", reflevel = "Hauptschule")
+ coeftest(gsoep_mnl2)[1:12,]
+ }
Loading required package: mlogit
Loading required package: Formula
Loading required package: maxLik
Loading required package: miscTools
Please cite the 'maxLik' package as:
Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
https://r-forge.r-project.org/projects/maxlik/
Estimate Std. Error t value Pr(>|t|)
Gymnasium:(intercept) -23.6982768 3.01026604 -7.872486 1.475202e-14
Realschule:(intercept) -6.3865987 2.36904833 -2.695850 7.204061e-03
Gymnasium:meducation 0.6597829 0.08144157 8.101304 2.726719e-15
Realschule:meducation 0.3004923 0.07910725 3.798543 1.593085e-04
Gymnasium:memploymentparttime 0.9372401 0.34536576 2.713761 6.830145e-03
Realschule:memploymentparttime 0.4933644 0.32189760 1.532675 1.258463e-01
Gymnasium:memploymentnone 1.1007670 0.35842942 3.071084 2.222541e-03
Realschule:memploymentnone 0.7526490 0.32884523 2.288764 2.241551e-02
Gymnasium:log(income) 1.6677258 0.28408738 5.870468 6.954975e-09
Realschule:log(income) 0.3934899 0.22539876 1.745750 8.133056e-02
Gymnasium:log(size) -1.5459256 0.48775919 -3.169444 1.599570e-03
Realschule:log(size) -1.1921835 0.44641174 -2.670592 7.762668e-03
>
> ## Table 5.2
> library("effects")
> gsoep_eff <- effect("meducation", gsoep_mnl,
+ xlevels = list(meducation = sort(unique(gsoep$meducation))))
> gsoep_eff$prob
prob.Hauptschule prob.Realschule prob.Gymnasium
[1,] 0.686724467 0.24514452 0.06813102
[2,] 0.494486442 0.32195219 0.18356137
[3,] 0.385007121 0.33853566 0.27645721
[4,] 0.331068546 0.33830072 0.33063074
[5,] 0.279605513 0.33203209 0.38836239
[6,] 0.231922018 0.32005576 0.44802222
[7,] 0.189019319 0.30313717 0.50784351
[8,] 0.119575666 0.25898492 0.62143941
[9,] 0.093066080 0.23424613 0.67268779
[10,] 0.071541719 0.20926168 0.71919660
[11,] 0.054404764 0.18493393 0.76066131
[12,] 0.040990572 0.16192466 0.79708477
[13,] 0.022753542 0.12138826 0.85585820
[14,] 0.006601958 0.06423885 0.92915919
> plot(gsoep_eff, confint = FALSE)
>
> ## Table 5.3, odds
> exp(coef(gsoep_mnl)[, "meducation"])
Realschule Gymnasium
1.350513 1.934338
>
> ## all effects
> eff_mnl <- allEffects(gsoep_mnl)
> plot(eff_mnl, ask = FALSE, confint = FALSE)
> plot(eff_mnl, ask = FALSE, style = "stacked", colors = gray.colors(3))
>
> ## omit year
> gsoep_mnl1 <- multinom(
+ school ~ meducation + memployment + log(income) + log(size) + parity,
+ data = gsoep)
# weights: 24 (14 variable)
initial value 741.563295
iter 10 value 658.442291
iter 20 value 624.980518
final value 624.957624
converged
> lrtest(gsoep_mnl, gsoep_mnl1)
Likelihood ratio test
Model 1: school ~ meducation + memployment + log(income) + log(size) +
parity + year2
Model 2: school ~ meducation + memployment + log(income) + log(size) +
parity
#Df LogLik Df Chisq Pr(>Chisq)
1 30 -618.48
2 14 -624.96 -16 12.964 0.6754
> eff_mnl1 <- allEffects(gsoep_mnl1)
> plot(eff_mnl1, ask = FALSE, confint = FALSE)
> plot(eff_mnl1, ask = FALSE, style = "stacked", colors = gray.colors(3))
>
>
> ## Chapter 6
> ## Table 6.1
> library("MASS")
> gsoep$munemp <- factor(gsoep$memployment != "none",
+ levels = c(FALSE, TRUE), labels = c("no", "yes"))
> gsoep_pop <- polr(school ~ meducation + munemp + log(income) + log(size) + parity + year2,
+ data = gsoep, method = "probit", Hess = TRUE)
> gsoep_pol <- polr(school ~ meducation + munemp + log(income) + log(size) + parity + year2,
+ data = gsoep, Hess = TRUE)
> lrtest(gsoep_pop)
Likelihood ratio test
Model 1: school ~ meducation + munemp + log(income) + log(size) + parity +
year2
Model 2: school ~ 1
#Df LogLik Df Chisq Pr(>Chisq)
1 15 -631.19
2 2 -732.84 -13 203.31 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> lrtest(gsoep_pol)
Likelihood ratio test
Model 1: school ~ meducation + munemp + log(income) + log(size) + parity +
year2
Model 2: school ~ 1
#Df LogLik Df Chisq Pr(>Chisq)
1 15 -630.15
2 2 -732.84 -13 205.38 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
>
> ## Table 6.2
> ## todo
> eff_pol <- allEffects(gsoep_pol)
> plot(eff_pol, ask = FALSE, confint = FALSE)
> plot(eff_pol, ask = FALSE, style = "stacked", colors = gray.colors(3))
>
>
> ####################################
> ## Labor Force Participation Data ##
> ####################################
>
> ## Mroz data
> data("PSID1976", package = "AER")
> PSID1976$nwincome <- with(PSID1976, (fincome - hours * wage)/1000)
>
> ## visualizations
> plot(hours ~ nwincome, data = PSID1976,
+ xlab = "Non-wife income (in USD 1000)",
+ ylab = "Hours of work in 1975")
>
> plot(jitter(hours, 200) ~ jitter(wage, 50), data = PSID1976,
+ xlab = "Wife's average hourly wage (jittered)",
+ ylab = "Hours of work in 1975 (jittered)")
>
> ## Chapter 1, p. 18
> hours_lm <- lm(hours ~ wage + nwincome + youngkids + oldkids, data = PSID1976,
+ subset = participation == "yes")
>
> ## Chapter 7
> ## Example 7.2, Table 7.1
> hours_tobit <- tobit(hours ~ nwincome + education + experience + I(experience^2) +
+ age + youngkids + oldkids, data = PSID1976)
> hours_ols1 <- lm(hours ~ nwincome + education + experience + I(experience^2) +
+ age + youngkids + oldkids, data = PSID1976)
> hours_ols2 <- lm(hours ~ nwincome + education + experience + I(experience^2) +
+ age + youngkids + oldkids, data = PSID1976, subset = participation == "yes")
>
> ## Example 7.10, Table 7.4
> wage_ols <- lm(log(wage) ~ education + experience + I(experience^2),
+ data = PSID1976, subset = participation == "yes")
>
> library("sampleSelection")
> wage_ghr <- selection(participation ~ nwincome + age + youngkids + oldkids +
+ education + experience + I(experience^2),
+ log(wage) ~ education + experience + I(experience^2), data = PSID1976)
>
> ## Exercise 7.13
> hours_cragg1 <- glm(participation ~ nwincome + education +
+ experience + I(experience^2) + age + youngkids + oldkids,
+ data = PSID1976, family = binomial(link = "probit"))
> library("truncreg")
> hours_cragg2 <- truncreg(hours ~ nwincome + education +
+ experience + I(experience^2) + age + youngkids + oldkids,
+ data = PSID1976, subset = participation == "yes")
>
> ## Exercise 7.15
> wage_olscoef <- sapply(c(-Inf, 0.5, 1, 1.5, 2), function(censpoint)
+ coef(lm(log(wage) ~ education + experience + I(experience^2),
+ data = PSID1976[log(PSID1976$wage) > censpoint,])))
> wage_mlcoef <- sapply(c(0.5, 1, 1.5, 2), function(censpoint)
+ coef(tobit(log(wage) ~ education + experience + I(experience^2),
+ data = PSID1976, left = censpoint)))
>
>
> ##################################
> ## Choice of Brand for Crackers ##
> ##################################
>
> ## data
> if(require("mlogit")) {
+ data("Cracker", package = "mlogit")
+ head(Cracker, 3)
+ crack <- mlogit.data(Cracker, varying = 2:13, shape = "wide", choice = "choice")
+ head(crack, 12)
+
+ ## Table 5.6 (model 3 probably not fully converged in W&B)
+ crack$price <- crack$price/100
+ crack_mlogit1 <- mlogit(choice ~ price | 0, data = crack, reflevel = "private")
+ crack_mlogit2 <- mlogit(choice ~ price | 1, data = crack, reflevel = "private")
+ crack_mlogit3 <- mlogit(choice ~ price + feat + disp | 1, data = crack,
+ reflevel = "private")
+ lrtest(crack_mlogit1, crack_mlogit2, crack_mlogit3)
+
+ ## IIA test
+ crack_mlogit_all <- update(crack_mlogit2, reflevel = "nabisco")
+ crack_mlogit_res <- update(crack_mlogit_all,
+ alt.subset = c("keebler", "nabisco", "sunshine"))
+ hmftest(crack_mlogit_all, crack_mlogit_res)
+ }
Hausman-McFadden test
data: crack
chisq = 51.592, df = 3, p-value = 3.659e-11
alternative hypothesis: IIA is rejected
>
>
>
>
>
> dev.off()
null device
1
>