Consider the following simple example: Y = 1 if x1 + x2 > 0, Y=0, otherwise.
x = matrix(runif(20), 10, 2)
y = ifelse(x[,1]+x[,2]>1, 1, 0);
plot(x[,1], x[,2], type="n", xlab="", ylab="");
text(x[,1], x[,2], y )
myfit=glm(y~x, family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(myfit)
##
## Call:
## glm(formula = y ~ x, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.583e-05 -2.110e-08 -2.110e-08 2.110e-08 1.521e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -192.9 349163.9 -0.001 1
## x1 235.2 439425.2 0.001 1
## x2 160.1 924663.2 0.000 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.3460e+01 on 9 degrees of freedom
## Residual deviance: 4.8565e-10 on 7 degrees of freedom
## AIC: 6
##
## Number of Fisher Scoring iterations: 25
The likelihood for logistic function doesn’t converge when the data are well-separated: the MLE of beta would converge to infinity or negative-infinity. However, the separating line is well-defined and we can still use the fitted model to do prediction.
The Coronary Risk‐Factor Study data involve 462 males between the ages of 15 and 64 from a heart‐disease high‐risk region of the Western Cape, South Africa. The response is “chd”, the presence (chd=1) or absence (chd=0) of coronary heart disease.
There are 9 covariates:
heart = read.table("https://web.stanford.edu/~hastie/ElemStatLearn//datasets/SAheart.data", sep=",",head=T, row.names=1)
head(heart)
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 1
## 2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 1
## 3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 0
## 4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 1
## 5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 1
## 6 132 6.20 6.47 36.21 Present 62 30.77 14.14 45 0
heartfull = glm(chd~., data=heart, family=binomial)
summary(heartfull)
##
## Call:
## glm(formula = chd ~ ., family = binomial, data = heart)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7781 -0.8213 -0.4387 0.8889 2.5435
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.1507209 1.3082600 -4.701 2.58e-06 ***
## sbp 0.0065040 0.0057304 1.135 0.256374
## tobacco 0.0793764 0.0266028 2.984 0.002847 **
## ldl 0.1739239 0.0596617 2.915 0.003555 **
## adiposity 0.0185866 0.0292894 0.635 0.525700
## famhistPresent 0.9253704 0.2278940 4.061 4.90e-05 ***
## typea 0.0395950 0.0123202 3.214 0.001310 **
## obesity -0.0629099 0.0442477 -1.422 0.155095
## alcohol 0.0001217 0.0044832 0.027 0.978350
## age 0.0452253 0.0121298 3.728 0.000193 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 472.14 on 452 degrees of freedom
## AIC: 492.14
##
## Number of Fisher Scoring iterations: 5
How to interprete the coefficient associated with variable “obesity”? It is strange that having excessive body fat can actually lower your chance of having a heart disease?
round(cor(data.matrix(heart)), dig=2)
## sbp tobacco ldl adiposity famhist typea obesity alcohol
## sbp 1.00 0.21 0.16 0.36 0.09 -0.06 0.24 0.14
## tobacco 0.21 1.00 0.16 0.29 0.09 -0.01 0.12 0.20
## ldl 0.16 0.16 1.00 0.44 0.16 0.04 0.33 -0.03
## adiposity 0.36 0.29 0.44 1.00 0.18 -0.04 0.72 0.10
## famhist 0.09 0.09 0.16 0.18 1.00 0.04 0.12 0.08
## typea -0.06 -0.01 0.04 -0.04 0.04 1.00 0.07 0.04
## obesity 0.24 0.12 0.33 0.72 0.12 0.07 1.00 0.05
## alcohol 0.14 0.20 -0.03 0.10 0.08 0.04 0.05 1.00
## age 0.39 0.45 0.31 0.63 0.24 -0.10 0.29 0.10
## chd 0.19 0.30 0.26 0.25 0.27 0.10 0.10 0.06
## age chd
## sbp 0.39 0.19
## tobacco 0.45 0.30
## ldl 0.31 0.26
## adiposity 0.63 0.25
## famhist 0.24 0.27
## typea -0.10 0.10
## obesity 0.29 0.10
## alcohol 0.10 0.06
## age 1.00 0.37
## chd 0.37 1.00
phat=heartfull$fitted.values;
-2*(sum(log(phat[heart$chd==1])) +
sum(log(1-phat[heart$chd==0])))
## [1] 472.14
table(heart$chd)
##
## 0 1
## 302 160
p0hat = rep(mean(heart$chd), length(heart$chd)) # 160/(160+302)
-2*(sum(log(p0hat[heart$chd==1])) +
sum(log(1-p0hat[heart$chd==0])))
## [1] 596.1084
The two procedures happen to return the same model.
heartstepA = step(heartfull, scope=list(upper=~., lower=~1))
## Start: AIC=492.14
## chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + obesity +
## alcohol + age
##
## Df Deviance AIC
## - alcohol 1 472.14 490.14
## - adiposity 1 472.55 490.55
## - sbp 1 473.44 491.44
## <none> 472.14 492.14
## - obesity 1 474.23 492.23
## - ldl 1 481.07 499.07
## - tobacco 1 481.67 499.67
## - typea 1 483.05 501.05
## - age 1 486.53 504.53
## - famhist 1 488.89 506.89
##
## Step: AIC=490.14
## chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + obesity +
## age
##
## Df Deviance AIC
## - adiposity 1 472.55 488.55
## - sbp 1 473.47 489.47
## <none> 472.14 490.14
## - obesity 1 474.24 490.24
## + alcohol 1 472.14 492.14
## - ldl 1 481.15 497.15
## - tobacco 1 482.06 498.06
## - typea 1 483.06 499.06
## - age 1 486.64 502.64
## - famhist 1 488.99 504.99
##
## Step: AIC=488.55
## chd ~ sbp + tobacco + ldl + famhist + typea + obesity + age
##
## Df Deviance AIC
## - sbp 1 473.98 487.98
## <none> 472.55 488.55
## - obesity 1 474.65 488.65
## + adiposity 1 472.14 490.14
## + alcohol 1 472.55 490.55
## - tobacco 1 482.54 496.54
## - ldl 1 482.95 496.95
## - typea 1 483.19 497.19
## - famhist 1 489.38 503.38
## - age 1 495.48 509.48
##
## Step: AIC=487.98
## chd ~ tobacco + ldl + famhist + typea + obesity + age
##
## Df Deviance AIC
## - obesity 1 475.69 487.69
## <none> 473.98 487.98
## + sbp 1 472.55 488.55
## + adiposity 1 473.47 489.47
## + alcohol 1 473.94 489.94
## - tobacco 1 484.18 496.18
## - typea 1 484.30 496.30
## - ldl 1 484.53 496.53
## - famhist 1 490.58 502.58
## - age 1 502.11 514.11
##
## Step: AIC=487.69
## chd ~ tobacco + ldl + famhist + typea + age
##
## Df Deviance AIC
## <none> 475.69 487.69
## + obesity 1 473.98 487.98
## + sbp 1 474.65 488.65
## + adiposity 1 475.44 489.44
## + alcohol 1 475.65 489.65
## - ldl 1 484.71 494.71
## - typea 1 485.44 495.44
## - tobacco 1 486.03 496.03
## - famhist 1 492.09 502.09
## - age 1 502.38 512.38
summary(heartstepA)
##
## Call:
## glm(formula = chd ~ tobacco + ldl + famhist + typea + age, family = binomial,
## data = heart)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9165 -0.8054 -0.4430 0.9329 2.6139
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.44644 0.92087 -7.000 2.55e-12 ***
## tobacco 0.08038 0.02588 3.106 0.00190 **
## ldl 0.16199 0.05497 2.947 0.00321 **
## famhistPresent 0.90818 0.22576 4.023 5.75e-05 ***
## typea 0.03712 0.01217 3.051 0.00228 **
## age 0.05046 0.01021 4.944 7.65e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 475.69 on 456 degrees of freedom
## AIC: 487.69
##
## Number of Fisher Scoring iterations: 5
n=dim(heart)[1]
heartstepB = step(heartfull, scope=list(upper=~., lower=~1), trace = 0, k=log(n))
summary(heartstepB)
##
## Call:
## glm(formula = chd ~ tobacco + ldl + famhist + typea + age, family = binomial,
## data = heart)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9165 -0.8054 -0.4430 0.9329 2.6139
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.44644 0.92087 -7.000 2.55e-12 ***
## tobacco 0.08038 0.02588 3.106 0.00190 **
## ldl 0.16199 0.05497 2.947 0.00321 **
## famhistPresent 0.90818 0.22576 4.023 5.75e-05 ***
## typea 0.03712 0.01217 3.051 0.00228 **
## age 0.05046 0.01021 4.944 7.65e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 475.69 on 456 degrees of freedom
## AIC: 487.69
##
## Number of Fisher Scoring iterations: 5
Estimation (probability of being 1) on the training samples.
phat=heartstepA$fitted.values;
mypred = (phat>0.5)
table(heart$chd, mypred)
## mypred
## FALSE TRUE
## 0 256 46
## 1 73 87
Consider the following three males with the same value on all other predictors except age: min, max, and median. What’s the estimated chance of getting heart disease for each of them?
testsamples = heart[c(1, 1, 1),]
testsamples
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 1 160 12 5.73 23.11 Present 49 25.3 97.2 52 1
## 1.1 160 12 5.73 23.11 Present 49 25.3 97.2 52 1
## 1.2 160 12 5.73 23.11 Present 49 25.3 97.2 52 1
testsamples$age = c(min(heart$age), median(heart$age), max(heart$age))
testsamples
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 1 160 12 5.73 23.11 Present 49 25.3 97.2 15 1
## 1.1 160 12 5.73 23.11 Present 49 25.3 97.2 45 1
## 1.2 160 12 5.73 23.11 Present 49 25.3 97.2 64 1
predict(heartstepA, newdata=testsamples)
## 1 1.1 1.2
## -1.0700021 0.4438094 1.4025567
predict(heartstepA, newdata=testsamples, type="response")
## 1 1.1 1.2
## 0.2554027 0.6091664 0.8025893
The motivation for collecting this database was the explosion of the USA Space Shuttle Challenger on 28 January 1986. The explosion was eventually traced to the failure of one of the three field joints on one of the two solid booster rockets. Each of these six field joints includes two O‐rings, designated as primary and secondary, which fail when phenomena called erosion and blowby both occur.
The night before the launch a decision had to be made regarding launch safety. The discussion among engineers and managers leading to this decision included concern that the probability of failure of the O‐rings depended on the temperature at launch, which was forecasted to be 31 degrees F. There are strong engineering reasons based on the composition of O‐rings to support the judgment that failure probability may rise monotonically as temperature drops. One other variable, the pressure at which safety testing for field join leaks was performed, was available, but its relevance to the failure process was unclear.
The data set includes the temperature and the number of O‐ring failures for the 23 shuttle flights previous to the Challenger disaster. No previous liftoff temperature was below 53 degrees F. So tremendous extrapolation must be done from the given data to assess risk at 31 degrees F. However it is obvious (from the plot) the risk is high at 31 degrees F.
The task is to predict the Chance of one of the O‐rings would fail when the launch temperature is below freezing.
orings = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/space-shuttle/o-ring-erosion-or-blowby.data")
orings
## V1 V2 V3 V4 V5
## 1 6 0 66 50 1
## 2 6 1 70 50 2
## 3 6 0 69 50 3
## 4 6 0 68 50 4
## 5 6 0 67 50 5
## 6 6 0 72 50 6
## 7 6 0 73 100 7
## 8 6 0 70 100 8
## 9 6 1 57 200 9
## 10 6 1 63 200 10
## 11 6 1 70 200 11
## 12 6 0 78 200 12
## 13 6 0 67 200 13
## 14 6 2 53 200 14
## 15 6 0 67 200 15
## 16 6 0 75 200 16
## 17 6 0 70 200 17
## 18 6 0 81 200 18
## 19 6 0 76 200 19
## 20 6 0 79 200 20
## 21 6 2 75 200 21
## 22 6 0 76 200 22
## 23 6 1 58 200 23
orings=orings[, c(2:3)]
names(orings)=c("damage", "temp")
orings[order(orings$temp),]
## damage temp
## 14 2 53
## 9 1 57
## 23 1 58
## 10 1 63
## 1 0 66
## 5 0 67
## 13 0 67
## 15 0 67
## 4 0 68
## 3 0 69
## 2 1 70
## 8 0 70
## 11 1 70
## 17 0 70
## 6 0 72
## 7 0 73
## 16 0 75
## 21 2 75
## 19 0 76
## 22 0 76
## 12 0 78
## 20 0 79
## 18 0 81
logitmod=glm(cbind(damage, 6-damage) ~ temp, family=binomial, data=orings)
summary(logitmod)
##
## Call:
## glm(formula = cbind(damage, 6 - damage) ~ temp, family = binomial,
## data = orings)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.95227 -0.78299 -0.54117 -0.04379 2.65152
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.08498 3.05247 1.666 0.0957 .
## temp -0.11560 0.04702 -2.458 0.0140 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24.230 on 22 degrees of freedom
## Residual deviance: 18.086 on 21 degrees of freedom
## AIC: 35.647
##
## Number of Fisher Scoring iterations: 5
predict(logitmod, data.frame(temp=31), type="response")
## 1
## 0.8177744
min.temp=31; max.temp=max(orings$temp)
plot(orings$temp, orings$damage/6,
xlim=c(min.temp, max.temp), ylim=c(0,1),
xlab="Temp", ylab="Chance of Damage")
newtemp = seq(min.temp, max.temp, length=100)
phat = predict(logitmod, data.frame(temp=newtemp), type="response")
lines(newtemp, phat, col="red")
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
p = dim(heart)[2]
X=data.matrix(heart[,-p]);
Y=heart[,p];
heart.l1=glmnet(X,Y,family="binomial",alpha=1)
plot(heart.l1,label=TRUE)
heart.cv = cv.glmnet(X, Y, family="binomial",alpha=1)
plot(heart.cv)
heart.cv$lambda.min
## [1] 0.006230944
heart.cv$lambda.1se
## [1] 0.04824393
predict(heart.cv, lambda=heart.cv$lambda.1se, type="coefficients")
## 10 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -3.510976639
## sbp .
## tobacco 0.042431208
## ldl 0.077881736
## adiposity .
## famhist 0.484727397
## typea 0.004487505
## obesity .
## alcohol .
## age 0.031404116