Logistic Regression

Convergence Issue for a Simple Example

Consider the following simple example: Y = 1 if x1 + x2 > 0, Y=0, otherwise.

x = matrix(runif(20), 10, 2)
y = ifelse(x[,1]+x[,2]>1, 1, 0); 
plot(x[,1], x[,2], type="n", xlab="", ylab=""); 
text(x[,1], x[,2], y )

myfit=glm(y~x, family=binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(myfit)

## 
## Call:
## glm(formula = y ~ x, family = binomial)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.583e-05  -2.110e-08  -2.110e-08   2.110e-08   1.521e-05  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -192.9   349163.9  -0.001        1
## x1             235.2   439425.2   0.001        1
## x2             160.1   924663.2   0.000        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.3460e+01  on 9  degrees of freedom
## Residual deviance: 4.8565e-10  on 7  degrees of freedom
## AIC: 6
## 
## Number of Fisher Scoring iterations: 25

The likelihood for logistic function doesn’t converge when the data are well-separated: the MLE of beta would converge to infinity or negative-infinity. However, the separating line is well-defined and we can still use the fitted model to do prediction.

South African Heart Disease Data

The Coronary Risk‐Factor Study data involve 462 males between the ages of 15 and 64 from a heart‐disease high‐risk region of the Western Cape, South Africa. The response is “chd”, the presence (chd=1) or absence (chd=0) of coronary heart disease.

There are 9 covariates:

sbp - systolic blood pressure
tobacco - cumulative tobacco (kg)
ldl - low densiity lipoprotein cholesterol
adiposity
famhist - family history of heart disease (Present, Absent)
typea - type‐A behavior
obesity
alcohol - current alcohol consumption
age - age at onset

Fit a logistic model

heart = read.table("https://web.stanford.edu/~hastie/ElemStatLearn//datasets/SAheart.data", sep=",",head=T, row.names=1)
head(heart)

##   sbp tobacco  ldl adiposity famhist typea obesity alcohol age chd
## 1 160   12.00 5.73     23.11 Present    49   25.30   97.20  52   1
## 2 144    0.01 4.41     28.61  Absent    55   28.87    2.06  63   1
## 3 118    0.08 3.48     32.28 Present    52   29.14    3.81  46   0
## 4 170    7.50 6.41     38.03 Present    51   31.99   24.26  58   1
## 5 134   13.60 3.50     27.78 Present    60   25.99   57.34  49   1
## 6 132    6.20 6.47     36.21 Present    62   30.77   14.14  45   0

heartfull = glm(chd~., data=heart, family=binomial)
summary(heartfull)

## 
## Call:
## glm(formula = chd ~ ., family = binomial, data = heart)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7781  -0.8213  -0.4387   0.8889   2.5435  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -6.1507209  1.3082600  -4.701 2.58e-06 ***
## sbp             0.0065040  0.0057304   1.135 0.256374    
## tobacco         0.0793764  0.0266028   2.984 0.002847 ** 
## ldl             0.1739239  0.0596617   2.915 0.003555 ** 
## adiposity       0.0185866  0.0292894   0.635 0.525700    
## famhistPresent  0.9253704  0.2278940   4.061 4.90e-05 ***
## typea           0.0395950  0.0123202   3.214 0.001310 ** 
## obesity        -0.0629099  0.0442477  -1.422 0.155095    
## alcohol         0.0001217  0.0044832   0.027 0.978350    
## age             0.0452253  0.0121298   3.728 0.000193 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 596.11  on 461  degrees of freedom
## Residual deviance: 472.14  on 452  degrees of freedom
## AIC: 492.14
## 
## Number of Fisher Scoring iterations: 5

How to interprete the coefficient associated with variable “obesity”? It is strange that having excessive body fat can actually lower your chance of having a heart disease?

round(cor(data.matrix(heart)), dig=2)

##             sbp tobacco   ldl adiposity famhist typea obesity alcohol
## sbp        1.00    0.21  0.16      0.36    0.09 -0.06    0.24    0.14
## tobacco    0.21    1.00  0.16      0.29    0.09 -0.01    0.12    0.20
## ldl        0.16    0.16  1.00      0.44    0.16  0.04    0.33   -0.03
## adiposity  0.36    0.29  0.44      1.00    0.18 -0.04    0.72    0.10
## famhist    0.09    0.09  0.16      0.18    1.00  0.04    0.12    0.08
## typea     -0.06   -0.01  0.04     -0.04    0.04  1.00    0.07    0.04
## obesity    0.24    0.12  0.33      0.72    0.12  0.07    1.00    0.05
## alcohol    0.14    0.20 -0.03      0.10    0.08  0.04    0.05    1.00
## age        0.39    0.45  0.31      0.63    0.24 -0.10    0.29    0.10
## chd        0.19    0.30  0.26      0.25    0.27  0.10    0.10    0.06
##             age  chd
## sbp        0.39 0.19
## tobacco    0.45 0.30
## ldl        0.31 0.26
## adiposity  0.63 0.25
## famhist    0.24 0.27
## typea     -0.10 0.10
## obesity    0.29 0.10
## alcohol    0.10 0.06
## age        1.00 0.37
## chd        0.37 1.00

What’s null deviance?
What’s residual deviance?

phat=heartfull$fitted.values;
-2*(sum(log(phat[heart$chd==1])) + 
  sum(log(1-phat[heart$chd==0])))

## [1] 472.14

table(heart$chd)

## 
##   0   1 
## 302 160

p0hat = rep(mean(heart$chd), length(heart$chd))  # 160/(160+302)
-2*(sum(log(p0hat[heart$chd==1])) + 
  sum(log(1-p0hat[heart$chd==0])))

## [1] 596.1084

Stepwise Mode Selection with AIC and BIC.

The two procedures happen to return the same model.

heartstepA = step(heartfull, scope=list(upper=~., lower=~1))

## Start:  AIC=492.14
## chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + obesity + 
##     alcohol + age
## 
##             Df Deviance    AIC
## - alcohol    1   472.14 490.14
## - adiposity  1   472.55 490.55
## - sbp        1   473.44 491.44
## <none>           472.14 492.14
## - obesity    1   474.23 492.23
## - ldl        1   481.07 499.07
## - tobacco    1   481.67 499.67
## - typea      1   483.05 501.05
## - age        1   486.53 504.53
## - famhist    1   488.89 506.89
## 
## Step:  AIC=490.14
## chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + obesity + 
##     age
## 
##             Df Deviance    AIC
## - adiposity  1   472.55 488.55
## - sbp        1   473.47 489.47
## <none>           472.14 490.14
## - obesity    1   474.24 490.24
## + alcohol    1   472.14 492.14
## - ldl        1   481.15 497.15
## - tobacco    1   482.06 498.06
## - typea      1   483.06 499.06
## - age        1   486.64 502.64
## - famhist    1   488.99 504.99
## 
## Step:  AIC=488.55
## chd ~ sbp + tobacco + ldl + famhist + typea + obesity + age
## 
##             Df Deviance    AIC
## - sbp        1   473.98 487.98
## <none>           472.55 488.55
## - obesity    1   474.65 488.65
## + adiposity  1   472.14 490.14
## + alcohol    1   472.55 490.55
## - tobacco    1   482.54 496.54
## - ldl        1   482.95 496.95
## - typea      1   483.19 497.19
## - famhist    1   489.38 503.38
## - age        1   495.48 509.48
## 
## Step:  AIC=487.98
## chd ~ tobacco + ldl + famhist + typea + obesity + age
## 
##             Df Deviance    AIC
## - obesity    1   475.69 487.69
## <none>           473.98 487.98
## + sbp        1   472.55 488.55
## + adiposity  1   473.47 489.47
## + alcohol    1   473.94 489.94
## - tobacco    1   484.18 496.18
## - typea      1   484.30 496.30
## - ldl        1   484.53 496.53
## - famhist    1   490.58 502.58
## - age        1   502.11 514.11
## 
## Step:  AIC=487.69
## chd ~ tobacco + ldl + famhist + typea + age
## 
##             Df Deviance    AIC
## <none>           475.69 487.69
## + obesity    1   473.98 487.98
## + sbp        1   474.65 488.65
## + adiposity  1   475.44 489.44
## + alcohol    1   475.65 489.65
## - ldl        1   484.71 494.71
## - typea      1   485.44 495.44
## - tobacco    1   486.03 496.03
## - famhist    1   492.09 502.09
## - age        1   502.38 512.38

summary(heartstepA)

## 
## Call:
## glm(formula = chd ~ tobacco + ldl + famhist + typea + age, family = binomial, 
##     data = heart)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9165  -0.8054  -0.4430   0.9329   2.6139  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -6.44644    0.92087  -7.000 2.55e-12 ***
## tobacco         0.08038    0.02588   3.106  0.00190 ** 
## ldl             0.16199    0.05497   2.947  0.00321 ** 
## famhistPresent  0.90818    0.22576   4.023 5.75e-05 ***
## typea           0.03712    0.01217   3.051  0.00228 ** 
## age             0.05046    0.01021   4.944 7.65e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 596.11  on 461  degrees of freedom
## Residual deviance: 475.69  on 456  degrees of freedom
## AIC: 487.69
## 
## Number of Fisher Scoring iterations: 5

n=dim(heart)[1]
heartstepB = step(heartfull, scope=list(upper=~., lower=~1), trace = 0, k=log(n))
summary(heartstepB)

## 
## Call:
## glm(formula = chd ~ tobacco + ldl + famhist + typea + age, family = binomial, 
##     data = heart)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9165  -0.8054  -0.4430   0.9329   2.6139  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -6.44644    0.92087  -7.000 2.55e-12 ***
## tobacco         0.08038    0.02588   3.106  0.00190 ** 
## ldl             0.16199    0.05497   2.947  0.00321 ** 
## famhistPresent  0.90818    0.22576   4.023 5.75e-05 ***
## typea           0.03712    0.01217   3.051  0.00228 ** 
## age             0.05046    0.01021   4.944 7.65e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 596.11  on 461  degrees of freedom
## Residual deviance: 475.69  on 456  degrees of freedom
## AIC: 487.69
## 
## Number of Fisher Scoring iterations: 5

Estimation/Prediction

Estimation (probability of being 1) on the training samples.

phat=heartstepA$fitted.values;
mypred = (phat>0.5)
table(heart$chd, mypred)

##    mypred
##     FALSE TRUE
##   0   256   46
##   1    73   87

Consider the following three males with the same value on all other predictors except age: min, max, and median. What’s the estimated chance of getting heart disease for each of them?

testsamples = heart[c(1, 1, 1),]
testsamples

##     sbp tobacco  ldl adiposity famhist typea obesity alcohol age chd
## 1   160      12 5.73     23.11 Present    49    25.3    97.2  52   1
## 1.1 160      12 5.73     23.11 Present    49    25.3    97.2  52   1
## 1.2 160      12 5.73     23.11 Present    49    25.3    97.2  52   1

testsamples$age = c(min(heart$age), median(heart$age), max(heart$age))
testsamples

##     sbp tobacco  ldl adiposity famhist typea obesity alcohol age chd
## 1   160      12 5.73     23.11 Present    49    25.3    97.2  15   1
## 1.1 160      12 5.73     23.11 Present    49    25.3    97.2  45   1
## 1.2 160      12 5.73     23.11 Present    49    25.3    97.2  64   1

predict log-odds
predict probabilities

predict(heartstepA, newdata=testsamples)

##          1        1.1        1.2 
## -1.0700021  0.4438094  1.4025567

predict(heartstepA, newdata=testsamples, type="response")

##         1       1.1       1.2 
## 0.2554027 0.6091664 0.8025893

Challenger USA Space Shuttle O-Ring Data Analysis

The motivation for collecting this database was the explosion of the USA Space Shuttle Challenger on 28 January 1986. The explosion was eventually traced to the failure of one of the three field joints on one of the two solid booster rockets. Each of these six field joints includes two O‐rings, designated as primary and secondary, which fail when phenomena called erosion and blowby both occur.

The night before the launch a decision had to be made regarding launch safety. The discussion among engineers and managers leading to this decision included concern that the probability of failure of the O‐rings depended on the temperature at launch, which was forecasted to be 31 degrees F. There are strong engineering reasons based on the composition of O‐rings to support the judgment that failure probability may rise monotonically as temperature drops. One other variable, the pressure at which safety testing for field join leaks was performed, was available, but its relevance to the failure process was unclear.

The data set includes the temperature and the number of O‐ring failures for the 23 shuttle flights previous to the Challenger disaster. No previous liftoff temperature was below 53 degrees F. So tremendous extrapolation must be done from the given data to assess risk at 31 degrees F. However it is obvious (from the plot) the risk is high at 31 degrees F.

The task is to predict the Chance of one of the O‐rings would fail when the launch temperature is below freezing.

orings = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/space-shuttle/o-ring-erosion-or-blowby.data")
orings

##    V1 V2 V3  V4 V5
## 1   6  0 66  50  1
## 2   6  1 70  50  2
## 3   6  0 69  50  3
## 4   6  0 68  50  4
## 5   6  0 67  50  5
## 6   6  0 72  50  6
## 7   6  0 73 100  7
## 8   6  0 70 100  8
## 9   6  1 57 200  9
## 10  6  1 63 200 10
## 11  6  1 70 200 11
## 12  6  0 78 200 12
## 13  6  0 67 200 13
## 14  6  2 53 200 14
## 15  6  0 67 200 15
## 16  6  0 75 200 16
## 17  6  0 70 200 17
## 18  6  0 81 200 18
## 19  6  0 76 200 19
## 20  6  0 79 200 20
## 21  6  2 75 200 21
## 22  6  0 76 200 22
## 23  6  1 58 200 23

orings=orings[, c(2:3)]
names(orings)=c("damage", "temp")
orings[order(orings$temp),]

##    damage temp
## 14      2   53
## 9       1   57
## 23      1   58
## 10      1   63
## 1       0   66
## 5       0   67
## 13      0   67
## 15      0   67
## 4       0   68
## 3       0   69
## 2       1   70
## 8       0   70
## 11      1   70
## 17      0   70
## 6       0   72
## 7       0   73
## 16      0   75
## 21      2   75
## 19      0   76
## 22      0   76
## 12      0   78
## 20      0   79
## 18      0   81

logitmod=glm(cbind(damage, 6-damage) ~ temp, family=binomial, data=orings)
summary(logitmod)

## 
## Call:
## glm(formula = cbind(damage, 6 - damage) ~ temp, family = binomial, 
##     data = orings)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.95227  -0.78299  -0.54117  -0.04379   2.65152  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  5.08498    3.05247   1.666   0.0957 .
## temp        -0.11560    0.04702  -2.458   0.0140 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.230  on 22  degrees of freedom
## Residual deviance: 18.086  on 21  degrees of freedom
## AIC: 35.647
## 
## Number of Fisher Scoring iterations: 5

predict(logitmod,  data.frame(temp=31), type="response")

##         1 
## 0.8177744

min.temp=31; max.temp=max(orings$temp)
plot(orings$temp, orings$damage/6, 
     xlim=c(min.temp, max.temp), ylim=c(0,1), 
     xlab="Temp", ylab="Chance of Damage")
newtemp = seq(min.temp, max.temp, length=100)
phat = predict(logitmod, data.frame(temp=newtemp), type="response")
lines(newtemp, phat, col="red")

Logistic with Lasso penalty

library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-5

p = dim(heart)[2]
X=data.matrix(heart[,-p]);
Y=heart[,p];
heart.l1=glmnet(X,Y,family="binomial",alpha=1)
plot(heart.l1,label=TRUE)

heart.cv = cv.glmnet(X, Y, family="binomial",alpha=1)
plot(heart.cv)

heart.cv$lambda.min

## [1] 0.006230944

heart.cv$lambda.1se

## [1] 0.04824393

predict(heart.cv, lambda=heart.cv$lambda.1se, type="coefficients")

## 10 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept) -3.510976639
## sbp          .          
## tobacco      0.042431208
## ldl          0.077881736
## adiposity    .          
## famhist      0.484727397
## typea        0.004487505
## obesity      .          
## alcohol      .          
## age          0.031404116