Brief Description of the Diabetes Data (Efron et al, 2004): Ten baseline variables: age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

data = read.table("diabetes.data", header=TRUE)
head(data)
##   AGE SEX  BMI  BP  S1    S2 S3 S4     S5 S6   Y
## 1  59   2 32.1 101 157  93.2 38  4 4.8598 87 151
## 2  48   1 21.6  87 183 103.2 70  3 3.8918 69  75
## 3  72   2 30.5  93 156  93.6 41  4 4.6728 85 141
## 4  24   1 25.3  84 198 131.4 40  5 4.8903 89 206
## 5  50   1 23.0 101 192 125.4 52  4 4.2905 80 135
## 6  23   1 22.6  89 139  64.8 61  2 4.1897 68  97
dim(data)
## [1] 442  11

Lasso

library(glmnet)
set.seed(1234)
X = data.matrix(data[, -11])
Y = data$Y
mylasso = glmnet(X, Y, alpha = 1)

Notice how the model size goes from 10 to 9 and then back to 10

mylasso$df  
##  [1]  0  2  2  2  2  2  2  2  3  3  3  3  4  4  4  4  4  4  4  4  4  4  5
## [24]  5  5  5  6  6  6  7  7  7  7  7  7  7  7  7  7  7  7  7  8  8  8  8
## [47]  8  8  8  8  8  8  8  8  8  8  9 10 10 10 10 10 10 10 10 10 10  9  9
## [70]  9  9  9  9 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10

You can see how the coefficient of “S3” changes from non-zero to zero, and then back to non-zero, which corresponds to that white block in the right part of the matrix (see the image of the matrix).

lasso.coef = as.matrix(coef(mylasso))
# round(lasso.coef[, -c(1:60)], dig=2)
image(coef(mylasso))

lasso.varset: 11-by-88 matrix denoting variable sets over 88 different values of lamnda (denoted by s0, s1, …, s87)

lasso.coef = as.matrix(coef(mylasso))
lasso.varset = ifelse(lasso.coef==0, 0, 1)
lasso.varset
##             s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 s16 s17
## (Intercept)  1  1  1  1  1  1  1  1  1  1   1   1   1   1   1   1   1   1
## AGE          0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0
## SEX          0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0
## BMI          0  1  1  1  1  1  1  1  1  1   1   1   1   1   1   1   1   1
## BP           0  0  0  0  0  0  0  0  1  1   1   1   1   1   1   1   1   1
## S1           0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0
## S2           0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0
## S3           0  0  0  0  0  0  0  0  0  0   0   0   1   1   1   1   1   1
## S4           0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0
## S5           0  1  1  1  1  1  1  1  1  1   1   1   1   1   1   1   1   1
## S6           0  0  0  0  0  0  0  0  0  0   0   0   0   0   0   0   0   0
##             s18 s19 s20 s21 s22 s23 s24 s25 s26 s27 s28 s29 s30 s31 s32
## (Intercept)   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## AGE           0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## SEX           0   0   0   0   1   1   1   1   1   1   1   1   1   1   1
## BMI           1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## BP            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S1            0   0   0   0   0   0   0   0   0   0   0   1   1   1   1
## S2            0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## S3            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S4            0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## S5            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S6            0   0   0   0   0   0   0   0   1   1   1   1   1   1   1
##             s33 s34 s35 s36 s37 s38 s39 s40 s41 s42 s43 s44 s45 s46 s47
## (Intercept)   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## AGE           0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## SEX           1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## BMI           1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## BP            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S1            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S2            0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## S3            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S4            0   0   0   0   0   0   0   0   0   1   1   1   1   1   1
## S5            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S6            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##             s48 s49 s50 s51 s52 s53 s54 s55 s56 s57 s58 s59 s60 s61 s62
## (Intercept)   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## AGE           0   0   0   0   0   0   0   0   0   1   1   1   1   1   1
## SEX           1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## BMI           1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## BP            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S1            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S2            0   0   0   0   0   0   0   0   1   1   1   1   1   1   1
## S3            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S4            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S5            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S6            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##             s63 s64 s65 s66 s67 s68 s69 s70 s71 s72 s73 s74 s75 s76 s77
## (Intercept)   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## AGE           1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## SEX           1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## BMI           1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## BP            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S1            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S2            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S3            1   1   1   1   0   0   0   0   0   0   1   1   1   1   1
## S4            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S5            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## S6            1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##             s78 s79 s80 s81 s82 s83 s84 s85 s86 s87
## (Intercept)   1   1   1   1   1   1   1   1   1   1
## AGE           1   1   1   1   1   1   1   1   1   1
## SEX           1   1   1   1   1   1   1   1   1   1
## BMI           1   1   1   1   1   1   1   1   1   1
## BP            1   1   1   1   1   1   1   1   1   1
## S1            1   1   1   1   1   1   1   1   1   1
## S2            1   1   1   1   1   1   1   1   1   1
## S3            1   1   1   1   1   1   1   1   1   1
## S4            1   1   1   1   1   1   1   1   1   1
## S5            1   1   1   1   1   1   1   1   1   1
## S6            1   1   1   1   1   1   1   1   1   1

Summary of the models on the path of Lasso:

S3 is an interesting variable; it seems to be related to other S variables. On Lasso path, S3 enters to size 4 model (4 non-intercept predictors), and stays for a long time and only leaves briefly leading to a size 9 model without S3.

library(leaps)
RSSleaps = regsubsets(X, Y, nbest=1, nvmax=10)
sumleaps = summary(RSSleaps, matrix=T)
sumleaps
## Subset selection object
## 10 Variables  (and intercept)
##     Forced in Forced out
## AGE     FALSE      FALSE
## SEX     FALSE      FALSE
## BMI     FALSE      FALSE
## BP      FALSE      FALSE
## S1      FALSE      FALSE
## S2      FALSE      FALSE
## S3      FALSE      FALSE
## S4      FALSE      FALSE
## S5      FALSE      FALSE
## S6      FALSE      FALSE
## 1 subsets of each size up to 10
## Selection Algorithm: exhaustive
##           AGE SEX BMI BP  S1  S2  S3  S4  S5  S6 
## 1  ( 1 )  " " " " "*" " " " " " " " " " " " " " "
## 2  ( 1 )  " " " " "*" " " " " " " " " " " "*" " "
## 3  ( 1 )  " " " " "*" "*" " " " " " " " " "*" " "
## 4  ( 1 )  " " " " "*" "*" "*" " " " " " " "*" " "
## 5  ( 1 )  " " "*" "*" "*" " " " " "*" " " "*" " "
## 6  ( 1 )  " " "*" "*" "*" "*" "*" " " " " "*" " "
## 7  ( 1 )  " " "*" "*" "*" "*" "*" " " "*" "*" " "
## 8  ( 1 )  " " "*" "*" "*" "*" "*" " " "*" "*" "*"
## 9  ( 1 )  " " "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 10  ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"

But if you look at optimal size 4, 6, 7, 8 models from regsubsets, none of them has S3. That means, if AIC/BIC ends up picking a model with size 4, 6, 7, or 8 model, then the AIC/BIC optimal model is not on the path of Lasso.

You can check which model returned by AIC/BIC.

n = length(Y)
msize=apply(sumleaps$which, 1, sum);
BIC = sumleaps$bic; BIC = BIC - min(BIC); BIC = BIC/max(BIC);
AIC = n*log(sumleaps$rss/n) + 2*msize;
AIC = AIC - min(AIC); AIC = AIC/max(AIC);
cbind(AIC, BIC)
##            AIC         BIC
## 1  1.000000000 1.000000000
## 2  0.322396838 0.230742097
## 3  0.199478409 0.123584530
## 4  0.132534927 0.083241551
## 5  0.029652507 0.000000000
## 6  0.000000000 0.004169454
## 7  0.005806613 0.050664745
## 8  0.013262202 0.099128337
## 9  0.027634213 0.155847729
## 10 0.043603930 0.214474226
plot(range(msize), c(0,1.1), type="n", xlab="Model Size (with Intercept)", ylab="Model Selection Criteria")
points(msize, AIC, col="blue")
points(msize, BIC, col="black")
legend("topright", lty=rep(1,3), col=c("blue", "black"), legend=c("AIC", "BIC"))