Stat542: Ridge and Lasso

Install and load required packages.

glmnet: library for both lasso (alpha=1) and ridge (alpha=0). Check more glmnet examples at https://web.stanford.edu/~hastie/glmnet/glmnet_beta.html

mypackages = c("MASS", "glmnet")   # required packages
tmp = setdiff(mypackages, rownames(installed.packages()))  # packages need to be installed
if (length(tmp) > 0) install.packages(tmp)
lapply(mypackages, require, character.only = TRUE)
set.seed(2134)

Prepare the Boston Housing Data:

The preparation is the same as what we did before. Recall that the data contain the following columns:

1. MEDV (Y)  Median value of owner-occupied homes in $1000's (response variable)
2. CRIM      per capita crime rate by town
3. ZN        proportion of residential land zoned for lots over 
             25,000 sq.ft.
4. INDUS     proportion of non-retail business acres per town
5. CHAS      Charles River dummy variable (= 1 if tract bounds 
             river; 0 otherwise)
6. NOX       nitric oxides concentration (parts per 10 million)
7. RM        average number of rooms per dwelling
8. AGE       proportion of owner-occupied units built prior to 1940
9. DIS       weighted distances to five Boston employment centres
10. RAD       index of accessibility to radial highways
11. TAX      full-value property-tax rate per $10,000
12. PTRATIO  pupil-teacher ratio by town
13. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
             by town
14. LSTAT    % lower status of the population

myData = Boston
names(myData)[14] = "Y"
iLog = c(1, 3, 5, 6, 8, 9, 10, 14);
myData[, iLog] = log(myData[, iLog]);
myData[, 2] = myData[, 2] / 10;
myData[, 7] = myData[, 7]^2.5 / 10^4
myData[, 11] = exp(0.4 * myData[, 11]) / 1000;
myData[, 12] = myData[, 12] / 100;
myData[, 13] = sqrt(myData[, 13]);

# Move the last column of myData, the response Y, to the 1st column.
myData = data.frame(Y = myData[,14], myData[,-14]);
names(myData)[1] = "Y";
names(myData)

##  [1] "Y"       "crim"    "zn"      "indus"   "chas"    "nox"     "rm"     
##  [8] "age"     "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"

n = dim(myData)[1]; 
p = dim(myData)[2]-1;
X = as.matrix(myData[, -1]);  # some algorithms need the matrix/vector 
Y = myData[, 1];              # input (instead of data frame)

Split the data into two parts: 80% for training and 20% for testing

ntest = round(n*0.2)
ntrain = n - ntest;
test.id = sample(1:n, ntest);
Ytest = myData[test.id, 1];

Full Model

full.model = lm( Y ~ ., data = myData[-test.id, ]);  
Ytest.pred = predict(full.model, newdata= myData[test.id, ]);
sum((Ytest - Ytest.pred)^2)/ntest # averaged MSE on the test set

## [1] 0.0467682

Ridge using glmnet; lambda chosen by 10-fold CV

myridge = glmnet(X[-test.id, ], Y[-test.id], alpha = 0)
plot(myridge, label = TRUE, xvar = "lambda")

Check the output from `glmnet` for ridge regression.

summary(myridge)

##           Length Class     Mode   
## a0         100   -none-    numeric
## beta      1300   dgCMatrix S4     
## df         100   -none-    numeric
## dim          2   -none-    numeric
## lambda     100   -none-    numeric
## dev.ratio  100   -none-    numeric
## nulldev      1   -none-    numeric
## npasses      1   -none-    numeric
## jerr         1   -none-    numeric
## offset       1   -none-    logical
## call         4   -none-    call   
## nobs         1   -none-    numeric

length(myridge$lambda)  # retrieve the lambda value
dim(myridge$beta)       # coefficients for 13 non-intercept predictors
length(myridge$a0)      # intercept

# the 13 coefficients (including intercept) can also be retrieved using
# coef(myridge)
dim(coef(myridge))

# The two coefficient matrices should be the same
sum((coef(myridge) - rbind(myridge$a0, myridge$beta))^2)

Ridge regression coefs could change sign along the path

round(myridge$beta[8, ], dig = 2)

##    s0    s1    s2    s3    s4    s5    s6    s7    s8    s9   s10   s11 
##  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00 
##   s12   s13   s14   s15   s16   s17   s18   s19   s20   s21   s22   s23 
##  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00 
##   s24   s25   s26   s27   s28   s29   s30   s31   s32   s33   s34   s35 
##  0.00  0.00  0.00  0.00  0.00  0.00  0.01  0.01  0.01  0.01  0.01  0.01 
##   s36   s37   s38   s39   s40   s41   s42   s43   s44   s45   s46   s47 
##  0.01  0.01  0.01  0.01  0.01  0.01  0.01  0.01  0.01  0.01  0.01  0.01 
##   s48   s49   s50   s51   s52   s53   s54   s55   s56   s57   s58   s59 
##  0.02  0.02  0.02  0.02  0.02  0.02  0.02  0.02  0.02  0.02  0.02  0.01 
##   s60   s61   s62   s63   s64   s65   s66   s67   s68   s69   s70   s71 
##  0.01  0.01  0.01  0.01  0.01  0.01  0.00  0.00  0.00  0.00 -0.01 -0.01 
##   s72   s73   s74   s75   s76   s77   s78   s79   s80   s81   s82   s83 
## -0.01 -0.02 -0.02 -0.02 -0.03 -0.03 -0.04 -0.04 -0.04 -0.05 -0.05 -0.06 
##   s84   s85   s86   s87   s88   s89   s90   s91   s92   s93   s94   s95 
## -0.06 -0.07 -0.07 -0.08 -0.08 -0.09 -0.09 -0.10 -0.10 -0.11 -0.11 -0.12 
##   s96   s97   s98   s99 
## -0.12 -0.13 -0.13 -0.14

How are the intercepts computed?

k = 2; 
my.mean = apply(X[-test.id, ], 2, mean)  # 13x1 mean vector for training X
mean(Y[-test.id]) - sum(my.mean * myridge$beta[, k])

## [1] 3.024883

myridge$a0[k]  # intercept for lambda = myridge$lambda[k]

##       s1 
## 3.024883

# Check whether our intercept formula is true for all intercepts 
sum((mean(Y[-test.id]) - my.mean %*% myridge$beta  - myridge$a0)^2)

## [1] 2.031317e-28

Selection lambda by 10-fold CV. The CV results are stored in

cv.out$cvm: mean CV error
cv.out$cvsd: estimate of standard error of cvm

Two choices for lambda

lambda.min: the value of lambda that gives the minimum cvm
lambda.1se: the largest value of lambda (i.e., the largest regularization, the smallest df) whose cvm is within 1 standard error of the cvm of lambda.min.

cv.out = cv.glmnet(X[-test.id, ], Y[-test.id], alpha = 0) 
plot(cv.out)

lam.seq = exp(seq(-6, 2, length=100))
cv.out = cv.glmnet(X[-test.id,], Y[-test.id], alpha=0, lambda=lam.seq)  
plot(cv.out)

names(cv.out)

##  [1] "lambda"     "cvm"        "cvsd"       "cvup"       "cvlo"      
##  [6] "nzero"      "name"       "glmnet.fit" "lambda.min" "lambda.1se"

cv.out$lambda[which.min(cv.out$cvm)]

## [1] 0.002687371

cv.out$lambda.min

## [1] 0.002687371

tmp.id = which.min(cv.out$cvm)
max(cv.out$lambda[cv.out$cvm < cv.out$cvm[tmp.id] + cv.out$cvsd[tmp.id]])

## [1] 0.1019953

cv.out$lambda.1se

## [1] 0.1019953

Evaluate prediction performance

myridge = glmnet(X[-test.id,], Y[-test.id], alpha=0, lambda = lam.seq)
Ytest.pred = predict(myridge, s = cv.out$lambda.1se, newx=X[test.id,])
mean((Ytest.pred - Y[test.id])^2)

## [1] 0.05229618

Ytest.pred=predict(myridge, s = cv.out$lambda.min, newx=X[test.id,])
mean((Ytest.pred - Y[test.id])^2)

## [1] 0.0467029

Lasso using glmnet; lambda chosen by 10-fold CV.

mylasso = glmnet(X[-test.id,], Y[-test.id], alpha = 1)
summary(mylasso)

##           Length Class     Mode   
## a0         75    -none-    numeric
## beta      975    dgCMatrix S4     
## df         75    -none-    numeric
## dim         2    -none-    numeric
## lambda     75    -none-    numeric
## dev.ratio  75    -none-    numeric
## nulldev     1    -none-    numeric
## npasses     1    -none-    numeric
## jerr        1    -none-    numeric
## offset      1    -none-    logical
## call        4    -none-    call   
## nobs        1    -none-    numeric

par(mfrow = c(1, 2))
plot(mylasso, label=TRUE, xvar = "norm")
plot(mylasso, label=TRUE, xvar = "lambda")

par(mfrow=c(1,1))
mylasso$df

##  [1]  0  1  1  1  1  1  1  1  1  1  1  1  2  2  3  3  5  5  5  5  5  5  5
## [24]  6  6  6  6  6  6  6  7  7  7  7  7  7  7  8  8  9  9 10 10 10 11 11
## [47] 11 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
## [70] 12 12 12 12 12 12

cv.out = cv.glmnet(X[-test.id, ], Y[-test.id], alpha = 1)
plot(cv.out)

Try our own lambda sequences.

# lam.seq =  exp(seq(-4, 2, length=100))
# lam.seq =  exp(seq(-6, -1, length=100))
# cv.out = cv.glmnet(X[-test.id,], Y[-test.id], alpha = 1, lambda = l am.seq)
# plot(cv.out)

Check how lambda.min and lambda.1se are computed.

cv.out$lambda.min
tmp.id=which.min(cv.out$cvm)
cv.out$lambda[tmp.id]

cv.out$lambda.1se
max(cv.out$lambda[cv.out$cvm < cv.out$cvm[tmp.id] + cv.out$cvsd[tmp.id]])

Retrieve Lasso coefficients.

mylasso.coef.min = predict(mylasso, s=cv.out$lambda.min, type="coefficients")
mylasso.coef.1se = predict(mylasso, s=cv.out$lambda.1se, type="coefficients")
cbind(mylasso.coef.min, mylasso.coef.1se)

# number of variables selected (including the intercept)
sum(mylasso.coef.1se != 0)
# names of selected non-intercept variables
row.names(mylasso.coef.1se)[nonzeroCoef(mylasso.coef.1se)[-1]]

Apply the fitted model for prediction on the test data.

mylasso = glmnet(X[-test.id, ], Y[-test.id], alpha = 1)
Ytest.pred = predict(mylasso, s = cv.out$lambda.min, newx = X[test.id,])
mean((Ytest.pred - Y[test.id])^2)

## [1] 0.04656643

Ytest.pred = predict(mylasso, s = cv.out$lambda.1se, newx = X[test.id,])
mean((Ytest.pred - Y[test.id])^2)

## [1] 0.04702326

Refit the model selected by Lasso to reduce bias.

# Variables selected by lambda.1se 
mylasso.coef.1se = predict(mylasso, s = cv.out$lambda.1se, type="coefficients")
var.sel = row.names(mylasso.coef.1se)[nonzeroCoef(mylasso.coef.1se)[-1]]

var.sel;

## [1] "chas"    "rm"      "dis"     "tax"     "ptratio" "black"   "lstat"

tmp.X = X[, colnames(X) %in% var.sel]
mylasso.refit = coef(lm(Y[-test.id] ~ tmp.X[-test.id, ]))
Ytest.pred = mylasso.refit[1] + tmp.X[test.id,] %*% mylasso.refit[-1]
mean((Ytest.pred - Y[test.id])^2)

## [1] 0.04533682