glmnet
: library for both lasso (alpha=1) and ridge (alpha=0). Check more glmnet examples at https://web.stanford.edu/~hastie/glmnet/glmnet_beta.html
mypackages = c("MASS", "glmnet") # required packages
tmp = setdiff(mypackages, rownames(installed.packages())) # packages need to be installed
if (length(tmp) > 0) install.packages(tmp)
lapply(mypackages, require, character.only = TRUE)
set.seed(2134)
The preparation is the same as what we did before. Recall that the data contain the following columns:
1. MEDV (Y) Median value of owner-occupied homes in $1000's (response variable)
2. CRIM per capita crime rate by town
3. ZN proportion of residential land zoned for lots over
25,000 sq.ft.
4. INDUS proportion of non-retail business acres per town
5. CHAS Charles River dummy variable (= 1 if tract bounds
river; 0 otherwise)
6. NOX nitric oxides concentration (parts per 10 million)
7. RM average number of rooms per dwelling
8. AGE proportion of owner-occupied units built prior to 1940
9. DIS weighted distances to five Boston employment centres
10. RAD index of accessibility to radial highways
11. TAX full-value property-tax rate per $10,000
12. PTRATIO pupil-teacher ratio by town
13. B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks
by town
14. LSTAT % lower status of the population
myData = Boston
names(myData)[14] = "Y"
iLog = c(1, 3, 5, 6, 8, 9, 10, 14);
myData[, iLog] = log(myData[, iLog]);
myData[, 2] = myData[, 2] / 10;
myData[, 7] = myData[, 7]^2.5 / 10^4
myData[, 11] = exp(0.4 * myData[, 11]) / 1000;
myData[, 12] = myData[, 12] / 100;
myData[, 13] = sqrt(myData[, 13]);
# Move the last column of myData, the response Y, to the 1st column.
myData = data.frame(Y = myData[,14], myData[,-14]);
names(myData)[1] = "Y";
names(myData)
## [1] "Y" "crim" "zn" "indus" "chas" "nox" "rm"
## [8] "age" "dis" "rad" "tax" "ptratio" "black" "lstat"
n = dim(myData)[1];
p = dim(myData)[2]-1;
X = as.matrix(myData[, -1]); # some algorithms need the matrix/vector
Y = myData[, 1]; # input (instead of data frame)
ntest = round(n*0.2)
ntrain = n - ntest;
test.id = sample(1:n, ntest);
Ytest = myData[test.id, 1];
full.model = lm( Y ~ ., data = myData[-test.id, ]);
Ytest.pred = predict(full.model, newdata= myData[test.id, ]);
sum((Ytest - Ytest.pred)^2)/ntest # averaged MSE on the test set
## [1] 0.0467682
myridge = glmnet(X[-test.id, ], Y[-test.id], alpha = 0)
plot(myridge, label = TRUE, xvar = "lambda")
glmnet
for ridge regression.summary(myridge)
## Length Class Mode
## a0 100 -none- numeric
## beta 1300 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 4 -none- call
## nobs 1 -none- numeric
length(myridge$lambda) # retrieve the lambda value
dim(myridge$beta) # coefficients for 13 non-intercept predictors
length(myridge$a0) # intercept
# the 13 coefficients (including intercept) can also be retrieved using
# coef(myridge)
dim(coef(myridge))
# The two coefficient matrices should be the same
sum((coef(myridge) - rbind(myridge$a0, myridge$beta))^2)
round(myridge$beta[8, ], dig = 2)
## s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## s12 s13 s14 s15 s16 s17 s18 s19 s20 s21 s22 s23
## 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## s24 s25 s26 s27 s28 s29 s30 s31 s32 s33 s34 s35
## 0.00 0.00 0.00 0.00 0.00 0.00 0.01 0.01 0.01 0.01 0.01 0.01
## s36 s37 s38 s39 s40 s41 s42 s43 s44 s45 s46 s47
## 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
## s48 s49 s50 s51 s52 s53 s54 s55 s56 s57 s58 s59
## 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.01
## s60 s61 s62 s63 s64 s65 s66 s67 s68 s69 s70 s71
## 0.01 0.01 0.01 0.01 0.01 0.01 0.00 0.00 0.00 0.00 -0.01 -0.01
## s72 s73 s74 s75 s76 s77 s78 s79 s80 s81 s82 s83
## -0.01 -0.02 -0.02 -0.02 -0.03 -0.03 -0.04 -0.04 -0.04 -0.05 -0.05 -0.06
## s84 s85 s86 s87 s88 s89 s90 s91 s92 s93 s94 s95
## -0.06 -0.07 -0.07 -0.08 -0.08 -0.09 -0.09 -0.10 -0.10 -0.11 -0.11 -0.12
## s96 s97 s98 s99
## -0.12 -0.13 -0.13 -0.14
k = 2;
my.mean = apply(X[-test.id, ], 2, mean) # 13x1 mean vector for training X
mean(Y[-test.id]) - sum(my.mean * myridge$beta[, k])
## [1] 3.024883
myridge$a0[k] # intercept for lambda = myridge$lambda[k]
## s1
## 3.024883
# Check whether our intercept formula is true for all intercepts
sum((mean(Y[-test.id]) - my.mean %*% myridge$beta - myridge$a0)^2)
## [1] 2.031317e-28
cv.out$cvm
: mean CV errorcv.out$cvsd
: estimate of standard error of cvm
Two choices for lambda
lambda.min
: the value of lambda that gives the minimum cvm
lambda.1se
: the largest value of lambda (i.e., the largest regularization, the smallest df) whose cvm
is within 1 standard error of the cvm
of lambda.min
.cv.out = cv.glmnet(X[-test.id, ], Y[-test.id], alpha = 0)
plot(cv.out)
lam.seq = exp(seq(-6, 2, length=100))
cv.out = cv.glmnet(X[-test.id,], Y[-test.id], alpha=0, lambda=lam.seq)
plot(cv.out)
names(cv.out)
## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"
## [6] "nzero" "name" "glmnet.fit" "lambda.min" "lambda.1se"
cv.out$lambda[which.min(cv.out$cvm)]
## [1] 0.002687371
cv.out$lambda.min
## [1] 0.002687371
tmp.id = which.min(cv.out$cvm)
max(cv.out$lambda[cv.out$cvm < cv.out$cvm[tmp.id] + cv.out$cvsd[tmp.id]])
## [1] 0.1019953
cv.out$lambda.1se
## [1] 0.1019953
myridge = glmnet(X[-test.id,], Y[-test.id], alpha=0, lambda = lam.seq)
Ytest.pred = predict(myridge, s = cv.out$lambda.1se, newx=X[test.id,])
mean((Ytest.pred - Y[test.id])^2)
## [1] 0.05229618
Ytest.pred=predict(myridge, s = cv.out$lambda.min, newx=X[test.id,])
mean((Ytest.pred - Y[test.id])^2)
## [1] 0.0467029
mylasso = glmnet(X[-test.id,], Y[-test.id], alpha = 1)
summary(mylasso)
## Length Class Mode
## a0 75 -none- numeric
## beta 975 dgCMatrix S4
## df 75 -none- numeric
## dim 2 -none- numeric
## lambda 75 -none- numeric
## dev.ratio 75 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 4 -none- call
## nobs 1 -none- numeric
par(mfrow = c(1, 2))
plot(mylasso, label=TRUE, xvar = "norm")
plot(mylasso, label=TRUE, xvar = "lambda")
par(mfrow=c(1,1))
mylasso$df
## [1] 0 1 1 1 1 1 1 1 1 1 1 1 2 2 3 3 5 5 5 5 5 5 5
## [24] 6 6 6 6 6 6 6 7 7 7 7 7 7 7 8 8 9 9 10 10 10 11 11
## [47] 11 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
## [70] 12 12 12 12 12 12
cv.out = cv.glmnet(X[-test.id, ], Y[-test.id], alpha = 1)
plot(cv.out)
# lam.seq = exp(seq(-4, 2, length=100))
# lam.seq = exp(seq(-6, -1, length=100))
# cv.out = cv.glmnet(X[-test.id,], Y[-test.id], alpha = 1, lambda = l am.seq)
# plot(cv.out)
cv.out$lambda.min
tmp.id=which.min(cv.out$cvm)
cv.out$lambda[tmp.id]
cv.out$lambda.1se
max(cv.out$lambda[cv.out$cvm < cv.out$cvm[tmp.id] + cv.out$cvsd[tmp.id]])
mylasso.coef.min = predict(mylasso, s=cv.out$lambda.min, type="coefficients")
mylasso.coef.1se = predict(mylasso, s=cv.out$lambda.1se, type="coefficients")
cbind(mylasso.coef.min, mylasso.coef.1se)
# number of variables selected (including the intercept)
sum(mylasso.coef.1se != 0)
# names of selected non-intercept variables
row.names(mylasso.coef.1se)[nonzeroCoef(mylasso.coef.1se)[-1]]
mylasso = glmnet(X[-test.id, ], Y[-test.id], alpha = 1)
Ytest.pred = predict(mylasso, s = cv.out$lambda.min, newx = X[test.id,])
mean((Ytest.pred - Y[test.id])^2)
## [1] 0.04656643
Ytest.pred = predict(mylasso, s = cv.out$lambda.1se, newx = X[test.id,])
mean((Ytest.pred - Y[test.id])^2)
## [1] 0.04702326
# Variables selected by lambda.1se
mylasso.coef.1se = predict(mylasso, s = cv.out$lambda.1se, type="coefficients")
var.sel = row.names(mylasso.coef.1se)[nonzeroCoef(mylasso.coef.1se)[-1]]
var.sel;
## [1] "chas" "rm" "dis" "tax" "ptratio" "black" "lstat"
tmp.X = X[, colnames(X) %in% var.sel]
mylasso.refit = coef(lm(Y[-test.id] ~ tmp.X[-test.id, ]))
Ytest.pred = mylasso.refit[1] + tmp.X[test.id,] %*% mylasso.refit[-1]
mean((Ytest.pred - Y[test.id])^2)
## [1] 0.04533682