Use gbm
package in R to fit a gradient boosting model. For more GBM examples, check how to use GBM to analyze the AmesHousing data
Use loss function gaussian
(squared error) for regression.
n.trees
: number of trees (default 100)
shrinkage
: shrinkage factor for the step size (default 0.001)
bag.fraction
: subsampling rate (default 0.5)
cv.folds
: return CV error (default 0, i.e., no CV error returned)
interaction.depth
: depth of trees (default 1)
rm(list=objects())
library(gbm)
load("BostonHousing1.Rdata")
mydata = Housing1
n = nrow(mydata)
ntest = round(n * 0.3)
set.seed(1234)
test.id = sample(1:n, ntest)
myfit1 = gbm(Y ~ . , data = mydata[-test.id, ],
distribution = "gaussian",
n.trees = 100,
shrinkage = 1,
interaction.depth = 3,
bag.fraction = 1,
cv.folds = 5)
myfit1
## gbm(formula = Y ~ ., distribution = "gaussian", data = mydata[-test.id,
## ], n.trees = 100, interaction.depth = 3, shrinkage = 1, bag.fraction = 1,
## cv.folds = 5)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## The best cross-validation iteration was 28.
## There were 15 predictors of which 13 had non-zero influence.
gbm.perf(myfit1)
## Using cv method...
## [1] 28
opt.size = gbm.perf(myfit1)
## Using cv method...
size = 1:myfit1$n.trees
test.err = rep(0, length(size))
for(i in 1:length(size)){
y.pred = predict(myfit1, mydata[test.id, ], n.trees = size[i])
test.err[i] = sum((mydata$Y[test.id] - y.pred)^2)
}
plot(test.err, type = "n")
lines(size, test.err, lwd = 2)
abline(v = opt.size, lwd = 2, lty = 2, col = "blue")
myfit2 = gbm(Y ~ . , data = mydata[-test.id, ],
distribution = "gaussian",
n.trees = 1000,
shrinkage = 0.1,
interaction.depth = 3,
bag.fraction = 0.5,
cv.folds = 5)
gbm.perf(myfit2)
## Using cv method...
## [1] 931
opt.size = gbm.perf(myfit2)
## Using cv method...
size = 1:myfit2$n.trees
test.err = rep(0, length(size))
for(i in 1:length(size)){
y.pred = predict(myfit2, mydata[test.id, ], n.trees = size[i])
test.err[i] = sum((mydata$Y[test.id] - y.pred)^2)
}
plot(test.err, type = "n")
lines(size, test.err, lwd = 2)
abline(v = opt.size, lwd = 2, lty = 2, col = "blue")
Variable importance.
par(mfrow=c(1, 2))
summary(myfit1, cBars = 10,
method = relative.influence,
las = 2)
## var rel.inf
## lstat lstat 54.55954661
## rm rm 17.61402235
## crim crim 14.69199917
## dis dis 3.84716981
## lon lon 2.72932865
## b b 1.49663464
## nox nox 1.34947616
## age age 1.18250370
## ptratio ptratio 0.90179432
## tax tax 0.80956010
## lat lat 0.58140632
## zn zn 0.10850492
## rad rad 0.07485697
## indus indus 0.05319626
## chas chas 0.00000000
summary(myfit1, cBars = 10,
method = permutation.test.gbm,
las = 2)
## var rel.inf
## 1 lstat 45.2770326
## 2 rm 14.0000143
## 3 crim 10.4393911
## 4 dis 8.3214276
## 5 age 5.7788658
## 6 lon 4.1967632
## 7 nox 4.0238256
## 8 tax 2.6741552
## 9 b 2.3376530
## 10 ptratio 1.6785949
## 11 lat 0.7688673
## 12 zn 0.2911208
## 13 rad 0.1103934
## 14 indus 0.1018952
## 15 chas 0.0000000
par(mfrow = c(1,1))