Stat542: GBM for Regression

Use gbm package in R to fit a gradient boosting model. For more GBM examples, check how to use GBM to analyze the AmesHousing data

Use loss function gaussian (squared error) for regression.
n.trees: number of trees (default 100)
shrinkage: shrinkage factor for the step size (default 0.001)
bag.fraction: subsampling rate (default 0.5)
cv.folds: return CV error (default 0, i.e., no CV error returned)
interaction.depth: depth of trees (default 1)

rm(list=objects())
library(gbm)

load("BostonHousing1.Rdata")
mydata = Housing1
n = nrow(mydata)
ntest = round(n * 0.3)
set.seed(1234)
test.id = sample(1:n, ntest)

myfit1 = gbm(Y ~ . , data = mydata[-test.id, ], 
            distribution = "gaussian", 
            n.trees = 100,
            shrinkage = 1, 
            interaction.depth = 3, 
            bag.fraction = 1,
            cv.folds = 5)
myfit1

## gbm(formula = Y ~ ., distribution = "gaussian", data = mydata[-test.id, 
##     ], n.trees = 100, interaction.depth = 3, shrinkage = 1, bag.fraction = 1, 
##     cv.folds = 5)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## The best cross-validation iteration was 28.
## There were 15 predictors of which 13 had non-zero influence.

gbm.perf(myfit1)

## Using cv method...

## [1] 28

opt.size = gbm.perf(myfit1)

## Using cv method...

size = 1:myfit1$n.trees
test.err = rep(0, length(size))
for(i in 1:length(size)){
    y.pred = predict(myfit1, mydata[test.id, ], n.trees = size[i])
    test.err[i] = sum((mydata$Y[test.id] - y.pred)^2)
}    
plot(test.err, type = "n")
lines(size, test.err, lwd = 2)
abline(v = opt.size, lwd = 2, lty = 2, col = "blue")

myfit2 = gbm(Y ~ . , data = mydata[-test.id, ], 
            distribution = "gaussian", 
            n.trees = 1000,
            shrinkage = 0.1, 
            interaction.depth = 3, 
            bag.fraction = 0.5,
            cv.folds = 5)
gbm.perf(myfit2)

## Using cv method...

## [1] 931

opt.size = gbm.perf(myfit2)

## Using cv method...

size = 1:myfit2$n.trees
test.err = rep(0, length(size))
for(i in 1:length(size)){
    y.pred = predict(myfit2, mydata[test.id, ], n.trees = size[i])
    test.err[i] = sum((mydata$Y[test.id] - y.pred)^2)
}    
plot(test.err, type = "n")
lines(size, test.err, lwd = 2)
abline(v = opt.size, lwd = 2, lty = 2, col = "blue")

Variable importance.

par(mfrow=c(1, 2))
summary(myfit1, cBars = 10,
  method = relative.influence, 
  las = 2)

##             var     rel.inf
## lstat     lstat 54.55954661
## rm           rm 17.61402235
## crim       crim 14.69199917
## dis         dis  3.84716981
## lon         lon  2.72932865
## b             b  1.49663464
## nox         nox  1.34947616
## age         age  1.18250370
## ptratio ptratio  0.90179432
## tax         tax  0.80956010
## lat         lat  0.58140632
## zn           zn  0.10850492
## rad         rad  0.07485697
## indus     indus  0.05319626
## chas       chas  0.00000000

summary(myfit1, cBars = 10,
  method = permutation.test.gbm, 
  las = 2)

##        var    rel.inf
## 1    lstat 45.2770326
## 2       rm 14.0000143
## 3     crim 10.4393911
## 4      dis  8.3214276
## 5      age  5.7788658
## 6      lon  4.1967632
## 7      nox  4.0238256
## 8      tax  2.6741552
## 9        b  2.3376530
## 10 ptratio  1.6785949
## 11     lat  0.7688673
## 12      zn  0.2911208
## 13     rad  0.1103934
## 14   indus  0.1018952
## 15    chas  0.0000000

par(mfrow = c(1,1))