Stat542: Ensemble Trees for Regression

Load packages.

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

Split the data into training and test.

load("BostonHousing1.Rdata")
mydata = Housing1
n = nrow(mydata)
ntest = round(n * 0.3)
set.seed(1234)
test.id = sample(1:n, ntest)

Fit a random forest

rfModel = randomForest(Y ~ ., data = mydata[-test.id, ],
                       importance = T, ntree=400); 
names(rfModel)

##  [1] "call"            "type"            "predicted"      
##  [4] "mse"             "rsq"             "oob.times"      
##  [7] "importance"      "importanceSD"    "localImportance"
## [10] "proximity"       "ntree"           "mtry"           
## [13] "forest"          "coefs"           "y"              
## [16] "test"            "inbag"           "terms"

## the default value for mtry is p/3 for regression
## p = ncol(mydata) - 1 = 15
## mtry = 15/3 = 5
rfModel$mtry

## [1] 5

Test Error

yhat.test = predict(rfModel, mydata[test.id, ])
sum((mydata$Y[test.id] - yhat.test)^2)/length(test.id)

## [1] 0.01439591

Two Training Errors

yhat.train = predict(rfModel, mydata[-test.id, ])
sum((mydata$Y[-test.id] - yhat.train) ^2)/(n - ntest)

## [1] 0.004060994

sum((mydata$Y[-test.id] - rfModel$predicted) ^2)/(n - ntest)

## [1] 0.02225524

We can evaluate the training error by obtaining the prediction on the training set (same as the regular training error, i.e., may underestimate the realerror). But randomForest provides an estimate of the training error based on OOB samples, which is similar to CV errors, i.e., an unbiased estimate of the real classification error.

rfModel$oob.times[1:5]

## [1] 115 162 164 161 146

length(rfModel$oob)

## [1] 354

## oob.times --> ntree * exp(-1) = ntree * 0.368
rfModel$ntree * exp(-1)

## [1] 147.1518

mean(rfModel$oob.times)

## [1] 147.1949

The plot function for randomForest

tmp = rfModel$mse
par(mfrow=c(1, 2))
plot(rfModel)
plot(c(0, rfModel$ntree), range(tmp), type="n",
     xlab = "Number of trees", ylab="Error")
lines(tmp)

par(mfrow=c(1, 1))

Variable importance

%IncMSE: increase in MSE of predictions if variable j being permuted (for OOB samples) – shuffle the value of variable j, record the change of MSE for each tree, and average over all trees. Can be normalize by standard error with option “scale = TRUE” (default is unscaled).
IncNodePurity: total decrease of RSS from splitting on variable j, averaged over all trees.

## default %IncMSE is normalized
rfModel$importance

##              %IncMSE IncNodePurity
## chas    0.0008257216     0.2000874
## lon     0.0120863904     3.3318724
## lat     0.0036104675     1.1038888
## crim    0.0187398238     7.0187502
## zn      0.0006891212     0.1795601
## indus   0.0058836110     1.9351423
## nox     0.0244205920     5.9940021
## rm      0.0389700994    10.7947829
## age     0.0066059605     1.3542063
## dis     0.0079863880     2.7116109
## rad     0.0019257950     0.4307959
## tax     0.0079054051     2.6530136
## ptratio 0.0071258850     2.4985138
## b       0.0049439470     1.3784467
## lstat   0.0957308705    18.3817402

importance(rfModel, scale = F)

##              %IncMSE IncNodePurity
## chas    0.0008257216     0.2000874
## lon     0.0120863904     3.3318724
## lat     0.0036104675     1.1038888
## crim    0.0187398238     7.0187502
## zn      0.0006891212     0.1795601
## indus   0.0058836110     1.9351423
## nox     0.0244205920     5.9940021
## rm      0.0389700994    10.7947829
## age     0.0066059605     1.3542063
## dis     0.0079863880     2.7116109
## rad     0.0019257950     0.4307959
## tax     0.0079054051     2.6530136
## ptratio 0.0071258850     2.4985138
## b       0.0049439470     1.3784467
## lstat   0.0957308705    18.3817402

cbind(importance(rfModel, scale = TRUE), 
      importance(rfModel, scale = F)[,1]/rfModel$importanceSD)

##           %IncMSE IncNodePurity          
## chas     4.339701     0.2000874  4.339701
## lon     15.944197     3.3318724 15.944197
## lat     10.000114     1.1038888 10.000114
## crim    16.220690     7.0187502 16.220690
## zn       3.607632     0.1795601  3.607632
## indus    7.059829     1.9351423  7.059829
## nox     17.287019     5.9940021 17.287019
## rm      32.500312    10.7947829 32.500312
## age     12.846430     1.3542063 12.846430
## dis     11.507094     2.7116109 11.507094
## rad      5.201417     0.4307959  5.201417
## tax     10.221805     2.6530136 10.221805
## ptratio  9.742296     2.4985138  9.742296
## b       12.123677     1.3784467 12.123677
## lstat   27.105951    18.3817402 27.105951

par(mfrow = c(1,2))
varImpPlot(rfModel, type=1)
varImpPlot(rfModel, type=2)