Load packages.

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.

Split the data into training and test.

load("BostonHousing1.Rdata")
mydata = Housing1
n = nrow(mydata)
ntest = round(n * 0.3)
set.seed(1234)
test.id = sample(1:n, ntest)

Fit a random forest

rfModel = randomForest(Y ~ ., data = mydata[-test.id, ],
                       importance = T, ntree=400); 
names(rfModel)
##  [1] "call"            "type"            "predicted"      
##  [4] "mse"             "rsq"             "oob.times"      
##  [7] "importance"      "importanceSD"    "localImportance"
## [10] "proximity"       "ntree"           "mtry"           
## [13] "forest"          "coefs"           "y"              
## [16] "test"            "inbag"           "terms"
## the default value for mtry is p/3 for regression
## p = ncol(mydata) - 1 = 15
## mtry = 15/3 = 5
rfModel$mtry
## [1] 5

Test Error

yhat.test = predict(rfModel, mydata[test.id, ])
sum((mydata$Y[test.id] - yhat.test)^2)/length(test.id)
## [1] 0.01439591

Two Training Errors

yhat.train = predict(rfModel, mydata[-test.id, ])
sum((mydata$Y[-test.id] - yhat.train) ^2)/(n - ntest)
## [1] 0.004060994
sum((mydata$Y[-test.id] - rfModel$predicted) ^2)/(n - ntest)
## [1] 0.02225524

We can evaluate the training error by obtaining the prediction on the training set (same as the regular training error, i.e., may underestimate the realerror). But randomForest provides an estimate of the training error based on OOB samples, which is similar to CV errors, i.e., an unbiased estimate of the real classification error.

rfModel$oob.times[1:5]
## [1] 115 162 164 161 146
length(rfModel$oob)
## [1] 354
## oob.times --> ntree * exp(-1) = ntree * 0.368
rfModel$ntree * exp(-1)
## [1] 147.1518
mean(rfModel$oob.times)
## [1] 147.1949

The plot function for randomForest

tmp = rfModel$mse
par(mfrow=c(1, 2))
plot(rfModel)
plot(c(0, rfModel$ntree), range(tmp), type="n",
     xlab = "Number of trees", ylab="Error")
lines(tmp)

par(mfrow=c(1, 1))

Variable importance

## default %IncMSE is normalized
rfModel$importance
##              %IncMSE IncNodePurity
## chas    0.0008257216     0.2000874
## lon     0.0120863904     3.3318724
## lat     0.0036104675     1.1038888
## crim    0.0187398238     7.0187502
## zn      0.0006891212     0.1795601
## indus   0.0058836110     1.9351423
## nox     0.0244205920     5.9940021
## rm      0.0389700994    10.7947829
## age     0.0066059605     1.3542063
## dis     0.0079863880     2.7116109
## rad     0.0019257950     0.4307959
## tax     0.0079054051     2.6530136
## ptratio 0.0071258850     2.4985138
## b       0.0049439470     1.3784467
## lstat   0.0957308705    18.3817402
importance(rfModel, scale = F)
##              %IncMSE IncNodePurity
## chas    0.0008257216     0.2000874
## lon     0.0120863904     3.3318724
## lat     0.0036104675     1.1038888
## crim    0.0187398238     7.0187502
## zn      0.0006891212     0.1795601
## indus   0.0058836110     1.9351423
## nox     0.0244205920     5.9940021
## rm      0.0389700994    10.7947829
## age     0.0066059605     1.3542063
## dis     0.0079863880     2.7116109
## rad     0.0019257950     0.4307959
## tax     0.0079054051     2.6530136
## ptratio 0.0071258850     2.4985138
## b       0.0049439470     1.3784467
## lstat   0.0957308705    18.3817402
cbind(importance(rfModel, scale = TRUE), 
      importance(rfModel, scale = F)[,1]/rfModel$importanceSD)
##           %IncMSE IncNodePurity          
## chas     4.339701     0.2000874  4.339701
## lon     15.944197     3.3318724 15.944197
## lat     10.000114     1.1038888 10.000114
## crim    16.220690     7.0187502 16.220690
## zn       3.607632     0.1795601  3.607632
## indus    7.059829     1.9351423  7.059829
## nox     17.287019     5.9940021 17.287019
## rm      32.500312    10.7947829 32.500312
## age     12.846430     1.3542063 12.846430
## dis     11.507094     2.7116109 11.507094
## rad      5.201417     0.4307959  5.201417
## tax     10.221805     2.6530136 10.221805
## ptratio  9.742296     2.4985138  9.742296
## b       12.123677     1.3784467 12.123677
## lstat   27.105951    18.3817402 27.105951
par(mfrow = c(1,2))
varImpPlot(rfModel, type=1)
varImpPlot(rfModel, type=2)