library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
load("BostonHousing1.Rdata")
mydata = Housing1
n = nrow(mydata)
ntest = round(n * 0.3)
set.seed(1234)
test.id = sample(1:n, ntest)
rfModel = randomForest(Y ~ ., data = mydata[-test.id, ],
importance = T, ntree=400);
names(rfModel)
## [1] "call" "type" "predicted"
## [4] "mse" "rsq" "oob.times"
## [7] "importance" "importanceSD" "localImportance"
## [10] "proximity" "ntree" "mtry"
## [13] "forest" "coefs" "y"
## [16] "test" "inbag" "terms"
## the default value for mtry is p/3 for regression
## p = ncol(mydata) - 1 = 15
## mtry = 15/3 = 5
rfModel$mtry
## [1] 5
yhat.test = predict(rfModel, mydata[test.id, ])
sum((mydata$Y[test.id] - yhat.test)^2)/length(test.id)
## [1] 0.01439591
yhat.train = predict(rfModel, mydata[-test.id, ])
sum((mydata$Y[-test.id] - yhat.train) ^2)/(n - ntest)
## [1] 0.004060994
sum((mydata$Y[-test.id] - rfModel$predicted) ^2)/(n - ntest)
## [1] 0.02225524
We can evaluate the training error by obtaining the prediction on the training set (same as the regular training error, i.e., may underestimate the realerror). But randomForest provides an estimate of the training error based on OOB samples, which is similar to CV errors, i.e., an unbiased estimate of the real classification error.
rfModel$oob.times[1:5]
## [1] 115 162 164 161 146
length(rfModel$oob)
## [1] 354
## oob.times --> ntree * exp(-1) = ntree * 0.368
rfModel$ntree * exp(-1)
## [1] 147.1518
mean(rfModel$oob.times)
## [1] 147.1949
tmp = rfModel$mse
par(mfrow=c(1, 2))
plot(rfModel)
plot(c(0, rfModel$ntree), range(tmp), type="n",
xlab = "Number of trees", ylab="Error")
lines(tmp)
par(mfrow=c(1, 1))
%IncMSE: increase in MSE of predictions if variable j being permuted (for OOB samples) – shuffle the value of variable j, record the change of MSE for each tree, and average over all trees. Can be normalize by standard error with option “scale = TRUE” (default is unscaled).
IncNodePurity: total decrease of RSS from splitting on variable j, averaged over all trees.
## default %IncMSE is normalized
rfModel$importance
## %IncMSE IncNodePurity
## chas 0.0008257216 0.2000874
## lon 0.0120863904 3.3318724
## lat 0.0036104675 1.1038888
## crim 0.0187398238 7.0187502
## zn 0.0006891212 0.1795601
## indus 0.0058836110 1.9351423
## nox 0.0244205920 5.9940021
## rm 0.0389700994 10.7947829
## age 0.0066059605 1.3542063
## dis 0.0079863880 2.7116109
## rad 0.0019257950 0.4307959
## tax 0.0079054051 2.6530136
## ptratio 0.0071258850 2.4985138
## b 0.0049439470 1.3784467
## lstat 0.0957308705 18.3817402
importance(rfModel, scale = F)
## %IncMSE IncNodePurity
## chas 0.0008257216 0.2000874
## lon 0.0120863904 3.3318724
## lat 0.0036104675 1.1038888
## crim 0.0187398238 7.0187502
## zn 0.0006891212 0.1795601
## indus 0.0058836110 1.9351423
## nox 0.0244205920 5.9940021
## rm 0.0389700994 10.7947829
## age 0.0066059605 1.3542063
## dis 0.0079863880 2.7116109
## rad 0.0019257950 0.4307959
## tax 0.0079054051 2.6530136
## ptratio 0.0071258850 2.4985138
## b 0.0049439470 1.3784467
## lstat 0.0957308705 18.3817402
cbind(importance(rfModel, scale = TRUE),
importance(rfModel, scale = F)[,1]/rfModel$importanceSD)
## %IncMSE IncNodePurity
## chas 4.339701 0.2000874 4.339701
## lon 15.944197 3.3318724 15.944197
## lat 10.000114 1.1038888 10.000114
## crim 16.220690 7.0187502 16.220690
## zn 3.607632 0.1795601 3.607632
## indus 7.059829 1.9351423 7.059829
## nox 17.287019 5.9940021 17.287019
## rm 32.500312 10.7947829 32.500312
## age 12.846430 1.3542063 12.846430
## dis 11.507094 2.7116109 11.507094
## rad 5.201417 0.4307959 5.201417
## tax 10.221805 2.6530136 10.221805
## ptratio 9.742296 2.4985138 9.742296
## b 12.123677 1.3784467 12.123677
## lstat 27.105951 18.3817402 27.105951
par(mfrow = c(1,2))
varImpPlot(rfModel, type=1)
varImpPlot(rfModel, type=2)