Prepare the Boston Housing Data:

The preparation is the same as what we did in Part I. Recall the data contain the following columns:

1. MEDV (Y)  Median value of owner-occupied homes in $1000's (response variable)
2. CRIM      per capita crime rate by town
3. ZN        proportion of residential land zoned for lots over 
             25,000 sq.ft.
4. INDUS     proportion of non-retail business acres per town
5. CHAS      Charles River dummy variable (= 1 if tract bounds 
             river; 0 otherwise)
6. NOX       nitric oxides concentration (parts per 10 million)
7. RM        average number of rooms per dwelling
8. AGE       proportion of owner-occupied units built prior to 1940
9. DIS       weighted distances to five Boston employment centres
10. RAD       index of accessibility to radial highways
11. TAX      full-value property-tax rate per $10,000
12. PTRATIO  pupil-teacher ratio by town
13. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
             by town
14. LSTAT    % lower status of the population
library(MASS)
myData = Boston
names(myData)[14] = "Y"
iLog = c( 1,3,5,6,8,9,10,14 );
myData[,iLog] = log( myData[,iLog] );
myData[,2] = myData[,2] / 10;
myData[,7] = myData[,7]^2.5 / 10^4
myData[,11] = exp( 0.4 * myData[,11] ) / 1000;
myData[,12] = myData[,12] / 100;
myData[,13] = sqrt( myData[,13] );

Fit the full model

full.model = lm( Y ~ ., data = myData);  
n = dim(myData)[1];
summary(full.model)
## 
## Call:
## lm(formula = Y ~ ., data = myData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9918 -0.1002 -0.0034  0.1117  0.7640 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.176874   0.379017  11.020  < 2e-16 ***
## crim        -0.014606   0.011650  -1.254 0.210527    
## zn           0.001392   0.005639   0.247 0.805121    
## indus       -0.012709   0.022312  -0.570 0.569195    
## chas         0.109980   0.036634   3.002 0.002817 ** 
## nox         -0.283112   0.105340  -2.688 0.007441 ** 
## rm           0.421108   0.110175   3.822 0.000149 ***
## age          0.006403   0.004863   1.317 0.188536    
## dis         -0.183154   0.036804  -4.977 8.97e-07 ***
## rad          0.068362   0.022473   3.042 0.002476 ** 
## tax         -0.201832   0.048432  -4.167 3.64e-05 ***
## ptratio     -0.040017   0.008091  -4.946 1.04e-06 ***
## black        0.044472   0.011456   3.882 0.000118 ***
## lstat       -0.262615   0.016091 -16.320  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2008 on 492 degrees of freedom
## Multiple R-squared:  0.765,  Adjusted R-squared:  0.7588 
## F-statistic: 123.2 on 13 and 492 DF,  p-value: < 2.2e-16
p = 13

Stepwise AIC

stepAIC = step(full.model, direction="both")
## Start:  AIC=-1611.15
## Y ~ crim + zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + black + lstat
## 
##           Df Sum of Sq    RSS     AIC
## - zn       1    0.0025 19.831 -1613.1
## - indus    1    0.0131 19.841 -1612.8
## - crim     1    0.0634 19.892 -1611.5
## - age      1    0.0699 19.898 -1611.4
## <none>                 19.828 -1611.2
## - nox      1    0.2911 20.119 -1605.8
## - chas     1    0.3632 20.192 -1604.0
## - rad      1    0.3729 20.201 -1603.7
## - rm       1    0.5888 20.417 -1598.3
## - black    1    0.6073 20.436 -1597.9
## - tax      1    0.6999 20.528 -1595.6
## - ptratio  1    0.9857 20.814 -1588.6
## - dis      1    0.9981 20.826 -1588.3
## - lstat    1   10.7345 30.563 -1394.2
## 
## Step:  AIC=-1613.08
## Y ~ crim + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + black + lstat
## 
##           Df Sum of Sq    RSS     AIC
## - indus    1    0.0211 19.852 -1614.5
## - crim     1    0.0660 19.897 -1613.4
## - age      1    0.0698 19.901 -1613.3
## <none>                 19.831 -1613.1
## + zn       1    0.0025 19.828 -1611.2
## - nox      1    0.2950 20.126 -1607.6
## - chas     1    0.3668 20.198 -1605.8
## - rad      1    0.3705 20.201 -1605.7
## - rm       1    0.5921 20.423 -1600.2
## - black    1    0.6049 20.436 -1599.9
## - tax      1    0.7657 20.597 -1595.9
## - dis      1    1.0049 20.836 -1590.1
## - ptratio  1    1.0241 20.855 -1589.6
## - lstat    1   10.7378 30.569 -1396.1
## 
## Step:  AIC=-1614.55
## Y ~ crim + chas + nox + rm + age + dis + rad + tax + ptratio + 
##     black + lstat
## 
##           Df Sum of Sq    RSS     AIC
## - age      1    0.0710 19.923 -1614.7
## - crim     1    0.0773 19.929 -1614.6
## <none>                 19.852 -1614.5
## + indus    1    0.0211 19.831 -1613.1
## + zn       1    0.0105 19.841 -1612.8
## - nox      1    0.3253 20.177 -1608.3
## - chas     1    0.3504 20.202 -1607.7
## - rad      1    0.3806 20.233 -1606.9
## - black    1    0.5986 20.451 -1601.5
## - rm       1    0.6459 20.498 -1600.3
## - tax      1    0.8147 20.667 -1596.2
## - dis      1    0.9879 20.840 -1592.0
## - ptratio  1    1.1043 20.956 -1589.2
## - lstat    1   10.8744 30.726 -1395.5
## 
## Step:  AIC=-1614.74
## Y ~ crim + chas + nox + rm + dis + rad + tax + ptratio + black + 
##     lstat
## 
##           Df Sum of Sq    RSS     AIC
## - crim     1    0.0561 19.979 -1615.3
## <none>                 19.923 -1614.7
## + age      1    0.0710 19.852 -1614.5
## + indus    1    0.0223 19.901 -1613.3
## + zn       1    0.0107 19.912 -1613.0
## - nox      1    0.2798 20.203 -1609.7
## - rad      1    0.3365 20.259 -1608.3
## - chas     1    0.3688 20.292 -1607.5
## - black    1    0.6546 20.578 -1600.4
## - rm       1    0.7829 20.706 -1597.2
## - tax      1    0.8216 20.745 -1596.3
## - ptratio  1    1.0525 20.976 -1590.7
## - dis      1    1.3550 21.278 -1583.5
## - lstat    1   11.6815 31.605 -1383.3
## 
## Step:  AIC=-1615.32
## Y ~ chas + nox + rm + dis + rad + tax + ptratio + black + lstat
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 19.979 -1615.3
## + crim     1    0.0561 19.923 -1614.7
## + age      1    0.0497 19.929 -1614.6
## + indus    1    0.0319 19.947 -1614.1
## + zn       1    0.0179 19.961 -1613.8
## - rad      1    0.2914 20.270 -1610.0
## - chas     1    0.3624 20.341 -1608.2
## - nox      1    0.3971 20.376 -1607.4
## - black    1    0.7468 20.726 -1598.8
## - rm       1    0.7824 20.761 -1597.9
## - tax      1    0.8916 20.871 -1595.2
## - ptratio  1    1.1229 21.102 -1589.7
## - dis      1    1.2990 21.278 -1585.4
## - lstat    1   12.1901 32.169 -1376.3
# set "trace=0" if you do not want to see the intermediate results
#stepAIC = step(full.model, trace=0, direction="both")            

Stepwise BIC

n = dim(myData)[1]
stepBIC = step(full.model, direction="both", k=log(n))      
## Start:  AIC=-1551.97
## Y ~ crim + zn + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + black + lstat
## 
##           Df Sum of Sq    RSS     AIC
## - zn       1    0.0025 19.831 -1558.1
## - indus    1    0.0131 19.841 -1557.9
## - crim     1    0.0634 19.892 -1556.6
## - age      1    0.0699 19.898 -1556.4
## <none>                 19.828 -1552.0
## - nox      1    0.2911 20.119 -1550.8
## - chas     1    0.3632 20.192 -1549.0
## - rad      1    0.3729 20.201 -1548.8
## - rm       1    0.5888 20.417 -1543.4
## - black    1    0.6073 20.436 -1542.9
## - tax      1    0.6999 20.528 -1540.7
## - ptratio  1    0.9857 20.814 -1533.7
## - dis      1    0.9981 20.826 -1533.3
## - lstat    1   10.7345 30.563 -1339.3
## 
## Step:  AIC=-1558.14
## Y ~ crim + indus + chas + nox + rm + age + dis + rad + tax + 
##     ptratio + black + lstat
## 
##           Df Sum of Sq    RSS     AIC
## - indus    1    0.0211 19.852 -1563.8
## - crim     1    0.0660 19.897 -1562.7
## - age      1    0.0698 19.901 -1562.6
## <none>                 19.831 -1558.1
## - nox      1    0.2950 20.126 -1556.9
## - chas     1    0.3668 20.198 -1555.1
## - rad      1    0.3705 20.201 -1555.0
## + zn       1    0.0025 19.828 -1552.0
## - rm       1    0.5921 20.423 -1549.5
## - black    1    0.6049 20.436 -1549.2
## - tax      1    0.7657 20.597 -1545.2
## - dis      1    1.0049 20.836 -1539.3
## - ptratio  1    1.0241 20.855 -1538.9
## - lstat    1   10.7378 30.569 -1345.4
## 
## Step:  AIC=-1563.83
## Y ~ crim + chas + nox + rm + age + dis + rad + tax + ptratio + 
##     black + lstat
## 
##           Df Sum of Sq    RSS     AIC
## - age      1    0.0710 19.923 -1568.2
## - crim     1    0.0773 19.929 -1568.1
## <none>                 19.852 -1563.8
## - nox      1    0.3253 20.177 -1561.8
## - chas     1    0.3504 20.202 -1561.2
## - rad      1    0.3806 20.233 -1560.4
## + indus    1    0.0211 19.831 -1558.1
## + zn       1    0.0105 19.841 -1557.9
## - black    1    0.5986 20.451 -1555.0
## - rm       1    0.6459 20.498 -1553.8
## - tax      1    0.8147 20.667 -1549.7
## - dis      1    0.9879 20.840 -1545.5
## - ptratio  1    1.1043 20.956 -1542.7
## - lstat    1   10.8744 30.726 -1349.0
## 
## Step:  AIC=-1568.25
## Y ~ crim + chas + nox + rm + dis + rad + tax + ptratio + black + 
##     lstat
## 
##           Df Sum of Sq    RSS     AIC
## - crim     1    0.0561 19.979 -1573.0
## <none>                 19.923 -1568.2
## - nox      1    0.2798 20.203 -1567.4
## - rad      1    0.3365 20.259 -1566.0
## - chas     1    0.3688 20.292 -1565.2
## + age      1    0.0710 19.852 -1563.8
## + indus    1    0.0223 19.901 -1562.6
## + zn       1    0.0107 19.912 -1562.3
## - black    1    0.6546 20.578 -1558.1
## - rm       1    0.7829 20.706 -1555.0
## - tax      1    0.8216 20.745 -1554.0
## - ptratio  1    1.0525 20.976 -1548.4
## - dis      1    1.3550 21.278 -1541.2
## - lstat    1   11.6815 31.605 -1341.0
## 
## Step:  AIC=-1573.05
## Y ~ chas + nox + rm + dis + rad + tax + ptratio + black + lstat
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 19.979 -1573.0
## - rad      1    0.2914 20.270 -1572.0
## - chas     1    0.3624 20.341 -1570.2
## - nox      1    0.3971 20.376 -1569.3
## + crim     1    0.0561 19.923 -1568.2
## + age      1    0.0497 19.929 -1568.1
## + indus    1    0.0319 19.947 -1567.6
## + zn       1    0.0179 19.961 -1567.3
## - black    1    0.7468 20.726 -1560.7
## - rm       1    0.7824 20.761 -1559.8
## - tax      1    0.8916 20.871 -1557.2
## - ptratio  1    1.1229 21.102 -1551.6
## - dis      1    1.2990 21.278 -1547.4
## - lstat    1   12.1901 32.169 -1338.3
sel.var.AIC = attr(stepAIC$terms, "term.labels")
sel.var.BIC = attr(stepBIC$terms, "term.labels")
sel.var.AIC
## [1] "chas"    "nox"     "rm"      "dis"     "rad"     "tax"     "ptratio"
## [8] "black"   "lstat"
length(sel.var.AIC)
## [1] 9
length(sel.var.BIC)
## [1] 9
c("rad", "hello") %in% sel.var.AIC
## [1]  TRUE FALSE
sel.var.BIC %in% sel.var.AIC
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

```