setwd("~/mydoc/Course/datamining/Rcode/")

Next we’ll go through the 4 examples mentioned in chap 1 of ESL. First, load the following library that contains all the datasets used in ESL.

library("ElemStatLearn")

Example 1: Prostate Cancer Data

We divide the samples into training and test data, and fit a linear regression model on the training set to predict the outcome “lpsa”. Then use the fitted regression model to do prediction on both the training and test data. The performance is measured by the mean squared error.

The command predict takes the fitted regression model (the output from lm) and a new dataset as input and returns the prediction on this new dataset. R “remembers” features/covariates through their names, not their location in the data matrix. So to form prediction you need to provide the new dataset as a dataframe with the same column names (for the features, i.e., x’s) as the dataframe used in the command lm.

data("prostate")
names(prostate)  
## "lpsa" the 2nd last var is the response variable
## "train" is the indicator for training data
prostate$train
table(prostate$train)  # 67 training vs 30 testing

## Fit a linear regression model to predict lpsa
traindata = prostate[prostate$train==TRUE,]
testdata = prostate[prostate$train==FALSE,]

## Remove the "train" indicator column
traindata = within(traindata, rm(train))
testdata = within(testdata, rm(train))

myfit = lm(lpsa ~ . , data=traindata)
myfit  ## display the estimated LS coefficients
summary(myfit)  ## more output

mypredict = predict(myfit, newdata=testdata)

## mean squared error on the training and test sets. 
sum((traindata$lpsa - myfit$fitted)^2)/nrow(traindata)  
sum((testdata$lpsa - mypredict)^2)/nrow(testdata)       

Example 2: Spam Email

The column names for the spam data is not informative, except the last column. To get a set of meaningful column names for the spam data,

We also rename the last column (the response variable) to be “Y” and set its values to be 0 or 1: spam = 1 and email = 0.

?spam
dim(spam)
names(spam)

spam.name=read.table("spam_name.txt");
for(i in 1:(ncol(spam)-1)) names(spam)[i]=as.character(spam.name[i,]);
names(spam)[ncol(spam)]="Y"
spam$Y = ifelse(spam$Y=="spam", 1, 0)

head(spam) # display the first 5 obs/rows.
spam[1:2,] # display the first 2 obs/rows

attach(spam)
table(Y)  # counts for Y=1(spam) and 0 (email)

The distribution of each feature differs between spam an email? To answer this question, we can compute summary statistics (e.g., mean, sd, median) within spam and within email for each feature.

spam.sum = matrix(0, 57, 6)
colnames(spam.sum) = c("spam.mean", "email.mean", "spam.sd", "email.sd", "spam.med", "email.med")
rownames(spam.sum) = colnames(spam)[1:57]

for(i in 1:57){
  spam.sum[i,1] = mean(spam[Y==1,i])
  spam.sum[i,2] = mean(spam[Y==0,i])

  spam.sum[i,3] = sd(spam[Y==1,i])
  spam.sum[i,4] = sd(spam[Y==0,i])

  spam.sum[i,5] = median(spam[Y==1,i])
  spam.sum[i,6] = median(spam[Y==0,i])  
}

spam.sum  ## too many digits after the decimal point
round(spam.sum, dig=2)

spam.sum[1,]
library(ggplot2)
tmp = as.data.frame(spam[, c(1,58)])
ggplot(tmp, aes(x=Wmake, fill=as.factor(Y))) + 
  geom_density(alpha=0.2)

Next we use linear regression to do classification: if the prediciton is bigger than 0.5, classify the sample as 1, and 0 otherwise.

## Extract a subset from the training data to form a test set.
## If you want to make the sampling step reproducible, 
## use set.seed function. 

set.seed(150)  
n=nrow(spam)
test.id=rep(0,n); 
test.id[sample(1:n, 500)]=1;

## I would ignore the p-values/t-stat/F-stat since
## the response Y apparently doesn't follow a normal dist

mylm=lm(Y~., data=spam[test.id !=1,]);
summary(mylm)

pred.train=ifelse(mylm$fitted>0.5,1,0)
sum(pred.train == Y[test.id==0])
train.err = 1- sum(pred.train == Y[test.id==0])/(n-500)
train.err

pred.test=predict(mylm, spam[test.id==1,]);
pred.test=ifelse(pred.test>0.5,1,0)
test.err = 1- sum(pred.test == Y[test.id==1])/500
test.err

Example 3: Handwritten Digits

?zip.train
data(zip.train)
dim(zip.train)

## The first column indicate the digit
table(zip.train[,1])

Show a random sample of 12 images (arranged in a 3x4 table).

n=nrow(zip.train)
par(mfrow=c(3,4), pty='s', mar=c(1,1,1,1), xaxt='n', yaxt='n')
for(i in 1:12){
  tmp=zip.train[sample(1:n, 1), -1]; 
  tmp = tmp/max(tmp)
  tmp=matrix(tmp, 16, 16)
  image(tmp[, 16:1])
}

Make the images black-and-white. Define custom color using RColorBrewer1.

install.packages("RColorBrewer")
library(RColorBrewer)

##Color ramp def.
colors = c('white','black')
cus_col = colorRampPalette(colors=colors)

par(mfrow=c(3,4), pty='s', mar=c(1,1,1,1), xaxt='n', yaxt='n')
for(i in 1:12){
  tmp=zip.train[sample(1:n, 1), -1]; 
  tmp = tmp/max(tmp)*255
  tmp=matrix(tmp, 16, 16)
  image(tmp[, 16:1], col=cus_col(256))
}

Show the averaged image for each digit.

par(mfrow=c(3,4), pty='s', mar=c(1,1,1,1), xaxt='n', yaxt='n')
for(i in 0:9){
  tmp=zip.train[zip.train[,1]==i, -1]; 
  
  ## calculate the average image and rescale the value in each
  ## pixel to be between 0 to 255
  tmp=apply(tmp, 2, mean)
  tmp = tmp/max(tmp)*255
  
  tmp=matrix(tmp, 16, 16)
  image(tmp[, 16:1], col=cus_col(256))
}

Try “k-nearest-neighbor” in the library “class”.

library(class)  
?knn   ## check the help file for command "knn"

## For knn, no need to spend extra time on trianing.
## Training is done on the fly when a test point is given
## The command below may take some time to run
pred=knn(zip.train[,-1], zip.test[,-1], zip.train[,1])  

## display the classification result in a 2x2 table
table(zip.test[,1], pred)  

## Compute the 0/1 test error
mytable = table(zip.test[,1], pred)
1-sum(diag(mytable))/nrow(zip.test)

Example 4: NCI Microarray Data

http://genome-www.stanford.edu/nci60/

data(nci)
dim(nci)
?nci

## Create a custom color palette from green to red
cus_col = colorRampPalette(colors=c("green", "black", "red"))

## Show just the first 500 genes
image(nci[1:500,], col=cus_col(64))