Set the model parameters like dimension and standard error, and store the two centers (each is a two-dim vector) in m1 and m2.
p = 2;
s = 1; # sd for generating x
m1 = c (1,0);
m0 = c(0,1);
Generate n samples from each normal component. First, we generate a (2n)-by-p matrix with entries being iid samples form a normal dist with mean 0 and variance s-square. Then we form a (2n)-by-p matrix with the first n rows being the two-dimensional vector m1 (the center of the first normal component) and the next n rows being m2. We use command rep
to generate repeated rows and use command rbind
to stack two matrices vertically together.
n = 100; # generate the train data; n for each class
traindata = matrix(rnorm(2*n*p), 2*n, p)*s +
rbind(matrix(rep(m1, n), nrow = n, byrow=TRUE),
matrix(rep(m0, n), nrow = n, byrow=TRUE))
dim(traindata)
Ytrain = factor(c(rep(1,n), rep(0,n)))
Generate 2N test samples similarly.
N = 5000;
testdata = matrix(rnorm(2*N*p), 2*N, p)*s +
rbind(matrix(rep(m1, N), nrow = N, byrow=TRUE),
matrix(rep(m0, N), nrow = N, byrow=TRUE))
Ytest = factor(c(rep(1,N), rep(0,N)))
Let’s take a look of the data. In the plot generated by the code below, points from two groups are colored in red and blue, respectively; the two centers are plotted as +, and a legend is added to explain the association of each color.
The option type=n
tells R not to print anything, just to set the plotting area, then we add points group by group.
plot(traindata[,1], traindata[,2], type="n", xlab="", ylab="")
# add points from class 1
points(traindata[1:n,1], traindata[1:n,2], col="blue");
# add points from class 0
points(traindata[(n+1):(2*n),1], traindata[(n+1):(2*n),2],
col="red");
# add the 10 centers for class 1
points(m1[1], m1[2], pch = "+", cex = 2, col = "blue");
# add the 10 centers for class 0
points(m0[1], m0[2], pch = "+", cex = 2, col = "red");
legend("bottomright", pch = c(1,1), col = c("red", "blue"),
legend = c("class 1", "class 0"))
(Feel free to skip this part if you’ve known how to use ggplot2.)
Previously I plot the data in the old “Artist Palette” way in the sense that I start with a convas (plotting region), and then add items one by one, first points, then color, then legend. Here is the drawback of this approach: if I want to add a point outside the current plotting region, then I have to tear off the convas and restart everything by starting a larger plotting region (to reserve the space for points I’ll add later.)
A better graphical package for R is ggplot2
; gg stands for grammar for graphics http://ggplot2.org/
Two major commands: qplot
and ggplot
. You can find many examples and tutorialsonline.
Run the following code (each line gives your a slightly different plot) to know how to set various arguments in qplot
.
library(ggplot2)
## ggplot only accepts dataframe
mytraindata = data.frame(X1 = traindata[,1],
X2 = traindata[,2],
Y = Ytrain)
## plot the data and color by the Y label
qplot(X1, X2, data = mytraindata, colour = Y)
## change the shape and size of the points
qplot(X1, X2, data = mytraindata, colour = Y,
shape = I(1), size = I(3))
## change x,y labels and add titles
qplot(X1, X2, data = mytraindata, colour = Y,
shape = I(1), size = I(3),
main = "Scatter Plot (Training Data)",
xlab="", ylab="")
## want to remove the gray background? Change theme
qplot(X1, X2, data = mytraindata, colour = Y,
shape = I(1), size = I(3)) + theme_bw()
## more theme options
## A single command would be a little lengthy. We can save
## the output of qplot into an object "myplot"
myplot = qplot(X1, X2, data = mytraindata, colour = Y,
shape = I(1), size = I(3))
myplot = myplot + theme_bw();
myplot + theme(legend.position="left")
?theme
## How to adding the two centers on
## existing plot, we need to use the command "ggplot"
?ggplot
myplot = ggplot(mytraindata, aes(X1, X2))
## use user-specified color
myplot + geom_point(aes(color = Y), shape = 1, size = 3) + scale_color_manual(values = c("red", "blue"))
## add the two centers;
## change shape and size;
## use user-sepecified color
myplot = ggplot(mytraindata,aes(X1, X2)) +
geom_point(aes(colour = Y), shape = 3, size = 2) +
scale_color_manual(values = c("red", "blue"))
myplot +
geom_point(data = data.frame(X1 = m1[1], X2 = m1[2]),
aes(X1, X2), colour = "blue", shape = 2, size = 5) +
geom_point(data = data.frame(X1 = m0[1], X2 = m0[2]),
aes(X1, X2), colour = "red", shape = 2, size = 5)
library(class)
## Choice of the neighhood size.
## Here I just use the values from the textbook
myk = c(151, 101, 69, 45, 31, 21, 11, 7, 5, 3, 1)
m = length(myk);
train.err.knn = rep(0,m);
test.err.knn = rep(0, m);
for( j in 1:m){
Ytrain.pred = knn(traindata, traindata, Ytrain, k = myk[j])
train.err.knn[j] = sum(Ytrain != Ytrain.pred)/(2*n)
Ytest.pred = knn(traindata, testdata, Ytrain,k = myk[j])
test.err.knn[j] = sum(Ytest != Ytest.pred)/(2*N)
}
RegModel = lm(as.numeric(Ytrain) - 1 ~ traindata)
Ytrain_pred_LS = as.numeric(RegModel$fitted > 0.5)
Ytest_pred_LS = RegModel$coef[1] + RegModel$coef[2] * testdata[,1] +
RegModel$coef[3] * testdata[,2]
Ytest_pred_LS = as.numeric(Ytest_pred_LS > 0.5)
## cross tab for training data and training error
table(Ytrain, Ytrain_pred_LS);
train.err.LS = sum(Ytrain != Ytrain_pred_LS) / (2*n);
## cross tab for test data and test error
table(Ytest, Ytest_pred_LS);
test.err.LS = sum(Ytest != Ytest_pred_LS) / (2*N);
Ytest_pred_Bayes = as.numeric(2*testdata %*% matrix(m1-m0, nrow=2) > (sum(m1^2)-sum(m0^2)));
test.err.Bayes = sum(Ytest != Ytest_pred_Bayes) / (2*N)
Test errors are in red and training errors are in blue. The upper x-coordinate indicates the K values, and the lower x-coordinate indicates the degree-of-freedom of the KNN procedures so the labels are reciprocally related to K.
The training and test errors for linear regression are plotted at df=3, since the linear model has 3 parameters, i.e., 3 dfs.
plot(c(0.5,m), range(test.err.LS, train.err.LS, test.err.knn, train.err.knn),
type="n", xlab="Degree of Freedom", ylab="Error", xaxt="n")
df = round((2*n)/myk)
axis(1, at = 1:m, labels = df)
axis(3, at = 1:m, labels = myk)
points(1:m, test.err.knn, col="red", pch=1);
lines(1:m, test.err.knn, col="red", lty=1);
points(1:m, train.err.knn, col="blue", pch=1);
lines(1:m, train.err.knn, col="blue", lty=2);
points(3, train.err.LS, pch=2, cex=2, col="blue")
points(3, test.err.LS, pch=2, cex=2, col="red")
abline(test.err.Bayes, 0, col="purple")
Generate the 20 centers, 10 for each group.
csize = 10; # number of centers
p = 2;
s = 1; # sd for generating the centers within each class
m1 = matrix(rnorm(csize*p), csize, p)*s + cbind( rep(1,csize), rep(0,csize));
m0 = matrix(rnorm(csize*p), csize, p)*s + cbind( rep(0,csize), rep(1,csize));
Generate training data.
n=100;
# Randomly allocate the n samples for class 1 to the 10 clusters
id1 = sample(1:csize, n, replace = TRUE);
# Randomly allocate the n samples for class 1 to the 10 clusters
id0 = sample(1:csize, n, replace = TRUE);
s= sqrt(1/5); # sd for generating x.
traindata = matrix(rnorm(2*n*p), 2*n, p)*s + rbind(m1[id1,], m0[id0,])
Ytrain = factor(c(rep(1,n), rep(0,n)))
Generate test data.
N = 5000;
id1 = sample(1:csize, N, replace=TRUE);
id0 = sample(1:csize, N, replace=TRUE);
testdata = matrix(rnorm(2*N*p), 2*N, p)*s +
rbind(m1[id1,], m0[id0,])
Ytest = factor(c(rep(1,N), rep(0,N)))
plot(traindata[, 1], traindata[, 2], type = "n", xlab = "", ylab = "")
points(traindata[1:n, 1], traindata[1:n, 2], col = "blue");
points(traindata[(n+1):(2*n), 1], traindata[(n+1):(2*n), 2], col="red");
points(m1[1:csize, 1], m1[1:csize, 2], pch="+", cex=1.5, col="blue");
points(m0[1:csize, 1], m0[1:csize, 2], pch="+", cex=1.5, col="red");
legend("bottomleft", pch = c(1,1), col = c("red", "blue"),
legend = c("class 1", "class 0"))
The calculation for the Bayes error for Example II is different from the one for Example I since the underlying data generating processes are different.
Since we need to repeatly evaluate a mixture of normal with 10 components on each test point, I wrote a function first mixnorm
, and then use the command apply
to apply the same function on each row of the test data.
mixnorm=function(x){
## return the density ratio for a point x, where each
## density is a mixture of normal with 10 components
sum(exp(-apply((t(m1)-x)^2, 2, sum)*5/2))/sum(exp(-apply((t(m0)-x)^2, 2, sum)*5/2))
}
Ytest_pred_Bayes = apply(testdata, 1, mixnorm)
Ytest_pred_Bayes = as.numeric(Ytest_pred_Bayes > 1);
table(Ytest, Ytest_pred_Bayes);
test.err.Bayes = sum(Ytest != Ytest_pred_Bayes) / (2*N)