Letter Recognition and Confusion. The table below is from Wolford and Hollingsworth (1974), where each entry shows the frequency with which each letter was mistakenly called something else.
D = matrix(0, 8, 8)
letters = c("C", "D", "G", "H", "M", "N", "Q", "W")
colnames(D) = letters
rownames(D) = letters
D[2:8, 1] = c(5, 12, 2, 2, 2, 9, 1)
D[3:8, 2] = c(2, 4, 3, 4, 20, 5)
D[4:8, 3] = c(3, 2, 1, 9, 2)
D[5:8, 4] = c(19, 18, 1, 5)
D[6:8, 5] = c(16, 2, 18)
D[7:8, 6] = c(8, 13)
D[8, 7] = 4
D = (D+t(D))
D
## C D G H M N Q W
## C 0 5 12 2 2 2 9 1
## D 5 0 2 4 3 4 20 5
## G 12 2 0 3 2 1 9 2
## H 2 4 3 0 19 18 1 5
## M 2 3 2 19 0 16 2 18
## N 2 4 1 18 16 0 8 13
## Q 9 20 9 1 2 8 0 4
## W 1 5 2 5 18 13 4 0
Change the similarity matrix to distance matrix.
D0 = 21 - D
diag(D0) = 0
tmp = cmdscale(D0)
par(mfrow=c(1,2))
plot(tmp[, 1], tmp[, 2], type="n", xlab="", ylab="",
xlim = c(-15, 15), ylim=c(-15, 15))
text(tmp[, 1], tmp[, 2], label = letters)
D1 = 41 - D
diag(D1) = 0
tmp = cmdscale(D1)
plot(tmp[, 1], tmp[, 2], type="n", xlab="", ylab="",
xlim = c(-20, 20), ylim=c(-20, 20))
text(tmp[, 1], tmp[, 2], label = letters)
source("sportsranks.txt");
head(sportsranks)
## Baseball Football Basketball Tennis Cycling Swimming Jogging
## 1 3 7 2 4 5 6
## 1 3 2 5 4 7 6
## 1 3 2 5 4 7 6
## 4 7 3 1 5 6 2
## 2 3 1 7 6 5 4
## 6 3 4 7 2 1 5
km2=kmeans(sportsranks, centers=2, nstart=10);
km3=kmeans(sportsranks, centers=3, nstart=10);
D=dist(sportsranks)
Xmds=cmdscale(D);
par(mfrow=c(1,2));
plot(Xmds[,1], Xmds[,2], type="n", xlab="", ylab="")
points(Xmds[,1], Xmds[,2], pch=km2$cluster, col=km2$cluster);
title("MDS plot for K=2")
plot(Xmds[,1], Xmds[,2], type="n", xlab="", ylab="");
points(Xmds[,1], Xmds[,2], pch=km3$cluster, col=km3$cluster);
title("MDS plot for K=3")
How is cMDS computed?
D2 = as.matrix(D^2)
n = dim(D2)[1]
tmp = D2 - matrix(colMeans(D2), nrow=n, ncol=n, byrow=TRUE) -
matrix(rowMeans(D2), nrow=n, ncol=n, byrow=FALSE)
tmp = -(tmp + mean(D2))/2
tmp.svd = svd(tmp)
F = tmp.svd$u[, 1:2] %*% diag(sqrt(tmp.svd$d[1:2]))
cor(F, Xmds)
## [,1] [,2]
## [1,] 1.000000e+00 3.584861e-17
## [2,] 1.193066e-16 -1.000000e+00
You can use clusGap
from cluster
package to compute gap statistic.
library(cluster)
set.seed(123)
gap_stat = clusGap(sportsranks, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = sportsranks, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 2
## logW E.logW gap SE.sim
## [1,] 5.429866 5.642931 0.2130645 0.01577661
## [2,] 5.229886 5.540230 0.3103434 0.01659846
## [3,] 5.157134 5.466098 0.3089634 0.01669885
## [4,] 5.099607 5.406242 0.3066354 0.01676325
## [5,] 5.045671 5.352771 0.3070994 0.01728555
## [6,] 4.992883 5.306855 0.3139720 0.01701668
## [7,] 4.945869 5.264815 0.3189463 0.01763576
## [8,] 4.909225 5.225939 0.3167138 0.01720501
## [9,] 4.870728 5.190961 0.3202335 0.01755400
## [10,] 4.842217 5.157341 0.3151240 0.01707943
plot(gap_stat, frame = FALSE, xlab = "Number of clusters k")
abline(v = 2, lty = 2)
Next we compute gap statistic step by step.
n=130;
SS=rep(0,10)
SS[1]=sum(diag(var(sportsranks)))*(n-1);
for(k in 2:10){
kms=kmeans(sportsranks, centers=k, nstart=10);
SS[k]=sum(kms$withinss);
}
lss=log(SS);
n = dim(sportsranks)[1]
m = 7 # each review is a permutation of numbers from 1 to 7
B = 50 # number of iterations
lss0 = matrix(0,B,10)
for(b in 1:B){
xstar=NULL
for(i in 1:n){
xstar = rbind(xstar, sample(m))
}
lss0[b,1] = log(sum(diag(var(xstar)))*(n-1))
for(k in 2:10){
kms = kmeans(xstar, centers=k, nstart=10)
lss0[b,k] = log(sum(kms$withinss))
}
}
plot(1:10, lss, type="l", col="red", xlab="k",
ylab="log(Within SS)");
for(b in 1:B){
points(1:10, lss0[b,], type="l", col=8)
}
lss0m = apply(lss0, 2, mean);
lss0sd = sqrt(apply(lss0, 2, var))*sqrt(1+1/B);
diff = lss0m-lss;
matplot(1:10, cbind(diff, diff+lss0sd, diff-lss0sd), type="l",
xlab="k", ylab="Gap")
points(1:10, diff);
points(1:10, diff-lss0sd, pch="+", col="green");
sil2=silhouette(km2$cluster,D);
sil3=silhouette(km3$cluster,D);
par(mfrow=c(1,2))
plot(sil2)
plot(sil3)
Compute and plot average silhouette over a range of K values.
k.max = 10
sil = rep(0, k.max)
# Compute the average silhouette width for
# k = 2 to k = 15
for(i in 2:k.max){
tmp = kmeans(sportsranks, centers = i, nstart = 10)
ss <- silhouette(tmp$cluster, dist(sportsranks))
sil[i] <- mean(ss[, 3])
}
# Plot the average silhouette width
plot(1:k.max, sil, type = "b", pch = 19,
frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)
library(fpc)
ps = prediction.strength(sportsranks, Gmax=10,
clustermethod=kmeansCBI)
plot(1:10, ps$mean.pred, type='b', ylim=c(0,1),
xlab='Number of clusters', ylab='Prediction strength')
abline(h=ps$cutoff, col="red", lty=2, lwd=2)
pam2=pam(sportsranks, k=2);
plot(pam2)
pam3=pam(sportsranks, k=3);
plot(pam3)
par(mfrow=c(1,2));
plot(Xmds[,1], Xmds[,2], type="n");
points(Xmds[,1], Xmds[,2], pch=pam2$clustering, col=pam2$clustering);
title("MDS plot for K=2")
plot(Xmds[,1], Xmds[,2], type="n");
points(Xmds[,1], Xmds[,2], pch=pam3$clustering, col=pam3$clustering);
title("MDS plot for K=3")
# compare with Kmeans
table(pam2$clustering, km2$cluster)
##
## 1 2
## 1 60 0
## 2 2 68
table(pam3$clustering, km3$cluster)
##
## 1 2 3
## 1 1 57 0
## 2 29 1 0
## 3 1 0 41
plot(hclust(dist(sportsranks)), xlab="Complete Linkage", sub="");
plot(hclust(dist(sportsranks), method="single"), xlab="Single Linkage", sub="");
plot(hclust(dist(sportsranks), method="average"), xlab="Average Linkage", sub="");
clusters = hclust(dist(sportsranks), method="average")
clusterCut = cutree(clusters, 3)
table(clusterCut)
## clusterCut
## 1 2 3
## 63 8 59
Check how hclust
results differ from the ones from K-means.
table(cutree(clusters, 2), km2$cluster)
##
## 1 2
## 1 61 10
## 2 1 58
table(cutree(clusters, 3), km3$cluster)
##
## 1 2 3
## 1 1 54 8
## 2 4 4 0
## 3 26 0 33
clusters = hclust(dist(sportsranks))
table(cutree(clusters, 2), km2$cluster)
##
## 1 2
## 1 58 13
## 2 4 55
table(cutree(clusters, 3), km3$cluster)
##
## 1 2 3
## 1 0 50 0
## 2 13 6 2
## 3 18 2 39
The image.txt file is not provided. You can use any gray scale photo to replace image.txt. Of course, you need to change the dimension of the matrix X accordingly.
X = read.table("image.txt");
X = as.matrix(X);
X = t(X[270:1,]);
image(X, col=gray((0:128)/128));
# divide image into 6x6 patches
# save the patches (36x1 vector) into data matrix Z
p = 36
Z = NULL
n = (270*360)/36
for(i in 1:(360/6))
for(j in 1:(270/6)){
k=6*(i-1); l=6*(j-1);
Z=rbind(Z,as.vector(X[(k+1):(k+6), (l+1):(l+6)]));
}
dim(Z) # 2700x36
# run k-means with K=40 centers
mykm = kmeans(Z, centers=40, nstart=10)
# Use the 40 patches to replace 2700 patches in the original image
# The new image uses only 40 disctive patches
newX = matrix(0,360,270)
m = 0
for(i in 1:(360/6))
for(j in 1:(270/6)){
k = 6*(i-1)
l = 6*(j-1)
m = m+1;
tmp = mykm$center[mykm$cluster[m],]
newX[(k+1):(k+6), (l+1):(l+6)] = matrix(tmp,6,6);
}
image(newX, col=gray((0:128)/128))