Multidimensional Scaling

Letter Recognition and Confusion. The table below is from Wolford and Hollingsworth (1974), where each entry shows the frequency with which each letter was mistakenly called something else.

D = matrix(0, 8, 8)
letters = c("C", "D", "G", "H", "M", "N", "Q", "W")
colnames(D) = letters
rownames(D) = letters
D[2:8, 1] = c(5, 12, 2, 2, 2, 9, 1)
D[3:8, 2] = c(2, 4, 3, 4, 20, 5)
D[4:8, 3] = c(3, 2, 1, 9, 2)
D[5:8, 4] = c(19, 18, 1, 5)
D[6:8, 5] = c(16, 2, 18)
D[7:8, 6] = c(8, 13)
D[8, 7] = 4
D = (D+t(D))
D
##    C  D  G  H  M  N  Q  W
## C  0  5 12  2  2  2  9  1
## D  5  0  2  4  3  4 20  5
## G 12  2  0  3  2  1  9  2
## H  2  4  3  0 19 18  1  5
## M  2  3  2 19  0 16  2 18
## N  2  4  1 18 16  0  8 13
## Q  9 20  9  1  2  8  0  4
## W  1  5  2  5 18 13  4  0

Change the similarity matrix to distance matrix.

D0 = 21 - D
diag(D0) = 0
tmp = cmdscale(D0)
par(mfrow=c(1,2))
plot(tmp[, 1], tmp[, 2], type="n", xlab="", ylab="", 
     xlim = c(-15, 15), ylim=c(-15, 15))
text(tmp[, 1], tmp[, 2], label = letters)

D1 = 41 - D
diag(D1) = 0
tmp = cmdscale(D1)
plot(tmp[, 1], tmp[, 2], type="n", xlab="", ylab="", 
     xlim = c(-20, 20), ylim=c(-20, 20))
text(tmp[, 1], tmp[, 2], label = letters)

K-means

Run K-means with K=2 and 3.

source("sportsranks.txt");
head(sportsranks)
##  Baseball Football Basketball Tennis Cycling Swimming Jogging
##         1        3          7      2       4        5       6
##         1        3          2      5       4        7       6
##         1        3          2      5       4        7       6
##         4        7          3      1       5        6       2
##         2        3          1      7       6        5       4
##         6        3          4      7       2        1       5
km2=kmeans(sportsranks, centers=2, nstart=10);
km3=kmeans(sportsranks, centers=3, nstart=10);

View the clusters in MDS plots.

D=dist(sportsranks)
Xmds=cmdscale(D);

par(mfrow=c(1,2));
plot(Xmds[,1], Xmds[,2], type="n", xlab="", ylab="")
points(Xmds[,1], Xmds[,2], pch=km2$cluster, col=km2$cluster);
title("MDS plot for K=2")
plot(Xmds[,1], Xmds[,2], type="n", xlab="", ylab="");
points(Xmds[,1], Xmds[,2], pch=km3$cluster, col=km3$cluster);
title("MDS plot for K=3")

How is cMDS computed?

D2 = as.matrix(D^2)
n = dim(D2)[1]
tmp = D2 - matrix(colMeans(D2), nrow=n, ncol=n, byrow=TRUE) - 
  matrix(rowMeans(D2), nrow=n, ncol=n, byrow=FALSE)
tmp = -(tmp + mean(D2))/2
tmp.svd = svd(tmp)
F = tmp.svd$u[, 1:2]  %*% diag(sqrt(tmp.svd$d[1:2]))
cor(F, Xmds)
##              [,1]          [,2]
## [1,] 1.000000e+00  3.584861e-17
## [2,] 1.193066e-16 -1.000000e+00

Gap Statistics

You can use clusGap from cluster package to compute gap statistic.

library(cluster)
set.seed(123)
gap_stat = clusGap(sportsranks, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = sportsranks, FUNcluster = kmeans, K.max = 10, B = 50,     nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 2
##           logW   E.logW       gap     SE.sim
##  [1,] 5.429866 5.642931 0.2130645 0.01577661
##  [2,] 5.229886 5.540230 0.3103434 0.01659846
##  [3,] 5.157134 5.466098 0.3089634 0.01669885
##  [4,] 5.099607 5.406242 0.3066354 0.01676325
##  [5,] 5.045671 5.352771 0.3070994 0.01728555
##  [6,] 4.992883 5.306855 0.3139720 0.01701668
##  [7,] 4.945869 5.264815 0.3189463 0.01763576
##  [8,] 4.909225 5.225939 0.3167138 0.01720501
##  [9,] 4.870728 5.190961 0.3202335 0.01755400
## [10,] 4.842217 5.157341 0.3151240 0.01707943
plot(gap_stat, frame = FALSE, xlab = "Number of clusters k")
abline(v = 2, lty = 2)

Next we compute gap statistic step by step.

  • Step 1. Calculate Within-Cluster-SS for different K
n=130;
SS=rep(0,10)
SS[1]=sum(diag(var(sportsranks)))*(n-1);
for(k in 2:10){
      kms=kmeans(sportsranks, centers=k, nstart=10);
      SS[k]=sum(kms$withinss);
}
lss=log(SS);
  • Step 2. Calculate SS_0(K) under the null hypothesis
n = dim(sportsranks)[1] 
m = 7  # each review is a permutation of numbers from 1 to 7
B = 50 # number of iterations
lss0 = matrix(0,B,10)
for(b in 1:B){
  xstar=NULL
  for(i in 1:n){
    xstar = rbind(xstar, sample(m))
  }
  lss0[b,1] = log(sum(diag(var(xstar)))*(n-1))
  for(k in 2:10){
    kms = kmeans(xstar, centers=k, nstart=10)
    lss0[b,k] = log(sum(kms$withinss))
  }
}

plot(1:10, lss, type="l", col="red", xlab="k", 
     ylab="log(Within SS)");
for(b in 1:B){
      points(1:10, lss0[b,], type="l", col=8)
}

  • Step 3. Gap curve. Based on the 1se rule, the optimal K = 2.
lss0m = apply(lss0, 2, mean);
lss0sd = sqrt(apply(lss0, 2, var))*sqrt(1+1/B);
diff = lss0m-lss;
matplot(1:10, cbind(diff, diff+lss0sd, diff-lss0sd), type="l",
        xlab="k", ylab="Gap")
points(1:10, diff);
points(1:10, diff-lss0sd, pch="+", col="green");

Silhouette Plots

sil2=silhouette(km2$cluster,D);
sil3=silhouette(km3$cluster,D);
par(mfrow=c(1,2))
plot(sil2)
plot(sil3)

Compute and plot average silhouette over a range of K values.

k.max = 10
sil = rep(0, k.max)

# Compute the average silhouette width for 
# k = 2 to k = 15
for(i in 2:k.max){
  tmp = kmeans(sportsranks, centers = i, nstart = 10)
  ss <- silhouette(tmp$cluster, dist(sportsranks))
  sil[i] <- mean(ss[, 3])
}

# Plot the  average silhouette width
plot(1:k.max, sil, type = "b", pch = 19, 
     frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)

Prediction Strength

library(fpc)
ps = prediction.strength(sportsranks, Gmax=10,
                          clustermethod=kmeansCBI)
plot(1:10, ps$mean.pred, type='b', ylim=c(0,1), 
     xlab='Number of clusters', ylab='Prediction strength')
abline(h=ps$cutoff, col="red", lty=2, lwd=2)

PAM

pam2=pam(sportsranks, k=2);
plot(pam2)

pam3=pam(sportsranks, k=3);
plot(pam3)

par(mfrow=c(1,2));
plot(Xmds[,1], Xmds[,2], type="n");
points(Xmds[,1], Xmds[,2], pch=pam2$clustering, col=pam2$clustering);
title("MDS plot for K=2")
plot(Xmds[,1], Xmds[,2], type="n");
points(Xmds[,1], Xmds[,2], pch=pam3$clustering, col=pam3$clustering);
title("MDS plot for K=3")

# compare with Kmeans
table(pam2$clustering, km2$cluster)
##    
##      1  2
##   1 60  0
##   2  2 68
table(pam3$clustering, km3$cluster)
##    
##      1  2  3
##   1  1 57  0
##   2 29  1  0
##   3  1  0 41

Hierarchical Clustering (bottom-up)

plot(hclust(dist(sportsranks)), xlab="Complete Linkage", sub="");

plot(hclust(dist(sportsranks), method="single"), xlab="Single Linkage", sub="");

plot(hclust(dist(sportsranks), method="average"), xlab="Average Linkage", sub="");

clusters = hclust(dist(sportsranks), method="average")
clusterCut = cutree(clusters, 3)
table(clusterCut)
## clusterCut
##  1  2  3 
## 63  8 59

Check how hclust results differ from the ones from K-means.

table(cutree(clusters, 2), km2$cluster)
##    
##      1  2
##   1 61 10
##   2  1 58
table(cutree(clusters, 3), km3$cluster)
##    
##      1  2  3
##   1  1 54  8
##   2  4  4  0
##   3 26  0 33
clusters = hclust(dist(sportsranks))
table(cutree(clusters, 2), km2$cluster)
##    
##      1  2
##   1 58 13
##   2  4 55
table(cutree(clusters, 3), km3$cluster)
##    
##      1  2  3
##   1  0 50  0
##   2 13  6  2
##   3 18  2 39

K-means and VQ-compression

The image.txt file is not provided. You can use any gray scale photo to replace image.txt. Of course, you need to change the dimension of the matrix X accordingly.

X = read.table("image.txt"); 
X = as.matrix(X); 
X = t(X[270:1,]); 
image(X, col=gray((0:128)/128)); 

# divide image into 6x6 patches
# save the patches (36x1 vector) into data matrix Z
p = 36
Z = NULL 
n = (270*360)/36
for(i in 1:(360/6))
    for(j in 1:(270/6)){
        k=6*(i-1); l=6*(j-1);
        Z=rbind(Z,as.vector(X[(k+1):(k+6), (l+1):(l+6)]));
        }
dim(Z)  # 2700x36 

# run k-means with K=40 centers
mykm = kmeans(Z, centers=40, nstart=10)

# Use the 40 patches to replace 2700 patches in the original image
# The new image uses only 40 disctive patches
newX = matrix(0,360,270)
m = 0
for(i in 1:(360/6))
    for(j in 1:(270/6)){
        k = 6*(i-1)
        l = 6*(j-1)
        m = m+1;
        tmp = mykm$center[mykm$cluster[m],]
        newX[(k+1):(k+6), (l+1):(l+6)] = matrix(tmp,6,6);
    }
image(newX, col=gray((0:128)/128))