Stat542: Clustering

Multidimensional Scaling

Letter Recognition and Confusion. The table below is from Wolford and Hollingsworth (1974), where each entry shows the frequency with which each letter was mistakenly called something else.

D = matrix(0, 8, 8)
letters = c("C", "D", "G", "H", "M", "N", "Q", "W")
colnames(D) = letters
rownames(D) = letters
D[2:8, 1] = c(5, 12, 2, 2, 2, 9, 1)
D[3:8, 2] = c(2, 4, 3, 4, 20, 5)
D[4:8, 3] = c(3, 2, 1, 9, 2)
D[5:8, 4] = c(19, 18, 1, 5)
D[6:8, 5] = c(16, 2, 18)
D[7:8, 6] = c(8, 13)
D[8, 7] = 4
D = (D+t(D))
D

##    C  D  G  H  M  N  Q  W
## C  0  5 12  2  2  2  9  1
## D  5  0  2  4  3  4 20  5
## G 12  2  0  3  2  1  9  2
## H  2  4  3  0 19 18  1  5
## M  2  3  2 19  0 16  2 18
## N  2  4  1 18 16  0  8 13
## Q  9 20  9  1  2  8  0  4
## W  1  5  2  5 18 13  4  0

Change the similarity matrix to distance matrix.

D0 = 21 - D
diag(D0) = 0
tmp = cmdscale(D0)
par(mfrow=c(1,2))
plot(tmp[, 1], tmp[, 2], type="n", xlab="", ylab="", 
     xlim = c(-15, 15), ylim=c(-15, 15))
text(tmp[, 1], tmp[, 2], label = letters)

D1 = 41 - D
diag(D1) = 0
tmp = cmdscale(D1)
plot(tmp[, 1], tmp[, 2], type="n", xlab="", ylab="", 
     xlim = c(-20, 20), ylim=c(-20, 20))
text(tmp[, 1], tmp[, 2], label = letters)

K-means

Run K-means with K=2 and 3.

source("sportsranks.txt");
head(sportsranks)

##  Baseball Football Basketball Tennis Cycling Swimming Jogging
##         1        3          7      2       4        5       6
##         1        3          2      5       4        7       6
##         1        3          2      5       4        7       6
##         4        7          3      1       5        6       2
##         2        3          1      7       6        5       4
##         6        3          4      7       2        1       5

km2=kmeans(sportsranks, centers=2, nstart=10);
km3=kmeans(sportsranks, centers=3, nstart=10);

View the clusters in MDS plots.

D=dist(sportsranks)
Xmds=cmdscale(D);

par(mfrow=c(1,2));
plot(Xmds[,1], Xmds[,2], type="n", xlab="", ylab="")
points(Xmds[,1], Xmds[,2], pch=km2$cluster, col=km2$cluster);
title("MDS plot for K=2")
plot(Xmds[,1], Xmds[,2], type="n", xlab="", ylab="");
points(Xmds[,1], Xmds[,2], pch=km3$cluster, col=km3$cluster);
title("MDS plot for K=3")

How is cMDS computed?

D2 = as.matrix(D^2)
n = dim(D2)[1]
tmp = D2 - matrix(colMeans(D2), nrow=n, ncol=n, byrow=TRUE) - 
  matrix(rowMeans(D2), nrow=n, ncol=n, byrow=FALSE)
tmp = -(tmp + mean(D2))/2
tmp.svd = svd(tmp)
F = tmp.svd$u[, 1:2]  %*% diag(sqrt(tmp.svd$d[1:2]))
cor(F, Xmds)

##              [,1]          [,2]
## [1,] 1.000000e+00  3.584861e-17
## [2,] 1.193066e-16 -1.000000e+00

Gap Statistics

You can use clusGap from cluster package to compute gap statistic.

library(cluster)
set.seed(123)
gap_stat = clusGap(sportsranks, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
print(gap_stat, method = "firstmax")

## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = sportsranks, FUNcluster = kmeans, K.max = 10, B = 50,     nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 2
##           logW   E.logW       gap     SE.sim
##  [1,] 5.429866 5.642931 0.2130645 0.01577661
##  [2,] 5.229886 5.540230 0.3103434 0.01659846
##  [3,] 5.157134 5.466098 0.3089634 0.01669885
##  [4,] 5.099607 5.406242 0.3066354 0.01676325
##  [5,] 5.045671 5.352771 0.3070994 0.01728555
##  [6,] 4.992883 5.306855 0.3139720 0.01701668
##  [7,] 4.945869 5.264815 0.3189463 0.01763576
##  [8,] 4.909225 5.225939 0.3167138 0.01720501
##  [9,] 4.870728 5.190961 0.3202335 0.01755400
## [10,] 4.842217 5.157341 0.3151240 0.01707943

plot(gap_stat, frame = FALSE, xlab = "Number of clusters k")
abline(v = 2, lty = 2)

Next we compute gap statistic step by step.

Step 1. Calculate Within-Cluster-SS for different K

n=130;
SS=rep(0,10)
SS[1]=sum(diag(var(sportsranks)))*(n-1);
for(k in 2:10){
      kms=kmeans(sportsranks, centers=k, nstart=10);
      SS[k]=sum(kms$withinss);
}
lss=log(SS);

Step 2. Calculate SS_0(K) under the null hypothesis

n = dim(sportsranks)[1] 
m = 7  # each review is a permutation of numbers from 1 to 7
B = 50 # number of iterations
lss0 = matrix(0,B,10)
for(b in 1:B){
  xstar=NULL
  for(i in 1:n){
    xstar = rbind(xstar, sample(m))
  }
  lss0[b,1] = log(sum(diag(var(xstar)))*(n-1))
  for(k in 2:10){
    kms = kmeans(xstar, centers=k, nstart=10)
    lss0[b,k] = log(sum(kms$withinss))
  }
}

plot(1:10, lss, type="l", col="red", xlab="k", 
     ylab="log(Within SS)");
for(b in 1:B){
      points(1:10, lss0[b,], type="l", col=8)
}

Step 3. Gap curve. Based on the 1se rule, the optimal K = 2.

lss0m = apply(lss0, 2, mean);
lss0sd = sqrt(apply(lss0, 2, var))*sqrt(1+1/B);
diff = lss0m-lss;
matplot(1:10, cbind(diff, diff+lss0sd, diff-lss0sd), type="l",
        xlab="k", ylab="Gap")
points(1:10, diff);
points(1:10, diff-lss0sd, pch="+", col="green");

Silhouette Plots

sil2=silhouette(km2$cluster,D);
sil3=silhouette(km3$cluster,D);
par(mfrow=c(1,2))
plot(sil2)
plot(sil3)

Compute and plot average silhouette over a range of K values.

k.max = 10
sil = rep(0, k.max)

# Compute the average silhouette width for 
# k = 2 to k = 15
for(i in 2:k.max){
  tmp = kmeans(sportsranks, centers = i, nstart = 10)
  ss <- silhouette(tmp$cluster, dist(sportsranks))
  sil[i] <- mean(ss[, 3])
}

# Plot the  average silhouette width
plot(1:k.max, sil, type = "b", pch = 19, 
     frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)

Prediction Strength

library(fpc)
ps = prediction.strength(sportsranks, Gmax=10,
                          clustermethod=kmeansCBI)
plot(1:10, ps$mean.pred, type='b', ylim=c(0,1), 
     xlab='Number of clusters', ylab='Prediction strength')
abline(h=ps$cutoff, col="red", lty=2, lwd=2)

PAM

pam2=pam(sportsranks, k=2);
plot(pam2)

pam3=pam(sportsranks, k=3);
plot(pam3)

par(mfrow=c(1,2));
plot(Xmds[,1], Xmds[,2], type="n");
points(Xmds[,1], Xmds[,2], pch=pam2$clustering, col=pam2$clustering);
title("MDS plot for K=2")
plot(Xmds[,1], Xmds[,2], type="n");
points(Xmds[,1], Xmds[,2], pch=pam3$clustering, col=pam3$clustering);
title("MDS plot for K=3")

# compare with Kmeans
table(pam2$clustering, km2$cluster)

##    
##      1  2
##   1 60  0
##   2  2 68

table(pam3$clustering, km3$cluster)

##    
##      1  2  3
##   1  1 57  0
##   2 29  1  0
##   3  1  0 41

Hierarchical Clustering (bottom-up)

plot(hclust(dist(sportsranks)), xlab="Complete Linkage", sub="");

plot(hclust(dist(sportsranks), method="single"), xlab="Single Linkage", sub="");

plot(hclust(dist(sportsranks), method="average"), xlab="Average Linkage", sub="");

clusters = hclust(dist(sportsranks), method="average")
clusterCut = cutree(clusters, 3)
table(clusterCut)

## clusterCut
##  1  2  3 
## 63  8 59

Check how hclust results differ from the ones from K-means.

table(cutree(clusters, 2), km2$cluster)

##    
##      1  2
##   1 61 10
##   2  1 58

table(cutree(clusters, 3), km3$cluster)

##    
##      1  2  3
##   1  1 54  8
##   2  4  4  0
##   3 26  0 33

clusters = hclust(dist(sportsranks))
table(cutree(clusters, 2), km2$cluster)

##    
##      1  2
##   1 58 13
##   2  4 55

table(cutree(clusters, 3), km3$cluster)

##    
##      1  2  3
##   1  0 50  0
##   2 13  6  2
##   3 18  2 39

K-means and VQ-compression

The image.txt file is not provided. You can use any gray scale photo to replace image.txt. Of course, you need to change the dimension of the matrix X accordingly.

X = read.table("image.txt"); 
X = as.matrix(X); 
X = t(X[270:1,]); 
image(X, col=gray((0:128)/128)); 

# divide image into 6x6 patches
# save the patches (36x1 vector) into data matrix Z
p = 36
Z = NULL 
n = (270*360)/36
for(i in 1:(360/6))
    for(j in 1:(270/6)){
        k=6*(i-1); l=6*(j-1);
        Z=rbind(Z,as.vector(X[(k+1):(k+6), (l+1):(l+6)]));
        }
dim(Z)  # 2700x36 

# run k-means with K=40 centers
mykm = kmeans(Z, centers=40, nstart=10)

# Use the 40 patches to replace 2700 patches in the original image
# The new image uses only 40 disctive patches
newX = matrix(0,360,270)
m = 0
for(i in 1:(360/6))
    for(j in 1:(270/6)){
        k = 6*(i-1)
        l = 6*(j-1)
        m = m+1;
        tmp = mykm$center[mykm$cluster[m],]
        newX[(k+1):(k+6), (l+1):(l+6)] = matrix(tmp,6,6);
    }
image(newX, col=gray((0:128)/128))