import numpy as np
import numpy.matlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from scipy.spatial.distance import cdist
from skimage import io, img_as_ubyte, img_as_float64
from sklearn.metrics import silhouette_score
from load_mnist import load_mnist
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import rand_score
from sklearn.manifold import SpectralEmbedding
%matplotlib inline
We begin by loading the Old Faithful dataset. The data includes the waiting time between eruptions and the duration of the eruption for the Old Faithful geyser in Yellowstone National Park, Wyoming, USA. The data can be described as follows:
A data frame with 272 observations on 2 variables.
eruptions numeric Eruption time in mins
waiting numeric Waiting time to next eruption
old_faithful_dataset = np.loadtxt("faithful.csv", delimiter=" ", skiprows=0)
plt.scatter(old_faithful_dataset[:, 0], old_faithful_dataset[:, 1])
plt.xlabel('Eruption time (in minutes)', fontsize=12)
plt.ylabel('Waiting time to next eruption (in minutes)', fontsize=12)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16);
We write a function k_means_clustering that performs $k$-means clustering for the input data and a number of clusters no_of_clusters.
Here data is matrix $X = \left( \begin{matrix} x_1^\top \\ \vdots \\ x_s^\top \end{matrix} \right) \in \mathbb{R}^{s \times d}$ of $s$ $d$-dimensional vectors.
Optional arguments are:
If init_centroids is set to None, then we initialise it as a 2D array of normally distributed random variables with mean zero and standard deviation one.
The algorithm stops as soon as $\left(L(z^k, \mu^k) - L(z^{k + 1}, \mu^{k + 1})\right) / L(z^{k + 1}, \mu^{k + 1}) \leq \text{tolerance}$. $L$ denotes the k-means clustering objective
The function returns the following:
def k_means_clustering(data, no_of_clusters, init_centroids=None, maximum_counter=300, tolerance=1e-9, \
print_output=10):
no_of_samples, dimension = data.shape
if init_centroids is None:
centroids = np.zeros((no_of_clusters, dimension))
centroids = np.random.normal(size=(no_of_clusters, dimension))
else:
centroids = init_centroids
print(f'centroids_init: \n {centroids}')
distances = cdist(data, centroids, 'euclidean')
assignments = np.zeros((no_of_samples, no_of_clusters))
for idx in range(no_of_samples):
assignments[idx, np.argmin(distances[idx])] = 1
for _ in range(maximum_counter):
old_centroids = centroids
old_assignments = assignments
for k in range(no_of_clusters):
centroids[k] = data[assignments[:,k]==1].mean(axis=0)
distances = cdist(data, centroids ,'euclidean')
assignments = np.zeros((no_of_samples, no_of_clusters))
for idx in range(no_of_samples):
assignments[idx, np.argmin(distances[idx])] = 1
old_loss = 0
new_loss = 0
for i in range(no_of_samples):
for k in range(no_of_clusters):
old_loss += old_assignments[i,k] * np.linalg.norm(data[i] - old_centroids[k])**2
new_loss += assignments[i,k] * np.linalg.norm(data[i] - centroids[k])**2
loss_ratio = (old_loss - new_loss) / new_loss
if (_+1)%print_output==0:
print(f'Iteration {_+1}, Loss: {new_loss}')
print(f'Centroids: \n {centroids}')
if loss_ratio < tolerance:
print(f'Finished after {_+1} iterations.')
break
return centroids, assignments
We cluster the Old Faithful dataset into two clusters, initialising our centroids with the two centroid vectors $\mu_1 = \left( \begin{matrix} 3.5 & 50\end{matrix}\right)^\top$ and $\mu_2 = \left( \begin{matrix} 3 & 80\end{matrix}\right)^\top$.
centroids, assignments = k_means_clustering(old_faithful_dataset, no_of_clusters=2,
init_centroids=np.array([[3.5, 50], [3, 80]]).reshape(2, 2), maximum_counter=300, tolerance=1e-9,
print_output=10)
centroids_init: [[ 3.5 50. ] [ 3. 80. ]] Finished after 2 iterations.
We plot the resulting clusters.
colormap_bright = ListedColormap(['#FF0000', '#0000FF'])
plt.scatter(old_faithful_dataset[:, 0], old_faithful_dataset[:, 1], c=assignments[:, 0], cmap=colormap_bright, edgecolors='k')
plt.scatter(centroids[:, 0], centroids[:, 1], marker="*", s=500, color=["purple", "orange"], edgecolors='k')
plt.xlabel('Eruption time (in minutes)', fontsize=12)
plt.ylabel('Waiting time to next eruption (in minutes)', fontsize=12)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.tight_layout;
labels = [np.argmax(i) for i in assignments]
SC = silhouette_score(old_faithful_dataset, labels)
print('The silhouette score is: %f' % SC)
The silhouette score is: 0.724055
This is a fairly good result.
We define a function to import the required dataset.
def load_mnist(path, kind = 'train'):
import os
import gzip
import numpy as np
"""Load MNIST data from `path`"""
labels_path = os.path.join(path,
'%s-labels-idx1-ubyte.gz'
% kind)
images_path = os.path.join(path,
'%s-images-idx3-ubyte.gz'
% kind)
with gzip.open(labels_path, 'rb') as lbpath:
labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
offset=8)
with gzip.open(images_path, 'rb') as imgpath:
images = np.frombuffer(imgpath.read(), dtype=np.uint8,
offset=16).reshape(len(labels), 784)
return images, labels
mnist_im, mnist_lb = load_mnist('DigitMNIST/')
The MNIST dataset is a collection of grayscale images of handwritten digits (0,1,...,9).
Here, for simplicity, we will only look at the images representing the digits 4,6,8, and to make the algorithms run faster we take the first 1000 images.
DIGITS = [4,6,8]
mnist_im_sub = mnist_im[(mnist_lb==4) | (mnist_lb==6) | (mnist_lb==8)]
mnist_lb_sub = mnist_lb[(mnist_lb==4) | (mnist_lb==6) | (mnist_lb==8)]
mnist_im_sub = mnist_im_sub[:1000]
mnist_lb_sub = mnist_lb_sub[:1000]
Here is a sample of these images.
M = 5
plt.figure(figsize=(10,10))
for i in range(M):
for j in range(M):
t = j+i*M
plt.subplot(M,M,t+1)
plt.imshow(np.reshape(mnist_im_sub[t, :], (28, 28)), cmap='gray')
plt.axis('off')
We use the KMeans, GaussianMixture and SpectralClustering methods from SKlearn and compare them.
km_model = KMeans(n_clusters=3).fit(mnist_im_sub)
km_labels = km_model.labels_
km_centers = km_model.cluster_centers_
gmm_model = GaussianMixture(n_components=3).fit(mnist_im_sub)
gm_labels = gmm_model.predict(mnist_im_sub)
gm_centers = gmm_model.means_
sc_model = SpectralClustering(n_clusters=3, affinity='nearest_neighbors').fit(mnist_im_sub)
sp_labels = sc_model.labels_
M = 10
plt.figure(figsize=(15,5))
for i in range(M):
plt.subplot(1,M,i+1)
plt.imshow(np.reshape(mnist_im_sub[i, :], (28, 28)), cmap='gray')
plt.axis('off')
print(f'KMeans: {km_labels[:10]}')
print(f'GMM: {gm_labels[:10]}')
print(f'SC: {sp_labels[:10]}')
KMeans: [1 1 0 2 0 1 1 2 0 0] GMM: [1 1 2 0 2 1 1 0 2 2] SC: [1 1 2 0 2 1 1 0 2 2]
All methods have clustered the images perfectly.
As a second check, we display the centers of the clusters for KMeans and GMM (the nature of SpectralEmbedding means it does not have centroid vectors).
plt.figure(figsize=(10, 5))
K=3
for counter in range(K):
im_show_km = km_centers[counter].reshape(28,28)
im_show_gm = gm_centers[counter].reshape(28,28)
plt.subplot(2, K, counter+1)
plt.imshow(im_show_km,cmap='gray')
plt.axis('off')
plt.subplot(2, K, counter+K+1)
plt.imshow(im_show_gm,cmap='gray')
plt.axis('off')
plt.tight_layout;
For spectral clustering, we want to examine the embedding that the algorithm produces.
For that we can use sklearn.manifold.SpectralEmbedding.
We apply the embedding $P:\mathbb{R}^{784}\to \mathbb{R}^2$
to transform the array of high dimensional vectors in mnist_im_sub ($1000\times 784$) into a new array mnist_im_emb that is of dimension $1000\times 2$.
embedding = SpectralEmbedding(n_components=2)
mnist_im_emb = embedding.fit_transform(mnist_im_sub)
plt.scatter(mnist_im_emb[:,0], mnist_im_emb[:,1], c=mnist_lb_sub);
Finally, we would like to get a quantitave estimate for the clustering quality of each of the methods.
Since in this example we know what the ground truth is (the labels given by the MNIST database), we can use an external evaluation.
We use rand_score (imported from sklearn.metrics) to compute the Rand Score for each of the three algorithms.
km_RI = rand_score(mnist_lb_sub, km_labels)
gm_RI = rand_score(mnist_lb_sub, gm_labels)
sp_RI = rand_score(mnist_lb_sub, sp_labels)
print("Rand Index for K-Means clustering: %.10f" % km_RI)
print("Rand Index for Gassian Mixture clustering: %.10f" % gm_RI)
print("Rand Index for Spectral clustering: %.10f" % sp_RI)
Rand Index for K-Means clustering: 0.8779839840 Rand Index for Gassian Mixture clustering: 0.8802842843 Rand Index for Spectral clustering: 0.9549449449
Spectral Clustering appears to be the most effective.
We explore another clustering application: compressing colour images.
An 8-bit RGB image uses $256 = 2^8$ possible values per colour-channel.
In other words, each pixel in the image requires $3\times 8 = 24$ bits of storage.
Suppose we would like to approximate such an image with an $n$-bit RGB image, where $n < 24$. How do we choose the $3 n$ colour-intensity values, so that the image looks similar to the original image? This can be done with clustering methods.
image = img_as_float64(io.imread('mandrill.png'))
plt.imshow(image)
plt.axis('off')
plt.tight_layout;
We use the sklearn.cluster KMeans to apply the K-means clustering algorithm to the image.
The number of clusters is set to be $2^\text{nbits}$, where nbits is number of bits used to store each pixel.
We repeat the process for $\text{nbits} = 1,2,6$. Each image is a matrix of 480x480 RGB values, which we have to turn into a vector of 230,400 values.
def compression(nbits):
kmeans = KMeans(2**nbits).fit(image.reshape(image.shape[0] * image.shape[1], 3))
centers = kmeans.cluster_centers_
labels = kmeans.labels_
return centers, labels
centers_1, labels_1 = compression(1)
print('done')
centers_2, labels_2 = compression(2)
print('done')
centers_6, labels_6 = compression(6)
print('done')
done done done
image_comp_1 = centers_1[labels_1].reshape(480, 480, 3)
image_comp_2 = centers_2[labels_2].reshape(480, 480, 3)
image_comp_6 = centers_6[labels_6].reshape(480, 480, 3)
fig, axs = plt.subplots(1, 3, figsize=(15,5))
axs = axs.ravel()
axs[0].imshow(image_comp_1)
axs[1].imshow(image_comp_2)
axs[2].imshow(image_comp_6)
<matplotlib.image.AxesImage at 0x7f8d68578820>
Our results are pretty good.