import numpy as np
from skimage.data import astronaut
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_wine
from numpy.linalg import svd
from scipy import io
%matplotlib inline
All matrices can be "factorised" into the form $X=U \Sigma V^T$, where $U$ is a matrix comprised of the left eigenvectors, $V^T$ a matrix of right eigenvectors, and $\Sigma$ is the diagonal matrix of singular values. This decomposition can be used for generating low-rank approximations of matrices, preserving as much information as possible whilst reducing the storage required.
The following function takes a matrix M and a desired rank k, and returns the best approximation for M by a rank k matrix.
def low_rank(M, k):
u, s, v = np.linalg.svd(M, full_matrices=False)
s = np.diag(s)
for i in range(len(s)):
if i>=k:
s[i][i]=0
return np.dot(np.dot(u, s), v)
We test on a 512x512 image of Eileen Colins from skimage.data.
image = astronaut()
plt.figure(figsize=(8,8))
plt.imshow(image)
plt.axis('off') ;
We now separate the image into the three colour channels (RGB), making sure that the values are scaled to be between 0 and 1 (instead of 0 to 255).
image_R = image[:, :, 0] / 255.0
image_G = image[:, :, 1] / 255.0
image_B = image[:, :, 2] / 255.0
plt.figure(figsize=(15,10))
plt.subplot(1,3,1)
plt.imshow(image_R, cmap='gray')
plt.axis('off') ;
plt.subplot(1,3,2)
plt.imshow(image_G, cmap='gray')
plt.axis('off') ;
plt.subplot(1,3,3)
plt.imshow(image_B, cmap='gray')
plt.axis('off') ;
We plot the singular values for each separate channel (regular and log-log).
SR = np.linalg.svd(image_R, compute_uv=False)
SG = np.linalg.svd(image_G, compute_uv=False)
SB = np.linalg.svd(image_B, compute_uv=False)
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
plt.plot(SR, 'r')
plt.plot(SG, 'g')
plt.plot(SB, 'b')
plt.xlabel('k')
plt.ylabel('$\sigma$')
plt.subplot(1, 2, 2)
plt.loglog(SR, 'r')
plt.loglog(SG, 'g')
plt.loglog(SB, 'b')
plt.xlabel('k')
plt.ylabel('$\sigma$');
The Frobenius norm of a matrix $M$ is $$ \|M\|_F = \sqrt{\sum_{i,j} M_{ij}^2} = \sqrt{\sum_i \sigma_i^2},$$ where $\sigma_i$ are the singular values of $M$.
Our goal is to compress the image by finding a lower rank approximation. In order to assess the quality of the compressed image we can use the ratio of the Frobenius norm between the compressed image and the original one. Let $M_k$ be the approximation of $M$ by a rank-$k$ matrix.
Define $$ \rho_k(M) = \frac{\| M_k\|_F}{\|M\|_F}.$$
We compute the values of $\rho_k(M)$ for $M=$ image_R,image_G,image_B (k=1,...,512).
image_R_frob = np.sqrt(np.sum(SR**2))
image_G_frob = np.sqrt(np.sum(SG**2))
image_B_frob = np.sqrt(np.sum(SB**2))
rho_R = np.sqrt(np.cumsum(SR**2)) / image_R_frob
rho_G = np.sqrt(np.cumsum(SG**2)) / image_G_frob
rho_B = np.sqrt(np.cumsum(SB**2)) / image_B_frob
plt.plot(rho_R, 'r')
plt.plot(rho_G, 'g')
plt.plot(rho_B, 'b');
We want to compress the image, and control the loss of quality.
i.e find the smallest rank $k$ required (per channel), to achieve $\rho_k >= \text{QUALITY}$.
QUALITY = 0.999
k_R = np.where(rho_R >= QUALITY)[0][0]
k_G = np.where(rho_G >= QUALITY)[0][0]
k_B = np.where(rho_B >= QUALITY)[0][0]
We compress each of the channels separately, using the low_rank function above, and using the above k_R,k_G,k_B as the input. We clip the results to ensure every value is between 0 and 1.
low_R = low_rank(image_R, k_R)
low_G = low_rank(image_G, k_G)
low_B = low_rank(image_B, k_B)
low_R = np.clip(low_R, 0, 1)
low_G = np.clip(low_G, 0, 1)
low_B = np.clip(low_B, 0, 1)
Here are our results.
plt.figure(figsize=(20,12))
plt.subplot(1,4,1)
plt.imshow(low_R, cmap='gray')
plt.axis('off') ;
plt.subplot(1,4,2)
plt.imshow(low_G, cmap='gray')
plt.axis('off') ;
plt.subplot(1,4,3)
plt.imshow(low_B, cmap='gray')
plt.axis('off') ;
plt.subplot(1,4,4)
low_im = np.zeros(image.shape)
low_im[:,:,0] = low_R
low_im[:,:,1] = low_G
low_im[:,:,2] = low_B
plt.imshow(low_im)
plt.axis('off') ;
We implement a function compression_rate that computes the compression of an image using the low-rank approximation. The inputs are:
The output of this function is the ratio $N_{\text{original}}/N_{\text{compressed}}$, where $N_{\text{original}}$ is the total number of values needed to store the original image (flat, no SVD), and and $N_{\text{compressed}}$ is the total number of values needed to be stored using our low-rank representation.
def compression_rate(IMH, IMV, KR, KG, KB):
N_original = IMH * IMV * 3
N_compressed = (KR + KG + KB) * (1 + IMH + IMV)
return N_original / N_compressed
c_rate = compression_rate(image.shape[0], image.shape[1], k_R, k_G, k_B)
print('The compression rate for k_R = %d, k_G = %d, k_B = %d, is: %0.3f' % (k_R, k_G, k_B, c_rate) )
The compression rate for k_R = 76, k_G = 102, k_B = 111, is: 2.655
Below we present an assortment of low-rank versions of our original image, for each colour channel and then combined, with the compression rate shown above the full colour image.
KS = [1, 2, 5, 10, 25, 50, 100]
NK = len(KS)
plt.figure(figsize=(15,30))
c = 0
for K in KS:
low_R = np.clip(low_rank(image_R, K), 0, 1)
low_G = np.clip(low_rank(image_G, K), 0, 1)
low_B = np.clip(low_rank(image_B, K), 0, 1)
c_rate = compression_rate(image.shape[0], image.shape[1], K, K, K)
plt.subplot(NK,4,c*4+1)
plt.imshow(low_R, cmap='gray')
plt.axis('off') ;
plt.subplot(NK,4,c*4+2)
plt.imshow(low_G, cmap='gray')
plt.axis('off') ;
plt.subplot(NK,4,c*4+3)
plt.imshow(low_B, cmap='gray')
plt.axis('off') ;
plt.subplot(NK,4,c*4+4)
low_im = np.zeros(image.shape)
low_im[:,:,0] = low_R
low_im[:,:,1] = low_G
low_im[:,:,2] = low_B
plt.imshow(low_im)
plt.axis('off') ;
plt.title('%0.3f' % c_rate)
c = c+1
We want to demonstrate PCA on a 13-dimensional dataset, by loading the wine dataset from sklearn.
This dataset contains chemical analysis of N=178 different wines by three different cultivators.
The analysis contains the folowing measurements:
Alcohol
Malic acid
Ash
Alcalinity of ash
Magnesium
Total phenols
Flavanoids
Nonflavanoid phenols
Proanthocyanins
Colour intensity
Hue
OD280/OD315 of diluted wines
Proline
Overall we have N=178 data points, lying in $\mathbb{R}^{D}$, with D=13. We stack all points together into a matrix X_wine $\in \mathbb{R}^{D\times N}$.
We have labels 0,1, or 2 for each wine (clutivator). The true labels are given in L_wine.
We want to see whether PCA can be helpful in the unsupervised task of clustering the 178 wines.
We start by loading the dataset, and printing the first 5 data points, just to get a general impression.
X_wine, L_wine = load_wine(return_X_y=True)
X_wine = X_wine.T
np.set_printoptions(suppress=True)
print('First 5 data points:')
print('--------------------')
print(X_wine[:,0:5])
First 5 data points: -------------------- [[ 14.23 13.2 13.16 14.37 13.24] [ 1.71 1.78 2.36 1.95 2.59] [ 2.43 2.14 2.67 2.5 2.87] [ 15.6 11.2 18.6 16.8 21. ] [ 127. 100. 101. 113. 118. ] [ 2.8 2.65 2.8 3.85 2.8 ] [ 3.06 2.76 3.24 3.49 2.69] [ 0.28 0.26 0.3 0.24 0.39] [ 2.29 1.28 2.81 2.18 1.82] [ 5.64 4.38 5.68 7.8 4.32] [ 1.04 1.05 1.03 0.86 1.04] [ 3.92 3.4 3.17 3.45 2.93] [1065. 1050. 1185. 1480. 735. ]]
We implement a function called pc_transform. The inputs of this functions are:
The function computes the principal components listed in PCS, and then return the projection of X on these principal components.
For example, get_transform(X, [0,4]) should return the projection of X on the first and fifth principal components.
def pc_transform(X, PCS):
U, _, _ = np.linalg.svd(X, full_matrices=False)
return (U[:,PCS]).T@X
We now compute the projection of X_wine along the first 2 principal components, placing the result in a matrix Y $\in \mathbb{R}^{2\times N}$.
We then plot the projected points, and colour them according to the original labels.
MX = np.mean(X_wine,axis=1)
MX = MX[:, np.newaxis]
XZ = X_wine-MX
Y = pc_transform(XZ,[0,1])
### END SOLUTION
plt.scatter(Y[0,:], Y[1,:], c=L_wine);
We will now try to improve the result by normalisation, taking each row of X_wine, and standardising it to have mean 0 and variance 1. We do this using the StandardScaler from sklearn.preprocessing.
scaler = StandardScaler()
scaler.fit(X_wine.T)
XS = scaler.transform(X_wine.T)
XS = XS.T
YS = pc_transform(XS, [0,1])
plt.scatter(YS[0,:], YS[1,:], c=L_wine);
This has indeed helped.
We now work with a dataset consisting of numerous faces and create a basis of so-called eigenfaces. The images are 192x168, and are stored as vectors of length 32,256.
This utility function will just let us convert an image from vector to matrix representation, so it can be showed on the screen.
def vec2img(vec):
return np.reshape(vec,(168,192)).T
We load the faces database.
FACES - a where each entry is a collection of images of a single person
NPEOPLE - number of people in the list (should be 20)
NFACES - number of images per person (should be 64)
f = open('faces.npy','rb')
FACES = np.load(f)
NPEOPLE = len(FACES)
NFACES = 64
NR = int(np.sqrt(NFACES))
f.close()
First, we will examine the photos of a single person.
PI - the index of the person we examine.
X_person - the data matrix, containing all images of person PI, as columns.
We start by presenting all the photos.
PI = 1
X_person = FACES[PI]
plt.figure(figsize=(15,15))
for i in range(NFACES):
im = vec2img(X_person[:,i])
plt.subplot(NR,NR,i+1)
plt.imshow(im,cmap='gray')
plt.axis('off')
Next, we want to find the "eigenfaces", i.e., the principal components of this collection of images.
When using PCA, we should do all the processing for the cetnered data, i.e., with mean 0.
Therefore, we take the following steps:
At the end of this process, the columns of U_person should represent the eigenfaces.
M_person = np.mean(X_person,1)
M_person = M_person[:,np.newaxis]
XZ_person = X_person-M_person
U_person, S_person , V_person = svd(XZ_person, full_matrices=False)
We present the first 15 eigenfaces, along with the "average" face.
plt.figure(figsize=(15,15))
plt.subplot(4,4,1)
plt.imshow(vec2img(M_person), cmap='gray')
plt.axis('off')
plt.title('mean')
for i in range(15):
plt.subplot(4,4,i+2)
plt.imshow(vec2img(U_person[:,i]), cmap='gray')
plt.axis('off')
plt.title('$u_{%i}$' % (i+1))
We can try get a feeling now for what the eigenfaces represent. We reconstruct XZ_person based off the first 2 eigenfaces.
X12 = U_person[:,:2]@np.diag(S_person[0:2])@V_person[:2,:] + M_person
plt.figure(figsize=(15,15))
for i in range(NFACES):
im = vec2img(X12[:,i])
plt.subplot(NR,NR,i+1)
plt.imshow(im,cmap='gray')
plt.axis('off')
We do the same for the 3rd and 4th eigenfaces.
X34 = U_person[:,2:4]@np.diag(S_person[2:4])@V_person[2:4,:] + M_person
plt.figure(figsize=(15,15))
for i in range(NFACES):
im = vec2img(X34[:,i])
plt.subplot(NR,NR,i+1)
plt.imshow(im,cmap='gray')
plt.axis('off')
It is easy to see the eigances are representing different lighting aspects of the images.
We now see the effect of taking not just one person, but all the people in the dataset and repeating this procedure.
X_all = np.concatenate(FACES,1)
plt.figure(figsize=(15,15))
for i in range(NFACES):
j = np.random.randint(NFACES*NPEOPLE)
im = vec2img(X_all[:,j])
plt.subplot(NR,NR,i+1)
plt.imshow(im,cmap='gray')
plt.axis('off')
M_all = np.mean(X_all,1)
M_all = M_all[:,np.newaxis]
XZ_all = X_all-M_all
U_all, S_all, _ = svd(XZ_all, full_matrices=False)
plt.figure(figsize=(15,15))
plt.subplot(4,4,1)
plt.imshow(vec2img(M_all), cmap='gray')
plt.axis('off')
plt.title('mean')
for i in range(15):
plt.subplot(4,4,i+2)
plt.imshow(vec2img(U_all[:,i]), cmap='gray')
plt.axis('off')
plt.title('$u_{%i}$' % (i+1))
We now try to test the effect of the eigenfaces in a different way.
For each eigenface ui we show the image M + t$\cdot$ui, where t is in $\big[-\frac{\sigma_i}{20},\frac{\sigma_i}{20}\big]$.
The list I indicates which eigenfaces to examine.
I = [0,1,2,3,4,5]
NT = 11
SCALE = 20
NI = len(I)
M = X_all.mean(1)
plt.figure(figsize=(15,10))
for i in range(NI):
T = np.linspace(-S_all[I[i]]/SCALE,S_all[I[i]]/SCALE,NT)
for j in range(NT):
im = M + T[j]*U_all[:,I[i]]
plt.subplot(NI,NT, i*NT+j+1)
plt.imshow(vec2img(im),cmap='gray')
plt.axis('off')
Finally, we would like to see whether the eigenfaces can be useful for clustering the face images. We will only focus on binary clustering here.
The list in PIS contains exactly two indexes of the two people we want to test clustering on.
The list in PCS contains exactly two indexes, for the principal components to be used.
We create a data matrix X_bin that contains all images from these two people.
Next, we use the pc_transform function to project on the two principal components listed in PCS,
and the result is placed in the matrix Y_bin.
These principal components will be plotted, coloured according to the person.
PIS = [5,15]
PCS = [4,6]
cols = np.zeros(NFACES*2)
cols[NFACES:] = 1
X_bin = np.concatenate(FACES[PIS],1)
M_bin = np.mean(X_bin, axis=1)
M_bin = M_bin[:, np.newaxis]
XZ_bin = X_bin-M_bin
Y_bin = pc_transform(XZ_bin, PCS)
plt.scatter(Y_bin[0,:], Y_bin[1,:],c=cols);
PIS = [5,15]
PCS = [0,1]
cols = np.zeros(NFACES*2)
cols[NFACES:] = 1
X_bin = np.concatenate(FACES[PIS],1)
M_bin = np.mean(X_bin, axis=1)
M_bin = M_bin[:, np.newaxis]
XZ_bin = X_bin-M_bin
Y_bin = pc_transform(XZ_bin, PCS)
plt.scatter(Y_bin[0,:], Y_bin[1,:],c=cols);
Some principal components are much more effective at clustering than others, as shown above.