import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
%matplotlib inline
Our goal is to classify the spoken digits from the AudioMNIST Dataset. The training set contains 2400 samples with 900 Mel Frequency Cepstrum coefficients each. We will use these 900 coefficients as the features to train on.
Below is a visualisation of what each of these samples look like as an image.
audio_mnist_training_mfccs = np.genfromtxt('AudioMNIST/MFCC/Training/training_mfccs.txt')
audio_mnist_training_labels = np.genfromtxt('AudioMNIST/MFCC/Training/training_labels.txt').reshape(-1,1)
audio_mnist_testing_mfccs = np.genfromtxt('AudioMNIST/MFCC/Testing/testing_mfccs.txt')
audio_mnist_testing_labels = np.genfromtxt('AudioMNIST/MFCC/Testing/testing_labels.txt').reshape(-1,1)
item_number = np.random.randint(low=0, high=2400)
plt.imshow(audio_mnist_training_mfccs[item_number].reshape(30, 30), cmap='hot')
plt.title("This is the {n}th audio sample of the Audio MNIST training set. The corresponding label is {l}".format( \
n= item_number, l=int(audio_mnist_training_labels[item_number][0])))
plt.tight_layout
<function matplotlib.pyplot.tight_layout(*, pad=1.08, h_pad=None, w_pad=None, rect=None)>
We implement a neural network from scratch, using an architecture of only two densely connected layers. Below are all the classes and functions we will need to train the network.
class Layer():
def __init__(self):
self.input = input
self.output = output
def forward(self, input):
pass
def backward(self, output_gradient, learning_rate):
pass
class Dense(Layer):
def __init__(self, input_size, output_size):
self.weights = np.random.randn(output_size, input_size)
self.bias = np.random.randn(output_size, 1)
def forward(self, input):
self.input = input
return np.dot(self.weights, self.input) + self.bias
def backward(self, output_gradient, learning_rate):
weights_gradient = np.dot(output_gradient, self.input.T)
self.weights -= learning_rate * weights_gradient
self.bias -= learning_rate * output_gradient
return np.dot(self.weights.T, output_gradient)
###
class Activation(Layer):
def __init__(self, activation, activation_prime):
self.activation = activation
self.activation_prime = activation_prime
def forward(self, input):
self.input = input
return self.activation(self.input)
def backward(self, output_gradient, learning_rate):
return np.multiply(output_gradient, self.activation_prime(self.input))
class Sigmoid(Activation):
def __init__(self):
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_prime(x):
s = sigmoid(x)
return s * (1 - s)
super().__init__(sigmoid, sigmoid_prime)
###
def mse(y_true, y_pred):
return np.mean(np.power(y_true - y_pred, 2))
def mse_prime(y_true, y_pred):
return 2 * (y_pred - y_true) / np.size(y_true)
###
def predict(network, input):
output = input
for layer in network:
output = layer.forward(output)
return output
def train(network,
loss,
loss_prime,
x_train,
y_train,
batch_size=None,
epochs = 1000,
learning_rate = 0.01,
verbose = True,
print_output=10):
print('Training...')
if batch_size==None:
batch_size = len(x_train)
if not verbose:
print('Thinking... beep boop')
for e in range(epochs):
error = 0
p = np.random.permutation(batch_size)
for x, y in zip(x_train[p], y_train[p]):
# forward
output = predict(network, x)
# error
if batch_size == None:
error += loss(y, output)
else:
error += (len(x_train)//batch_size) * loss(y, output)
# backward
grad = loss_prime(y, output)
for layer in reversed(network):
grad = layer.backward(grad, learning_rate)
error /= len(x_train)
if verbose and (e+1) % print_output == 0:
print(f"{e + 1}/{epochs}, error={error}")
return network
def one_hot_vector_encoding(labels):
labels = labels.astype(int)
no_of_classes = np.max(labels) + 1
output = np.zeros((len(labels), no_of_classes))
output[np.arange(len(labels)), labels] = 1
return output
def preprocess_data(x, y):
p = np.random.permutation(len(x))
x = x[p]
y = y[p]
x = x.reshape(x.shape[0], 30 * 30, 1)
x = x.astype("float32") / 255
y = one_hot_vector_encoding(y)
y = y.reshape(y.shape[0], 10, 1)
return x, y
def classification_accuracy(true_labels, recovered_labels):
count = 0
for i, item in enumerate(true_labels):
if item==recovered_labels[i]:
count += 1
return count/len(true_labels)
We now train the network for 20 epochs over a range of learning rates. As we do not have a validation set for this data, we validate on training accuracy.
import pandas as pd
np.random.seed(210711042)
audio_mnist_training_mfccs = np.genfromtxt('AudioMNIST/MFCC/Training/training_mfccs.txt')
audio_mnist_training_labels = np.genfromtxt('AudioMNIST/MFCC/Training/training_labels.txt')
audio_mnist_testing_mfccs = np.genfromtxt('AudioMNIST/MFCC/Testing/testing_mfccs.txt')
audio_mnist_testing_labels = np.genfromtxt('AudioMNIST/MFCC/Testing/testing_labels.txt')
x_train, y_train = preprocess_data(audio_mnist_training_mfccs, audio_mnist_training_labels)
x_test, y_test = preprocess_data(audio_mnist_testing_mfccs, audio_mnist_testing_labels)
# neural network
network = [
Dense(30 * 30, 40),
Sigmoid(),
Dense(40, 10),
Sigmoid()
]
### iterate below to test for effect of learning rate on validation/test error.
nn_val_accuracies = []
nn_test_accuracies = []
l_range = np.linspace(0.01, 2, 10)
for l in l_range:
print(f'Learning Rate = {l}')
# train
model = train(network, mse, mse_prime, x_train, y_train, batch_size=None, epochs=20, learning_rate=l, print_output=20, verbose=True)
# validate
y_train_pred = []
for x, y in zip(x_train, y_train):
output = predict(network, x)
y_train_pred.append(np.argmax(output))
nn_val_accuracies.append(classification_accuracy([np.argmax(y) for y in y_train], y_train_pred))
# test
y_test_pred = []
for x, y in zip(x_test, y_test):
output = predict(network, x)
y_test_pred.append(np.argmax(output))
#print('pred:', np.argmax(output), '\ttrue:', np.argmax(y))
nn_test_accuracies.append(classification_accuracy([np.argmax(y) for y in y_test], y_test_pred))
print('Done.')
Learning Rate = 0.01 Training... 20/20, error=0.09340107941154037 Learning Rate = 0.23111111111111113 Training... 20/20, error=0.04788714308176815 Learning Rate = 0.45222222222222225 Training... 20/20, error=0.01958777912021465 Learning Rate = 0.6733333333333333 Training... 20/20, error=0.01065429215562957 Learning Rate = 0.8944444444444445 Training... 20/20, error=0.007902229471752807 Learning Rate = 1.1155555555555556 Training... 20/20, error=0.0072829541324684575 Learning Rate = 1.3366666666666667 Training... 20/20, error=0.005675824243068701 Learning Rate = 1.557777777777778 Training... 20/20, error=0.0047207657416331655 Learning Rate = 1.778888888888889 Training... 20/20, error=0.007156059678047401 Learning Rate = 2.0 Training... 20/20, error=0.006968689784519685 Done.
plt.figure(figsize=(10,5))
plt.plot(l_range, nn_val_accuracies, label = 'Validation Accuracy')
plt.plot(l_range, nn_test_accuracies, label = 'Test Accuracy')
plt.grid()
plt.legend()
plt.xlabel('Learning Rate (L)')
plt.ylabel('Accuracy')
plt.savefig('val_test_l_rate_precise.png')
plt.show()
As would be expected, our training error is always higher than testing error, but there doesnt seem to be any sign of overtraining at this small number of epochs, and general accuracy is quite high across all learning rates.
optimum_l = l_range[np.argmax(nn_test_accuracies)]
print(f'Optimum learning rate is: {optimum_l:.2f}, with test accuracy: {max(nn_test_accuracies):.2f}')
Optimum learning rate is: 1.56, with test accuracy: 0.94
We now train with 100 epochs on the optimum learning rate to see if this improves accuracy.
np.random.seed(210711042)
# train
model = train(network, mse, mse_prime, x_train, y_train, batch_size=None, epochs=100, learning_rate=optimum_l, verbose=True)
# validate
y_train_pred = []
for x, y in zip(x_train, y_train):
output = predict(network, x)
y_train_pred.append(np.argmax(output))
# test
y_test_pred = []
for x, y in zip(x_test, y_test):
output = predict(network, x)
y_test_pred.append(np.argmax(output))
cm = confusion_matrix([np.argmax(y) for y in y_test], y_test_pred, labels=np.arange(10))
df_cm = pd.DataFrame(cm, index = [i for i in range(10)], columns = [i for i in range(10)])
plt.figure(figsize=(12,10))
sns.heatmap(df_cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix_1_1000.png')
plt.show()
nn_accuracy = classification_accuracy([np.argmax(y) for y in y_test], y_test_pred)
print(f'Clasification Accuracy: {100 * nn_accuracy} %')
Training... 10/100, error=0.0032168093390167237 20/100, error=0.002505579540439619 30/100, error=0.0018586508495502334 40/100, error=0.001546325349610625 50/100, error=0.001314164205826172 60/100, error=0.0012644028201468514 70/100, error=0.0008443316017970302 80/100, error=0.0008023117199280373 90/100, error=0.0007641827193966758 100/100, error=0.0008039309614834198
Clasification Accuracy: 94.33333333333334 %
Our neural network appears to struggle with differentiating 8's from 6's, but apart from this is extremely accurate. Perhaps normalising data, adding more layers, involving convolutional layers etc. could improve this.