import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

%matplotlib inline


audio_mnist_training_mfccs = np.genfromtxt('AudioMNIST/MFCC/Training/training_mfccs.txt')
audio_mnist_training_labels = np.genfromtxt('AudioMNIST/MFCC/Training/training_labels.txt').reshape(-1,1)

audio_mnist_testing_mfccs = np.genfromtxt('AudioMNIST/MFCC/Testing/testing_mfccs.txt')
audio_mnist_testing_labels = np.genfromtxt('AudioMNIST/MFCC/Testing/testing_labels.txt').reshape(-1,1)


item_number = np.random.randint(low=0, high=2400)

plt.imshow(audio_mnist_training_mfccs[item_number].reshape(30, 30), cmap='hot')
plt.title("This is the {n}th audio sample of the Audio MNIST training set. The corresponding label is {l}".format( \
            n= item_number, l=int(audio_mnist_training_labels[item_number][0])))
plt.tight_layout

<function matplotlib.pyplot.tight_layout(*, pad=1.08, h_pad=None, w_pad=None, rect=None)>


class Layer():
    def __init__(self):
        self.input = input
        self.output = output
    def forward(self, input):
        pass
    def backward(self, output_gradient, learning_rate):
        pass
    
class Dense(Layer):
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(output_size, input_size)
        self.bias = np.random.randn(output_size, 1)
    def forward(self, input):
        self.input = input
        return np.dot(self.weights, self.input) + self.bias

    def backward(self, output_gradient, learning_rate):
        weights_gradient = np.dot(output_gradient, self.input.T)
        self.weights -= learning_rate * weights_gradient
        self.bias -= learning_rate * output_gradient
        return np.dot(self.weights.T, output_gradient) 
    
###    
    
class Activation(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime
    def forward(self, input):
        self.input = input
        return self.activation(self.input)
    def backward(self, output_gradient, learning_rate):
        return np.multiply(output_gradient, self.activation_prime(self.input))
    
class Sigmoid(Activation):
    def __init__(self):
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))
        def sigmoid_prime(x):
            s = sigmoid(x)
            return s * (1 - s)
        super().__init__(sigmoid, sigmoid_prime)

        
###        
  
    
def mse(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred, 2))

def mse_prime(y_true, y_pred):
    return 2 * (y_pred - y_true) / np.size(y_true)


###


def predict(network, input):
    output = input
    for layer in network:
        output = layer.forward(output)
    return output

def train(network, 
          loss, 
          loss_prime, 
          x_train, 
          y_train, 
          batch_size=None, 
          epochs = 1000, 
          learning_rate = 0.01, 
          verbose = True, 
          print_output=10):
    print('Training...')
    if batch_size==None:
        batch_size = len(x_train)
    if not verbose:
        print('Thinking... beep boop')
    for e in range(epochs):
        error = 0
        p = np.random.permutation(batch_size)
        for x, y in zip(x_train[p], y_train[p]):
            # forward
            output = predict(network, x)

            # error
            if batch_size == None:
                error += loss(y, output)
            else:
                error += (len(x_train)//batch_size) * loss(y, output)

            # backward
            grad = loss_prime(y, output)
            for layer in reversed(network):
                grad = layer.backward(grad, learning_rate)

        error /= len(x_train)
        if verbose and (e+1) % print_output == 0:
            print(f"{e + 1}/{epochs}, error={error}")
        
    return network

def one_hot_vector_encoding(labels):
    labels = labels.astype(int)
    no_of_classes = np.max(labels) + 1
    output = np.zeros((len(labels), no_of_classes))
    output[np.arange(len(labels)), labels] = 1
    return output

def preprocess_data(x, y):
    p = np.random.permutation(len(x))
    x = x[p]
    y = y[p]
    x = x.reshape(x.shape[0], 30 * 30, 1)
    x = x.astype("float32") / 255
    y = one_hot_vector_encoding(y)
    y = y.reshape(y.shape[0], 10, 1)
    return x, y

def classification_accuracy(true_labels, recovered_labels):
    count = 0
    for i, item in enumerate(true_labels):
        if item==recovered_labels[i]:
            count += 1
    return count/len(true_labels)


import pandas as pd
np.random.seed(210711042)

audio_mnist_training_mfccs = np.genfromtxt('AudioMNIST/MFCC/Training/training_mfccs.txt')
audio_mnist_training_labels = np.genfromtxt('AudioMNIST/MFCC/Training/training_labels.txt')

audio_mnist_testing_mfccs = np.genfromtxt('AudioMNIST/MFCC/Testing/testing_mfccs.txt')
audio_mnist_testing_labels = np.genfromtxt('AudioMNIST/MFCC/Testing/testing_labels.txt')

x_train, y_train = preprocess_data(audio_mnist_training_mfccs, audio_mnist_training_labels)
x_test, y_test = preprocess_data(audio_mnist_testing_mfccs, audio_mnist_testing_labels)


# neural network
network = [
    Dense(30 * 30, 40),
    Sigmoid(),
    Dense(40, 10),
    Sigmoid()
]

### iterate below to test for effect of learning rate on validation/test error.

nn_val_accuracies = []
nn_test_accuracies = []
l_range = np.linspace(0.01, 2, 10)

for l in l_range:
    print(f'Learning Rate = {l}')
    # train
    model = train(network, mse, mse_prime, x_train, y_train, batch_size=None, epochs=20, learning_rate=l, print_output=20, verbose=True)

    # validate
    y_train_pred = []
    for x, y in zip(x_train, y_train):
        output = predict(network, x)
        y_train_pred.append(np.argmax(output))
    nn_val_accuracies.append(classification_accuracy([np.argmax(y) for y in y_train], y_train_pred))

    # test
    y_test_pred = []
    for x, y in zip(x_test, y_test):
        output = predict(network, x)
        y_test_pred.append(np.argmax(output))
        #print('pred:', np.argmax(output), '\ttrue:', np.argmax(y))
    nn_test_accuracies.append(classification_accuracy([np.argmax(y) for y in y_test], y_test_pred))
print('Done.')

Learning Rate = 0.01
Training...
20/20, error=0.09340107941154037
Learning Rate = 0.23111111111111113
Training...
20/20, error=0.04788714308176815
Learning Rate = 0.45222222222222225
Training...
20/20, error=0.01958777912021465
Learning Rate = 0.6733333333333333
Training...
20/20, error=0.01065429215562957
Learning Rate = 0.8944444444444445
Training...
20/20, error=0.007902229471752807
Learning Rate = 1.1155555555555556
Training...
20/20, error=0.0072829541324684575
Learning Rate = 1.3366666666666667
Training...
20/20, error=0.005675824243068701
Learning Rate = 1.557777777777778
Training...
20/20, error=0.0047207657416331655
Learning Rate = 1.778888888888889
Training...
20/20, error=0.007156059678047401
Learning Rate = 2.0
Training...
20/20, error=0.006968689784519685
Done.


plt.figure(figsize=(10,5))
plt.plot(l_range, nn_val_accuracies, label = 'Validation Accuracy')
plt.plot(l_range, nn_test_accuracies, label = 'Test Accuracy')
plt.grid()
plt.legend()
plt.xlabel('Learning Rate (L)')
plt.ylabel('Accuracy')
plt.savefig('val_test_l_rate_precise.png')
plt.show()


optimum_l = l_range[np.argmax(nn_test_accuracies)]
print(f'Optimum learning rate is: {optimum_l:.2f}, with test accuracy: {max(nn_test_accuracies):.2f}')

Optimum learning rate is: 1.56, with test accuracy: 0.94


np.random.seed(210711042)

# train
model = train(network, mse, mse_prime, x_train, y_train, batch_size=None, epochs=100, learning_rate=optimum_l, verbose=True)

# validate
y_train_pred = []
for x, y in zip(x_train, y_train):
    output = predict(network, x)
    y_train_pred.append(np.argmax(output))
    
# test
y_test_pred = []
for x, y in zip(x_test, y_test):
    output = predict(network, x)
    y_test_pred.append(np.argmax(output))

cm = confusion_matrix([np.argmax(y) for y in y_test], y_test_pred, labels=np.arange(10))
df_cm = pd.DataFrame(cm, index = [i for i in range(10)], columns = [i for i in range(10)])
plt.figure(figsize=(12,10))
sns.heatmap(df_cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix_1_1000.png')
plt.show()

nn_accuracy = classification_accuracy([np.argmax(y) for y in y_test], y_test_pred)
print(f'Clasification Accuracy: {100 * nn_accuracy} %')

Training...
10/100, error=0.0032168093390167237
20/100, error=0.002505579540439619
30/100, error=0.0018586508495502334
40/100, error=0.001546325349610625
50/100, error=0.001314164205826172
60/100, error=0.0012644028201468514
70/100, error=0.0008443316017970302
80/100, error=0.0008023117199280373
90/100, error=0.0007641827193966758
100/100, error=0.0008039309614834198

Clasification Accuracy: 94.33333333333334 %

Sound classification with deep learning¶