import numpy as np


def linear_model_function(data_matrix, weights):
    return np.dot(data_matrix, weights)


def binary_logistic_activation_function(inputs):
    return 1/(1+np.exp(-inputs))


def binary_logistic_prediction_function(logistic_values):
    return logistic_values >= 0.5

def classification_accuracy(true_labels, recovered_labels):
    count = 0
    for i, elem in enumerate(true_labels):
        if elem == recovered_labels[i]:
            count += 1
    return count / len(true_labels)


def binary_logistic_regression_cost_function(data_matrix, data_labels, weights):
    return np.mean(np.log(1 + np.exp(np.dot(data_matrix, weights))) - data_labels * np.dot(data_matrix, weights))

def binary_logistic_regression_gradient(data_matrix, data_labels, weights):
    return data_matrix.T @ (binary_logistic_activation_function(np.dot(data_matrix, weights)) - data_labels) / len(data_matrix)


def gradient_descent(objective,
                     gradient,
                     initial_weights,
                     step_size=1,
                     no_of_iterations=100,
                     print_output=10):
    objective_values = []
    weights = np.copy(initial_weights)
    objective_values.append(objective(weights))
    for counter in range(no_of_iterations):
        weights -= step_size * gradient(weights)
        objective_values.append(objective(weights))
        if (counter + 1) % print_output == 0:
            print(f'Iteration {counter+1}/{no_of_iterations}, objective = {objective_values[counter]}.')
    print(f'Iteration completed after {counter+1}/{no_of_iterations}, objective = {objective_values[counter]}.')
    return weights, objective_values

def gradient_descent_v2(objective, gradient, initial_weights, \
                        step_size=1, no_of_iterations=100, print_output=10, tolerance=1e-6):
    objective_values = []
    weights = np.copy(initial_weights)
    objective_values.append(objective(weights))
    for counter in range(no_of_iterations):
        weights -= step_size * gradient(weights)
        objective_values.append(objective(weights))
        if (counter + 1) % print_output == 0:
            print(f'Iteration {counter+1}/{no_of_iterations}, objective = {objective_values[counter]}.')
        if np.linalg.norm(gradient(weights)) <= tolerance:
            break
    print(f'Iteration completed after {counter+1}/{no_of_iterations}, objective = {objective_values[counter]}.')
    return weights, objective_values


spotify_training_data = np.genfromtxt('spotify_training.csv',
                                      skip_header=True,
                                      dtype=None,
                                      delimiter=',')
spotify_testing_data_input = np.genfromtxt('spotify_testing.csv',
                                           skip_header=True,
                                           dtype=None,
                                           delimiter=',')
spotify_training_data_input = spotify_training_data[:, :-1]
spotify_training_data_labels = spotify_training_data[:, -1].reshape(-1, 1)


def standardise(data_matrix):
    means = np.mean(data_matrix, axis=0)
    standardised_matrix = data_matrix - means
    stds = np.std(standardised_matrix, axis=0)
    return (standardised_matrix / stds), means, stds

def de_standardise(standardised_matrix, row_of_means, row_of_stds):
    return standardised_matrix * row_of_stds + row_of_means


spotify_training_data_input, spotify_row_of_avgs, spotify_row_of_stds = standardise(spotify_training_data_input)
spotify_testing_data_input = (spotify_testing_data_input - spotify_row_of_avgs) / spotify_row_of_stds


def linear_regression_data(data_inputs):
    first_column = np.ones((len(data_inputs), 1))
    X_matrix = np.c_[first_column,data_inputs]
    return X_matrix


spotify_training_data_matrix = linear_regression_data(spotify_training_data_input)

spotify_objective = lambda weights: binary_logistic_regression_cost_function(spotify_training_data_matrix, spotify_training_data_labels, weights)

spotify_gradient = lambda weights: binary_logistic_regression_gradient(spotify_training_data_matrix, spotify_training_data_labels, weights)

spotify_initial_weights = np.zeros(len(spotify_training_data_matrix.T)).reshape(len(spotify_training_data_matrix.T), 1)

spotify_step_size = 3.9 * len(spotify_training_data_matrix) / np.linalg.norm(spotify_training_data_matrix)**2

spotify_optimal_weights, spotify_objective_values = gradient_descent_v2(spotify_objective, spotify_gradient,
                                                                        spotify_initial_weights,
                                                                        spotify_step_size, no_of_iterations=1000, print_output=10,
                                                                        tolerance = 1e-2)

Iteration 10/1000, objective = 0.3656217497255595.
Iteration 20/1000, objective = 0.30596451520120277.
Iteration 30/1000, objective = 0.27905827259944155.
Iteration 40/1000, objective = 0.2627961153435555.
Iteration 50/1000, objective = 0.25161785537553033.
Iteration 60/1000, objective = 0.243342246725595.
Iteration 70/1000, objective = 0.23690261321297962.
Iteration 80/1000, objective = 0.23170531567632618.
Iteration 90/1000, objective = 0.2273907132713435.
Iteration 100/1000, objective = 0.22372733676269302.
Iteration 110/1000, objective = 0.22055922100700967.
Iteration 120/1000, objective = 0.21777738870557936.
Iteration 130/1000, objective = 0.21530339462833412.
Iteration 140/1000, objective = 0.21307934159732492.
Iteration 150/1000, objective = 0.2110615717681608.
Iteration 160/1000, objective = 0.20921654328811126.
Iteration 170/1000, objective = 0.20751805697725514.
Iteration 180/1000, objective = 0.2059453443989625.
Iteration 190/1000, objective = 0.2044817210576845.
Iteration 200/1000, objective = 0.20311361947131654.
Iteration 210/1000, objective = 0.20182988312341005.
Iteration 220/1000, objective = 0.20062124302753828.
Iteration 230/1000, objective = 0.1994799243263153.
Iteration 240/1000, objective = 0.19839934692994124.
Iteration 250/1000, objective = 0.19737389512587236.
Iteration 260/1000, objective = 0.19639873842677535.
Iteration 270/1000, objective = 0.1954696909328364.
Iteration 280/1000, objective = 0.1945830999582056.
Iteration 290/1000, objective = 0.1937357571150063.
Iteration 300/1000, objective = 0.19292482679017547.
Iteration 310/1000, objective = 0.1921477882072467.
Iteration 320/1000, objective = 0.19140238818244162.
Iteration 330/1000, objective = 0.1906866023609994.
Iteration 340/1000, objective = 0.1899986032236738.
Iteration 350/1000, objective = 0.1893367335322925.
Iteration 360/1000, objective = 0.18869948417071905.
Iteration 370/1000, objective = 0.18808547555738597.
Iteration 380/1000, objective = 0.18749344197498705.
Iteration 390/1000, objective = 0.18692221829443936.
Iteration 400/1000, objective = 0.18637072867302754.
Iteration 410/1000, objective = 0.18583797688751488.
Iteration 420/1000, objective = 0.18532303802700822.
Iteration 430/1000, objective = 0.1848250513212996.
Iteration 440/1000, objective = 0.18434321392116687.
Iteration 450/1000, objective = 0.18387677547987788.
Iteration 460/1000, objective = 0.18342503341161065.
Iteration 470/1000, objective = 0.1829873287239673.
Iteration 480/1000, objective = 0.1825630423392369.
Iteration 490/1000, objective = 0.18215159183334703.
Iteration 500/1000, objective = 0.18175242853315693.
Iteration 510/1000, objective = 0.1813650349223749.
Iteration 520/1000, objective = 0.18098892231433117.
Iteration 530/1000, objective = 0.18062362875640717.
Iteration 540/1000, objective = 0.18026871713637088.
Iteration 550/1000, objective = 0.17992377346539687.
Iteration 560/1000, objective = 0.17958840531632078.
Iteration 570/1000, objective = 0.17926224039882857.
Iteration 580/1000, objective = 0.17894492525591488.
Iteration 590/1000, objective = 0.17863612406815424.
Iteration 600/1000, objective = 0.1783355175541878.
Iteration 610/1000, objective = 0.1780428019573889.
Iteration 620/1000, objective = 0.1777576881099966.
Iteration completed after 623/1000, objective = 0.17767359477303019.


spotify_recovered_labels = binary_logistic_activation_function(np.dot(spotify_training_data_matrix, spotify_optimal_weights)) >= 0.5

spotify_classification_accuracy = classification_accuracy(spotify_training_data_labels, spotify_recovered_labels)

print(f'Spotify Classification Accuracy: {100*spotify_classification_accuracy:.2f}%')

Spotify Classification Accuracy: 92.67%


def ridge_binary_logistic_regression_cost_function(data_matrix, 
                                                   data_labels, 
                                                   weights, 
                                                   regularisation_parameter):
    return binary_logistic_regression_cost_function(data_matrix, data_labels, weights) + regularisation_parameter * np.linalg.norm(weights)**2 / 2

def ridge_binary_logistic_regression_gradient(data_matrix, 
                                              data_labels, 
                                              weights, 
                                              regularisation_parameter):
    return binary_logistic_regression_gradient(data_matrix, data_labels, weights) + regularisation_parameter * weights


def grid_search(objective, grid):
    values = np.array([])
    for point in grid:
        values = np.append(values, objective(point))
    return grid[np.argmin(values)]


regularisation_parameter = 1

spotify_regularisation_parameter_grid = np.arange(0, .1, 0.01)

spotify_validation_error = lambda regularisation_parameter: 1-classification_accuracy(spotify_training_data_labels,\
                                                            binary_logistic_prediction_function(
                                                            binary_logistic_activation_function(
                                                            linear_model_function(spotify_training_data_matrix,
                                                            gradient_descent_v2(
                                                            objective = lambda weights: ridge_binary_logistic_regression_cost_function(
                                                                        spotify_training_data_matrix,
                                                                        spotify_training_data_labels,
                                                                        weights,
                                                                        regularisation_parameter),
                                                            gradient = lambda weights: ridge_binary_logistic_regression_gradient(
                                                                        spotify_training_data_matrix,
                                                                        spotify_training_data_labels,
                                                                        weights,
                                                                        regularisation_parameter),
                                                            initial_weights = np.zeros(shape = (spotify_training_data_matrix.shape[1], 1)),
                                                            step_size = 1/(np.linalg.norm(spotify_training_data_matrix)**2/(3.9*len(spotify_training_data_matrix))+regularisation_parameter),
                                                            no_of_iterations = 2000,
                                                            print_output = 2001,
                                                            tolerance = 1e-5
                                                            )[0]))))

spotify_optimal_regularisation_parameter = grid_search(
    spotify_validation_error, spotify_regularisation_parameter_grid)

spotify_optimal_weights = gradient_descent_v2(
    objective=lambda weights: ridge_binary_logistic_regression_cost_function(
        spotify_training_data_matrix, spotify_training_data_labels, weights,
        spotify_optimal_regularisation_parameter),
    gradient=lambda weights: ridge_binary_logistic_regression_gradient(
        spotify_training_data_matrix, spotify_training_data_labels, weights,
        spotify_optimal_regularisation_parameter),
    initial_weights=np.zeros(shape=(spotify_training_data_matrix.shape[1], 1)),
    step_size=1 /
    (np.linalg.norm(spotify_training_data_matrix)**2 /
     (3.9 * len(spotify_training_data_matrix)) + regularisation_parameter),
    no_of_iterations=2000,
    print_output=2001,
    tolerance=1e-5)[0]

spotify_training_regression_values = linear_model_function(
    spotify_training_data_matrix, spotify_optimal_weights)
spotify_training_predicted_labels = binary_logistic_prediction_function(
    binary_logistic_activation_function(spotify_training_regression_values))
spotify_classification_accuracy = classification_accuracy(true_labels=spotify_training_data_labels,\
                                                         recovered_labels=spotify_training_predicted_labels)
print(
    "The classification accuracy for the training set is {p:.2f} %. This is achieved for the hyperparameter value {a}"
    .format(p=100 * spotify_classification_accuracy,
            a=spotify_optimal_regularisation_parameter))

Iteration completed after 2000/2000, objective = 0.16459394936632762.
Iteration completed after 1636/2000, objective = 0.24916613121759873.
Iteration completed after 952/2000, objective = 0.28121024509895964.
Iteration completed after 676/2000, objective = 0.30325224599054196.
Iteration completed after 525/2000, objective = 0.32046027272947586.
Iteration completed after 430/2000, objective = 0.3347162465704719.
Iteration completed after 365/2000, objective = 0.3469480847841871.
Iteration completed after 317/2000, objective = 0.3576911695182459.
Iteration completed after 281/2000, objective = 0.3672861042167601.
Iteration completed after 252/2000, objective = 0.37596462134901215.
Iteration completed after 858/2000, objective = 0.30325224599048967.
The classification accuracy for the training set is 92.67 %. This is achieved for the hyperparameter value 0.03

Spotify Music Classification¶

Logistic Regression¶

Ridge Regression¶