diff --git a/mlp.py b/mlp.py index bfc602bed3a8aeb5ee01b800c58acf0daaff20ff..222225cd81538be3802564086da0e179efaafbbc 100644 --- a/mlp.py +++ b/mlp.py @@ -11,16 +11,16 @@ def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate): # Forward pass a0 = data # the data are the input of the first layer z1 = np.matmul(a0, w1) + b1 # input of the hidden layer - a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer (sigmoid activation function) + a1 = sigmoid(z1) # output of the hidden layer (sigmoid activation function) z2 = np.matmul(a1, w2) + b2 # input of the output layer - a2 = 1 / (1 + np.exp(-z2)) # output of the output layer (sigmoid activation function) + a2 = sigmoid(z2) # output of the output layer (sigmoid activation function) predictions = a2 # the predicted values are the outputs of the output layer # Compute loss (MSE) loss = np.mean(np.square(predictions - targets)) # According to the formulas established by theory : - d_a2 = 2 / N * (1 - targets) + d_a2 = 2 / N * (a2 - targets) d_z2 = d_a2 * a2 * (1 - a2) d_w2 = np.matmul(a1.T, d_z2) d_b2 = d_z2 @@ -46,30 +46,36 @@ def one_hot(labels): return one_hot_matrix +def softmax(x): + e_x = np.exp(x - np.max(x)) # Subtracting the maximum value for numerical stability + return e_x / e_x.sum(axis=0) + + def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate): N = len(labels_train) # number of training examples # Forward pass a0 = data # the data are the input of the first layer z1 = np.matmul(a0, w1) + b1 # input of the hidden layer - a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer (sigmoid activation function) + a1 = sigmoid(z1) # output of the hidden layer (sigmoid activation function) z2 = np.matmul(a1, w2) + b2 # input of the output layer - a2 = 1 / (1 + np.exp(-z2)) # output of the output layer (sigmoid activation function) + a2 = softmax(z2) # output of the output layer (softmax activation function) predictions = a2 # the predicted values are the outputs of the output layer targets_one_hot = one_hot(labels_train) # target as a one-hot encoding for the desired labels # Cross-entropy loss - loss = -np.sum(targets_one_hot * np.log(predictions)) / N + epsilon = 0.00001 + loss = - np.sum(targets_one_hot * np.log(predictions + epsilon)) / N # Backpropagation d_z2 = a2 - targets_one_hot d_w2 = np.dot(a1.T, d_z2) / N - d_b2 = d_z2 / N + d_b2 = np.sum(d_z2, axis = 0, keepdims = True) / N d_a1 = np.dot(d_z2, w2.T) d_z1 = d_a1 * z1 * (1 - a1) d_w1 = np.dot(a0.T, d_z1) / N - d_b1 = d_z1 / N + d_b1 = np.sum(d_z1, axis = 0, keepdims = True) / N # Calculation of the updated weights and biases of the network with gradient descent method w1 -= learning_rate * d_w1 @@ -88,9 +94,9 @@ def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch # Forward pass a0 = data_train # the data are the input of the first layer z1 = np.matmul(a0, w1) + b1 # input of the hidden layer - a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer (sigmoid activation function) + a1 = sigmoid(z1) # output of the hidden layer (sigmoid activation function) z2 = np.matmul(a1, w2) + b2 # input of the output layer - a2 = 1 / (1 + np.exp(-z2)) # output of the output layer (sigmoid activation function) + a2 = softmax(z2) # output of the output layer (softmax activation function) predictions = a2 # the predicted values are the outputs of the output layer # Find the predicted class @@ -108,9 +114,9 @@ def test_mlp(w1, b1, w2, b2, data_test, labels_test): # Forward pass a0 = data_test # the data are the input of the first layer z1 = np.matmul(a0, w1) + b1 # input of the hidden layer - a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer (sigmoid activation function) + a1 = sigmoid(z1) # output of the hidden layer (sigmoid activation function) z2 = np.matmul(a1, w2) + b2 # input of the output layer - a2 = 1 / (1 + np.exp(-z2)) # output of the output layer (sigmoid activation function) + a2 = softmax(z2) # output of the output layer (softmax activation function) predictions = a2 # the predicted values are the outputs of the output layer # Find the predicted label @@ -128,10 +134,10 @@ def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, lear d_in = data_train.shape[1] # number of input neurons d_out = len(np.unique(labels_train)) # number of output neurons = number of classes - # Random initialization of the network weights and biaises - w1 = 2 * np.random.rand(d_in, d_h) - 1 # first layer weights + # Random initialization of the network weights and biaises with Xavier initialisation + w1 = np.random.randn(d_in, d_h) / np.sqrt(d_in) # first layer weights b1 = np.zeros((1, d_h)) # first layer biaises - w2 = 2 * np.random.rand(d_h, d_out) - 1 # second layer weights + w2 = np.random.randn(d_h, d_out) / np.sqrt(d_h) # second layer weights b2 = np.zeros((1, d_out)) # second layer biaises # Training of the MLP classifier with num_epoch steps @@ -145,26 +151,28 @@ def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, lear if __name__ == "__main__": - split_factor = 0.9 + # Parameters + split_factor = 0.1 d_h = 64 learning_rate = 0.1 num_epoch = 100 + # Extraction and formatting of the data from Cifar database data, labels = read_cifar("./data/cifar-10-batches-py") data_train, labels_train, data_test, labels_test = split_dataset(data, labels, split_factor) + # Initialisation of the data to plot epochs = [i for i in range(1, num_epoch + 1)] - learning_accuracy = [0] * num_epoch - - for i in range(num_epoch) : - train_accuracies, test_accuracy = run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate, i + 1) - learning_accuracy[i] = test_accuracy + train_accuracies, test_accuracy = run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate, num_epoch) - plt.plot(epochs, learning_accuracy) + # Plot the graph + plt.close() + plt.plot(epochs, train_accuracies) plt.title("Evolution of learning accuracy across learning epochs") plt.xlabel("number of epochs") plt.ylabel("Accuracy") plt.grid(True, which='both') + plt.show() plt.savefig("results/mlp.png")