From 5bbb047e1b9dd00e961a5f89ece7b5604077ccb8 Mon Sep 17 00:00:00 2001 From: corentin <corentin.massala@gmail.com> Date: Thu, 9 Nov 2023 11:16:55 +0100 Subject: [PATCH] Correction of the softmax, graph added, correction on backpropagation --- mlp.py | 150 +++++++++++++++++++++++++++------------------------------ 1 file changed, 71 insertions(+), 79 deletions(-) diff --git a/mlp.py b/mlp.py index 5f449bf..c8822b9 100644 --- a/mlp.py +++ b/mlp.py @@ -6,16 +6,14 @@ import matplotlib.pyplot as plt def sigmoid(x): return 1 / (1 + np.exp(-x)) - def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate): - N_out = len(targets) #number of training examples - + N_out = len(data) #number of training examples # Forward pass a0 = data # the data are the input of the first layer - z1 = np.matmul(a0, w1) + b1 # input of the hidden layer + z1 = np.dot(a0, w1) + b1 # input of the hidden layer a1 = sigmoid(z1) # output of the hidden layer (sigmoid activation function) - z2 = np.matmul(a1, w2) + b2 # input of the output layer + z2 = np.dot(a1, w2) + b2 # input of the output layer a2 = sigmoid(z2) # output of the output layer (sigmoid activation function) predictions = a2 # the predicted values are the outputs of the output layer @@ -30,50 +28,39 @@ def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate): # print('shape w2', w2.shape) # print('shape b2', b2.shape) + # Backpropagation + # Backpropagation delta_a2 = 2 / N_out * (a2 - targets) - # print('shape delta_a2', delta_a2.shape) - delta_z2 = delta_a2 * (a2 * (1 - a2)) - # print('shape delta_z2', delta_z2.shape) - delta_w2 = np.dot(a1.T, delta_z2) - # print('shape delta_w2', delta_w2.shape) - delta_b2 = delta_z2 + delta_z2 = delta_a2 * (a2 * (1 - a2)) # We divide by the sample size to have an average on the error and avoid big gradient jumps + delta_w2 = np.dot(a1.T, delta_z2) + delta_b2 = np.sum(delta_z2, axis = 0, keepdims = True) delta_a1 = np.dot(delta_z2, w2.T) - # print('shape delta_a1', delta_a1.shape) - delta_z1 = delta_a1 * (a1 * (1- a1)) - # print('shape delta_z1', delta_z1.shape) - delta_w1 = np.dot(a0.T, delta_z1) - # print('shape delta_w1', delta_w2.shape) - delta_b1 = delta_z1 - - # Update weights and biases - w2 -= learning_rate * delta_w2 - b2 -= learning_rate * np.sum(delta_b2, axis = 0, keepdims = True) - - w1 -= learning_rate * delta_w1 - b1 -= learning_rate * np.sum(delta_b1, axis = 0, keepdims = True) + delta_z1 = delta_a1 * (a1 * (1 - a1)) + delta_w1 = np.dot(a0.T, delta_z1) + delta_b1 = np.sum(delta_z1, axis = 0, keepdims = True) return w1, b1, w2, b2, loss def one_hot(labels): - #num_classes = np.max(labels) + 1 on va le hardcoder ici - num_classes = 10 + num_classes = int(np.max(labels) + 1) #num_classes = 10 one_hot_matrix = np.eye(num_classes)[labels] return one_hot_matrix def softmax_stable(x): - #We use this function to avoid computing to big numbers - return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum()) + #We use this function to avoid computing big numbers + return(np.exp(x - np.max(x, axis=1, keepdims=True)) / np.exp(x - np.max(x, axis=1, keepdims=True)).sum()) -def cross_entropy_loss(y_pred, y_true): - loss = -np.sum(y_true * np.log(y_pred)) / len(y_pred) +def cross_entropy_loss(y_pred, y_true_one_hot): + epsilon = 1e-10 + loss = - np.sum( y_true_one_hot * np.log(y_pred + epsilon) ) / len(y_pred) return loss def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate): - N_out = len(labels_train) #number of training examples + N_out = len(data) #number of training examples # Forward pass a0 = data # the data are the input of the first layer @@ -82,31 +69,33 @@ def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate): z2 = np.matmul(a1, w2) + b2 # input of the output layer a2 = softmax_stable(z2) # output of the output layer (sigmoid activation function) predictions = a2 # the predicted values are the outputs of the output layer - + # print('a0', a0[:2]) + # print('w1', w1[:2]) + # print('z1', z1[:2]) + # print('a1', a1[:2]) + # print('z2', z2[:2]) + # print('a2', a2[:2]) # Compute loss (cross-entropy loss) y_true_one_hot = one_hot(labels_train) loss = cross_entropy_loss(predictions, y_true_one_hot) - # Backpropagation - # delta_a2 = 2 / N_out * (a2 - labels_train) ceci n'est plus nécessaire ici - delta_z2 = (a2 - y_true_one_hot) - delta_w2 = np.dot(a1.T, delta_z2) / N_out # on divise par N_out pour ne pas faire des saut de gradient trop elevés - delta_b2 = delta_z2 / N_out - + delta_z2 = (a2 - y_true_one_hot) # We divide by the sample size to have an average on the error and avoid big gradient jumps + delta_w2 = np.dot(a1.T, delta_z2) / N_out + delta_b2 = np.sum(delta_z2, axis = 0, keepdims = True) / N_out - delta_a1 = np.dot(delta_z2, w2.T) - delta_z1 = delta_a1 * (a1 * (1 - a1)) + delta_a1 = np.dot(delta_z2, w2.T) + delta_z1 = delta_a1 * (a1 * (1 - a1)) / N_out delta_w1 = np.dot(a0.T, delta_z1) / N_out - delta_b1 = delta_z1 / N_out - - # Update weights and biases - w2 -= learning_rate * delta_w2 - b2 -= learning_rate * np.sum(delta_b2, axis = 0, keepdims = True) + delta_b1 = np.sum(delta_z1, axis = 0, keepdims = True) / N_out + + # Update weights and biases w1 -= learning_rate * delta_w1 - b1 -= learning_rate * np.sum(delta_b1, axis = 0, keepdims = True) + b1 -= learning_rate * delta_b1 + w2 -= learning_rate * delta_w2 + b2 -= learning_rate * delta_b2 return w1, b1, w2, b2, loss @@ -129,13 +118,10 @@ def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch # Compute accuracy predictions = forward(w1, b1, w2, b2, data_train) predicted_labels = np.argmax(predictions, axis=1) - # print(predictions.shape) - # print(predicted_labels.shape) - # print(labels_train.shape) accuracy = np.mean(predicted_labels == labels_train) train_accuracies.append(accuracy) - print(f'Epoch {epoch + 1}/{num_epoch}, Loss: {loss:.3f}, Train Accuracy: {accuracy:.2f}') + print(f'Epoch {epoch + 1}/{num_epoch}, Loss: {loss:.3f}, Train Accuracy: {accuracy:.5f}') return w1, b1, w2, b2, train_accuracies @@ -144,22 +130,20 @@ def test_mlp(w1, b1, w2, b2, data_test, labels_test): # Compute accuracy predictions = forward(w1, b1, w2, b2, data_test) predicted_labels = np.argmax(predictions, axis=1) - print(predicted_labels) test_accuracy = np.mean(predicted_labels == labels_test) - print(f'Train Accuracy: {test_accuracy:.2f}') + print(f'Test Accuracy: {test_accuracy:.2f}') return test_accuracy -def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h,learning_rate, num_epoch): +def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate, num_epoch): d_in = data_train.shape[1] d_out = 10 #we can hard code it here or len(np.unique(label_train)) #Random initialisation of weights - w1 = np.random.randn(d_in, d_h) - b1 = np.random.randn(1, d_h) - - w2 = np.random.randn(d_h, d_out) - b2 = np.random.randn(1, d_out) + w1 = np.random.randn(d_in, d_h) / np.sqrt(d_in) + b1 = np.zeros((1, d_h)) + w2 = np.random.randn(d_h, d_out) / np.sqrt(d_h) + b2 = np.zeros((1, d_out)) # Train MLP w1, b1, w2, b2, train_accuracies = train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch) @@ -168,32 +152,40 @@ def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h,learn test_accuracy = test_mlp(w1, b1, w2, b2, data_test, labels_test) return train_accuracies, test_accuracy +def plot_graph(data_train, labels_train, data_test, labels_test, d_h, learning_rate, num_epoch): + # Run MLP training + train_accuracies, test_accuracy = run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate, num_epoch) + + # Plot and save the learning accuracy graph + plt.figure(figsize=(8, 6)) + epochs = np.arange(1, num_epoch + 1) + plt.plot(epochs, train_accuracies, marker='x', color='b', label='Train Accuracy') + plt.xlabel('Epochs') + plt.ylabel('Accuracy') + plt.title('MLP Train Accuracy') + plt.legend() + plt.grid(True) + plt.savefig('image-classification/results/mlp.png') + plt.show() + + if __name__ == '__main__': data, labels = read_cifar.read_cifar('image-classification/data/cifar-10-batches-py') X_train, X_test, y_train, y_test = read_cifar.split_dataset(data, labels, 0.9) + d_in, d_h, d_out = 3072, 64, 10 + learning_rate = 0.1 + num_epoch = 100 - d_in, d_h, d_out = 3072, 728, 10 - w1 = np.random.normal(scale=0.5, size=(d_in, d_h)) - b1 = np.random.randn(1, d_h) - w2 = np.random.normal(scale=0.5, size=(d_h, d_out)) - b2 = np.random.randn(1, d_out) - - # print(forward(w1, b1, w2, b2,X_train[:1])) - # for i in range(100): - # learn_once_cross_entropy(w1, b1, w2, b2, X_train[:1000], y_train[:1000], 0.005) - train_mlp(w1, b1, w2, b2, X_train[:10000], y_train[:10000], 0.1, 100) - # train_mlp_2(w1, w2, X_train[:10000], y_train[:10000], 0.05, 100) - # test_mlp(w1, b1, w2, b2, X_test[:50], y_test[:50]) + # #Initialisation + # w1 = np.random.randn(d_in, d_h) / np.sqrt(d_in) + # b1 = np.zeros((1, d_h)) + # w2 = np.random.randn(d_h, d_out) / np.sqrt(d_h) + # b2 = np.zeros((1, d_out)) + # train_mlp(w1, b1, w2, b2, X_train, y_train, 0.1, 100) + # test_mlp(w1, b1, w2, b2, X_test[:50], y_test[:50]) + plot_graph(X_train, y_train, X_test ,y_test , d_h, learning_rate, num_epoch) - # values = [2, 4, 5, 3] - # # Output achieved - # output = softmax_stable(values) - # y_true = [3, 1] # 1 observation - # y_true_one_hot = one_hot(y_true) - # print(y_true_one_hot) - # y_pred = [[0.1, 0.1, 0.1, 0.7],[0.1, 0.1, 0.1, 0.7]] - # loss = cross_entropy_loss(y_pred, y_true_one_hot) - # print(loss) \ No newline at end of file + \ No newline at end of file -- GitLab