diff --git a/knn.py b/knn.py
index 1fd7fa48ac3bf08a5583b63d19aa2920f107c265..4dc7d61363961d8093d446d6df5ff8eca607d050 100644
--- a/knn.py
+++ b/knn.py
@@ -52,18 +52,7 @@ def evaluate_knn(data_train, labels_train, data_test, labels_test, k) :
     number_total_prediction = labels_test.shape[0]
     classification_rate = number_true_prediction/number_total_prediction
     
-    return classification_rate
-
-def plot_accuracy(data_train, labels_train, data_test, labels_test, k_max) : 
-    Y = []
-    for k in range(1, k_max+1) : 
-        Y += [evaluate_knn(data_train, labels_train, data_test, labels_test, k)]
-    plt.plot(list(range(1, k_max+1)), Y)
-    plt.xlabel('k (Number of Neighbors)')
-    plt.ylabel('Accuracy')
-    plt.savefig('results/knn.png')
-     
-    
+    return classification_rate   
     
 if __name__ == "__main__" :
     t1 = time.time()
diff --git a/mlp.py b/mlp.py
index 825ce394b75b7307eeeace901b920186255b6d9c..3613083a96b6b44db7d536acec5482de738c6558 100644
--- a/mlp.py
+++ b/mlp.py
@@ -6,6 +6,10 @@ Created on Fri Oct 27 16:48:16 2023
 """
 
 import numpy as np
+import read_cifar
+import matplotlib.pyplot as plt
+from scipy.special import expit
+from tqdm import tqdm
 
 
 def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate) : 
@@ -22,9 +26,9 @@ def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate) :
     dCdZ2 = dCdA2 * (a2 - a2**2)
     dCdW2 = np.matmul(a1.T, dCdZ2)
     dCdB2 = (1/N) * np.sum(dCdZ2, axis=0, keepdims=True)
-    dCdA1 = np.matmul(dCdZ2, w2.T)
+    dCdA1 = (1/N) * np.matmul(dCdZ2, w2.T)
     dCdZ1 = dCdA1 * (a1 - a1**2)
-    dCdW1 = np.matmul(a0.T, dCdZ1)
+    dCdW1 = (1/N) * np.matmul(a0.T, dCdZ1)
     dCdB1 = (1/N) * np.sum(dCdZ1, axis=0, keepdims=True)
     
     # one gradient descent step
@@ -33,8 +37,132 @@ def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate) :
     w2 -= dCdW2 * learning_rate
     b2 -= dCdB2 * learning_rate
     
+    # new a2 calculation
+    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
+    a1 = 1 / (1 + np.exp(-z1))  # output of the hidden layer (sigmoid activation function)
+    z2 = np.matmul(a1, w2) + b2  # input of the output layer
+    a2 = 1 / (1 + np.exp(-z2))  # output of the output layer (sigmoid activation function)
+    predictions = a2
+    
     loss = np.mean(np.square(predictions - targets))
     
     return w1, b1, w2, b2, loss
     
+
+def one_hot(labels) :
+    D = len(labels)
+    L = max(labels) # labels is an int array
+    one_hot_matrix = np.zeros((D, L+1))
+    for k in range(len(labels)) : 
+        label = labels[k]
+        one_hot_matrix[k, label] = 1
+    return one_hot_matrix
+
+def softmax(x):
+    e_x = np.exp(x - np.max(x))  # Soustraction du max pour éviter les problèmes de stabilité numérique
+    return e_x / e_x.sum()
+
+
+def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate) :
+    y = one_hot(labels_train)
+    
+    a0 = data # the data are the input of the first layer
+    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
+    a1 = expit(z1)  # output of the hidden layer (sigmoid activation function)
+    z2 = np.matmul(a1, w2) + b2  # input of the output layer
+    a2 = expit(z2)
+    
+    #computing the softmax predictions and loss
+    predictions = softmax(a2)
+    loss = -np.sum(y*np.log(predictions))/(float(predictions.shape[0]))#cross entropy loss
+    
+    predictions = a2  # the predicted values are the outputs of the output layer
+    N = data.shape[0]
+    
+    # calculation of partial derivates of C
+    dCdZ2 = a2 - y
+    dCdW2 = (1/N) * np.matmul(a1.T, dCdZ2)
+    dCdB2 = (1/N) * np.sum(dCdZ2, keepdims=True)
+    dCdA1 = np.matmul(dCdZ2, w2.T)
+    dCdZ1 = dCdA1 * (a1 - np.square(a1))
+    dCdW1 = (1/N) * np.matmul(a0.T, dCdZ1)
+    dCdB1 = (1/N) * np.sum(dCdZ1, keepdims=True)
+    
+    # one gradient descent step
+    w1 -= dCdW1 * learning_rate
+    b1 -= dCdB1 * learning_rate
+    w2 -= dCdW2 * learning_rate
+    b2 -= dCdB2 * learning_rate
+    
+    # accuracy calculation
+    predictions_vect = np.argmax(predictions, axis=1)
+    number_true_prediction = np.sum(labels_train == predictions_vect)
+    number_total_prediction = labels_train.shape[0]
+    accuracy = number_true_prediction/number_total_prediction
+    
+    return w1, b1, w2, b2, accuracy
+
+def train_mlp(w1, b1, w2, b2, data, labels_train, learning_rate, num_epoch) : 
+    train_accuracies = []
+    for k in tqdm(range(num_epoch)) :
+        w1, b1, w2, b2, accuracy = learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate)
+        train_accuracies.append(accuracy)
+        
+    return w1, b1, w2, b2, train_accuracies
+
+
+def test_mlp(w1, b1, w2, b2, data_test, labels_test) :
+    # prediction calculation
+    a0 = data_test
+    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
+    a1 = expit(z1)  # output of the hidden layer (sigmoid activation function)
+    z2 = np.matmul(a1, w2) + b2  # input of the output layer
+    a2 = np.exp(z2) / np.sum(np.exp(z2))  # output of the output layer (sigmoid activation function)
+    predictions = a2
+    predictions_vect = np.argmax(predictions, axis=1) 
+    
+    # accuracy calculation
+    number_true_prediction = np.sum(labels_test == predictions_vect)
+    number_total_prediction = len(labels_test)
+    test_accuracy = number_true_prediction/number_total_prediction
+
+    return test_accuracy
+
+
+def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate, num_epoch) :
+    d_out = np.max(labels_train) + 1 # output dimension (number of neurons of the output layer 
+    d_in = data_train.shape[1]
+    # Random initialization of the network weights and biaises
+    w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
+    b1 = np.zeros((1, d_h))  # first layer biaises
+    w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
+    b2 = np.zeros((1, d_out))  # second layer biaises
+
+    w1, b1, w2, b2, train_accuracies = train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch)
+    test_accuracy = test_mlp(w1, b1, w2, b2, data_test, labels_test)
+    return train_accuracies, test_accuracy
+
+if __name__ == "__main__" :
+    # open, read and split the data using read_cifar script
+    file = "./data/cifar-10-python/"
+    data, labels = read_cifar.read_cifar(file)
+    data_train, labels_train, data_test, labels_test = read_cifar.split_dataset(data, labels, 0.9)
+    
+    # train the MLP
+    d_h=64
+    learning_rate=0.1
+    num_epoch=100
+    
+    train_accuracies, test_accuracy = run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate, num_epoch)
+  
+    # Plot the figure
+    plt.figure(figsize = (12,8))
+    plt.plot(np.array(train_accuracies)*100, color='r', label='split factor : 0.9')
+    plt.title("Multilayer Perceptron training accuracy")
+    plt.legend()
+    plt.xlabel('number of epochs (num_epoch)')
+    plt.ylabel('accuracy (%)')
+    plt.show()
+    plt.savefig('results/mlp.png')
+    
     
\ No newline at end of file
diff --git a/read_cifar.py b/read_cifar.py
index f25324d500f4cdcb148bd6dcb38f9b482cbe749e..3cc5db39208cfa0dc843297e52e0d0659e7ea41f 100644
--- a/read_cifar.py
+++ b/read_cifar.py
@@ -24,7 +24,7 @@ def read_cifar (batch_dir) :
     data_batches = []
     label_batches = []
     
-    for i in range(1,4) :
+    for i in range(1,6) :
         batch_filename = f'data_batch_{i}'
         batch_path = os.path.join(batch_dir, batch_filename)
         data, labels = read_cifar_batch(batch_path)
diff --git a/results/mlp.png b/results/mlp.png
new file mode 100644
index 0000000000000000000000000000000000000000..0615c2f516317665bcec9b40c91015ff8a58715c
Binary files /dev/null and b/results/mlp.png differ