diff --git a/mlp.py b/mlp.py
index 0f3546bc2cc0cd70caa5e924eb97cf5c09ea34b8..086121de19398658c9d33e3440429def69d58694 100644
--- a/mlp.py
+++ b/mlp.py
@@ -1,5 +1,7 @@
 import numpy as np
 
+import read_cifar
+
 
 def learn_once_mse(w1: np.array, b1: int, w2: np.array, b2: int, data: np.array, target: np.array, learning_rate: float):
     """
@@ -76,7 +78,7 @@ def one_hot(labels: np.array):
     return one_hot_matrix
 
 
-def learn_once_cross_entropy(w1: np.array, b1: np.array, w2: np.array, b2: np.array, data: np.array, labels_train: np.array, learning_rate: int):
+def learn_once_cross_entropy(w1: np.array, b1: np.array, w2: np.array, b2: np.array, data: np.array, labels_train: np.array, learning_rate: float):
     """
     Performs one learning step of the MLP with cross-entropy loss
 
@@ -96,24 +98,42 @@ def learn_once_cross_entropy(w1: np.array, b1: np.array, w2: np.array, b2: np.ar
         loss -- loss of the forward pass
     """
     # Forward pass
-    a0 = data  # the data are the input of the first layer
-    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
+    # the data are the input of the first layer
+    a0 = data
+    # input of the hidden layer
+    z1 = np.matmul(a0, w1) + b1
     # output of the hidden layer (sigmoid activation function)
     a1 = 1 / (1 + np.exp(-z1))
-    z2 = np.matmul(a1, w2) + b2  # input of the output layer
+    # input of the output layer
+    z2 = np.matmul(a1, w2) + b2
     # output of the output layer (sigmoid activation function)
-    a2 = 1 / (1 + np.exp(-z2))
-    predictions = a2  # the predicted values are the outputs of the output layer
+    a2 = np.exp(z2) / np.sum(np.exp(z2), axis=1, keepdims=True)
+    # the predicted values are the outputs of the output layer
+    predictions = a2
+
+    one_hot_targets = one_hot(labels_train)
 
     # Compute loss (cross-entropy)
-    loss = -np.mean(np.sum(labels_train * np.log(predictions) +
-                    (1 - labels_train) * np.log(1 - predictions), axis=1))
+    loss = -np.mean(
+        one_hot_targets * np.log(predictions) +
+        (1 - one_hot_targets) * np.log(1 - predictions)
+    )
 
     # Backward pass
     # derivative of the loss with respect to the output of the output layer
-    dC_dA2 = -labels_train / predictions + (1 - labels_train) / (1 - predictions)
-    # derivative of the loss with respect to the input of the output layer
-    # dC_dZ2 = a2 - 
+    dC_dZ2 = predictions - one_hot_targets
+    # derivative of the loss with respect to the weights of the output layer
+    dC_dW2 = np.matmul(a1.T, dC_dZ2)
+    # derivative of the loss with respect to the biaises of the output layer
+    dC_dB2 = np.sum(dC_dZ2, axis=0, keepdims=True)
+    # derivative of the loss with respect to the output of the hidden layer
+    dC_dA1 = np.matmul(dC_dZ2, w2.T)
+    # derivative of the loss with respect to the input of the hidden layer
+    dC_dZ1 = dC_dA1 * (1 - a1) * a1
+    # derivative of the loss with respect to the weights of the hidden layer
+    dC_dW1 = np.matmul(a0.T, dC_dZ1)
+    # derivative of the loss with respect to the biaises of the hidden layer
+    dC_dB1 = np.sum(dC_dZ1, axis=0, keepdims=True)
 
     # Update weights and biaises
     w1 -= learning_rate * dC_dW1
@@ -124,6 +144,103 @@ def learn_once_cross_entropy(w1: np.array, b1: np.array, w2: np.array, b2: np.ar
     return w1, b1, w2, b2, loss
 
 
+def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epochs):
+    """
+    Trains the MLP
+
+    Arguments:
+        w1 -- weights of the hidden layer
+        b1 -- biaises of the hidden layer
+        w2 -- weights of the output layer
+        b2 -- biaises of the output layer
+        data_train -- training data
+        labels_train -- labels of the training data
+        learning_rate -- learning rate
+        num_epochs -- number of epochs
+    Returns:
+        w1 -- updated weights of the hidden layer
+        b1 -- updated biaises of the hidden layer
+        w2 -- updated weights of the output layer
+        b2 -- updated biaises of the output layer
+        acc -- list of accuracies across epochs
+    """
+    acc = []
+    for i in range(num_epochs):
+        w1, b1, w2, b2, loss = learn_once_cross_entropy(
+            w1, b1, w2, b2, data_train, labels_train, learning_rate)
+        acc.append(test_mlp(w1, b1, w2, b2, data_train, labels_train))
+    return w1, b1, w2, b2, acc
+
+
+def test_mlp(w1, b1, w2, b2, data_test, labels_test):
+    """
+    Tests the MLP
+
+    Arguments:
+        w1 -- weights of the hidden layer
+        b1 -- biaises of the hidden layer
+        w2 -- weights of the output layer
+        b2 -- biaises of the output layer
+        data_test -- test data
+        labels_test -- labels of the test data
+    Returns:
+        acc -- accuracy
+    """
+    # Forward pass
+    # the data are the input of the first layer
+    a0 = data_test
+    # input of the hidden layer
+    z1 = np.matmul(a0, w1) + b1
+    # output of the hidden layer (sigmoid activation function)
+    a1 = 1 / (1 + np.exp(-z1))
+    # input of the output layer
+    z2 = np.matmul(a1, w2) + b2
+    # output of the output layer (sigmoid activation function)
+    a2 = np.exp(z2) / np.sum(np.exp(z2), axis=1, keepdims=True)
+    # the predicted values are the outputs of the output layer
+    predictions = a2
+    
+    # Compute accuracy
+    acc = np.mean(np.argmax(predictions, axis=1) == labels_test)
+    
+    return acc
+
+
+def run_mlp_training(data_train, labels_train, data_test, labels_test, learning_rate, num_epochs):
+    """
+    Runs the MLP training
+
+    Arguments:
+        data_train -- training data
+        labels_train -- labels of the training data
+        data_test -- test data
+        labels_test -- labels of the test data
+        learning_rate -- learning rate
+        n_epochs -- number of epochs
+    Returns:
+        w1 -- weights of the hidden layer
+        b1 -- biaises of the hidden layer
+        w2 -- weights of the output layer
+        b2 -- biaises of the output layer
+        acc -- list of accuracies across epochs
+    """
+    N = data_train.shape[0]  # number of training data
+    d_in = data_train.shape[1]  # input dimension
+    d_h = 3  # number of neurons in the hidden layer
+    # output dimension (number of neurons of the output layer)
+    d_out = np.max(labels_train) + 1
+
+    # Random initialization of the network weights and biaises
+    w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
+    b1 = np.zeros((1, d_h))  # first layer biaises
+    w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
+    b2 = np.zeros((1, d_out))  # second layer biaises
+
+    w1, b1, w2, b2, acc = train_mlp(
+        w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epochs)
+    return w1, b1, w2, b2, acc
+
+
 if __name__ == "__main__":
     N = 30  # number of input data
     d_in = 3  # input dimension
@@ -145,3 +262,28 @@ if __name__ == "__main__":
         print(loss)
 
     print(one_hot(np.array([9, 1, 3, 0, 6, 5, 2, 7, 8, 4])))
+
+    N = 30  # number of input data
+    d_in = 3  # input dimension
+    d_h = 3  # number of neurons in the hidden layer
+    d_out = 5  # output dimension (number of neurons of the output layer)
+
+    w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
+    b1 = np.zeros((1, d_h))  # first layer biaises
+    w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
+    b2 = np.zeros((1, d_out))  # second layer biaises
+
+    data = np.random.rand(N, d_in)  # create a random data
+    targets = np.random.randint(1, d_out, N)  # create a random targets
+
+    for i in range(100):
+        w1, b1, w2, b2, loss = learn_once_cross_entropy(
+            w1, b1, w2, b2, data, targets, 0.1)
+        print(loss)
+
+    data, labels = read_cifar.read_cifar("data/cifar-10-batches-py/")
+    data_train, labels_train, data_test, labels_test = read_cifar.split_dataset(
+        data, labels, 0.8)
+    w1, b1, w2, b2, acc = run_mlp_training(
+        data_train, labels_train, data_test, labels_test, 0.1, 100)
+    print(acc)