Compare revisions

Matías Duhalde · Matías Duhalde · Matías Duhalde · Matías Duhalde · fffcc01f · fffcc01f
--- a/README.md
+++ b/README.md
@@ -16,8 +16,8 @@ This project contains several files and directory. A brief description of each i
 - `results/`: This folder contains some of the results generated by the program.
 - `knn.py`: Contains functions related to the KNN algorithm.
 - `read_cifar.py`: Contains functions related to reading and parsing the CIFAR-10 dataset.
- `nn.py`: Contains functions related to the Neural Network algorithm.
+- `mlp.py`: Contains functions related to the Neural Network algorithm.
- `main.ipynb`: Jupyter Notebook containing the main program. It is used to the algorithms and generate the results.
+- `main.ipynb`: Jupyter Notebook containing the main program. It is used to test the algorithms and generate the results. It also contains some descriptions regarding the algorithms, notably a mathematical description of the Neural Network algorithm.
 ## Usage
@@ -37,3 +37,4 @@ The main program is contained in the `main.ipynb` file. It can be run using Jupy
 ## References
 - Data Source : The CIFAR-10 Dataset. <https://www.cs.toronto.edu/~kriz/cifar.html>
+- Assignment repository <https://gitlab.ec-lyon.fr/edelland/mod_4_6-td1>
--- a/main.ipynb
+++ b/main.ipynb
 %% Cell type:markdown id: tags:
 # TD 1 : Image Classification
 %% Cell type:markdown id: tags:
 ## Dataset
+%% Cell type:markdown id: tags:
+### General imports
 %% Cell type:code id: tags:
 ``` python
+from importlib import reload
 import numpy as np
 import matplotlib.pyplot as plt
+```
+%% Cell type:markdown id: tags:
+### Read dataset
+%% Cell type:code id: tags:
+``` python
+import read_cifar
+reload(read_cifar)
 from read_cifar import read_cifar, split_dataset
 ```
 %% Cell type:code id: tags:
 ``` python
 DATASET_PATH = "./data/cifar-10-batches-py"
 ```
 %% Cell type:code id: tags:
 ``` python
 cifar_data, cifar_labels = read_cifar(DATASET_PATH)
 ```
 %% Cell type:code id: tags:
 ``` python
 print(cifar_data[0:5])
 print(cifar_labels[0:5])
 ```
 %% Output
    [[ 59.  43.  50. ... 140.  84.  72.]
     [154. 126. 105. ... 139. 142. 144.]
     [255. 253. 253. ...  83.  83.  84.]
     [ 28.  37.  38. ...  28.  37.  46.]
     [170. 168. 177. ...  82.  78.  80.]]
    [6 9 9 4 1]
 %% Cell type:markdown id: tags:
+### 0.9 Split
+%% Cell type:code id: tags:
+``` python
+# Split dataset into training and testing with 90% training and 10% testing
+train_data_09, train_labels_09, test_data_09, test_labels_09 = split_dataset(
+    cifar_data,
+    cifar_labels,
+    0.9
+)
+```
+%% Cell type:markdown id: tags:
 ## K-Nearest Neighbors (KNN) Classification
 %% Cell type:code id: tags:
 ``` python
+import knn
+reload(knn)
 from knn import evaluate_knn, distance_matrix, knn_predict
 ```
 %% Cell type:markdown id: tags:
 ### Example with split = 0.8 and k = 3
 %% Cell type:code id: tags:
 ``` python
 example_split = 0.8
 example_k = 3
-example_train_data, example_train_labels, example_test_data, example_test_labels = split_dataset(cifar_data, cifar_labels, example_split)
+example_train_data, example_train_labels, example_test_data, example_test_labels = split_dataset(
+    cifar_data,
+    cifar_labels,
+    example_split
+)
 print(example_train_data.shape)
 print(example_train_labels.shape)
 print(example_test_data.shape)
 print(example_test_labels.shape)
 ```
 %% Output
    (48000, 3072)
    (48000,)
    (12000, 3072)
    (12000,)
 %% Cell type:code id: tags:
 ``` python
-example_accuracy = evaluate_knn(example_train_data, example_train_labels, example_test_data, example_test_labels, example_k)
+example_accuracy = evaluate_knn(
+    example_train_data,
+    example_train_labels,
+    example_test_data,
+    example_test_labels,
+    example_k
+)
 print(example_accuracy)
 ```
 %% Output
-    0.33066666666666666
+    0.3294166666666667
 %% Cell type:markdown id: tags:
 ### split = 0.9 with K from 1 to 20
 %% Cell type:code id: tags:
 ``` python
-# Split dataset into training and testing with 90% training and 10% testing
-train_data_09, train_labels_09, test_data_09, test_labels_09 = split_dataset(cifar_data, cifar_labels, 0.9)
-```
-%% Cell type:code id: tags:
-``` python
 # The distance matrix won't change, so we only calculate it once
 dist_matrix_09 = distance_matrix(test_data_09, train_data_09)
 ```
 %% Cell type:code id: tags:
 ``` python
 accuracies_09 = []
 for k in range(1, 21):
    predicted_labels_09 = knn_predict(dist_matrix_09, train_labels_09, k)
    accuracy = (predicted_labels_09 == test_labels_09).mean()
    accuracies_09.append(accuracy)
    print(f"k = {k}, accuracy = {accuracy}")
 ```
 %% Output
-    k = 1, accuracy = 0.3506666666666667
+    k = 1, accuracy = 0.35033333333333333
-    k = 2, accuracy = 0.30716666666666664
+    k = 2, accuracy = 0.3175
-    k = 3, accuracy = 0.3298333333333333
+    k = 3, accuracy = 0.331
-    k = 4, accuracy = 0.3328333333333333
+    k = 4, accuracy = 0.3418333333333333
-    k = 5, accuracy = 0.3323333333333333
+    k = 5, accuracy = 0.346
-    k = 6, accuracy = 0.3383333333333333
+    k = 6, accuracy = 0.3393333333333333
-    k = 7, accuracy = 0.3401666666666667
+    k = 7, accuracy = 0.3405
-    k = 8, accuracy = 0.3395
+    k = 8, accuracy = 0.33866666666666667
-    k = 9, accuracy = 0.3416666666666667
+    k = 9, accuracy = 0.346
-    k = 10, accuracy = 0.3388333333333333
+    k = 10, accuracy = 0.3413333333333333
-    k = 11, accuracy = 0.3416666666666667
+    k = 11, accuracy = 0.3368333333333333
-    k = 12, accuracy = 0.3426666666666667
+    k = 12, accuracy = 0.335
-    k = 13, accuracy = 0.3435
+    k = 13, accuracy = 0.3363333333333333
-    k = 14, accuracy = 0.3405
+    k = 14, accuracy = 0.3378333333333333
-    k = 15, accuracy = 0.3395
+    k = 15, accuracy = 0.333
-    k = 16, accuracy = 0.341
+    k = 16, accuracy = 0.33116666666666666
-    k = 17, accuracy = 0.3375
+    k = 17, accuracy = 0.33016666666666666
-    k = 18, accuracy = 0.33816666666666667
+    k = 18, accuracy = 0.3318333333333333
-    k = 19, accuracy = 0.33466666666666667
+    k = 19, accuracy = 0.3293333333333333
-    k = 20, accuracy = 0.333
+    k = 20, accuracy = 0.32516666666666666
 %% Cell type:code id: tags:
 ``` python
 plt.plot(range(1, 21), accuracies_09)
 plt.xlabel("k")
 plt.xticks(range(1, 21, 2))
 plt.ylabel("Accuracy")
 plt.savefig('./results/knn.png')
 plt.show()
 ```
 %% Output
+%% Cell type:markdown id: tags:
+## Artificial Neural Network
+%% Cell type:markdown id: tags:
+### Mathematical Formulation
+#### 1
+We have that $\sigma(x) = \frac{1}{1+e^{-x}}$. We can calculate the derivative of $\sigma$ as follows:
+$$
+\begin{align}
+    \sigma'(x) &= \frac{d}{dx} \frac{1}{1+e^{-x}} \\
+    &= \frac{d}{dx} (1+e^{-x})^{-1} \\
+    &= -(1+e^{-x})^{-2} (-e^{-x}) \\
+    &= \frac{e^{-x}}{(1+e^{-x})^{2}} \\
+    &= \frac{1}{1+e^{-x}} \frac{e^{-x}}{1+e^{-x}} \\
+    &= \frac{1}{1+e^{-x}} \frac{1+e^{-x}-1}{1+e^{-x}} \\
+    &= \frac{1}{1+e^{-x}} \left( 1 - \frac{1}{1+e^{-x}} \right) \\
+    &= \sigma(x) (1 - \sigma(x))
+\end{align}
+$$
+Therefore we find that $\sigma'(x) = \sigma(x) (1 - \sigma(x))$.
+#### 2
+We use mean squared error as our loss function. We have that
+$$
+    C = \frac{1}{N_{out}} \Sigma_{i=1}^{N_{out}} (\hat{y}_i - y_i)^2
+$$
+where $N_{out}$ is the number of output neurons, $\hat{y}_i$ is the predicted value of the $i$-th output neuron and $y_i$ is the true value of the $i$-th output neuron (in our case, $y_i = a^{(2)}_i$). We have that
+We can express $\frac{dC}{dA^{(2)}}$ as follows:
+$$
+\begin{align}
+    \frac{dC}{da^{(2)}_i} &= \frac{d}{da^{(2)}_i} \left( \frac{1}{N_{out}} \Sigma_{i=1}^{N_{out}} (a^{(2)}_i - y_i)^2 \right) \\
+    &= \frac{d}{da^{(2)}_i} \left( \frac{1}{N_{out}} ((a^{(2)}_1 - y_1)^2 + ... + (a^{(2)}_i - y_i)^2 + ... + (a^{(2)}_{N_{out}} - y_{N_{out}})^2) \right) \\
+    &=  \frac{1}{N_{out}} 2(a^{(2)}_i - y_i)
+\end{align}
+$$
+And in vector form
+$$
+    \frac{dC}{dA^{(2)}} =  \frac{1}{N_{out}} 2(A^{(2)} - Y)
+$$
+#### 3
+$$
+\begin{align}
+    \frac{dC}{dZ^{(2)}} &= \frac{dC}{dA^{(2)}} \frac{dA^{(2)}}{dZ^{(2)}} \\
+    &= \frac{dC}{dA^{(2)}} \frac{dA^{(2)}}{dZ^{(2)}} \\
+    &= \frac{dC}{dA^{(2)}} \frac{d}{dZ^{(2)}} \sigma(Z^{(2)}) \\
+    &= \frac{dC}{dA^{(2)}} \sigma'(Z^{(2)}) \\
+    &= \frac{dC}{dA^{(2)}} \sigma(Z^{(2)}) (1 - \sigma(Z^{(2)})) \\
+    &= \frac{dC}{dA^{(2)}} A^{(2)} (1 - A^{(2)})
+\end{align}
+$$
+#### 4
+$$
+\begin{align}
+    \frac{dC}{dW^{(2)}} &= \frac{dC}{dZ^{(2)}} \frac{dZ^{(2)}}{dW^{(2)}} \\
+    &= \frac{dC}{dZ^{(2)}} \frac{d}{dW^{(2)}} \left( W^{(2)} A^{(1)} + B^{(2)} \right) \\
+    &= \frac{dC}{dZ^{(2)}} A^{(1)}
+\end{align}
+$$
+#### 5
+$$
+\begin{align}
+    \frac{dC}{dB^{(2)}} &= \frac{dC}{dZ^{(2)}} \frac{dZ^{(2)}}{dB^{(2)}} \\
+    &= \frac{dC}{dZ^{(2)}} \frac{d}{dB^{(2)}} \left( W^{(2)} A^{(1)} + B^{(2)} \right) \\
+    &= \frac{dC}{dZ^{(2)}} 1 \\
+    &= \frac{dC}{dZ^{(2)}}
+\end{align}
+$$
+#### 6
+$$
+\begin{align}
+    \frac{dC}{dA^{(1)}} &= \frac{dC}{dZ^{(2)}} \frac{dZ^{(2)}}{dA^{(1)}} \\
+    &= \frac{dC}{dZ^{(2)}} \frac{d}{dA^{(1)}} \left( W^{(2)} A^{(1)} + B^{(2)} \right) \\
+    &= \frac{dC}{dZ^{(2)}} W^{(2)}
+\end{align}
+$$
+#### 7
+$$
+\begin{align}
+    \frac{dC}{dZ^{(1)}} &= \frac{dC}{dA^{(1)}} \frac{dA^{(1)}}{dZ^{(1)}} \\
+    &= \frac{dC}{dA^{(1)}} A^{(1)} (1 - A^{(1)})
+\end{align}
+$$
+#### 8
+$$
+\begin{align}
+    \frac{dC}{dW^{(1)}} &= \frac{dC}{dZ^{(1)}} \frac{dZ^{(1)}}{dW^{(1)}} \\
+    &= \frac{dC}{dZ^{(1)}} A^{(0)}
+\end{align}
+$$
+#### 9
+$$
+\begin{align}
+    \frac{dC}{dB^{(1)}} &= \frac{dC}{dZ^{(1)}} \frac{dZ^{(1)}}{dB^{(1)}} \\
+    &= \frac{dC}{dZ^{(1)}}
+\end{align}
+$$
+%% Cell type:code id: tags:
+``` python
+import mlp
+reload(mlp)
+from mlp import (
+    learn_once_mse,
+    one_hot,
+    learn_once_cross_entropy,
+    run_mlp_training
+)
+```
+%% Cell type:markdown id: tags:
+### Learning pass example
 %% Cell type:code id: tags:
 ``` python
+N = 30  # number of input data
+d_in = 3  # input dimension
+d_h = 3  # number of neurons in the hidden layer
+d_out = 2  # output dimension (number of neurons of the output layer)
+# Random initialization of the network weights and biaises
+w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
+b1 = np.zeros((1, d_h))  # first layer biaises
+w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
+b2 = np.zeros((1, d_out))  # second layer biaises
+random_data = np.random.rand(N, d_in)  # create a random data
+random_targets = np.random.rand(N, d_out)  # create a random targets
+```
+%% Cell type:code id: tags:
+``` python
+updated_w1, updated_b1, updated_w2, updated_b2, loss = learn_once_mse(
+    w1,
+    b1,
+    w2,
+    b2,
+    random_data,
+    random_targets,
+    0.1,
+)
+print("Loss:", loss)
+```
+%% Output
+    Loss: 0.10367831888711801
+%% Cell type:markdown id: tags:
+### One-hot encoding
+%% Cell type:code id: tags:
+``` python
+one_hot(np.array([1, 2, 0]))
+```
+%% Output
+    array([[0., 1., 0.],
+           [0., 0., 1.],
+           [1., 0., 0.]])
+%% Cell type:markdown id: tags:
+### Cross entropy pass example
+%% Cell type:code id: tags:
+``` python
+N = 30  # number of input data
+d_in = 3  # input dimension
+d_h = 3  # number of neurons in the hidden layer
+d_out = 5  # output dimension (number of neurons of the output layer)
+# Random initialization of the network weights and biaises
+w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
+b1 = np.zeros((1, d_h))  # first layer biaises
+w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
+b2 = np.zeros((1, d_out))  # second layer biaises
+random_data = np.random.rand(N, d_in)  # create a random data
+random_targets = np.random.randint(1, d_out, N)  # create a random targets
+```
+%% Cell type:code id: tags:
+``` python
+cross_w1, cross_b1, cross_w2, cross_b2, cross_loss = learn_once_cross_entropy(
+    w1,
+    b1,
+    w2,
+    b2,
+    random_data,
+    random_targets,
+    0.1,
+)
+print("Loss:", cross_loss)
+```
+%% Output
+    Loss: 0.6940785845571713
+%% Cell type:markdown id: tags:
+### split = 0.9, d_h = 64, learning_rate 0.1, 100 epochs
+%% Cell type:code id: tags:
+``` python
+d_h = 64
+learning_rate = 0.1
+num_epoch = 100
+```
+%% Cell type:code id: tags:
+``` python
+training_accuracy_values, test_accuracy = run_mlp_training(
+    train_data_09,
+    train_labels_09,
+    test_data_09,
+    test_labels_09,
+    d_h,
+    learning_rate,
+    num_epoch
+)
+for (i, training_accuracy) in enumerate(training_accuracy_values):
+    print(f"Epoch {i}: {training_accuracy}")
+print(f"Test accuracy: {test_accuracy}")
+```
+%% Output
+    /home/tracert6/Documents/ECL-S9-DeepLearning/TP/TP1/mlp.py:210: RuntimeWarning: overflow encountered in exp
+      z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
+%% Cell type:code id: tags:
+``` python
+# Plot the training accuracy values
+plt.plot(range(num_epoch + 1), training_accuracy_values)
+plt.xlabel("Epoch")
+plt.ylabel("Training Accuracy")
+plt.savefig('./results/mlp.png')
+plt.show()
 ```

 %% Cell type:markdown id: tags:
 # TD 1 : Image Classification
 %% Cell type:markdown id: tags:
 ## Dataset
+%% Cell type:markdown id: tags:
+### General imports
 %% Cell type:code id: tags:
 ``` python
+from importlib import reload
 import numpy as np
 import matplotlib.pyplot as plt
+```
+%% Cell type:markdown id: tags:
+### Read dataset
+%% Cell type:code id: tags:
+``` python
+import read_cifar
+reload(read_cifar)
 from read_cifar import read_cifar, split_dataset
 ```
 %% Cell type:code id: tags:
 ``` python
 DATASET_PATH = "./data/cifar-10-batches-py"
 ```
 %% Cell type:code id: tags:
 ``` python
 cifar_data, cifar_labels = read_cifar(DATASET_PATH)
 ```
 %% Cell type:code id: tags:
 ``` python
 print(cifar_data[0:5])
 print(cifar_labels[0:5])
 ```
 %% Output
    [[ 59.  43.  50. ... 140.  84.  72.]
     [154. 126. 105. ... 139. 142. 144.]
     [255. 253. 253. ...  83.  83.  84.]
     [ 28.  37.  38. ...  28.  37.  46.]
     [170. 168. 177. ...  82.  78.  80.]]
    [6 9 9 4 1]
 %% Cell type:markdown id: tags:
+### 0.9 Split
+%% Cell type:code id: tags:
+``` python
+# Split dataset into training and testing with 90% training and 10% testing
+train_data_09, train_labels_09, test_data_09, test_labels_09 = split_dataset(
+    cifar_data,
+    cifar_labels,
+    0.9
+)
+```
+%% Cell type:markdown id: tags:
 ## K-Nearest Neighbors (KNN) Classification
 %% Cell type:code id: tags:
 ``` python
+import knn
+reload(knn)
 from knn import evaluate_knn, distance_matrix, knn_predict
 ```
 %% Cell type:markdown id: tags:
 ### Example with split = 0.8 and k = 3
 %% Cell type:code id: tags:
 ``` python
 example_split = 0.8
 example_k = 3
-example_train_data, example_train_labels, example_test_data, example_test_labels = split_dataset(cifar_data, cifar_labels, example_split)
+example_train_data, example_train_labels, example_test_data, example_test_labels = split_dataset(
+    cifar_data,
+    cifar_labels,
+    example_split
+)
 print(example_train_data.shape)
 print(example_train_labels.shape)
 print(example_test_data.shape)
 print(example_test_labels.shape)
 ```
 %% Output
    (48000, 3072)
    (48000,)
    (12000, 3072)
    (12000,)
 %% Cell type:code id: tags:
 ``` python
-example_accuracy = evaluate_knn(example_train_data, example_train_labels, example_test_data, example_test_labels, example_k)
+example_accuracy = evaluate_knn(
+    example_train_data,
+    example_train_labels,
+    example_test_data,
+    example_test_labels,
+    example_k
+)
 print(example_accuracy)
 ```
 %% Output
-    0.33066666666666666
+    0.3294166666666667
 %% Cell type:markdown id: tags:
 ### split = 0.9 with K from 1 to 20
 %% Cell type:code id: tags:
 ``` python
-# Split dataset into training and testing with 90% training and 10% testing
-train_data_09, train_labels_09, test_data_09, test_labels_09 = split_dataset(cifar_data, cifar_labels, 0.9)
-```
-%% Cell type:code id: tags:
-``` python
 # The distance matrix won't change, so we only calculate it once
 dist_matrix_09 = distance_matrix(test_data_09, train_data_09)
 ```
 %% Cell type:code id: tags:
 ``` python
 accuracies_09 = []
 for k in range(1, 21):
    predicted_labels_09 = knn_predict(dist_matrix_09, train_labels_09, k)
    accuracy = (predicted_labels_09 == test_labels_09).mean()
    accuracies_09.append(accuracy)
    print(f"k = {k}, accuracy = {accuracy}")
 ```
 %% Output
-    k = 1, accuracy = 0.3506666666666667
+    k = 1, accuracy = 0.35033333333333333
-    k = 2, accuracy = 0.30716666666666664
+    k = 2, accuracy = 0.3175
-    k = 3, accuracy = 0.3298333333333333
+    k = 3, accuracy = 0.331
-    k = 4, accuracy = 0.3328333333333333
+    k = 4, accuracy = 0.3418333333333333
-    k = 5, accuracy = 0.3323333333333333
+    k = 5, accuracy = 0.346
-    k = 6, accuracy = 0.3383333333333333
+    k = 6, accuracy = 0.3393333333333333
-    k = 7, accuracy = 0.3401666666666667
+    k = 7, accuracy = 0.3405
-    k = 8, accuracy = 0.3395
+    k = 8, accuracy = 0.33866666666666667
-    k = 9, accuracy = 0.3416666666666667
+    k = 9, accuracy = 0.346
-    k = 10, accuracy = 0.3388333333333333
+    k = 10, accuracy = 0.3413333333333333
-    k = 11, accuracy = 0.3416666666666667
+    k = 11, accuracy = 0.3368333333333333
-    k = 12, accuracy = 0.3426666666666667
+    k = 12, accuracy = 0.335
-    k = 13, accuracy = 0.3435
+    k = 13, accuracy = 0.3363333333333333
-    k = 14, accuracy = 0.3405
+    k = 14, accuracy = 0.3378333333333333
-    k = 15, accuracy = 0.3395
+    k = 15, accuracy = 0.333
-    k = 16, accuracy = 0.341
+    k = 16, accuracy = 0.33116666666666666
-    k = 17, accuracy = 0.3375
+    k = 17, accuracy = 0.33016666666666666
-    k = 18, accuracy = 0.33816666666666667
+    k = 18, accuracy = 0.3318333333333333
-    k = 19, accuracy = 0.33466666666666667
+    k = 19, accuracy = 0.3293333333333333
-    k = 20, accuracy = 0.333
+    k = 20, accuracy = 0.32516666666666666
 %% Cell type:code id: tags:
 ``` python
 plt.plot(range(1, 21), accuracies_09)
 plt.xlabel("k")
 plt.xticks(range(1, 21, 2))
 plt.ylabel("Accuracy")
 plt.savefig('./results/knn.png')
 plt.show()
 ```
 %% Output
+%% Cell type:markdown id: tags:
+## Artificial Neural Network
+%% Cell type:markdown id: tags:
+### Mathematical Formulation
+#### 1
+We have that $\sigma(x) = \frac{1}{1+e^{-x}}$. We can calculate the derivative of $\sigma$ as follows:
+$$
+\begin{align}
+    \sigma'(x) &= \frac{d}{dx} \frac{1}{1+e^{-x}} \\
+    &= \frac{d}{dx} (1+e^{-x})^{-1} \\
+    &= -(1+e^{-x})^{-2} (-e^{-x}) \\
+    &= \frac{e^{-x}}{(1+e^{-x})^{2}} \\
+    &= \frac{1}{1+e^{-x}} \frac{e^{-x}}{1+e^{-x}} \\
+    &= \frac{1}{1+e^{-x}} \frac{1+e^{-x}-1}{1+e^{-x}} \\
+    &= \frac{1}{1+e^{-x}} \left( 1 - \frac{1}{1+e^{-x}} \right) \\
+    &= \sigma(x) (1 - \sigma(x))
+\end{align}
+$$
+Therefore we find that $\sigma'(x) = \sigma(x) (1 - \sigma(x))$.
+#### 2
+We use mean squared error as our loss function. We have that
+$$
+    C = \frac{1}{N_{out}} \Sigma_{i=1}^{N_{out}} (\hat{y}_i - y_i)^2
+$$
+where $N_{out}$ is the number of output neurons, $\hat{y}_i$ is the predicted value of the $i$-th output neuron and $y_i$ is the true value of the $i$-th output neuron (in our case, $y_i = a^{(2)}_i$). We have that
+We can express $\frac{dC}{dA^{(2)}}$ as follows:
+$$
+\begin{align}
+    \frac{dC}{da^{(2)}_i} &= \frac{d}{da^{(2)}_i} \left( \frac{1}{N_{out}} \Sigma_{i=1}^{N_{out}} (a^{(2)}_i - y_i)^2 \right) \\
+    &= \frac{d}{da^{(2)}_i} \left( \frac{1}{N_{out}} ((a^{(2)}_1 - y_1)^2 + ... + (a^{(2)}_i - y_i)^2 + ... + (a^{(2)}_{N_{out}} - y_{N_{out}})^2) \right) \\
+    &=  \frac{1}{N_{out}} 2(a^{(2)}_i - y_i)
+\end{align}
+$$
+And in vector form
+$$
+    \frac{dC}{dA^{(2)}} =  \frac{1}{N_{out}} 2(A^{(2)} - Y)
+$$
+#### 3
+$$
+\begin{align}
+    \frac{dC}{dZ^{(2)}} &= \frac{dC}{dA^{(2)}} \frac{dA^{(2)}}{dZ^{(2)}} \\
+    &= \frac{dC}{dA^{(2)}} \frac{dA^{(2)}}{dZ^{(2)}} \\
+    &= \frac{dC}{dA^{(2)}} \frac{d}{dZ^{(2)}} \sigma(Z^{(2)}) \\
+    &= \frac{dC}{dA^{(2)}} \sigma'(Z^{(2)}) \\
+    &= \frac{dC}{dA^{(2)}} \sigma(Z^{(2)}) (1 - \sigma(Z^{(2)})) \\
+    &= \frac{dC}{dA^{(2)}} A^{(2)} (1 - A^{(2)})
+\end{align}
+$$
+#### 4
+$$
+\begin{align}
+    \frac{dC}{dW^{(2)}} &= \frac{dC}{dZ^{(2)}} \frac{dZ^{(2)}}{dW^{(2)}} \\
+    &= \frac{dC}{dZ^{(2)}} \frac{d}{dW^{(2)}} \left( W^{(2)} A^{(1)} + B^{(2)} \right) \\
+    &= \frac{dC}{dZ^{(2)}} A^{(1)}
+\end{align}
+$$
+#### 5
+$$
+\begin{align}
+    \frac{dC}{dB^{(2)}} &= \frac{dC}{dZ^{(2)}} \frac{dZ^{(2)}}{dB^{(2)}} \\
+    &= \frac{dC}{dZ^{(2)}} \frac{d}{dB^{(2)}} \left( W^{(2)} A^{(1)} + B^{(2)} \right) \\
+    &= \frac{dC}{dZ^{(2)}} 1 \\
+    &= \frac{dC}{dZ^{(2)}}
+\end{align}
+$$
+#### 6
+$$
+\begin{align}
+    \frac{dC}{dA^{(1)}} &= \frac{dC}{dZ^{(2)}} \frac{dZ^{(2)}}{dA^{(1)}} \\
+    &= \frac{dC}{dZ^{(2)}} \frac{d}{dA^{(1)}} \left( W^{(2)} A^{(1)} + B^{(2)} \right) \\
+    &= \frac{dC}{dZ^{(2)}} W^{(2)}
+\end{align}
+$$
+#### 7
+$$
+\begin{align}
+    \frac{dC}{dZ^{(1)}} &= \frac{dC}{dA^{(1)}} \frac{dA^{(1)}}{dZ^{(1)}} \\
+    &= \frac{dC}{dA^{(1)}} A^{(1)} (1 - A^{(1)})
+\end{align}
+$$
+#### 8
+$$
+\begin{align}
+    \frac{dC}{dW^{(1)}} &= \frac{dC}{dZ^{(1)}} \frac{dZ^{(1)}}{dW^{(1)}} \\
+    &= \frac{dC}{dZ^{(1)}} A^{(0)}
+\end{align}
+$$
+#### 9
+$$
+\begin{align}
+    \frac{dC}{dB^{(1)}} &= \frac{dC}{dZ^{(1)}} \frac{dZ^{(1)}}{dB^{(1)}} \\
+    &= \frac{dC}{dZ^{(1)}}
+\end{align}
+$$
+%% Cell type:code id: tags:
+``` python
+import mlp
+reload(mlp)
+from mlp import (
+    learn_once_mse,
+    one_hot,
+    learn_once_cross_entropy,
+    run_mlp_training
+)
+```
+%% Cell type:markdown id: tags:
+### Learning pass example
 %% Cell type:code id: tags:
 ``` python
+N = 30  # number of input data
+d_in = 3  # input dimension
+d_h = 3  # number of neurons in the hidden layer
+d_out = 2  # output dimension (number of neurons of the output layer)
+# Random initialization of the network weights and biaises
+w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
+b1 = np.zeros((1, d_h))  # first layer biaises
+w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
+b2 = np.zeros((1, d_out))  # second layer biaises
+random_data = np.random.rand(N, d_in)  # create a random data
+random_targets = np.random.rand(N, d_out)  # create a random targets
+```
+%% Cell type:code id: tags:
+``` python
+updated_w1, updated_b1, updated_w2, updated_b2, loss = learn_once_mse(
+    w1,
+    b1,
+    w2,
+    b2,
+    random_data,
+    random_targets,
+    0.1,
+)
+print("Loss:", loss)
+```
+%% Output
+    Loss: 0.10367831888711801
+%% Cell type:markdown id: tags:
+### One-hot encoding
+%% Cell type:code id: tags:
+``` python
+one_hot(np.array([1, 2, 0]))
+```
+%% Output
+    array([[0., 1., 0.],
+           [0., 0., 1.],
+           [1., 0., 0.]])
+%% Cell type:markdown id: tags:
+### Cross entropy pass example
+%% Cell type:code id: tags:
+``` python
+N = 30  # number of input data
+d_in = 3  # input dimension
+d_h = 3  # number of neurons in the hidden layer
+d_out = 5  # output dimension (number of neurons of the output layer)
+# Random initialization of the network weights and biaises
+w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
+b1 = np.zeros((1, d_h))  # first layer biaises
+w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
+b2 = np.zeros((1, d_out))  # second layer biaises
+random_data = np.random.rand(N, d_in)  # create a random data
+random_targets = np.random.randint(1, d_out, N)  # create a random targets
+```
+%% Cell type:code id: tags:
+``` python
+cross_w1, cross_b1, cross_w2, cross_b2, cross_loss = learn_once_cross_entropy(
+    w1,
+    b1,
+    w2,
+    b2,
+    random_data,
+    random_targets,
+    0.1,
+)
+print("Loss:", cross_loss)
+```
+%% Output
+    Loss: 0.6940785845571713
+%% Cell type:markdown id: tags:
+### split = 0.9, d_h = 64, learning_rate 0.1, 100 epochs
+%% Cell type:code id: tags:
+``` python
+d_h = 64
+learning_rate = 0.1
+num_epoch = 100
+```
+%% Cell type:code id: tags:
+``` python
+training_accuracy_values, test_accuracy = run_mlp_training(
+    train_data_09,
+    train_labels_09,
+    test_data_09,
+    test_labels_09,
+    d_h,
+    learning_rate,
+    num_epoch
+)
+for (i, training_accuracy) in enumerate(training_accuracy_values):
+    print(f"Epoch {i}: {training_accuracy}")
+print(f"Test accuracy: {test_accuracy}")
+```
+%% Output
+    /home/tracert6/Documents/ECL-S9-DeepLearning/TP/TP1/mlp.py:210: RuntimeWarning: overflow encountered in exp
+      z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
+%% Cell type:code id: tags:
+``` python
+# Plot the training accuracy values
+plt.plot(range(num_epoch + 1), training_accuracy_values)
+plt.xlabel("Epoch")
+plt.ylabel("Training Accuracy")
+plt.savefig('./results/mlp.png')
+plt.show()
 ```

--- a/mlp.py
+++ b/mlp.py
+import numpy as np
+def learn_once_mse(
+    w1: np.ndarray,
+    b1: np.ndarray,
+    w2: np.ndarray,
+    b2: np.ndarray,
+    data: np.ndarray,
+    targets: np.ndarray,
+    learning_rate: float,
+) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, float):
+    """Perform one step of gradient descent on the given data and targets.
+    Args:
+        w1 (np.ndarray): The weights of the first layer, of shape (d_in, d_h).
+        b1 (np.ndarray): The bias of the first layer, of shape (1, d_h).
+        w2 (np.ndarray): The weights of the second layer, of shape (d_h, d_out).
+        b2 (np.ndarray): The bias of the second layer, of shape (1, d_out).
+        data (np.ndarray): The data, of shape (N, d_in).
+        targets (np.ndarray): The targets, of shape (N, d_out).
+        learning_rate (float): The learning rate.
+    Returns:
+        (np.ndarray, np.ndarray, np.ndarray, np.ndarray, float): A tuple containing the updated weights and biases, and the loss.
+    """
+    # Forward pass
+    a0 = data  # the data are the input of the first layer
+    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
+    a1 = 1 / (
+        1 + np.exp(-z1)
+    )  # output of the hidden layer (sigmoid activation function)
+    z2 = np.matmul(a1, w2) + b2  # input of the output layer
+    a2 = 1 / (
+        1 + np.exp(-z2)
+    )  # output of the output layer (sigmoid activation function)
+    predictions = a2  # the predicted values are the outputs of the output layer
+    # Compute loss (MSE)
+    loss = np.mean(np.square(predictions - targets))
+    # Backward pass
+    # Compute gradients
+    dC_da2 = 2 * (predictions - targets) / predictions.shape[0]
+    dC_dz2 = dC_da2 * a2 * (1 - a2)
+    dC_dw2 = np.matmul(a1.T, dC_dz2)
+    dC_db2 = np.sum(dC_dz2, axis=0, keepdims=True)
+    dC_da1 = np.matmul(dC_dz2, w2.T)
+    dC_dz1 = dC_da1 * a1 * (1 - a1)
+    dC_dw1 = np.matmul(a0.T, dC_dz1)
+    dC_db1 = np.sum(dC_dz1, axis=0, keepdims=True)
+    # Update weights and biases
+    w1 -= learning_rate * dC_dw1
+    b1 -= learning_rate * dC_db1
+    w2 -= learning_rate * dC_dw2
+    b2 -= learning_rate * dC_db2
+    return w1, b1, w2, b2, loss
+def one_hot(labels: np.ndarray) -> np.ndarray:
+    """Calculates the one-hot matrix of the given labels.
+    Args:
+        labels (np.ndarray): The labels.
+    Returns:
+        np.ndarray: The one-hot matrix of the labels.
+    """
+    return np.eye(labels.max() + 1)[labels]
+def learn_once_cross_entropy(
+    w1: np.ndarray,
+    b1: np.ndarray,
+    w2: np.ndarray,
+    b2: np.ndarray,
+    data: np.ndarray,
+    labels_train: np.ndarray,
+    learning_rate: float,
+) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, float):
+    """Perform one step of gradient descent using a binary cross-entropy loss on the given data and targets.
+    Args:
+        w1 (np.ndarray): The weights of the first layer, of shape (d_in, d_h).
+        b1 (np.ndarray): The bias of the first layer, of shape (1, d_h).
+        w2 (np.ndarray): The weights of the second layer, of shape (d_h, d_out).
+        b2 (np.ndarray): The bias of the second layer, of shape (1, d_out).
+        data (np.ndarray): The data, of shape (N, d_in).
+        targets (np.ndarray): The targets, of shape (N, d_out).
+        learning_rate (float): The learning rate.
+    Returns:
+        (np.ndarray, np.ndarray, np.ndarray, np.ndarray, float): A tuple containing the updated weights and biases, and the loss.
+    """
+    # Forward pass
+    a0 = data  # the data are the input of the first layer
+    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
+    a1 = 1 / (
+        1 + np.exp(-z1)
+    )  # output of the hidden layer (sigmoid activation function)
+    z2 = np.matmul(a1, w2) + b2  # input of the output layer
+    a2 = 1 / (
+        1 + np.exp(-z2)
+    )  # output of the output layer (sigmoid activation function)
+    predictions = a2  # the predicted values are the outputs of the output layer
+    one_hot_targets = one_hot(labels_train)
+    # Compute loss (Cross Entropy)
+    # https://arize.com/blog-course/binary-cross-entropy-log-loss/
+    loss = -np.mean(
+        one_hot_targets * np.log(predictions)
+        + (1 - one_hot_targets) * np.log(1 - predictions)
+    )
+    # Backward pass
+    # Compute gradients
+    dC_dz2 = a2 - one_hot_targets
+    dC_dw2 = np.matmul(a1.T, dC_dz2)
+    dC_db2 = np.sum(dC_dz2, axis=0, keepdims=True)
+    dC_da1 = np.matmul(dC_dz2, w2.T)
+    dC_dz1 = dC_da1 * a1 * (1 - a1)
+    dC_dw1 = np.matmul(a0.T, dC_dz1)
+    dC_db1 = np.sum(dC_dz1, axis=0, keepdims=True)
+    # Update weights and biases
+    w1 -= learning_rate * dC_dw1
+    b1 -= learning_rate * dC_db1
+    w2 -= learning_rate * dC_dw2
+    b2 -= learning_rate * dC_db2
+    return w1, b1, w2, b2, loss
+def train_mlp(
+    w1: np.ndarray,
+    b1: np.ndarray,
+    w2: np.ndarray,
+    b2: np.ndarray,
+    data_train: np.ndarray,
+    labels_train: np.ndarray,
+    learning_rate: float,
+    num_epoch: int,
+) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, list[float]):
+    """Perform num_epoch training steps.
+    Args:
+        w1 (np.ndarray): The weights of the first layer, of shape (d_in, d_h).
+        b1 (np.ndarray): The bias of the first layer, of shape (1, d_h).
+        w2 (np.ndarray): The weights of the second layer, of shape (d_h, d_out).
+        b2 (np.ndarray): The bias of the second layer, of shape (1, d_out).
+        data_train (np.ndarray): The data, of shape (N, d_in).
+        labels_train (np.ndarray): The targets, of shape (N, d_out).
+        learning_rate (float): The learning rate.
+        num_epoch (int): The number of epochs.
+    Returns:
+        (np.ndarray, np.ndarray, np.ndarray, np.ndarray, list[float]): A tuple containing the resulting weights and biases, and the list of accuracy values of each epoch.
+    """
+    # Starting accuracy (random weights)
+    accuracy = test_mlp(w1, b1, w2, b2, data_train, labels_train)
+    accuracies = [accuracy]
+    for _ in range(num_epoch):
+        # Train once
+        w1, b1, w2, b2, _ = learn_once_mse(
+            w1, b1, w2, b2, data_train, labels_train, learning_rate
+        )
+        # Compute current model training accuracy
+        accuracy = test_mlp(w1, b1, w2, b2, data_train, labels_train)
+        accuracies.append(accuracy)
+    return w1, b1, w2, b2, accuracies
+def test_mlp(
+    w1: np.ndarray,
+    b1: np.ndarray,
+    w2: np.ndarray,
+    b2: np.ndarray,
+    data_test: np.ndarray,
+    labels_test: np.ndarray,
+) -> float:
+    """Test the network on the given test set.
+    Args:
+        w1 (np.ndarray): The weights of the first layer, of shape (d_in, d_h).
+        b1 (np.ndarray): The bias of the first layer, of shape (1, d_h).
+        w2 (np.ndarray): The weights of the second layer, of shape (d_h, d_out).
+        b2 (np.ndarray): The bias of the second layer, of shape (1, d_out).
+        data (np.ndarray): The data, of shape (N, d_in).
+        targets (np.ndarray): The targets, of shape (N, d_out).
+    Returns:
+        float: The testing accuracy of the model on the given data.
+    """
+    # Forward pass
+    a0 = data_test  # the data are the input of the first layer
+    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
+    a1 = 1 / (
+        1 + np.exp(-z1)
+    )  # output of the hidden layer (sigmoid activation function)
+    z2 = np.matmul(a1, w2) + b2  # input of the output layer
+    a2 = 1 / (
+        1 + np.exp(-z2)
+    )  # output of the output layer (sigmoid activation function)
+    predictions = a2  # the predicted values are the outputs of the output layer
+    # Compute accuracy
+    accuracy = np.mean(np.argmax(predictions, axis=1) == labels_test)
+    return accuracy
+def run_mlp_training(
+    data_train: np.ndarray,
+    labels_train: np.ndarray,
+    data_test: np.ndarray,
+    labels_test: np.ndarray,
+    d_h: int,
+    learning_rate: float,
+    num_epoch: int,
+) -> (list[float], float):
+    """Train an MLP classifier.
+    Args:
+        data_train (np.ndarray): The training data, of shape (N, d_in).
+        labels_train (np.ndarray): The training labels, of shape (N, d_out).
+        data_test (np.ndarray): The test data, of shape (N, d_in).
+        labels_test (np.ndarray): The test labels, of shape (N, d_out).
+        learning_rate (float): The learning rate.
+        num_epoch (int): The number of training epochs.
+    Returns:
+        (list[float], float): A tuple containing the list of training accuracy values of each epoch, and the final accuracy.
+    """
+    d_in = data_train.shape[1]
+    d_out = labels_train.shape[0]
+    # Random initialization of the network weights and biaises
+    w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
+    b1 = np.zeros((1, d_h))  # first layer biaises
+    w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
+    b2 = np.zeros((1, d_out))  # second layer biaises
+    # Train the network
+    w1, b1, w2, b2, accuracy_values = train_mlp(
+        w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch
+    )
+    # Test the network
+    accuracy = test_mlp(w1, b1, w2, b2, data_test, labels_test)
+    return accuracy_values, accuracy
--- a/results/knn.png
+++ b/results/knn.png
No results found