readme mlp

6b34bc72 · pierre-cau · 523a93f9 · 6b34bc72 · 6b34bc72 · 6b34bc72
Commit 6b34bc72 authored 8 months ago by pierre-cau
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ The project is divided into the following directories:
 - `data/`: Contains the dataset.
 - `src/`: Contains the source code of the project.
 - `src/utils/`: Contains utility functions such as `read_cifar` or `evaluate_knn`.
- `assets/`: Contains the main images and linked files of the project.
+- `results/`: Contains the main images and linked files of the project.
 - `src/main.py`: Main file of the project.

 ___
@@ -83,3 +83,15 @@ $\frac{\partial C}{\partial Z^{(1)}} = \frac{\partial C}{\partial A^{(1)}} \odot

 8. The gradient of the cost function with respect to the biases of the first layer, $B^{(1)}$, is given by:
 $\frac{\partial C}{\partial B^{(1)}} = \frac{\partial C}{\partial Z^{(1)}}$
+
+____
+### Results
+
+Using all this equation, I have coded some methods in the `mlp.py`file to train the neural network, espcially the `run_mlp_training`.
+
+Thus, for `split_factor=0.9`, `d_h=64`, `learning_rate=0.1` and `num_epoch=100`, we obtain the following curves : 
+
+![mlp_split_0.1](results\mlp_2.png)
+
+Here we observe that the accuracy is increasing one epoch at a time but still. At the end, we reach about 23% of both test and train accuracy. This means that the algorithm is neither underfitted nor overfitted. Both the loss and train accuracy seem to be quite stable at the end which implies that the algorithm have finished its learning.
+Nonetheless the accuracy is still very low and the algorithm can easily diverge due to exponential values, encountering overlfows. To counter this phenomenon, I made the choice to initialize the weights as tiny as possible but still randomly choosed. I have also introduced so `np.clip` methods and used an epsilon to respectively avoid overflows and dividing by zero.
\ No newline at end of file
--- a/assets/knn.png
+++ b/assets/knn.png
--- a/results/mlp.png
+++ b/results/mlp.png
--- a/results/mlp_1.png
+++ b/results/mlp_1.png
--- a/results/mlp_2.png
+++ b/results/mlp_2.png
--- a/src/main.py
+++ b/src/main.py
@@ -46,27 +46,56 @@ if __name__ == "__main__":
    # Parameters
    split_factor = 0.9
    d_h = 64
-    learning_rate = 0.1
+    learning_rate = 0.2
    num_epoch = 100

    # Split the dataset
    data_train, labels_train, data_test, labels_test = split_dataset(data, labels, split_factor)

+    # Normalize the data
+    data_train, data_test =  Z_score_normalize(data_train, data_test)
+    
+
    # Ensure labels are one-hot encoded
    labels_train = one_hot(labels_train)
    labels_test = one_hot(labels_test)

    # Run MLP training
-    train_accuracies, test_accuracies = run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate, num_epoch)
+    train_accuracies, test_accuracies, losses = run_mlp_training(
+        data_train=data_train,
+        labels_train=labels_train,
+        data_test=data_test,
+        labels_test=labels_test,
+        d_h=d_h,
+        learning_rate=learning_rate,
+        num_epoch=num_epoch,
+        return_loss=True,
+        verbose=True
+    )
+
+    print(f"Final test accuracy: {test_accuracies}")
+    print(f"Final train accuracy: {train_accuracies[-1]}")
                                         
    # Plot the evolution of learning accuracy
-    plt.figure()
-    plt.plot(range(num_epoch), train_accuracies, label='Train Accuracy')
-    plt.plot(range(num_epoch), test_accuracies, label='Test Accuracy')
-    plt.xlabel('Epoch')
-    plt.ylabel('Accuracy')
-    plt.title('MLP Training Accuracy Evolution')
-    plt.legend()
+    fig, ax1 = plt.subplots()
+    # Plot train accuracies on the first y-axis
+    ax1.plot(range(num_epoch), train_accuracies, label='Train Accuracy', color='b')
+    ax1.set_xlabel('Epoch')
+    ax1.set_ylabel('Train Accuracy', color='b')
+    ax1.tick_params(axis='y', labelcolor='b')
+    ax1.xaxis.set_major_locator(plt.MaxNLocator(integer=True))  # Ensure only integer ticks on x-axis
+
+    # Create a second y-axis to plot the losses
+    ax2 = ax1.twinx()
+    ax2.plot(range(num_epoch), losses, label='Loss', color='r')
+    ax2.set_ylabel('Loss', color='r')
+    ax2.tick_params(axis='y', labelcolor='r')
+
+    # Add title and grid
+    plt.title(f'MLP Training Accuracy and Loss Evolution\n(d_h={d_h}, learning_rate={learning_rate}, num_epoch={num_epoch})')
+    fig.tight_layout()
    plt.grid()
+
+    # Save and show the plot
    plt.savefig('../results/mlp.png')
    plt.show()
\ No newline at end of file
--- a/src/utils/__pycache__/knn.cpython-38.pyc
+++ b/src/utils/__pycache__/knn.cpython-38.pyc
--- a/src/utils/__pycache__/mlp.cpython-38.pyc
+++ b/src/utils/__pycache__/mlp.cpython-38.pyc
--- a/src/utils/knn.py
+++ b/src/utils/knn.py
 import numpy as np

+
 def distance_matrix(matrix1, matrix2):
    """
    Compute the L2 Euclidean distance matrix between two matrices.

--- a/src/utils/mlp.py
+++ b/src/utils/mlp.py
@@ -2,6 +2,25 @@
 # Date : 2024

 import numpy as np
+from tqdm import tqdm
+
+
+def softmax(x):
+    """
+    Return the softmax function of the x array
+    
+    Parameters
+    ----------
+    x : np.ndarray
+        input vector 
+
+    Returns
+    -------
+        Softmax of x
+    """
+
+    exp = np.exp(x - np.max(x,axis=1,keepdims=True))
+    return exp / np.sum(exp,axis=1,keepdims=True)

 def one_hot(array):
    """
@@ -63,8 +82,10 @@ def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate):
    # Forward pass
    a0 = data
    z1 = np.matmul(a0, w1) + b1
+    z1 = np.clip(z1, -500, 500)
    a1 = 1 / (1 + np.exp(-z1))
    z2 = np.matmul(a1, w2) + b2
+    z2 = np.clip(z2, -500, 500)
    a2 = 1 / (1 + np.exp(-z2))
    predictions = a2

@@ -135,13 +156,31 @@ def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate):
    # Forward pass
    a0 = data
    z1 = np.matmul(a0, w1) + b1
+    z1 = np.clip(z1, -500, 500) # Avoid overflow
    a1 = 1 / (1 + np.exp(-z1))
    z2 = np.matmul(a1, w2) + b2
-    a2 = 1 / (1 + np.exp(-z2))
+    z2 = np.clip(z2, -500, 500) # Avoid overflow
+    a2 = softmax(z2)
+    
+    # print("\n A0 :",a0.min(), a0.max(), a0.mean(), a0.std())
+    # print("Z1 :",z1.min(), z1.max(), z1.mean(), z1.std())
+    # print("A1 :",a1.min(), a1.max(), a1.mean(), a1.std())
+    # print("Z2 :",z2.min(), z2.max(), z2.mean(), z2.std())
+    # print("A2 :",a2.min(), a2.max(), a2.mean(), a2.std())
    predictions = a2

+    epsilon = 1e-10
+    predictions = np.clip(predictions, epsilon, 1 - epsilon) # Avoid log(0)
+
    # Compute loss (binary cross-entropy)
+    # print(labels_train * np.log(predictions) + (1 - labels_train) * np.log(1 - predictions))
    loss = -np.mean(labels_train * np.log(predictions) + (1 - labels_train) * np.log(1 - predictions))
+    if np.isnan(loss):
+        # print(labels_train)
+        # print(predictions)
+        # print(np.log(predictions))
+        # print(np.log(1 - predictions))
+        raise ValueError("Loss is NaN → Try reducing the learning rate or normalizing the data.")
    
    # Backward pass
    d_loss_a2 = predictions - labels_train
@@ -163,9 +202,20 @@ def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate):
    w2 -= learning_rate * d_loss_w2
    b2 -= learning_rate * d_loss_b2

+    # print(w1, b1, w2, b2, loss)
    return w1, b1, w2, b2, loss
    
-def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch):
+def train_mlp(w1, 
+              b1,
+              w2, 
+              b2, 
+              data_train, 
+              labels_train, 
+              learning_rate, 
+              num_epoch,
+              return_loss = False,
+              verbose = False
+              ):
    """
    Train a simple MLP for a given number of epochs.
    
@@ -187,6 +237,8 @@ def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch
        Learning rate of the optimizer.
    num_epoch : int
        Number of training epochs.
+    return_loss : bool
+        If True, return the loss across epochs.

    Returns
    -------
@@ -202,10 +254,11 @@ def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch
        List of training accuracies across epochs.
    """
    train_accuracies = []
+    losses = []

-    for epoch in range(num_epoch):
+    for epoch in tqdm(range(num_epoch), desc="Training", leave=False):
        w1, b1, w2, b2, loss = learn_once_cross_entropy(w1, b1, w2, b2, data_train, labels_train, learning_rate)
-        
+        losses.append(loss)
        # Compute accuracy
        predictions = 1 / (1 + np.exp(-np.matmul(1 / (1 + np.exp(-np.matmul(data_train, w1) - b1)), w2) - b2))
        predicted_classes = np.argmax(predictions, axis=1)
@@ -213,8 +266,11 @@ def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch
        accuracy = np.mean(predicted_classes == true_classes)
        train_accuracies.append(accuracy)

-        print(f"Epoch {epoch + 1}/{num_epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")
+        if verbose:
+            tqdm.write(f"Epoch {epoch + 1}/{num_epoch} - Loss: {loss:.4f} - Accuracy: {accuracy:.4f}") # We modify the text to display the loss with tqdm.write
        
+    if return_loss:
+        return w1, b1, w2, b2, train_accuracies, losses
    return w1, b1, w2, b2, train_accuracies

 def test_mlp(w1, b1, w2, b2, data_test, labels_test):
@@ -246,7 +302,7 @@ def test_mlp(w1, b1, w2, b2, data_test, labels_test):
    z1 = np.matmul(a0, w1) + b1
    a1 = 1 / (1 + np.exp(-z1))
    z2 = np.matmul(a1, w2) + b2
-    a2 = 1 / (1 + np.exp(-z2))
+    a2 = softmax(z2)
    predictions = a2

    # Compute accuracy
@@ -256,7 +312,29 @@ def test_mlp(w1, b1, w2, b2, data_test, labels_test):

    return test_accuracy

-def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate, num_epoch):
+
+def init_params(n_features,n_outputs,d_h):
+    """
+    Initialize the weights and bias 
+    """
+
+    w1 =(2 * np.random.rand(n_features, d_h) - 1) *  0.01
+    b1 = np.zeros((1, d_h))
+    w2 = (2 * np.random.rand(d_h, n_outputs) - 1) * 0.01
+    b2 = np.zeros((1, n_outputs))
+
+    return w1,b1,w2,b2
+
+def run_mlp_training(data_train, 
+                     labels_train, 
+                     data_test, 
+                     labels_test, 
+                     d_h, 
+                     learning_rate, 
+                     num_epoch,
+                     return_loss = False,
+                     verbose = False
+                     ):
    """
    Train an MLP classifier and return the training accuracies across epochs and the final testing accuracy.
    
@@ -276,6 +354,10 @@ def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, lear
        Learning rate of the optimizer.
    num_epoch : int
        Number of training epochs.
+    return_loss : bool
+        If True, return the loss across epochs.
+    verbose : bool
+        If True, display the loss and accuracy at each epoch.

    Returns
    -------
@@ -287,23 +369,62 @@ def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, lear
    n_features = data_train.shape[1]
    n_outputs = labels_train.shape[1]

+
    # Initialize weights and biases
-    w1 = np.random.randn(n_features, d_h) * 0.01
-    b1 = np.zeros((1, d_h))
-    w2 = np.random.randn(d_h, n_outputs) * 0.01
-    b2 = np.zeros((1, n_outputs))
+    w1,b1,w2,b2 = init_params(n_features=n_features,
+                              n_outputs=n_outputs,
+                              d_h=d_h)
+    

    # Train the MLP
-    w1, b1, w2, b2, train_accuracies = train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch)
+    if return_loss:
+        w1, b1, w2, b2, train_accuracies, losses = train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch, return_loss, verbose)
+    else :
+        w1, b1, w2, b2, train_accuracies = train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch, return_loss, verbose)
    
    # Test the MLP
    test_accuracy = test_mlp(w1, b1, w2, b2, data_test, labels_test)

+    if return_loss:
+        return train_accuracies, test_accuracy, losses
    return train_accuracies, test_accuracy

+def Z_score_normalize(data_train, data_test):
+    """
+    Normalize the training and testing data.
+    
+    Parameters
+    ----------
+    data_train : np.ndarray
+        Training data of shape (n_train, d).
+    data_test : np.ndarray
+        Testing data of shape (n_test, d).
+    
+    Returns
+    -------
+    data_train_normalized : np.ndarray
+        Normalized training data of shape (n_train, d).
+    data_test_normalized : np.ndarray
+        Normalized testing data of shape (n_test, d).
+    """
+    # Compute the mean and standard deviation of the training data
+    mean = np.mean(data_train, axis=0)
+    std = np.std(data_train, axis=0)
+    
+    # Normalize the training data
+    data_train_normalized = (data_train - mean) / std
+    
+    # Normalize the testing data
+    data_test_normalized = (data_test - mean) / std
+    
+    return data_train_normalized, data_test_normalized
+
+
 if __name__ == "__main__":
    # Test de one-hot encoding
    array = np.array([0, 1, 2, 1, 0])
    num_classes = 3
    one_hot_matrix = one_hot(array)
    print(one_hot_matrix)
+
+    
\ No newline at end of file