import numpy as np
import matplotlib.pyplot as plt


N = 30  # number of input data
d_in = 3  # input dimension
d_h = 3  # number of neurons in the hidden layer
d_out = 2  # output dimension (number of neurons of the output layer)
learning_rate = 0.1  
num_epochs=100

# Random initialization of the network weights and biaises
def initialization(d_in,d_h,d_out):
    np.random.seed(10)  # To get the same random values
    W1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
    b1 = np.zeros((1, d_h))  # first layer biaises
    W2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
    b2 = np.zeros((1, d_out))  # second layer biaises 
    return W1,b1,W2,b2

data = np.random.rand(N, d_in)  # create a random data
targets = np.random.rand(N, d_out)  # create a random targets

# Define the sigmoid activation function
def sigmoid(x,derivate):
 if derivate==False:
    return 1 / (1 + np.exp(-x))
 else:
     return x*(1-x)
 

# Define the softmax activation function
def softmax(x,derivate):
    if derivate == False :
      return np.exp(x) / np.exp(np.array(x)).sum(axis=1, keepdims=True)
    else :
        return x*(1-x)
    
#Definir les métriques :
def loss_metrics(predictions, targets, metric, status):
    if metric == "MSE":
        if status == "forward":
            return np.mean((predictions - targets) ** 2)
        elif status == "backward":
            return 2 * (predictions - targets) / len(predictions)  # Gradient of MSE loss
    elif metric == "BCE":
        # Binary Cross-Entropy Loss
        epsilon = 1e-15  # Small constant to prevent log(0)
        predictions = np.clip(predictions, epsilon, 1 - epsilon)
        if status == "forward":
            return - (targets * np.log(predictions) + (1 - targets) * np.log(1 - predictions)).mean()
        elif status == "backward":
            return (predictions - targets) / ((1 - predictions) * predictions)  # Gradient of BCE loss
    else:
        raise ValueError("Metric not supported: " + metric)

# learn_once_mse
"""
    Update the weights and biases of the network for one gradient descent step using Mean Squared Error (MSE) loss.

    Parameters:
    - w1: Weight matrix of the first layer (shape: d_in x d_h).
    - b1: Bias vector of the first layer (shape: 1 x d_h).
    - w2: Weight matrix of the second layer (shape: d_h x d_out).
    - b2: Bias vector of the second layer (shape: 1 x d_out).
    - data: Input data matrix (shape: batch_size x d_in).
    - targets: Target output matrix (shape: batch_size x d_out).
    - learning_rate: Learning rate for gradient descent.

    Returns:
    - W1: Updated weight matrix of the first layer.
    - b1: Updated bias vector of the first layer.
    - w2: Updated weight matrix of the second layer.
    - b2: Updated bias vector of the second layer.
    - loss: Mean Squared Error (MSE) loss for monitoring.
    """

def learn_once_mse(W1, b1, W2, b2, data, targets, learning_rate):
    # Forward pass
    # Calculate the input and output of the hidden layer
    hidden_layer_input = np.matmul(data, W1) + b1
    hidden_layer_output = sigmoid(hidden_layer_input, derivate=False)  # Apply the sigmoid activation

    # Calculate the input and output of the output layer
    output_layer_input = np.matmul(hidden_layer_output, W2) + b2
    output_layer_output = softmax(output_layer_input, derivate=False)  # Apply the softmax activation

    # Backpropagation phase
    # Calculate the error at the output layer
    output_error = output_layer_output - targets

    # Calculate gradients for the output layer
    output_layer_gradients = output_error * softmax(output_layer_output, derivate=True)

    # Update weights and biases of the output layer
    W2 = W2 - learning_rate * np.dot(hidden_layer_output.T, output_layer_gradients) / data.shape[0]
    b2 = b2 - learning_rate * (1 / hidden_layer_output.shape[1]) * output_layer_gradients.sum(axis=0)

    # Calculate the error at the hidden layer
    hidden_layer_error = np.dot(output_layer_gradients, W2.T)

    # Calculate gradients for the hidden layer
    hidden_layer_gradients = hidden_layer_error * sigmoid(hidden_layer_output, derivate=True)

    # Update weights and biases of the hidden layer
    W1 = W1 - learning_rate * np.dot(data.T, hidden_layer_gradients) / data.shape[0]
    b1 = b1 - learning_rate * (1 / data.shape[1]) * hidden_layer_gradients.sum(axis=0)

    # Calculate the loss using the specified metric
    loss = loss_metrics(output_layer_output, targets,metric="MSE",status="forward")

    return W1, b1, W2, b2, loss

#One Hot Function :
def one_hot(targets):
    """
    one_hot_encode takes an arrayy of target values and returns the corresponding one-hot encoded matrix.
    
    Parameters:
    - targets: An arrayy of target values.
    
    Returns:
    - one_hot_matrix: A one-hot encoded matrix where each row corresponds to a target value.
    """
    num_classes = np.unique(targets).shape[0]  # Determine the number of unique classes in the target arrayy
    num_samples = targets.shape[0]  # Get the number of samples in the target arrayy

    one_hot_matrix = np.zeros((num_samples, num_classes))  # Initialize a matrix of zeros

    for i in range(num_samples):
        target_class = targets[i]
        one_hot_matrix[i, target_class] = 1  # Set the corresponding class index to 1

    return one_hot_matrix

#learn_once_cross_entropy 

def learn_once_cross_entropy(W1, b1, W2, b2, data, targets, learning_rate):
    """
    Perform one gradient descent step using binary cross-entropy loss.

    Parameters:
    - W1, b1, W2, b2: Weights and biases of the network.
    - data: Input data matrix of shape (batch_size x d_in).
    - targets: Target output matrix of shape (batch_size x d_out).
    - learning_rate: Learning rate for gradient descent.

    Returns:
    - Updated weights and biases (W1, b1, W2, b2) of the network.
    - Loss value for monitoring.
    """

    # Forward pass
    # Implement feedforward propagation on the hidden layer
    hidden_layer_input = np.matmul(data, W1) + b1
    hidden_layer_output = sigmoid(hidden_layer_input, derivate=False)  # Apply the Sigmoid activation function

    # Implement feedforward propagation on the output layer
    output_layer_input = np.matmul(hidden_layer_output, W2) + b2
    output_layer_output = softmax(output_layer_input, derivate=False)  # Apply the Softmax activation function

    # Backpropagation phase
    # Updating W2 and b2
    output_error = output_layer_output - targets
    dW2 = output_error * softmax(output_layer_output, derivate=True)
    W2_update = np.dot(hidden_layer_output.T, dW2) / data.shape[0]
    update_b2 = (1 / hidden_layer_output.shape[1]) * dW2.sum(axis=0, keepdims=True)

    # Updating W1 and b1
    hidden_layer_error = np.dot(dW2, W2.T)
    dW1 = hidden_layer_error * sigmoid(hidden_layer_output, derivate=True)
    W1_update = np.dot(data.T, dW1) / data.shape[0]
    update_b1 = (1 / data.shape[1]) * dW1.sum(axis=0, keepdims=True)

    # Gradient descent
    W2 = W2 - learning_rate * W2_update
    W1 = W1 - learning_rate * W1_update
    b2 = b2 - learning_rate * update_b2
    b1 = b1 - learning_rate * update_b1

    # Compute loss (Binary Cross Entropy)
    loss = loss_metrics(output_layer_output, targets, metric="BCE", status="forward")

    return W1, b1, W2, b2, loss


def calculate_accuracy(predictions, actual_values):
    """
    calculate_accuracy: Compute the accuracy of the model.

    Parameters:
    - predictions: Predicted values.
    - actual_values: Ground truth observations.

    Returns:
    - Accuracy as a float.
    """
    correct_predictions = predictions.argmax(axis=1) == actual_values.argmax(axis=1)
    accuracy = correct_predictions.mean()
    return accuracy

def train_mlp(W1, b1, W2, b2, data, targets, learning_rate):
    """
     Perform training steps for a specified number of epochs.

    Parameters:
    - W1, b1, W2, b2: Weights and biases of the network.
    - data: Input data matrix of shape (batch_size x d_in).
    - targets: Target output matrix of shape (batch_size x d_out).
    - learning_rate: Learning rate for gradient descent.
    - num_epochs: Number of training epochs.
    - metrics: Specifies the loss metric (default is Binary Cross Entropy).

    Returns:
    - Updated weights and biases (W1, b1, W2, b2) of the network.
    - List of training accuracies across epochs as a list of floats.
    """
    
    # Forward pass
    hidden_layer_input = np.matmul(data, W1) + b1
    hidden_layer_output = sigmoid(hidden_layer_input, derivate=False)

    output_layer_input = np.matmul(hidden_layer_output, W2) + b2
    output_layer_output = softmax(output_layer_input, derivate=False)

    N = data.shape[0]

    # Backpropagation phase
    output_error = output_layer_output - targets
    output_layer_gradients = output_error * softmax(output_layer_output, derivate=True)

    W2_update = np.dot(hidden_layer_output.T, output_layer_gradients) / N
    update_b2 = (1 / hidden_layer_output.shape[1]) * output_layer_gradients.sum(axis=0, keepdims=True)

    hidden_layer_error = np.dot(output_layer_gradients, W2.T)
    hidden_layer_gradients = hidden_layer_error * sigmoid(hidden_layer_output, derivate=True)

    W1_update = np.dot(data.T, hidden_layer_gradients) / N
    update_b1 = (1 / data.shape[1]) * hidden_layer_gradients.sum(axis=0, keepdims=True)

    # Gradient descent
    W2 = W2 - learning_rate * W2_update
    W1 = W1 - learning_rate * W1_update
    b2 = b2 - learning_rate * update_b2
    b1 = b1 - learning_rate * update_b1

    # Calculate loss and accuracy
    loss = loss_metrics(output_layer_output, targets,metric="BCE",status="forward")
  
    train_accuracies=calculate_accuracy(output_layer_output, targets)

    return W1, b1, W2, b2, loss, train_accuracies

def test_mlp(W1, b1, W2, b2, data_test, labels_test):
    """
     Evaluate the network's performance on the test set.

    Parameters:
    - W1, b1, W2, b2: Weights and biases of the network.
    - data_test: Test data matrix of shape (batch_size x d_in).
    - labels_test: True labels for the test data.

    Returns:
    - test_accuracy: The testing accuracy as a float.
    """
    # Forward pass
    hidden_layer_input = np.matmul(data_test, W1) + b1
    hidden_layer_output = sigmoid(hidden_layer_input, derivate=False)

    output_layer_input = np.matmul(hidden_layer_output, W2) + b2
    output_layer_output = softmax(output_layer_input, derivate=False)

    # Compute testing accuracy
    test_accuracy = calculate_accuracy(output_layer_output, labels_test)
    return test_accuracy

def run_mlp_training(X_train, labels_train, data_test, labels_test, num_hidden_units, learning_rate, num_epochs):
    """
    Train an MLP classifier and evaluate its performance.

    Parameters:
    - X_train: Training data matrix of shape (batch_size x input_dimension).
    - labels_train: True labels for the training data.
    - data_test: Test data matrix of shape (batch_size x input_dimension).
    - labels_test: True labels for the test data.
    - num_hidden_units: Number of neurons in the hidden layer.
    - learning_rate: The learning rate for gradient descent.
    - num_epochs: The number of training epochs.

    Returns:
    - train_accuracies: List of training accuracies across epochs.
    - test_accuracy: The final testing accuracy.
    """
    input_dimension = X_train.shape[1]
    output_dimension = np.unique(labels_train).shape[0]  # Number of classes

    # Initialize weights and biases
    W1, b1, W2, b2 = initialization(input_dimension, num_hidden_units, output_dimension)
    
    train_accuracies = []  # List to store training accuracies

    # Training loop
    for epoch in range(num_epochs):
        W1, b1, W2, b2, loss, train_accuracy = train_mlp(W1, b1, W2, b2, X_train, one_hot(labels_train), learning_rate)
        test_accuracy = test_mlp(W1, b1, W2, b2, data_test, one_hot(labels_test))
        train_accuracies.append(train_accuracy)
        
        print("Epoch {}/{}".format(epoch + 1, num_epochs))
        print("Train Accuracy: {:.6f}    Test Accuracy: {:.6f}".format(round(train_accuracy, 6), round(test_accuracy, 6)))
    
    return train_accuracies, test_accuracy

# plot_ANN

import matplotlib.pyplot as plt

def plot_ANN(X_train, y_train, X_test, y_test):
    """
    Plot the variation of accuracy in terms of the number of epochs.

    Parameters:
    - X_train: Training data matrix.
    - y_train: True labels for the training data.
    - X_test: Test data matrix.
    - y_test: True labels for the test data.
    """
    # Train an MLP and obtain training accuracies and final test accuracy
    train_accuracies, test_accuracy = run_mlp_training(X_train, y_train, X_test, y_test, num_hidden_units=64, learning_rate=0.1, num_epochs=100)

    # Display the test accuracy
    print("Test Set Accuracy: {}".format(test_accuracy))

    # Create a Matplotlib plot
    plt.plot(list(range(1, len(train_accuracies) + 1)), train_accuracies)
    plt.title('Accuracy Variation Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')

    # Save the figure (optional)
    plt.savefig("Results/mlp.png")

    # Show the plot (optional)
    plt.show()