import numpy as np
import math

def learn_once_mse(w1: np.ndarray, b1: np.ndarray, w2: np.ndarray, b2: np.ndarray, data: np.ndarray, targets: np.ndarray, learning_rate: float):
    """
    :w1: weights of the first layer of the network.
    :b1: bias of the first layer of the network.
    :w2: weights of the second layer of the network.
    :b2: bias of the second layer of the network.
    :data: input vector of the network.
    :targets: output vector to reach.
    :learning_rate: factor for the gradient descent learning (quickness of the descent).
    :return: updated weights and biases of the network after 1 loop of gradient descent.
    """
    # Forward pass
    N = np.size(data, 0)
    a0 = data # the data are the input of the first layer
    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
    a1 = 1 / (1 + math.exp(-z1))  # output of the hidden layer (sigmoid activation function)
    z2 = np.matmul(a1, w2) + b2  # input of the output layer
    a2 = 1 / (1 + math.exp(-z2))  # output of the output layer (sigmoid activation function)
    predictions = a2  # the predicted values are the outputs of the output layer

    # Compute loss (MSE)
    loss = np.mean((predictions - targets)**2)

    #Compute gradient dW
    da2 = 2/N*(a2-targets)
    dz2 = da2*a2*(1-a2)
    dw2 = dz2*a1
    db2 = dz2
    da1 = dz2*np.sum(w2, axis=1)
    dz1 = da1*a1*(1-a1)
    dw1 = dz1*a0
    db1 = dz1

    w1 -= learning_rate*dw1
    w2 -= learning_rate*dw2
    b1 -= learning_rate*db1
    b2 -= learning_rate*db2

    
    return w1, b1, w2, b2, loss


def one_hot(label=np.ndarray):
    """
    Encode une suite d'entier en binaire : encodeur one-hot.

    :label: La suite d'entier à encoder.
    :return: la matrice encodée.
    """
    result = np.zeros((np.size(label, 0), np.size(label, 0)))
    for i in range(np.size(label, 0)):
        result[i] = convert_integer_to_binary(label[i], np.size(label, 0))
    return result

def convert_integer_to_binary(integer, size):
    """
    Convert an integer into a binary vector with a specified size.

    :integer: Integer to convert to binary..
    :taille: Size of the specified binary vector.
    :return: The converted binary vector.
    """
    binary = []
    while integer > 0:
        binary.insert(0, integer % 2)
        integer //= 2

    # Fill with zero on the left if necessary to reach the specified size
    while len(binary) < size:
        binary.insert(0, 0)

    return np.array(binary)


def learn_once_cross_entropy(w1: np.ndarray, b1: np.ndarray, w2: np.ndarray, b2: np.ndarray, data: np.ndarray, labels_train: np.ndarray, learning_rate: np.ndarray):
    """
    :w1: weights of the first layer of the network.
    :b1: bias of the first layer of the network.
    :w2: weights of the second layer of the network.
    :b2: bias of the second layer of the network.
    :data: input vector of the network.
    :labels_train: output vector for the training of the network.
    :learning_rate: factor for the gradient descent learning (quickness of the descent).
    :return: updated weights and biases of the network after 1 loop of gradient descent, and the loss value.
    """
    
    # Forward pass
    a0 = data # the data are the input of the first layer
    z1 = np.matmul(a0, w1) + b1  # input of the hidden layer
    a1 = 1 / (1 + np.exp(-z1))  # output of the hidden layer (sigmoid activation function)
    z2 = np.matmul(a1, w2) + b2  # input of the output layer
    a2 = 1 / (1 + np.exp(-z2))  # output of the output layer (sigmoid activation function)

    encoded_vector = one_hot(labels_train)
    dz2 = a2 - encoded_vector
    dw2 = dz2*a1
    db2 = dz2
    da1 = dz2*np.sum(w2, axis=1)
    dz1 = da1*a1*(1-a1)
    dw1 = dz1*a0
    db1 = dz1

    w1 -= learning_rate*dw1
    w2 -= learning_rate*dw2
    b1 -= learning_rate*db1
    b2 -= learning_rate*db2

    m = np.size(data, 0)
    loss = (-1/m) * np.sum(labels_train * np.log(a2) + (1 - labels_train) * np.log(1 - a2))

    return w1, b1, w2, b2, loss

def train_mlp(w1: np.ndarray, b1: np.ndarray, w2: np.ndarray, b2: np.ndarray, data_train: np.ndarray, labels_train: np.ndarray, learning_rate: float, num_epoch: int):
    """
    :w1: weights of the first layer of the network.
    :b1: bias of the first layer of the network.
    :w2: weights of the second layer of the network.
    :b2: bias of the second layer of the network.
    :data_train: input training vector.
    :labels_train: output training vector.
    :learning_rate: factor for the gradient descent learning (quickness of the descent).
    :num_epoch: number of training loops (gradient descent).
    :return: updated weights and biases of the network after num_epoch loop of gradient descent, accuracy at each loop.
    """
    c=0
    accuracies=[]
    while c<num_epoch:
        w1, b1, w2, b2, _ = learn_once_cross_entropy(w1, b1, w2, b2, data_train, labels_train, learning_rate)
        c+=1

        # Forward pass
        a0 = data_train
        z1 = np.matmul(a0, w1) + b1
        a1 = 1 / (1 + np.exp(-z1))
        z2 = np.matmul(a1, w2) + b2
        a2 = 1 / (1 + np.exp(-z2))
        accuracies = compute_accuracy(a2, labels_train)
    return w1, b1, w2, b2, accuracies

def compute_accuracy(y_predict, y_target):
    true = 0
    for i in range(np.size(y_predict, 0)):
        if y_predict[i] == y_target[0]:
            true += 1
    return true/np.size(y_predict, 0)

def test_mlp(w1: np.ndarray, b1: np.ndarray, w2:np.ndarray, b2:np.ndarray, data_test: np.ndarray, labels_test: np.ndarray):
    """
    :w1: weights of the first layer of the network.
    :b1: bias of the first layer of the network.
    :w2: weights of the second layer of the network.
    :b2: bias of the second layer of the network.
    :data_test: input testing vector.
    :labels_train: output testing vector.
    :return: the accuracy of the test.
    """
    

    w1, b1, w2, b2, _ = train_mlp(w1, b1, w2, b2, data_test, labels_test)

    a0 = data_test
    z1 = np.matmul(a0, w1) + b1
    a1 = 1 / (1 + np.exp(-z1))
    z2 = np.matmul(a1, w2) + b2
    y_predict = 1 / (1 + np.exp(-z2))
    test_accuracy = compute_accuracy(y_predict, labels_test)

    return test_accuracy

def run_mlp_training(data_train:np.ndarray, labels_train:np.ndarray, data_test:np.ndarray, labels_test:np.ndarray, d_h: int, learning_rate: float, num_epoch: int):
    """
    :data_train: input training vector.
    :labels_train: output training vector.
    :data_test: input testing vector.
    :labels_test: output testing vector.
    :d_h: number of neurons on the hidden layer.
    :learning_rate: factor for the gradient descent learning (quickness of the descent).
    :num_epoch: number of training loops (gradient descent).
    :return: the training accuracies across epochs as a list of floats and the final testing accuracy as a float.
    """
    #Number of neurons on the first and the last layer.
    d_in = np.size(data_train, 1)
    d_out = np.size(data_test, 0)    

    # Random initialization of the network weights and biaises
    w1 = 2 * np.random.rand(d_in, d_h) - 1  # first layer weights
    b1 = np.zeros((1, d_h))  # first layer biaises
    w2 = 2 * np.random.rand(d_h, d_out) - 1  # second layer weights
    b2 = np.zeros((1, d_out))  # second layer biaises

    w1, b1, w2, b2, list_accuracies = train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch)
    w1, b1, w2, b2, final_accuracy = test_mlp(w1, b1, w2, b2, data_test, labels_test)

    return list_accuracies, final_accuracy