import matplotlib.pyplot as plt
import numpy as np

def load_data(file_name, delimiter=','):
    """ Reads the file containing the data and returns the matrices corresponding to it

    Parameters
    ----------
    file_name : name of the file containing the data
    delimiter : character that separates the columns in the file (default is ",")

    Returns
    -------
    x : data matrix of dimension [N, nb_var]
    d : matrix containing the target variable values of dimension [N, nb_target]
    N : number of elements
    nb_var : number of predictor variables
    nb_target : number of target variables

    """
    
    data = np.loadtxt(file_name, delimiter=delimiter)
    
    nb_target = 1
    nb_var = data.shape[1] - nb_target
    N = data.shape[0]

    x = data[:, :nb_var]
    d = data[:, nb_var:].reshape(N,1)
    
    return x, d, N, nb_var, nb_target


def normalization(x):
    """ Normalizes the data by centering and scaling the predictor variables
    
    Parameters
    ----------
    X : data matrix of dimension [N, nb_var]
    
    with N : number of elements and nb_var : number of predictor variables

    Returns
    -------
    X_norm : normalized data matrix of dimension [N, nb_var]
    mu : mean of the variables of dimension [1, nb_var]
    sigma : standard deviation of the variables of dimension [1, nb_var]
    
    """
    
    mu = np.mean(x, 0)
    sigma = np.std(x, 0)
    x_norm = (x - mu) / sigma

    return x_norm, mu, sigma

def split_data(x,d,prop_val=0.2, prop_test=0.2):
    """ Splits the original data into three distinct subsets: training, validation, and test
    
    Parameters
    ----------
    x : data matrix of dimension [N, nb_var]
    d : target values matrix [N, nb_target]
    prop_val : proportion of validation data in the total data (between 0 and 1)
    prop_test : proportion of test data in the total data (between 0 and 1)
    
    with N : number of elements, nb_var : number of predictor variables, nb_target : number of target variables

    Returns
    -------
    x_train : training data matrix
    d_train : target values matrix for training
    x_val : validation data matrix
    d_val : target values matrix for validation
    x_test : test data matrix
    d_test : target values matrix for test

    """
    assert prop_val + prop_test < 1.0

    N = x.shape[0]
    indices = np.arange(N)
    np.random.shuffle(indices)
    nb_val = int(N*prop_val)
    nb_test = int(N*prop_test)
    nb_train = N - nb_val - nb_test

    x = x[indices,:]
    d = d[indices,:]

    x_train = x[:nb_train,:]
    d_train = d[:nb_train,:]

    x_val = x[nb_train:nb_train+nb_val,:]
    d_val = d[nb_train:nb_train+nb_val,:]

    x_test = x[N-nb_test:,:]
    d_test = d[N-nb_test:,:]

    return x_train, d_train, x_val, d_val, x_test, d_test

def compute_cross_entropy_cost(y, d):
    """ Computes the value of the cross-entropy cost function
    
    Parameters
    ----------
    y : predicted data matrix (softmax)
    d : actual data matrix encoded by 1
    
    Returns
    -------
    cost : value corresponding to the cost function

    """

    N = y.shape[1]
    
    cost = - np.sum(d*np.log(y)) / N

    return cost

def forward_pass(x, W, b, activation):
    """ Performs a forward pass in the neural network
    
    Parameters
    ----------
    x : input matrix, dimension nb_var x N
    W : list containing the weight matrices of the network
    b : list containing the bias matrices of the network
    activation : list containing the activation functions of the network layers

    with N : number of elements, nb_var : number of predictor variables 

    Returns
    -------
    a : list containing the input potentials of the network layers
    h : list containing the outputs of the network layers

    """
    h = [x]
    a = []
    for i in range(len(b)):
        a.append( W[i].dot(h[i]) + b[i] )
        h.append( activation[i](a[i]) ) 

    return a, h

def backward_pass(delta_h, a, h, W, activation):
    """ Performs a backward pass in the neural network (backpropagation)
    
    Parameters
    ----------
    delta_h : matrix containing the gradient of the cost with respect to the network output
    a : list containing the input potentials of the network layers
    h : list containing the outputs of the network layers
    W : list containing the weight matrices of the network
    activation : list containing the activation functions of the network layers

    Returns
    -------
    delta_W : list containing the gradient matrices of the network's weight layers
    delta_b : list containing the gradient matrices of the network's bias layers

    """

    delta_b = []
    delta_W = []

    for i in range(len(W)-1,-1,-1):

        delta_a = delta_h * activation[i](a[i], True)
     
        delta_b.append( delta_a.mean(1).reshape(-1,1) ) 
        delta_W.append( delta_a.dot(h[i].T) ) 

        delta_h = (W[i].T).dot(delta_a) 

    delta_b = delta_b[::-1]
    delta_W = delta_W[::-1]

    return delta_W, delta_b

def sigmoid(z, deriv=False):
    """ Computes the value of the sigmoid function or its derivative applied to z
    
    Parameters
    ----------
    z : can be a scalar or a matrix
    deriv : boolean. If False returns the value of the sigmoid function, if True returns its derivative

    Returns
    -------
    s : value of the sigmoid function applied to z or its derivative. Same dimension as z

    """

    s = 1 / (1 + np.exp(-z))
    if deriv:
        return s * (1 - s)
    else :
        return s

def linear(z, deriv=False):
    """ Computes the value of the linear function or its derivative applied to z
    
    Parameters
    ----------
    z : can be a scalar or a matrix
    deriv : boolean. If False returns the value of the linear function, if True returns its derivative


    Returns
    -------
    s : value of the linear function applied to z or its derivative. Same dimension as z

    """
    if deriv:       
        return 1     
    else :
        return z


def relu(z, deriv=False):
    """ Computes the value of the ReLU function or its derivative applied to z
    
    Parameters
    ----------
    z : can be a scalar or a matrix
    deriv : boolean. If False returns the value of the ReLU function, if True returns its derivative

    Returns
    -------
    s : value of the ReLU function applied to z or its derivative. Same dimension as z

    """

    r = np.zeros(z.shape)
    if deriv:
        pos = np.where(z>=0)
        r[pos] = 1.0
        return r
    else :    
        return np.maximum(r,z)

def softmax(z, deriv=False):
    """ Computes the value of the softmax function or its derivative applied to z
    
    Parameters
    ----------
    z : data matrix
    deriv : boolean. If False returns the value of the softmax function, if True returns its derivative

    Returns
    -------
    s : value of the softmax function applied to z or its derivative. Same dimension as z

    """
    
    if deriv:
        return 1
    else :        
        return np.exp(z) / np.sum(np.exp(z),axis=0)

def one_hot_encoding(d):
    """ Performs a one-hot encoding: for the output neurons of the network, only 1 will have the value 1, all others will be 0
    
    Parameters
    ----------
    d : matrix containing the values of the target variable (class of the elements) of dimension [N, 1]
    with N : number of elements

    Returns
    -------
    e : encoded data matrix of dimension [N, nb_classes]
    with N : number of elements and nb_classes the number of classes (maximum+1) of the values in d

    """    

    d = d.astype(int).flatten()
    N = d.shape[0]
    nb_classes = d.max() + 1

    e = np.zeros((N,nb_classes))
    e[range(N),d] = 1

    return e

def classification_accuracy(y,d):
    """ Computes the classification accuracy (proportion of correctly classified elements)
    
    Parameters
    ----------
    y : network outputs matrix of dimension [nb_output_neurons x N]
    d : true values matrix [nb_output_neurons x N]
    
    with N : number of elements and nb_output_neurons : number of neurons in the output layer

    Returns
    -------
    t : classification accuracy

    """

    ind_y = np.argmax(y,axis=0)
    ind_d = np.argmax(d,axis=0)
    t = np.mean(ind_y == ind_d)
    
    return t


# ===================== Part 1: Reading and Normalizing the Data =====================
print("Reading the data ...")

x, d, N, nb_var, nb_target = load_data("iris.txt")
# x, d, N, nb_var, nb_target = load_data("scores.txt")

# Display the first 10 examples of the dataset
print("Displaying the first 10 examples of the dataset: ")
for i in range(0, 10):
    print(f"x = {x[i,:]}, d = {d[i]}")
    
# Normalization of the variables (centering and scaling)
print("Normalizing the variables ...")
x, mu, sigma = normalization(x)
d = one_hot_encoding(d)

# Split the data into training, validation, and test subsets
x_train, d_train, x_val, d_val, x_test, d_test = split_data(x,d)

# ===================== Part 2: Training =====================

# Learning rate and number of iterations
alpha = 0.0001
nb_iters = 10000
training_costs = np.zeros(nb_iters)
validation_costs = np.zeros(nb_iters)

# Network dimensions
D_c = [nb_var, 15, 15,  d_train.shape[1]] # list containing the number of neurons for each layer 
activation = [relu, relu, softmax] # list containing the activation functions for hidden layers and output layer

# Random initialization of network weights
W = []
b = []
for i in range(len(D_c)-1):    
    W.append(2 * np.random.random((D_c[i+1], D_c[i])) - 1)
    b.append(np.zeros((D_c[i+1],1)))

x_train = x_train.T # Data is presented as column vectors at the network input
d_train = d_train.T 

x_val = x_val.T # Data is presented as column vectors at the network input
d_val = d_val.T 

x_test = x_test.T # Data is presented as column vectors at the network input
d_test = d_test.T 

for t in range(nb_iters):

    #############################################################################
    # Forward pass: calculate predicted output y on validation data #
    #############################################################################
    a, h = forward_pass(x_val, W, b, activation)
    y_val = h[-1] # Predicted output

    ###############################################################################
    # Forward pass: calculate predicted output y on training data #
    ###############################################################################
    a, h = forward_pass(x_train, W, b, activation)
    y_train = h[-1] # Predicted output

    ###########################################
    # Compute Mean Squared Error loss function #
    ###########################################
    training_costs[t] = compute_cross_entropy_cost(y_train,d_train)
    validation_costs[t] = compute_cross_entropy_cost(y_val,d_val)

    ####################################
    # Backward pass: backpropagation #
    ####################################
    delta_h = (y_train-d_train) # For the last layer 
    delta_W, delta_b = backward_pass(delta_h, a, h, W, activation)
  
    #############################################
    # Update weights and biases  ##### #
    ############################################# 
    for i in range(len(b)-1,-1,-1):
        b[i] -= alpha * delta_b[i]
        W[i] -= alpha * delta_W[i]

print("Final cost on the training set: ", training_costs[-1])
print("Classification accuracy on the training set: ", classification_accuracy(y_train, d_train))

print("Final cost on the validation set: ", validation_costs[-1])
print("Classification accuracy on the validation set: ", classification_accuracy(y_val, d_val))

# Display cost function evolution during backpropagation
plt.figure(0)
plt.title("Cost function evolution during backpropagation")
plt.plot(np.arange(training_costs.size), training_costs, label="Training")
plt.plot(np.arange(validation_costs.size), validation_costs, label="Validation")
plt.legend(loc="upper left")
plt.xlabel("Number of iterations")
plt.ylabel("Cost")
plt.show()

# ===================== Part 3: Evaluation on the test set =====================

#######################################################################
# Forward pass: calculate predicted output y on test data #
#######################################################################
a, h = forward_pass(x_test, W, b, activation)
y_test = h[-1] # Predicted output

cost = compute_cross_entropy_cost(y_test,d_test)
print("Cost on the test set: ", cost)
print("Classification accuracy on the test set: ", classification_accuracy(y_test, d_test))