diff --git a/mlp.py b/mlp.py index 0f3546bc2cc0cd70caa5e924eb97cf5c09ea34b8..086121de19398658c9d33e3440429def69d58694 100644 --- a/mlp.py +++ b/mlp.py @@ -1,5 +1,7 @@ import numpy as np +import read_cifar + def learn_once_mse(w1: np.array, b1: int, w2: np.array, b2: int, data: np.array, target: np.array, learning_rate: float): """ @@ -76,7 +78,7 @@ def one_hot(labels: np.array): return one_hot_matrix -def learn_once_cross_entropy(w1: np.array, b1: np.array, w2: np.array, b2: np.array, data: np.array, labels_train: np.array, learning_rate: int): +def learn_once_cross_entropy(w1: np.array, b1: np.array, w2: np.array, b2: np.array, data: np.array, labels_train: np.array, learning_rate: float): """ Performs one learning step of the MLP with cross-entropy loss @@ -96,24 +98,42 @@ def learn_once_cross_entropy(w1: np.array, b1: np.array, w2: np.array, b2: np.ar loss -- loss of the forward pass """ # Forward pass - a0 = data # the data are the input of the first layer - z1 = np.matmul(a0, w1) + b1 # input of the hidden layer + # the data are the input of the first layer + a0 = data + # input of the hidden layer + z1 = np.matmul(a0, w1) + b1 # output of the hidden layer (sigmoid activation function) a1 = 1 / (1 + np.exp(-z1)) - z2 = np.matmul(a1, w2) + b2 # input of the output layer + # input of the output layer + z2 = np.matmul(a1, w2) + b2 # output of the output layer (sigmoid activation function) - a2 = 1 / (1 + np.exp(-z2)) - predictions = a2 # the predicted values are the outputs of the output layer + a2 = np.exp(z2) / np.sum(np.exp(z2), axis=1, keepdims=True) + # the predicted values are the outputs of the output layer + predictions = a2 + + one_hot_targets = one_hot(labels_train) # Compute loss (cross-entropy) - loss = -np.mean(np.sum(labels_train * np.log(predictions) + - (1 - labels_train) * np.log(1 - predictions), axis=1)) + loss = -np.mean( + one_hot_targets * np.log(predictions) + + (1 - one_hot_targets) * np.log(1 - predictions) + ) # Backward pass # derivative of the loss with respect to the output of the output layer - dC_dA2 = -labels_train / predictions + (1 - labels_train) / (1 - predictions) - # derivative of the loss with respect to the input of the output layer - # dC_dZ2 = a2 - + dC_dZ2 = predictions - one_hot_targets + # derivative of the loss with respect to the weights of the output layer + dC_dW2 = np.matmul(a1.T, dC_dZ2) + # derivative of the loss with respect to the biaises of the output layer + dC_dB2 = np.sum(dC_dZ2, axis=0, keepdims=True) + # derivative of the loss with respect to the output of the hidden layer + dC_dA1 = np.matmul(dC_dZ2, w2.T) + # derivative of the loss with respect to the input of the hidden layer + dC_dZ1 = dC_dA1 * (1 - a1) * a1 + # derivative of the loss with respect to the weights of the hidden layer + dC_dW1 = np.matmul(a0.T, dC_dZ1) + # derivative of the loss with respect to the biaises of the hidden layer + dC_dB1 = np.sum(dC_dZ1, axis=0, keepdims=True) # Update weights and biaises w1 -= learning_rate * dC_dW1 @@ -124,6 +144,103 @@ def learn_once_cross_entropy(w1: np.array, b1: np.array, w2: np.array, b2: np.ar return w1, b1, w2, b2, loss +def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epochs): + """ + Trains the MLP + + Arguments: + w1 -- weights of the hidden layer + b1 -- biaises of the hidden layer + w2 -- weights of the output layer + b2 -- biaises of the output layer + data_train -- training data + labels_train -- labels of the training data + learning_rate -- learning rate + num_epochs -- number of epochs + Returns: + w1 -- updated weights of the hidden layer + b1 -- updated biaises of the hidden layer + w2 -- updated weights of the output layer + b2 -- updated biaises of the output layer + acc -- list of accuracies across epochs + """ + acc = [] + for i in range(num_epochs): + w1, b1, w2, b2, loss = learn_once_cross_entropy( + w1, b1, w2, b2, data_train, labels_train, learning_rate) + acc.append(test_mlp(w1, b1, w2, b2, data_train, labels_train)) + return w1, b1, w2, b2, acc + + +def test_mlp(w1, b1, w2, b2, data_test, labels_test): + """ + Tests the MLP + + Arguments: + w1 -- weights of the hidden layer + b1 -- biaises of the hidden layer + w2 -- weights of the output layer + b2 -- biaises of the output layer + data_test -- test data + labels_test -- labels of the test data + Returns: + acc -- accuracy + """ + # Forward pass + # the data are the input of the first layer + a0 = data_test + # input of the hidden layer + z1 = np.matmul(a0, w1) + b1 + # output of the hidden layer (sigmoid activation function) + a1 = 1 / (1 + np.exp(-z1)) + # input of the output layer + z2 = np.matmul(a1, w2) + b2 + # output of the output layer (sigmoid activation function) + a2 = np.exp(z2) / np.sum(np.exp(z2), axis=1, keepdims=True) + # the predicted values are the outputs of the output layer + predictions = a2 + + # Compute accuracy + acc = np.mean(np.argmax(predictions, axis=1) == labels_test) + + return acc + + +def run_mlp_training(data_train, labels_train, data_test, labels_test, learning_rate, num_epochs): + """ + Runs the MLP training + + Arguments: + data_train -- training data + labels_train -- labels of the training data + data_test -- test data + labels_test -- labels of the test data + learning_rate -- learning rate + n_epochs -- number of epochs + Returns: + w1 -- weights of the hidden layer + b1 -- biaises of the hidden layer + w2 -- weights of the output layer + b2 -- biaises of the output layer + acc -- list of accuracies across epochs + """ + N = data_train.shape[0] # number of training data + d_in = data_train.shape[1] # input dimension + d_h = 3 # number of neurons in the hidden layer + # output dimension (number of neurons of the output layer) + d_out = np.max(labels_train) + 1 + + # Random initialization of the network weights and biaises + w1 = 2 * np.random.rand(d_in, d_h) - 1 # first layer weights + b1 = np.zeros((1, d_h)) # first layer biaises + w2 = 2 * np.random.rand(d_h, d_out) - 1 # second layer weights + b2 = np.zeros((1, d_out)) # second layer biaises + + w1, b1, w2, b2, acc = train_mlp( + w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epochs) + return w1, b1, w2, b2, acc + + if __name__ == "__main__": N = 30 # number of input data d_in = 3 # input dimension @@ -145,3 +262,28 @@ if __name__ == "__main__": print(loss) print(one_hot(np.array([9, 1, 3, 0, 6, 5, 2, 7, 8, 4]))) + + N = 30 # number of input data + d_in = 3 # input dimension + d_h = 3 # number of neurons in the hidden layer + d_out = 5 # output dimension (number of neurons of the output layer) + + w1 = 2 * np.random.rand(d_in, d_h) - 1 # first layer weights + b1 = np.zeros((1, d_h)) # first layer biaises + w2 = 2 * np.random.rand(d_h, d_out) - 1 # second layer weights + b2 = np.zeros((1, d_out)) # second layer biaises + + data = np.random.rand(N, d_in) # create a random data + targets = np.random.randint(1, d_out, N) # create a random targets + + for i in range(100): + w1, b1, w2, b2, loss = learn_once_cross_entropy( + w1, b1, w2, b2, data, targets, 0.1) + print(loss) + + data, labels = read_cifar.read_cifar("data/cifar-10-batches-py/") + data_train, labels_train, data_test, labels_test = read_cifar.split_dataset( + data, labels, 0.8) + w1, b1, w2, b2, acc = run_mlp_training( + data_train, labels_train, data_test, labels_test, 0.1, 100) + print(acc)