diff --git a/mlp.py b/mlp.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ea3fb1c1dd00b9a733b1a3011529c15f8adf57ce 100644 --- a/mlp.py +++ b/mlp.py @@ -0,0 +1,150 @@ +import numpy as np +import matplotlib.pyplot as plt +import scipy.special as sp +from tqdm import tqdm +import read_cifar + + + +def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate): + """ + Perform one iteration of training using Mean Squared Error (MSE) loss. + """ + # Forward propagation + a0 = data # Input layer receives the data + z1 = np.matmul(a0, w1) + b1 # Compute hidden layer input + a1 = 1 / (1 + np.exp(-z1)) # Apply sigmoid activation in hidden layer + z2 = np.matmul(a1, w2) + b2 # Compute output layer input + a2 = 1 / (1 + np.exp(-z2)) # Apply sigmoid activation in output layer + predictions = a2 # Final predictions from the network + + # Calculate MSE loss + loss = np.mean(np.square(predictions - targets)) + + # Backward propagation to compute gradients + grad_a2 = 2 * (predictions - targets) + grad_z2 = grad_a2 * a2 * (1 - a2) + grad_w2 = np.matmul(a1.T, grad_z2) + grad_b2 = np.sum(grad_z2, axis=0) + grad_a1 = np.matmul(grad_z2, w2.T) + grad_z1 = grad_a1 * a1 * (1 - a1) + grad_w1 = np.matmul(a0.T, grad_z1) + grad_b1 = np.sum(grad_z1, axis=0) + + # Update the network parameters + w1 -= learning_rate * grad_w1 + w2 -= learning_rate * grad_w2 + b1 -= learning_rate * grad_b1 + b2 -= learning_rate * grad_b2 + + return w1, b1, w2, b2, loss + +def one_hot(labels): + """ + Convert labels to one-hot encoded format. + """ + if isinstance(labels, np.int64): + labels = np.array([labels]) + one_hot_matrix = np.zeros((len(labels), 10)) + one_hot_matrix[np.arange(len(labels)), labels] = 1 + return one_hot_matrix + +def learn_once_cross_entropy(w1, b1, w2, b2, data, targets, learning_rate, batch_size): + """ + Perform one iteration of training using Cross-Entropy loss. + """ + # Forward propagation + a0 = data + z1 = np.matmul(a0, w1) + b1 + a1 = 1 / (1 + np.exp(-z1)) + z2 = np.matmul(a1, w2) + b2 + a2 = sp.softmax(z2, axis=1) + predictions = a2 + + # Convert targets to one-hot format and calculate accuracy + + pred_labels = a2.argmax(axis=1) + correct_predictions = np.sum(pred_labels == targets) + targets_one_hot = one_hot(targets) + + # Compute Cross-Entropy loss + loss = -np.sum(targets_one_hot * np.log(predictions + 1e-8)) / batch_size + grad_z2 = (predictions - targets_one_hot) / batch_size + grad_w2 = np.matmul(a1.T, grad_z2) + grad_b2 = np.sum(grad_z2, axis=0) + grad_a1 = np.matmul(grad_z2, w2.T) + grad_z1 = grad_a1 * a1 * (1 - a1) + a0 = a0.reshape(-1, batch_size) + grad_w1 = np.matmul(a0, grad_z1) + grad_b1 = np.sum(grad_z1, axis=0) + + # Update weights and biases + w1 -= learning_rate * grad_w1 + w2 -= learning_rate * grad_w2 + b1 -= learning_rate * grad_b1 + b2 -= learning_rate * grad_b2 + + accuracy = correct_predictions / len(pred_labels) + return w1, b1, w2, b2, accuracy + + +def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate=0.01, nb_epochs=100, batch_size=1): + training_accuracies = [] + for epoch in range(nb_epochs): + batch_accuracies = [] + batch_count = len(data_train) // batch_size + for i in tqdm(range(batch_count)): + batch_start, batch_end = i * batch_size, (i + 1) * batch_size + w1, b1, w2, b2, acc = learn_once_cross_entropy(w1, b1, w2, b2, data_train[batch_start:batch_end], labels_train[batch_start:batch_end], learning_rate, batch_size) + batch_accuracies.append(acc) + + # Handling remaining data if total data is not a multiple of batch size + if len(data_train) % batch_size != 0: + remaining = len(data_train) - batch_count * batch_size + w1, b1, w2, b2, acc = learn_once_cross_entropy(w1, b1, w2, b2, data_train[-remaining:], labels_train[-remaining:], learning_rate, remaining) + batch_accuracies.append(acc) + + epoch_accuracy = sum(batch_accuracies) / len(batch_accuracies) + print(f"Epoch {epoch + 1} Accuracy: {epoch_accuracy:.4f}") + training_accuracies.append(epoch_accuracy) + + return w1, b1, w2, b2, training_accuracies + +def test_mlp(w1, b1, w2, b2, data_test, labels_test): + # Forward pass + a0 = data_test + z1 = np.matmul(a0, w1) + b1 + a1 = 1 / (1 + np.exp(-z1)) + z2 = np.matmul(a1, w2) + b2 + a2 = sp.softmax(z2) + + # Compute accuracy + correct_count = np.sum(a2.argmax(axis=1) == labels_test) + return correct_count / len(labels_test) + +def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate=0.1, nb_epochs=100, batch_size=200): + # Initialize network parameters + w1 = np.random.uniform(-1, 1, (3072, d_h)) + b1 = np.zeros((1, d_h)) + w2 = np.random.uniform(-1, 1, (d_h, 10)) + b2 = np.zeros((1, 10)) + + # Training phase + w1, b1, w2, b2, train_accuracies = train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, nb_epochs, batch_size) + + # Testing phase + test_accuracy = test_mlp(w1, b1, w2, b2, data_test, labels_test) + print(f"Test Accuracy: {test_accuracy:.4f}") + + return train_accuracies, test_accuracy + +if __name__ == "__main__": + # Load and preprocess data + data, labels = read_cifar.read_cifar('data/cifar-10-batches-py') + data_train, labels_train, data_test, labels_test = read_cifar.split_dataset(data, labels, 0.9) + + # Execute training and testing + train_accuracies, test_accuracy = run_mlp_training(data_train, labels_train, data_test, labels_test, 64, 0.1, 100, 100) + plt.plot(train_accuracies, label="Training Accuracy") + plt.legend() + plt.show() \ No newline at end of file