diff --git a/mlp.py b/mlp.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ea3fb1c1dd00b9a733b1a3011529c15f8adf57ce 100644
--- a/mlp.py
+++ b/mlp.py
@@ -0,0 +1,150 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import scipy.special as sp
+from tqdm import tqdm
+import read_cifar
+
+
+
+def learn_once_mse(w1, b1, w2, b2, data, targets, learning_rate):
+    """
+    Perform one iteration of training using Mean Squared Error (MSE) loss.
+    """
+    # Forward propagation
+    a0 = data  # Input layer receives the data
+    z1 = np.matmul(a0, w1) + b1  # Compute hidden layer input
+    a1 = 1 / (1 + np.exp(-z1))  # Apply sigmoid activation in hidden layer
+    z2 = np.matmul(a1, w2) + b2  # Compute output layer input
+    a2 = 1 / (1 + np.exp(-z2))  # Apply sigmoid activation in output layer
+    predictions = a2  # Final predictions from the network
+
+    # Calculate MSE loss
+    loss = np.mean(np.square(predictions - targets))
+
+    # Backward propagation to compute gradients
+    grad_a2 = 2 * (predictions - targets)
+    grad_z2 = grad_a2 * a2 * (1 - a2) 
+    grad_w2 = np.matmul(a1.T, grad_z2) 
+    grad_b2 = np.sum(grad_z2, axis=0)
+    grad_a1 = np.matmul(grad_z2, w2.T)
+    grad_z1 = grad_a1 * a1 * (1 - a1)
+    grad_w1 = np.matmul(a0.T, grad_z1)
+    grad_b1 = np.sum(grad_z1, axis=0)
+
+    # Update the network parameters
+    w1 -= learning_rate * grad_w1
+    w2 -= learning_rate * grad_w2
+    b1 -= learning_rate * grad_b1
+    b2 -= learning_rate * grad_b2
+
+    return w1, b1, w2, b2, loss
+
+def one_hot(labels):
+    """
+    Convert labels to one-hot encoded format.
+    """
+    if isinstance(labels, np.int64):
+        labels = np.array([labels])
+    one_hot_matrix = np.zeros((len(labels), 10))
+    one_hot_matrix[np.arange(len(labels)), labels] = 1
+    return one_hot_matrix
+
+def learn_once_cross_entropy(w1, b1, w2, b2, data, targets, learning_rate, batch_size):
+    """
+    Perform one iteration of training using Cross-Entropy loss.
+    """
+    # Forward propagation
+    a0 = data
+    z1 = np.matmul(a0, w1) + b1
+    a1 = 1 / (1 + np.exp(-z1))
+    z2 = np.matmul(a1, w2) + b2
+    a2 = sp.softmax(z2, axis=1)
+    predictions = a2
+
+    # Convert targets to one-hot format and calculate accuracy
+    
+    pred_labels = a2.argmax(axis=1)
+    correct_predictions = np.sum(pred_labels == targets)
+    targets_one_hot = one_hot(targets)
+    
+    # Compute Cross-Entropy loss
+    loss = -np.sum(targets_one_hot * np.log(predictions + 1e-8)) / batch_size
+    grad_z2 = (predictions - targets_one_hot) / batch_size
+    grad_w2 = np.matmul(a1.T, grad_z2)
+    grad_b2 = np.sum(grad_z2, axis=0)
+    grad_a1 = np.matmul(grad_z2, w2.T)
+    grad_z1 = grad_a1 * a1 * (1 - a1)
+    a0 = a0.reshape(-1, batch_size)
+    grad_w1 = np.matmul(a0, grad_z1)
+    grad_b1 = np.sum(grad_z1, axis=0)
+
+    # Update weights and biases
+    w1 -= learning_rate * grad_w1
+    w2 -= learning_rate * grad_w2
+    b1 -= learning_rate * grad_b1
+    b2 -= learning_rate * grad_b2
+
+    accuracy = correct_predictions / len(pred_labels)
+    return w1, b1, w2, b2, accuracy
+
+
+def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate=0.01, nb_epochs=100, batch_size=1):
+    training_accuracies = []
+    for epoch in range(nb_epochs):
+        batch_accuracies = []
+        batch_count = len(data_train) // batch_size
+        for i in tqdm(range(batch_count)):
+            batch_start, batch_end = i * batch_size, (i + 1) * batch_size
+            w1, b1, w2, b2, acc = learn_once_cross_entropy(w1, b1, w2, b2, data_train[batch_start:batch_end], labels_train[batch_start:batch_end], learning_rate, batch_size)
+            batch_accuracies.append(acc)
+
+        # Handling remaining data if total data is not a multiple of batch size
+        if len(data_train) % batch_size != 0:
+            remaining = len(data_train) - batch_count * batch_size
+            w1, b1, w2, b2, acc = learn_once_cross_entropy(w1, b1, w2, b2, data_train[-remaining:], labels_train[-remaining:], learning_rate, remaining)
+            batch_accuracies.append(acc)
+
+        epoch_accuracy = sum(batch_accuracies) / len(batch_accuracies)
+        print(f"Epoch {epoch + 1} Accuracy: {epoch_accuracy:.4f}")
+        training_accuracies.append(epoch_accuracy)
+
+    return w1, b1, w2, b2, training_accuracies
+
+def test_mlp(w1, b1, w2, b2, data_test, labels_test):
+    # Forward pass
+    a0 = data_test
+    z1 = np.matmul(a0, w1) + b1
+    a1 = 1 / (1 + np.exp(-z1))
+    z2 = np.matmul(a1, w2) + b2
+    a2 = sp.softmax(z2)
+    
+    # Compute accuracy
+    correct_count = np.sum(a2.argmax(axis=1) == labels_test)
+    return correct_count / len(labels_test)
+
+def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, learning_rate=0.1, nb_epochs=100, batch_size=200):
+    # Initialize network parameters
+    w1 = np.random.uniform(-1, 1, (3072, d_h))
+    b1 = np.zeros((1, d_h))
+    w2 = np.random.uniform(-1, 1, (d_h, 10))
+    b2 = np.zeros((1, 10))
+
+    # Training phase
+    w1, b1, w2, b2, train_accuracies = train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, nb_epochs, batch_size)
+
+    # Testing phase
+    test_accuracy = test_mlp(w1, b1, w2, b2, data_test, labels_test)
+    print(f"Test Accuracy: {test_accuracy:.4f}")
+
+    return train_accuracies, test_accuracy
+
+if __name__ == "__main__":
+    # Load and preprocess data
+    data, labels = read_cifar.read_cifar('data/cifar-10-batches-py')
+    data_train, labels_train, data_test, labels_test = read_cifar.split_dataset(data, labels, 0.9)
+
+    # Execute training and testing
+    train_accuracies, test_accuracy = run_mlp_training(data_train, labels_train, data_test, labels_test, 64, 0.1, 100, 100)
+    plt.plot(train_accuracies, label="Training Accuracy")
+    plt.legend()
+    plt.show()
\ No newline at end of file