diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..75cddcbd3f621bd0a2ed9589a446c2085cdd5d2e --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/data + +# Environments +.env +.venv +env/ +venv/ +ENV/ diff --git a/knn.py b/knn.py new file mode 100644 index 0000000000000000000000000000000000000000..aa6b34e0172fcff42192b72a3ba041686774bb04 --- /dev/null +++ b/knn.py @@ -0,0 +1,58 @@ +import numpy as np +import matplotlib.pyplot as plt + + + +def distance_matrix(train, test): + print('Computing distance matrix between train and test sets') + dists = np.sqrt(-2 * np.matmul(train, test.T) + + np.sum(train*train, axis=1, keepdims=True) + + np.sum(test*test, axis=1, keepdims=True).T) + print('finished calculating dists') + return dists + + +def mode(x): + vals, counts = np.unique(x, return_counts=True) + return vals[np.argmax(counts)] + + +def knn_predict(dists, labels_train, k): + # dists has shape [num_train, num_test] + indexes_of_knn = np.argsort(dists, axis=0)[0:k, :] + nearest_labels_pred = labels_train[indexes_of_knn] + labels_pred = np.array([ mode(label) for label in nearest_labels_pred.T ]) + return labels_pred + + +def evaluate_knn(data_train, labels_train, data_test, labels_test, k): + print(f"Evaluating the k-NN with k = {k}") + dists = distance_matrix(data_train, data_test) + labels_pred = knn_predict(dists, labels_train, k) + accuracy = np.sum(labels_pred == labels_test) / len(labels_test) + return accuracy + + +def evaluate_knn_for_k(data_train, labels_train, data_test, labels_test, k_max): + print(f"Evaluating the k-NN for k in range [1, {k_max}]") + accuracies = [0] * k_max + dists = distance_matrix(data_train, data_test) + for k in range(1, k_max + 1): + labels_pred = knn_predict(dists, labels_train, k) + accuracy = np.sum(labels_pred == labels_test) / len(labels_test) + accuracies[k - 1] = accuracy + + return accuracies + + +def plot_accuracy_versus_k(accuracies): + k = len(accuracies) + fig = plt.figure(figsize=(12, 8)) + plt.plot(np.arange(1, k+1, 1), accuracies) + plt.title("Variation of the accuracy as a function of k") + plt.xlabel("k (number of neighbors)") + plt.ylabel("Accuracy") + # ax = fig.gca() + # ax.set_xticks(np.arange(1, k+1, 1)) + plt.grid(axis='both', which='both') + plt.savefig('./results/knn.png') diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac5b55542b3f6b1833953cfd8e75f5325e08da2 --- /dev/null +++ b/main.py @@ -0,0 +1,81 @@ +from read_cifar import read_cifar, split_dataset +from knn import evaluate_knn_for_k, plot_accuracy_versus_k +import matplotlib.pyplot as plt +from mlp import run_mlp_training, plot_accuracy_versus_epoch + + + +if __name__=="__main__": + # data, labels = read_cifar("data\cifar-10-batches-py") + #split = 0.9 + #data_train, labels_train, data_test, labels_test = split_dataset(data,labels,split) + # data_train, data_test = data_train/255.0, data_test/255.0 + # kmax = 20 + #accuracies = evaluate_knn_for_k(data_train, labels_train, data_test, labels_test,kmax) + # accuracies = [0.351, + # 0.31316666666666665, + # 0.329, + # 0.33666666666666667, + # 0.33616666666666667, + # 0.3413333333333333, + # 0.343, + # 0.3428333333333333, + # 0.341, + # 0.3335, + # 0.3325, + # 0.3328333333333333, + # 0.33016666666666666, + # 0.3295, + # 0.32766666666666666, + # 0.3285, + # 0.327, + # 0.32716666666666666, + # 0.32916666666666666, + # 0.3305] + #plot_accuracy_versus_k(accuracies) + #################################### + # parameters of the MLP : + split_factor = 0.9 + data, labels = read_cifar("data\cifar-10-batches-py") + data_train, labels_train, data_test, labels_test = split_dataset(data, labels, split=split_factor) + data_train, data_test = data_train/255.0, data_test/255.0 # normalize ou data + d_h = 64 + lr = 0.1 + num_epoch=100 + accuracies, losses = run_mlp_training(data_train, labels_train, data_test, + labels_test, d_h, lr, num_epoch) + # accuracies = [0.08788888888888889, 0.08990740740740741, 0.09135185185185185, 0.09296296296296297, 0.09514814814814815, 0.09631481481481481, 0.09724074074074074, 0.09787037037037037, 0.09820370370370371, 0.09883333333333333, 0.09844444444444445, 0.09859259259259259, 0.09857407407407408, 0.09885185185185186, 0.09872222222222223, 0.09855555555555555, 0.09872222222222223, 0.09883333333333333, 0.0989074074074074, 0.09881481481481481, 0.0987962962962963, 0.09898148148148148, 0.09916666666666667, 0.09938888888888889, 0.09961111111111111, 0.09975925925925926, 0.09975925925925926, 0.1, 0.10003703703703704, 0.09998148148148148, 0.10007407407407408, 0.10011111111111111, 0.10001851851851852, 0.10014814814814815, 0.10012962962962962, 0.09998148148148148, 0.1000925925925926, 0.1000925925925926, 0.10007407407407408, 0.10005555555555555, 0.10014814814814815, 0.10018518518518518, 0.1002037037037037, 0.10018518518518518, 0.10016666666666667, 0.10011111111111111, 0.10016666666666667, 0.10012962962962962, 0.10007407407407408, 0.10005555555555555, 0.1, 0.1, 0.1, 0.1, 0.1, 0.09998148148148148, 0.09998148148148148, 0.09996296296296296, 0.09996296296296296, 0.09996296296296296, 0.09994444444444445, 0.09994444444444445, 0.09994444444444445, 0.0999074074074074, 0.09994444444444445, 0.09996296296296296, 0.09996296296296296, 0.09996296296296296, 0.09998148148148148, 0.09996296296296296, 0.09998148148148148, 0.1, 0.1, 0.10003703703703704, 0.10003703703703704, 0.10005555555555555, 0.10007407407407408, 0.10007407407407408, 0.10007407407407408, 0.10003703703703704, 0.10001851851851852, 0.10003703703703704, 0.10003703703703704, 0.10003703703703704, 0.10001851851851852, 0.10001851851851852, 0.10003703703703704, 0.10003703703703704, 0.10005555555555555, 0.10007407407407408, 0.10007407407407408, 0.10007407407407408, 0.10007407407407408, 0.10005555555555555, 0.10005555555555555, 0.10005555555555555, 0.10007407407407408, 0.10007407407407408, 0.10007407407407408, 0.10007407407407408] + # print(accuracies) + plot_accuracy_versus_epoch(accuracies) + + + + + + +# Result for k = 1 +# Reading data from disk +# [INFO] Splitting data into train/test with split=70 +# [INFO] Training set has 42000 samples and testing set has 18000 samples. +# [INFO] Time taken 0 +# Evaluating the k-NN with k = 1 +# Computing distance matrix between train and test sets +# finished calculating dists +# Running the prediction using k-NN with k = 1 +# [INFO] computing accuracy of the predictions +# accuracy = 0.3388888888888889 + + +# Reading data from disk +# [INFO] Splitting data into train/test with split=70 +# [INFO] Training set has 42000 samples and testing set has 18000 samples. +# [INFO] Time taken 0 +# Evaluating the k-NN with k = 3 +# Computing distance matrix between train and test sets +# finished calculating dists +# Running the prediction using k-NN with k = 3 +# [INFO] computing accuracy of the predictions +# 0.3308333333333333 + + + diff --git a/mlp.py b/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..6a2fc13b36d57171aeed1f3dd9f38eb301d7005d --- /dev/null +++ b/mlp.py @@ -0,0 +1,131 @@ +import numpy as np +import matplotlib.pyplot as plt +import time + + +def learn_once_mse(w1, b1, w2, b2, data, targets, lr): + # Forward pass + a0 = data # Input of the first layer + z1 = np.matmul(a0, w1) + b1 # Input of the hidden layer + a1 = 1 / (1 + np.exp(-z1)) # Output of the hidden layer (sigmoid activation) + z2 = np.matmul(a1, w2) + b2 # Input of the output layer + a2 = 1 / (1 + np.exp(-z2)) # Output of the output layer (sigmoid activation) + predictions = a2 # Predicted values are the outputs of the output layer + # Compute loss (MSE) + loss = np.mean(np.square(predictions - targets)) + # Compute gradients + delta2 = predictions - targets + delta1 = np.dot(delta2, w2.T) * a1 * (1 - a1) # Gradient for the hidden layer + # Update weights and biases using gradients + w2 -= lr * np.dot(a1.T, delta2) / len(data) + b2 -= lr * np.sum(delta2, axis=0) / len(data) + w1 -= lr * np.dot(a0.T, delta1) / len(data) + b1 -= lr * np.sum(delta1, axis=0) / len(data) + return w1, b1, w2, b2, loss + + +def one_hot(x): + n_classes = 10 + return np.eye(n_classes)[x] + + +def softmax(x): + e_x = np.exp(x - np.max(x)) + return e_x / e_x.sum() + + +def learn_once_cross_entropy(w1, b1, w2, b2, data, targets, learning_rate): + N = data.shape[0] + # Forward pass + a0 = data # the data are the input of the first layer + z1 = np.matmul(a0, w1) + b1 # input of the hidden layer + a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer (sigmoid activation function) + z2 = np.matmul(a1, w2) + b2 # input of the output layer + a2 = softmax(z2) # output of the output layer (softmax activation function) + predictions = a2 # the predicted values are the outputs of the output layer + # One-hot encode the targets + oh_targets = one_hot(targets) + # Compute the Cross-Entropy loss + loss = - np.sum(oh_targets * np.log(predictions + 1e-9)) / N + # Backward pass + dz2 = predictions - oh_targets + dw2 = np.dot(a1.T, dz2) / N + db2 = np.sum(dz2, axis=0, keepdims=True) / N + da1 = np.dot(dz2, w2.T) + dz1 = da1 * a1 * (1 - a1) + dw1 = np.dot(a0.T, dz1) / N + db1 = np.sum(dz1, axis=0, keepdims=True) / N + # One step of gradient descent + w1 -= learning_rate * dw1 + w2 -= learning_rate * dw2 + b1 -= learning_rate * db1 + b2 -= learning_rate * db2 + return w1, b1, w2, b2, loss + + +def predict_mlp(w1, b1, w2, b2, data): + # Forward pass + a0 = data # the data are the input of the first layer + z1 = np.matmul(a0, w1) + b1 # input of the hidden layer + a1 = 1 / (1 + np.exp(-z1)) # output of the hidden layer (sigmoid activation function) + z2 = np.matmul(a1, w2) + b2 # input of the output layer + a2 = softmax(z2) # output of the output layer (softmax activation function) + predictions = np.argmax(a2, axis=1) + return predictions + + +def train_mlp(w1, b1, w2, b2, data_train, labels_train, learning_rate, num_epoch): + # perform num_epoch of training steps + losses = [] + train_accuracies = [0] * num_epoch + for epoch in range(num_epoch): + w1, b1, w2, b2, loss = learn_once_cross_entropy(w1, b1, w2, b2, data_train, labels_train, learning_rate) + losses.append(loss) + labels_pred = predict_mlp(w1, b1, w2, b2, data_train) + accuracy = np.mean(labels_pred == labels_train) + train_accuracies[epoch] = accuracy + print(f"Epoch loss [{epoch+1}/{num_epoch}] : {loss} --- accuracy : {accuracy}") + # Update weights and biases for the next iteration + # Pass the updated parameters to the next iteration + return w1, b1, w2, b2, train_accuracies + + +def test_mlp(w1, b1, w2, b2, data_test, labels_test): + #testing the network on the test set + labels_pred = predict_mlp(w1, b1, w2, b2, data_test) + test_accuracy = np.mean(labels_pred == labels_test) + return test_accuracy + + +def run_mlp_training(data_train, labels_train, data_test, labels_test, d_h, lr, num_epoch): + """Train an MLP with given parameters.""" + print("Starting Training...") + tic = time.time() + d_in = data_train.shape[1] + d_out = len(set(labels_train)) + # Random initialization of the network weights and biaises + w1 = 2 * np.random.rand(d_in, d_h) - 1 # first layer weights + b1 = np.zeros((1, d_h)) # first layer biaises + w2 = 2 * np.random.rand(d_h, d_out) - 1 # second layer weights + b2 = np.zeros((1, d_out)) # second layer biaises + w1, b1, w2, b2, accuracies = train_mlp(w1, b1, w2, b2, data_train, labels_train, lr, num_epoch) + toc = time.time() + print("Finished Training.") + print('Time taken for training: ', toc-tic) + print("Starting Testing...") + tic = time.time() + accuracy = test_mlp(w1, b1, w2, b2, data_test, labels_test) + toc = time.time() + print("Finished Testing.") + print('Time taken for Testing: ', toc-tic) + return accuracies, accuracy + + +def plot_accuracy_versus_epoch(accuracies): + plt.figure(figsize=(18, 10)) + plt.plot(accuracies, 'o-b') + plt.title("Variation of the accuracy over the epochs") + plt.xlabel("Epochs") + plt.ylabel("Accuracy") + plt.grid(axis='both', which='both') + plt.savefig('./results/mlp.png') diff --git a/read_cifar.py b/read_cifar.py new file mode 100644 index 0000000000000000000000000000000000000000..90fadea026b1b669da0f43ed49557a6378f0ec78 --- /dev/null +++ b/read_cifar.py @@ -0,0 +1,43 @@ +import numpy as np +import os + + +def unpickle(file): + import pickle + with open(file, 'rb') as fo: + dict = pickle.load(fo, encoding='bytes') + return dict + + +def read_cifar_batch(file): + dict = unpickle(file) + data = dict[b'data'].astype(np.float32) + labels = np.array(dict[b'labels'], dtype=np.int64) + labels = labels.reshape(labels.shape[0]) + return data, labels + + +def read_cifar(path): + print('Reading data from disk') + data_batches = ["data_batch_" + str(i) for i in range(1, 6)] + ['test_batch'] + flag = True + for db in data_batches: + data, labels = read_cifar_batch(os.path.join(path, db)) + if flag: + DATA = data + LABELS = labels + flag = False + else: + DATA = np.concatenate((DATA, data), axis=0, dtype=np.float32) + LABELS = np.concatenate((LABELS, labels), axis=-1, dtype=np.int64) + return DATA, LABELS + + +def split_dataset(data, labels, split=0.6): + print(f"Splitting data into train/test with split={split}") + n = data.shape[0] + indices = np.random.permutation(n) + train_idx, test_idx = indices[:int(split*n)], indices[int(split*n):] + data_train, data_test = data[train_idx,:].astype(np.float32), data[test_idx,:].astype(np.float32) + labels_train, labels_test = labels[train_idx].astype(np.int64), labels[test_idx].astype(np.int64) + return data_train, labels_train, data_test, labels_test \ No newline at end of file diff --git a/results/knn.png b/results/knn.png new file mode 100644 index 0000000000000000000000000000000000000000..d3f54e394bc7e286395893e7f23534491dfb2078 Binary files /dev/null and b/results/knn.png differ diff --git a/results/mlp.png b/results/mlp.png new file mode 100644 index 0000000000000000000000000000000000000000..0e768068f0e07661e8f5ec3c03784b6f63c5ea86 Binary files /dev/null and b/results/mlp.png differ