diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..68bc17f9ff2104a9d7b6777058bb4c343ca72609 --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/data/cifar-10-batches-py/data_batch_1 b/data/cifar-10-batches-py/data_batch_1 new file mode 100644 index 0000000000000000000000000000000000000000..ab404a5ac32492b807a5c6cd02b83dc4dd5ff980 Binary files /dev/null and b/data/cifar-10-batches-py/data_batch_1 differ diff --git a/data/cifar-10-batches-py/data_batch_2 b/data/cifar-10-batches-py/data_batch_2 new file mode 100644 index 0000000000000000000000000000000000000000..6bf1369a6cacadfdbd2f8c61e354cc7d0c17bbae Binary files /dev/null and b/data/cifar-10-batches-py/data_batch_2 differ diff --git a/data/cifar-10-batches-py/data_batch_3 b/data/cifar-10-batches-py/data_batch_3 new file mode 100644 index 0000000000000000000000000000000000000000..66a0d630a7eb736563b1861ce716bdc489f2113b Binary files /dev/null and b/data/cifar-10-batches-py/data_batch_3 differ diff --git a/data/cifar-10-batches-py/data_batch_4 b/data/cifar-10-batches-py/data_batch_4 new file mode 100644 index 0000000000000000000000000000000000000000..cf8d03d1e80e6d9e440d1764faa85aedd1d6b960 Binary files /dev/null and b/data/cifar-10-batches-py/data_batch_4 differ diff --git a/data/cifar-10-batches-py/data_batch_5 b/data/cifar-10-batches-py/data_batch_5 new file mode 100644 index 0000000000000000000000000000000000000000..468b2aa538c551bc9f590f213b19d96915b85062 Binary files /dev/null and b/data/cifar-10-batches-py/data_batch_5 differ diff --git a/data/cifar-10-batches-py/test_batch b/data/cifar-10-batches-py/test_batch new file mode 100644 index 0000000000000000000000000000000000000000..3e03f1fc5261d102600fc1c130454f1f5cda567b Binary files /dev/null and b/data/cifar-10-batches-py/test_batch differ diff --git a/knn.py b/knn.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf127dfe535a0000c5b171df1a0e23ea9035d4f --- /dev/null +++ b/knn.py @@ -0,0 +1,124 @@ +import numpy as np +import matplotlib.pyplot as plt +import read_cifar +import os + +#La matrice de distance est composé de N lignes et M colonnes, +#avec pour chaque élément la distance entre unne image test et une image entrainement +#La distance est tout simplement calculée pixel par pixel, puis on fait la somme +#pour avoir la distance totale de l'image +#Cette méthode n'est pas la plus efficace aujourd'hui, mais permet d'avoir une +#première idée +def distance_matrix(matrix1, matrix2): + # Calculate the squared sum of matrix1 + sum_matrix1 = np.sum(matrix1**2, axis=1, keepdims=True) + + # Calculate the squared sum of matrix2 + sum_matrix2 = np.sum(matrix2**2, axis=1, keepdims=True) + + # Compute the dot product between matrix1 and matrix2 + dot_product = np.dot(matrix1, matrix2.T) + + # Compute the Euclidean distance matrix + dists = np.sqrt(sum_matrix1 - 2 * dot_product + sum_matrix2.T) + + return dists + +#Test +# Create two example matrices +matrix1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) +matrix2 = np.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]]) + +# Compute the Euclidean distance matrix +dists = distance_matrix(matrix1, matrix2) + +##print(dists) + +#La fonction knn_predicts est assez simple : +#On regarde la matrice de distance pour une image, on la trie dans l'ordre croissant +#(donc avec les images les plus "proches" d'abord), puis on regarde les labels +#des k premières images : on prend ensuite le label qui revient le plus +##def knn_predict(dists, labels_train, k): +## # Initialize an empty array to store the predicted labels +## predicted_labels = [] +## # Loop through each row in the distance matrix (each test example) +## for i in range(dists.shape[0]): +## # Get the distances for the current test example +## distances = dists[i] +## # Get the indices of the k nearest neighbors +## nearest_indices = np.argsort(distances)[:k] +## +## # Get the labels of the k nearest neighbors +## nearest_labels = [labels_train[idx] for idx in nearest_indices] +## +## # Use a voting mechanism to determine the predicted label +## predicted_label = max(set(nearest_labels), key=nearest_labels.count) +## +## # Append the predicted label to the result array +## predicted_labels.append(predicted_label) +## return predicted_labels + +def knn_predict(dists, labels_train, k): + # Use np.argpartition to find the indices of the k nearest neighbors for all test examples + nearest_indices = np.argpartition(dists, k, axis=1)[:, :k] + + # Get the labels of the k nearest neighbors for all test examples + nearest_labels = labels_train[nearest_indices] + + # Use a voting mechanism to determine the predicted labels for all test examples + predicted_labels = np.array([np.argmax(np.bincount(nearest_labels[i])) for i in range(nearest_labels.shape[0])]) + + return predicted_labels + +#Dans cette fonction on calcule le taux de classification, +#càd le nombre d'observation correctement classées sur le nombre total +#d'observations. Pour cela, on va d'abord entrainer l'algorithme avec +#la base d'entraînement, puis on va vérifier avec la base de test +def evaluate_knn(data_train,labels_train,data_test,labels_test,k): + # Calculate the distance matrix between the training and test data + dists = distance_matrix(data_test, data_train) + + # Use the knn_predict function to get predicted labels for the test data + predicted_labels = knn_predict(dists, labels_train, k) + + # Initialize a variable to count the number of correct predictions + correct_predictions = 0 + + # Loop through the predicted and true labels and count the correct predictions + for predicted_label, true_label in zip(predicted_labels, labels_test): + if predicted_label == true_label: + correct_predictions += 1 + + # Calculate accuracy as the ratio of correct predictions to the total number of test instances + accuracy = correct_predictions / len(labels_test) * 100 + return accuracy + +if __name__ == "__main__": + data_folder = 'data/cifar-10-batches-py' + batch_filename = 'data_batch_1' # Adjust this to the specific batch file you want to read + + batch_path = os.path.join(data_folder, batch_filename) + + data, labels = read_cifar.read_cifar_batch(batch_path) + data_train, labels_train, data_test, labels_test = read_cifar.split_dataset(data, labels, 0.9) + print(len(data_train),len(data_test)) + # Initialize lists to store k values and corresponding accuracies + k_values = list(range(1, 21)) + accuracies = [] + # Calculate accuracy for different values of k + for k in k_values: + accuracy = evaluate_knn(data_train, labels_train, data_test, labels_test, k) + accuracies.append(accuracy) + # Create a plot of accuracy vs. k values + plt.figure(figsize=(10, 6)) + plt.plot(k_values, accuracies, marker='o', linestyle='-', color='b') + plt.title('Accuracy vs. k for k-Nearest Neighbors') + plt.xlabel('k (Number of Neighbors)') + plt.ylabel('Accuracy (%)') + plt.grid(True) + + # Save the plot as "knn.png" in the "results" directory + plt.savefig('results/knn.png') + + # Show the plot (optional) + plt.show() diff --git a/mlp.py b/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..22008e84e745c42b77a12ef0237a5d4de3b1bbd1 --- /dev/null +++ b/mlp.py @@ -0,0 +1,166 @@ +import numpy as np +from sklearn.metrics import accuracy_score +import read_cifar +import matplotlib.pyplot as plt +import os + +N = 30 # number of input data +d_in = 2 # input dimension +d_h = 3 # number of neurons in the hidden layer +d_out = 1 # output dimension (number of neurons of the output layer) + +# Random initialization of the network weights and biaises +w1 = 2 * np.random.rand(d_in, d_h) - 1 # first layer weights +b1 = np.zeros((1, d_h)) # first layer biaises +w2 = 2 * np.random.rand(d_h, d_out) - 1 # second layer weights +b2 = np.zeros((1, d_out)) # second layer biaises + +data = np.random.rand(N, d_in) # create a random data +targets = np.random.rand(N, d_out) # create a random targets + +# Sigmoid function +def sigmoid(z): + return 1 / (1 + np.exp(-np.clip(z, -30, 30))) #to avoid overflow + +# Forward pass +def forward_pass(data, w1, b1, w2, b2): + a0 = data # the data are the input of the first layer + z1 = np.matmul(a0, w1) + b1 # input of the hidden layer + a1 = sigmoid(z1) # output of the hidden layer (sigmoid activation function) + z2 = np.matmul(a1, w2) + b2 # input of the output layer + a2 = sigmoid(z2) # output of the output layer (sigmoid activation function) + predictions = a2 # the predicted values are the outputs of the output layer + return (a0,z1,a1,z2,a2,predictions) + +# Compute loss (MSE) +def mse(predictions,targets): + loss = np.mean(np.square(predictions - targets)) + return (loss) + +def learn_once_mse(w1,b1,w2,b2,data,targets,learning_rate = 0.01): + a0,z1,a1,z2,a2,predictions = forward_pass(data, w1, b1, w2, b2) + loss = mse(predictions,targets) + + # Backpropagation + grad_z2 = 2/targets.shape[1] * (a2-targets) * a2 * (1-a2) + grad_w2 = np.matmul(a1.T, grad_z2) + grad_b2 = np.sum(grad_z2, axis=0, keepdims=True) + grad_a1 = np.matmul(grad_z2, w2.T) + grad_z1 = grad_a1 * a1 * (1 - a1) + grad_w1 = np.matmul(data.T, grad_z1) + grad_b1 = np.sum(grad_z1, axis=0, keepdims=True) + + # Update weights and biases using gradient descent + w1 -= learning_rate * grad_w1 + b1 -= learning_rate * grad_b1 + w2 -= learning_rate * grad_w2 + b2 -= learning_rate * grad_b2 + + return w1, b1, w2, b2, loss + + +# Forward pass +def forward(data, w1, b1, w2, b2): + a0 = data # the data are the input of the first layer + z1 = np.matmul(a0, w1) + b1 # input of the hidden layer + a1 = sigmoid(z1) # output of the hidden layer (sigmoid activation function) + z2 = np.matmul(a1, w2) + b2 # input of the output layer + a2 = softmax_stable(z2) # output of the output layer (sigmoid activation function) + predictions = a2 # the predicted values are the outputs of the output layer + return (a0,z1,a1,z2,a2,predictions) + +def one_hot(labels): + num_classes = np.max(labels) + 1 + one_hot_matrix = np.eye(num_classes)[labels] + return one_hot_matrix + +def softmax_stable(x): + #We use this function to avoid computing to big numbers + return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum()) + + +def learn_once_cross_entropy(w1, b1, w2, b2, data, labels_train, learning_rate): + + a0,z1,a1,z2,a2,predictions = forward(data, w1, b1, w2, b2) + + N = len(labels_train) + + labels_train = one_hot(labels_train) + + # Compute the gradient of the loss with respect to the predictions (a2) + grad_z2 = a2 - labels_train + + # Backpropagation + grad_w2 = np.matmul(a1.T, grad_z2) + grad_b2 = np.sum(grad_z2, axis=0, keepdims=True) + grad_a1 = np.matmul(grad_z2, w2.T) + grad_z1 = grad_a1 * a1 * (1 - a1) + grad_w1 = np.matmul(data.T, grad_z1) + grad_b1 = np.sum(grad_z1, axis=0, keepdims=True) + + # Update weights and biases using gradient descent + w1 -= learning_rate * grad_w1 + b1 -= learning_rate * grad_b1 + w2 -= learning_rate * grad_w2 + b2 -= learning_rate * grad_b2 + + #Ajout d'un coefficient epsilon très faible dans la fonction de coût pour éviter les problèmes de division par zéro + epsilon = 1e-9 + loss = -np.sum(labels_train * np.log(predictions + epsilon) + (1 - labels_train) * np.log(1 - predictions + epsilon)) / N + + return w1, b1, w2, b2, loss + +#Fonction de prédiction qui pour un vecteur donné renvoie la classe prédite (cad l'indice de l'élément le plus élevé) +def predict_class(predictions): + return np.argmax(predictions, axis=1) + +#Fonction taux de réussite qui compare une liste de prédictions à la liste des résultats et renvoie la proportion de vraies prédictions +def accuracy(y_true, y_pred): + return np.mean(y_true == y_pred) + +def train_mlp(w1,b1,w2,b2, data_train, labels_train, learning_rate, num_epoch): + train_accuracies = [] + for i in range(num_epoch): + w1,b1,w2,b2, loss = learn_once_cross_entropy(w1,b1,w2,b2, data_train, labels_train,learning_rate) + a0,z1,a1,z2,a2,predictions = forward(data_train, w1, b1, w2, b2) + predictions = predict_class(a2) + train_accuracies.append(accuracy(labels_train, predictions)) + return (w1,b1,w2,b2, train_accuracies) + +def test_mlp(w1,b1,w2,b2, data_test,labels_test): + a0,z1,a1,z2,a2,predictions = forward(data_test, w1, b1, w2, b2) + predictions = predict_class(a2) + test_accuracy = accuracy(labels_test, predictions) + return test_accuracy + +def run_mlp_training(data_train,labels_train,data_test,labels_test,d_h,learning_rate,num_epoch): + N = data_train.shape[0] # number of input data + d_in = data_train.shape[1] # input dimension + d_out = np.max(labels_train)+1 # output dimension (number of neurons of the output layer) + + # Random initialization of the network weights and biaises + w1 = 2 * np.random.rand(d_in, d_h) - 1 # first layer weights + b1 = np.zeros((1, d_h)) # first layer biaises + w2 = 2 * np.random.rand(d_h, d_out) - 1 # second layer weights + b2 = np.zeros((1, d_out)) # second layer biaises + + w1,b1,w2,b2, train_accuracies = train_mlp(w1,b1,w2,b2, data_train, labels_train, learning_rate, num_epoch) + test_accuracy = test_mlp(w1,b1,w2,b2, data_test,labels_test) + return train_accuracies, test_accuracy + + +if __name__ == "__main__": + data_folder = 'data/cifar-10-batches-py' + batch_filename = 'data_batch_1' # Adjust this to the specific batch file you want to read + batch_path = os.path.join(data_folder, batch_filename) + data, labels = read_cifar.read_cifar_batch(batch_path) + data_train, labels_train, data_test, labels_test = read_cifar.split_dataset(data, labels, 0.9) + train_accuracies, test_accuracy = run_mlp_training(data_train,labels_train,data_test,labels_test,64,0.1,100) + plt.figure(figsize=(12, 4)) + plt.plot(train_accuracies) + plt.xlabel('Epoch') + plt.ylabel('Accuracy') + plt.title('Training Accuracy') + plt.savefig('results/mlp.png') + plt.show() + diff --git a/read_cifar.py b/read_cifar.py new file mode 100644 index 0000000000000000000000000000000000000000..a4bb3b3f66fd15ad40ba62f7bd16d18dabd7b55b --- /dev/null +++ b/read_cifar.py @@ -0,0 +1,74 @@ +import numpy as np +import pickle +import os + +def read_cifar_batch(batch_path): + with open(batch_path, 'rb') as file: + batch_data = pickle.load(file, encoding='bytes') + + data = np.array(batch_data[b'data'], dtype=np.float32) + labels = np.array(batch_data[b'labels'], dtype=np.int64) + + return data, labels + +def read_cifar(batch_path): + data = [] + labels = [] + + file_list = os.listdir(batch_path) + + for file in file_list : + path = os.path.join(batch_path, file) + d, l= read_cifar_batch(path) + data.append(d) + labels.append(l) + + return data, labels + +def split_dataset(data, labels, split): + if split < 0 or split > 1: + raise ValueError("The split parameter must be a float between 0 and 1.") + + # Get the number of samples in the dataset + num_samples = len(data) + + # Calculate the number of samples for the training set + num_train_samples = int(num_samples * split) + + # Create a random permutation of indices for shuffling + indices = np.random.permutation(num_samples) + + # Split the indices into training and test sets + train_indices = indices[:num_train_samples] + test_indices = indices[num_train_samples:] + + # Split the data and labels based on the shuffled indices + data_train = data[train_indices] + labels_train = labels[train_indices] + data_test = data[test_indices] + labels_test = labels[test_indices] + + return (data_train, labels_train, data_test, labels_test) + + +if __name__ == "__main__": + data_folder = 'data/cifar-10-batches-py' + batch_filename = 'data_batch_1' # Adjust this to the specific batch file you want to read + + batch_path = os.path.join(data_folder, batch_filename) + + data, labels = read_cifar_batch(batch_path) + +## # Example: Printing the shape of data and labels +## print("Data shape:", data.shape) +## print("Labels shape:", labels.shape) + + # Example: Printing data and labels for all files from the folder + data1, labels1 = read_cifar(data_folder) + print("Data :", data1) + print("Labels :", labels1) + +## data_train, labels_train, data_test, labels_test = split_dataset(data, labels, 0.8) +## # Example: Printing the shape of data test and train : +## print("Data train shape:", data_train.shape) +## print("Data test shape:", data_test.shape) diff --git a/results/knn.png b/results/knn.png new file mode 100644 index 0000000000000000000000000000000000000000..587f48dfc222e49ea0d9ccb812fc84f8dbaefd0e Binary files /dev/null and b/results/knn.png differ diff --git a/results/mlp.png b/results/mlp.png new file mode 100644 index 0000000000000000000000000000000000000000..ef972fe86b943ddfe2aa46fcb6eb076549dd0378 Binary files /dev/null and b/results/mlp.png differ