diff --git a/.gitignore b/.gitignore index 5469eefe6ada355ef1a95bbbc42adabc983b580c..b9cc96274fed4ae6040ab52a1df6de36969ef1b2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ -DATA +data Env_Deep diff --git a/__pycache__/knn.cpython-311.pyc b/__pycache__/knn.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a72d35584c1c74976a3fcd277918627ba2303a9c Binary files /dev/null and b/__pycache__/knn.cpython-311.pyc differ diff --git a/__pycache__/read_cifar.cpython-311.pyc b/__pycache__/read_cifar.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab2c0562b5bc35efe9e0bad8233915e686e34383 Binary files /dev/null and b/__pycache__/read_cifar.cpython-311.pyc differ diff --git a/knn.py b/knn.py new file mode 100644 index 0000000000000000000000000000000000000000..07a3b38e8789538324588af907fc4c28ec0cf0e2 --- /dev/null +++ b/knn.py @@ -0,0 +1,90 @@ +import numpy as np +import matplotlib.pyplot as plt + +def distance_matrix(train, test): + """ + Args: + train (np.ndarray(np.float32)): La matrice de données d'images d'entraînement de forme (n, m). + test (np.ndarray(np.float32)): La matrice de données d'images de test de forme (m, p). + + Retourne: + np.ndarray(np.float32): La matrice des distances euclidiennes L2 entre train et test, de forme (n, p). + + """ + + dists = np.sqrt(-2 * np.matmul(train, test.T) + + np.sum(train*train, axis=1, keepdims=True) + + np.sum(test*test, axis=1, keepdims=True).T) + print('finished calculating dists') + + return dists + +def mode(x): + """ + Args: + x (float): un tableau de nombres + + Retourne: + float: le mode de x + + """ + vals, counts = np.unique(x, return_counts=True) + + return vals[np.argmax(counts)] + +def knn_predict(dists, labels_train, k): + """ + Cette fonction prédit les étiquettes pour les exemples de test dans la matrice de distances "dists" en utilisant les k plus proches voisins du test dans l'ensemble d'entraînement. + Retourne : + labels_pred (tableau numpy) : tableau de taille (k, num_test) des étiquettes prédites pour les données de test. + + """ + + indexes_of_knn = np.argsort(dists, axis=0)[0:k, :] + nearest_labels_pred = labels_train[indexes_of_knn] + labels_pred = np.array([ mode(label) for label in nearest_labels_pred.T ]) + + return labels_pred + +def evaluate_knn_for_k(data_train, labels_train, data_test, labels_test, K_max): + """ + Args: + data_train (tableau numpy): données d'entraînement + labels_train (tableau numpy): étiquettes d'entraînement + data_test (tableau numpy): données de test + labels_test (tableau numpy): étiquettes de test + k_max (entier) : nombre maximal de voisins dans le k-NN + + Retourne: + accuracies (Liste): Liste des précisions des prédictions pour chaque valeur de k. + + """ + + accuracies = [0] * K_max + dists = distance_matrix(data_train, data_test) + + for k in range(1, K_max + 1): + labels_pred = knn_predict(dists, labels_train, k) + accuracy = np.sum(labels_pred == labels_test) / len(labels_test) + accuracies[k - 1] = accuracy + + return accuracies + +def plot_accuracy_versus_k(accuracies): + """ + Cette fonction génère un graphique illustrant la variation de la précision en fonction de k + et sauvegarde le graphique dans le dossier /results. + + """ + + k = len(accuracies) + + fig = plt.figure(figsize=(12, 8)) + plt.plot(np.arange(1, k+1, 1), accuracies, 'o-r') + plt.title("Variation of the accuracy as a function of k") + plt.xlabel("k (number of neighbors)") + plt.ylabel("Accuracy") + ax = fig.gca() + ax.set_xticks(np.arange(1, k+1, 1)) + plt.grid(axis='both', which='both') + plt.savefig(r'C:\Users\hp\Desktop\BE\image-classification\resultats\Knn.png') \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..bcda22e66e4f61bbe2bc6aef7c9100203c8c7bd4 --- /dev/null +++ b/main.py @@ -0,0 +1,13 @@ +from read_cifar import * +from knn import * + +path = r'C:\Users\hp\Desktop\BE\image-classification\data' + +if __name__ == "__main__": + split_factor = 0.9 + X, y = read_cifar(path) + X_train,y_train,X_test,y_test=split_dataset(X,y,split=0.9) + + K_max=20 + accuries=evaluate_knn_for_k(X_train, y_train, X_test, y_test, K_max) + plot_accuracy_versus_k(accuries) \ No newline at end of file diff --git a/read_cifar.py b/read_cifar.py index ad16d246823fa9bfd2a0b00fe370f6b24c6a1fbb..bc11bac06a81539364b298640d2d88efe0425425 100644 --- a/read_cifar.py +++ b/read_cifar.py @@ -1,88 +1,79 @@ -import pickle import numpy as np +import os +import pickle -#read_cifaar_batch -def read_cifar_batch(file) : - with open(file, 'rb') as fo: - dictionnaire = pickle.load(fo, encoding='bytes') - data=dictionnaire[b'data'].astype(np.float32) - labels=np.array(dictionnaire[b'labels'],np.int64) - return data,labels -''' -EXPLICATION DE LA FONCTION: -read_cifaar_batch function: read the path of a single batch. - -Arguments: -- The path of a single batch as a string, - -Returns: -- Matrix data of size (batch_size x data_size) -- Vector labels of size batch_size - -The data must be np.float32 array and labels must be np.int64 array. -''' - - - - -# read_cifar -def read_cifar(folder): - batch_file=["data_batch_1","data_batch_2","data_batch_3","data_batch_4","data_batch_5","test_batch"] - for i in range(len(batch_file)): - path= folder +'/'+batch_file[i] - if i==0: - data,labels=read_cifar_batch(path) +def unpickle(file): + with open(file, 'rb') as fo: + dict = pickle.load(fo, encoding='bytes') + return dict + +#La fonction lecture_cifar : prenne en argument le chemin du répertoire contenant les données, et renvoyant une matrice X de taille NxD où N correspond au nombre de données disponibles, et D à la dimension de ces données (nombre de valeurs numériques décrivant les données), ainsi qu'un vecteur Y de taille N dont les valeurs correspondent au code de la classe de la donnée de même indice dans X. + +#X et Y sont objets numpy +def read_cifar_batch(file): + """ + read_cifaar_batch function: read the path of a single batch. + + Arguments: + - The path of a single batch as a string, + + Returns: + - Matrix data of size (batch_size x data_size) + - Vector labels of size batch_size + """ + dict = unpickle(file) + data = dict[b'data'].astype(np.float32) + labels = np.array(dict[b'labels'], dtype=np.int64) + labels = labels.reshape(labels.shape[0]) + + return data, labels + +def read_cifar(path): + """ + read_cifaar function: read the path of the directory containing all batches (including test_batch). + + Arguments: + - the path of the directory containing the six batches (five data_batch and one test_batch) as a string + + Returns: + - Matrix data of size (batch_size x data_size) + - Vector labels of size batch_size + """ + data_batches = ["data_batch_" + str(i) for i in range(1, 6)] + ['test_batch'] + + flag = True + + for db in data_batches: + data, labels = read_cifar_batch(os.path.join(path, db)) + if flag: + DATA = data + LABELS = labels + flag = False else: - x,y=read_cifar_batch(path) - data =np.vstack([data ,x]) # all data for all batches is in variable "data" - labels=np.hstack([labels,y]) # All labels for all batches is in variable "labels" - return data ,labels - -''' -EXPLICATION DE LA FONCTION: -read_cifaar function: read the path of the directory containing all batches (including test_batch). - -Arguments: -- the path of the directory containing the six batches (five data_batch and one test_batch) as a string - -Returns: -- Matrix data of size (batch_size x data_size) -- Vector labels of size batch_size< - -The data must be np.float32 array and labels must be np.int64 array. -''' - - -# split_dataset - -def split_dataset(data,labels,split): - labels=labels.reshape(data.shape[0],1) - # Stack our Data and labels - con = np.hstack((data, labels)) - k=int(split*con.shape[0]) - # Shuffle all our Data stack it - np.random.shuffle(con) - # Train - X_train=con[:k,:-1] - y_train=np.array(con[:k,-1],np.int64) - # Test - X_test=con[k:,:-1] - y_test=np.array(con[k:,-1],np.int64) - return X_train,y_train,X_test,y_test - -''' -EXPLICATION DE LA FONCTION: -split_dataset function: splits the dataset into a training set and a test set. - -Arguments: -- data and labels, two arrays that have the same size in the first dimension. -- split, a float between 0 and 1 which determines the split factor of the training set with respect to the test set. - -Returns: -- data_train: the training data, -- labels_train: the corresponding labels, -- data_test: the testing data, and -- labels_test: the corresponding labels. -''' - + DATA = np.concatenate((DATA, data), axis=0, dtype=np.float32) + LABELS = np.concatenate((LABELS, labels), axis=-1, dtype=np.int64) + return DATA, LABELS + +def split_dataset(data, labels, split=0.6): + """ + split_dataset function: splits the dataset into a training set and a test set. + + Arguments: + - data and labels, two arrays that have the same size in the first dimension. + - split, a float between 0 and 1 which determines the split factor of the training set with respect to the test set. + + Returns: + - data_train: the training data, + - labels_train: the corresponding labels, + - data_test: the testing data, and + - labels_test: the corresponding labels. + """ + n = data.shape[0] + indices = np.random.permutation(n) + train_idx, test_idx = indices[:int(split*n)], indices[int(split*n):] + + data_train, data_test = data[train_idx,:].astype(np.float32), data[test_idx,:].astype(np.float32) + labels_train, labels_test = labels[train_idx].astype(np.int64), labels[test_idx].astype(np.int64) + + return data_train, labels_train, data_test, labels_test \ No newline at end of file diff --git a/resultats/Knn.png b/resultats/Knn.png new file mode 100644 index 0000000000000000000000000000000000000000..926556a3309ceb450d489a8e4e5d4d85b3763aee Binary files /dev/null and b/resultats/Knn.png differ