Partie 1

bde18979 · widad174 · 69411637 · bde18979 · bde18979 · bde18979
Commit bde18979 authored Oct 26, 2023 by widad174
--- a/.gitignore
+++ b/.gitignore
-DATA
+data
 Env_Deep
--- a/__pycache__/knn.cpython-311.pyc
+++ b/__pycache__/knn.cpython-311.pyc
--- a/__pycache__/read_cifar.cpython-311.pyc
+++ b/__pycache__/read_cifar.cpython-311.pyc
--- a/knn.py
+++ b/knn.py
+import numpy as np
+import matplotlib.pyplot as plt
+def distance_matrix(train, test):
+    """
+    Args:
+    train (np.ndarray(np.float32)): La matrice de données d'images d'entraînement de forme (n, m).
+    test (np.ndarray(np.float32)): La matrice de données d'images de test de forme (m, p).
+    Retourne:
+    np.ndarray(np.float32): La matrice des distances euclidiennes L2 entre train et test, de forme (n, p).
+    """
+    dists = np.sqrt(-2 * np.matmul(train, test.T) + 
+                    np.sum(train*train, axis=1, keepdims=True) + 
+                    np.sum(test*test, axis=1, keepdims=True).T)
+    print('finished calculating dists')
+    return dists
+def mode(x):
+    """
+    Args:
+    x (float): un tableau de nombres
+    Retourne:
+    float: le mode de x
+    """
+    vals, counts = np.unique(x, return_counts=True)
+    return vals[np.argmax(counts)]
+def knn_predict(dists, labels_train, k):
+    """
+    Cette fonction prédit les étiquettes pour les exemples de test dans la matrice de distances "dists" en utilisant les k plus proches voisins du test dans l'ensemble d'entraînement.
+    Retourne :
+    labels_pred (tableau numpy) : tableau de taille (k, num_test) des étiquettes prédites pour les données de test.
+    """
+    indexes_of_knn = np.argsort(dists, axis=0)[0:k, :]
+    nearest_labels_pred = labels_train[indexes_of_knn]
+    labels_pred = np.array([ mode(label) for label in nearest_labels_pred.T ])
+    return labels_pred
+def evaluate_knn_for_k(data_train, labels_train, data_test, labels_test, K_max):
+    """
+    Args:
+    data_train (tableau numpy): données d'entraînement
+    labels_train (tableau numpy): étiquettes d'entraînement
+    data_test (tableau numpy): données de test
+    labels_test (tableau numpy): étiquettes de test
+    k_max (entier) : nombre maximal de voisins dans le k-NN
+    Retourne:
+    accuracies (Liste): Liste des précisions des prédictions pour chaque valeur de k.
+    """
+    accuracies = [0] * K_max
+    dists = distance_matrix(data_train, data_test)
+    for k in range(1, K_max + 1):
+        labels_pred = knn_predict(dists, labels_train, k)
+        accuracy = np.sum(labels_pred == labels_test) / len(labels_test)
+        accuracies[k - 1] = accuracy
+    return accuracies
+def plot_accuracy_versus_k(accuracies):
+    """
+    Cette fonction génère un graphique illustrant la variation de la précision en fonction de k 
+    et sauvegarde le graphique dans le dossier /results.
+    """
+    k = len(accuracies)
+    fig = plt.figure(figsize=(12, 8))
+    plt.plot(np.arange(1, k+1, 1), accuracies, 'o-r')
+    plt.title("Variation of the accuracy as a function of k")
+    plt.xlabel("k (number of neighbors)")
+    plt.ylabel("Accuracy")
+    ax = fig.gca()
+    ax.set_xticks(np.arange(1, k+1, 1))
+    plt.grid(axis='both', which='both')
+    plt.savefig(r'C:\Users\hp\Desktop\BE\image-classification\resultats\Knn.png')
\ No newline at end of file
--- a/main.py
+++ b/main.py
+from read_cifar import *
+from knn import *
+path = r'C:\Users\hp\Desktop\BE\image-classification\data'
+if __name__ == "__main__":
+    split_factor = 0.9
+    X, y = read_cifar(path)
+    X_train,y_train,X_test,y_test=split_dataset(X,y,split=0.9)
+    K_max=20
+    accuries=evaluate_knn_for_k(X_train, y_train, X_test, y_test, K_max)
+    plot_accuracy_versus_k(accuries)
\ No newline at end of file
--- a/read_cifar.py
+++ b/read_cifar.py
-import pickle
 import numpy as np
+import os
+import pickle
-#read_cifaar_batch
-def read_cifar_batch(file) :
+def unpickle(file):
    with open(file, 'rb') as fo:
-        dictionnaire = pickle.load(fo, encoding='bytes') 
+        dict = pickle.load(fo, encoding='bytes')
-        data=dictionnaire[b'data'].astype(np.float32)
+    return dict
-        labels=np.array(dictionnaire[b'labels'],np.int64)
-    return data,labels
+#La fonction lecture_cifar : prenne en argument le chemin du répertoire contenant les données, et renvoyant une matrice X de taille NxD où N correspond au nombre de données disponibles, et D à la dimension de ces données (nombre de valeurs numériques décrivant les données), ainsi qu'un vecteur Y de taille N dont les valeurs correspondent au code de la classe de la donnée de même indice dans X. 
-'''
-EXPLICATION DE LA FONCTION:
+#X et Y sont objets numpy
+def read_cifar_batch(file):
+    """
    read_cifaar_batch function: read the path of a single batch.
    Arguments:
@@ -18,29 +21,16 @@ Arguments:
    Returns: 
    - Matrix data of size (batch_size x data_size)
    - Vector labels of size batch_size
+    """
+    dict = unpickle(file)
+    data = dict[b'data'].astype(np.float32)
+    labels = np.array(dict[b'labels'], dtype=np.int64)
+    labels = labels.reshape(labels.shape[0])
-The data must be np.float32 array and labels must be np.int64 array.
-'''
-# read_cifar
-def read_cifar(folder):
-    batch_file=["data_batch_1","data_batch_2","data_batch_3","data_batch_4","data_batch_5","test_batch"]
-    for i in range(len(batch_file)):
-        path= folder +'/'+batch_file[i]
-        if i==0:
-            data,labels=read_cifar_batch(path)
-        else:
-            x,y=read_cifar_batch(path)
-            data =np.vstack([data ,x])    # all data for all batches is in variable "data"
-            labels=np.hstack([labels,y])  # All labels for all batches is in variable "labels"
    return data, labels
-'''
+def read_cifar(path):
-EXPLICATION DE LA FONCTION:
+    """
    read_cifaar function: read the path of the directory containing all batches (including test_batch).
    Arguments:
@@ -48,31 +38,25 @@ Arguments:
    Returns:
    - Matrix data of size (batch_size x data_size)
- Vector labels of size batch_size<
+    - Vector labels of size batch_size
+    """
-The data must be np.float32 array and labels must be np.int64 array.
+    data_batches = ["data_batch_" + str(i) for i in range(1, 6)] + ['test_batch']
-'''
+    flag = True
-# split_dataset
+    for db in data_batches:
+        data, labels = read_cifar_batch(os.path.join(path, db))
-def split_dataset(data,labels,split):
+        if flag:
-    labels=labels.reshape(data.shape[0],1)
+                DATA = data
-    # Stack our Data and labels
+                LABELS = labels
-    con = np.hstack((data, labels))
+                flag = False
-    k=int(split*con.shape[0])
+        else:
-    # Shuffle all our Data stack it
+            DATA = np.concatenate((DATA, data), axis=0, dtype=np.float32)
-    np.random.shuffle(con)
+            LABELS = np.concatenate((LABELS, labels), axis=-1, dtype=np.int64)
-    # Train
+    return DATA, LABELS
-    X_train=con[:k,:-1]
-    y_train=np.array(con[:k,-1],np.int64)
+def split_dataset(data, labels, split=0.6):
-    # Test
+    """
-    X_test=con[k:,:-1]
-    y_test=np.array(con[k:,-1],np.int64)
-    return X_train,y_train,X_test,y_test
-'''
-EXPLICATION DE LA FONCTION:
    split_dataset function: splits the dataset into a training set and a test set.
    Arguments:
@@ -84,5 +68,12 @@ Returns:
    - labels_train: the corresponding labels,
    - data_test: the testing data, and
    - labels_test: the corresponding labels.
-'''
+    """
+    n = data.shape[0]
+    indices = np.random.permutation(n)
+    train_idx, test_idx = indices[:int(split*n)], indices[int(split*n):]
+    data_train, data_test = data[train_idx,:].astype(np.float32), data[test_idx,:].astype(np.float32)
+    labels_train, labels_test = labels[train_idx].astype(np.int64), labels[test_idx].astype(np.int64)
+    return data_train, labels_train, data_test, labels_test 
\ No newline at end of file
--- a/resultats/Knn.png
+++ b/resultats/Knn.png