final commit

4eba64b5 · Muniz Silva Samuel · cdb0f72c · 4eba64b5
Commit 4eba64b5 authored 2 years ago by Muniz Silva Samuel
--- a/read_cifar.py
+++ b/read_cifar.py
@@ -9,20 +9,34 @@ from sklearn.neighbors import KNeighborsRegressor
 import matplotlib.pyplot as plt


+#
+# ATTENTION : THIS CODE IS CONCATENATION OF THE CODES read_cifar.py AND knn.py
+#
+
+
 def unpickle(file):
+    """Use to Unpack the CIFAR10 dataset as a pickle. It returns a dictinary with the dataset and its labels."""
    import pickle
-    with open(file, 'rb') as fo:
-        dict = pickle.load(fo, encoding='bytes')
+
+    with open(file, "rb") as fo:
+        dict = pickle.load(fo, encoding="bytes")
    return dict

+
 def read_cifar_batch(path):
+    """Taking as parameter the path of a single batch as a string, and returning:
+    matrix data of size (batch_size x data_size) and  a vector labels of size batch_size"""
    dictionary = unpickle(path)
-  data = np.array(dictionary[b'data'], dtype = np.float32)
-  labels = np.array(dictionary[b'labels'], dtype = np.int64)
+    data = np.array(dictionary[b"data"], dtype=np.float32)
+    labels = np.array(dictionary[b"labels"], dtype=np.int64)

    return data, labels

+
 def read_cifar(path1, path2, path3, path4, path5, path6):
+    """taking as parameter the path of the directory containing the six batches
+    (five data_batch and one test_batch) as a string, and returning:
+    a matrix data of shape (batch_size x data_size) and a vector labels of size batch_size"""
    data, labels = read_cifar_batch(path1)
    dataAux, labelsAux = read_cifar_batch(path2)
    data = np.concatenate((data, dataAux), 0)
@@ -43,25 +57,28 @@ def read_cifar(path1,path2,path3,path4,path5,path6):
    return data, labels


-
 def split_dataset(data, labels):
-
-  data_train, data_test , labels_train , labels_test = train_test_split(data, labels,shuffle = True ,test_size = 0.1)
+    """which splits the dataset into a training set and a test set"""
+    data_train, data_test, labels_train, labels_test = train_test_split(
+        data, labels, shuffle=True, test_size=0.1
+    )

    return data_train, data_test, labels_train, labels_test


 def distance_matrix(data_test, data_train):
-
-  dists = np.array([np.sum((data_train-l)**2,axis=1)**.5 for l in data_test])
+    """Takes the matrix data_test and data_train. It returning a 2d array(N,M) such that dists[i,j] represents
+    the distance between the i-th data_test row and the j-th data_train row
+    """
+    dists = np.array([np.sum((data_train - l) ** 2, axis=1) ** 0.5 for l in data_test])

    return dists
-#receives a 2d array data_train(M,k) and a data_test (N,k), 
-#returning a 2d array(N,M) such that dists[i,j] represents 
-#the distance between the i-th data_test row and the j-th data_train row
-#in resume,  each column represent a distance of a training point to all other
+

 def knn_predict(dists, labels_train, k):
+    """Take the matrix of distances  dists, the labels for training and k nearest neighbor
+    It returns the classification given by the module KNN.
+    """
    # classif = np.array(0)
    print(labels_train[:20])
    print(labels_train.size)
@@ -79,34 +96,36 @@ def knn_predict(dists , labels_train , k):

    return classif

-def evaluate_knn(data_train,labels_train,data_test,labels_test,k):

-  classif = np.array(knn_predict(distance_matrix(data_train,data_test) , labels_train , k)) 
+def evaluate_knn(data_train, labels_train, data_test, labels_test, k):
+    """Receives the datas ans labels for training and teste and k nearest neighbor.
+    It retuns the accuracy of the KNN module"""
+    classif = np.array(
+        knn_predict(distance_matrix(data_train, data_test), labels_train, k)
+    )
    result = np.array(classif == labels_test)
    acc = np.count_nonzero(result) / np.size(result)

    return acc * 100


-
-datas,labels = read_cifar_batch('data_batch_1')
-print(datas.shape,labels.shape)
+datas, labels = read_cifar_batch("data_batch_1")
 dataTrain, dataTest, labelsTrain, labelsTest = split_dataset(datas, labels)
-print(dataTrain.shape,dataTest.shape,labelsTrain.shape)
 distanceMatrix = distance_matrix(dataTrain, dataTest)
-print(distanceMatrix.shape)
 print()

 result = []
+# Apply various KNN moduli with k ranging from 1 to 20
 for i in range(1, 21):
-  result = np.append(result,evaluate_knn(dataTrain,labelsTrain,dataTest,labelsTest,i)) 
+    result = np.append(
+        result, evaluate_knn(dataTrain, labelsTrain, dataTest, labelsTest, i)
+    )

 x = np.arange(1, 21)

-# plotting
+# plot the graph of (Accuracy) x k
 plt.title("Plot graph")
 plt.xlabel("K neighbors")
 plt.ylabel("Accuracy %")
 plt.plot(x, result, color="red")
 plt.show()
-