knn finito

6444c4c5 · Danjou Pierre · f2295c1f · 6444c4c5 · 6444c4c5 · 6444c4c5
Commit 6444c4c5 authored 8 months ago by Danjou Pierre
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,4 @@ cython_debug/
 #.idea/
 /data
+\data
--- a/knn.py
+++ b/knn.py
-# -*- coding: utf-8 -*-
+import numpy as np
+from read_cifar import *
+import matplotlib.pyplot as plt
+import numpy as np
+def distance_matrix(A, B):
    """
-Created on Thu Nov  7 10:19:23 2024
+    Compute the L2 Euclidean distance matrix between two matrices A and B.
-@author: danjo
+    Parameters:
+        A (numpy.ndarray): Matrix of shape (m, n)
+        B (numpy.ndarray): Matrix of shape (p, n)
+    Returns:
+        numpy.ndarray: Distance matrix of shape (m, p) where the element (i, j) is the
+                       Euclidean distance between A[i] and B[j].
    """
+    # Squared norms of each row in A and B
+    A_squared = np.sum(A**2, axis=1).reshape(-1, 1)  # Shape (m, 1)
+    B_squared = np.sum(B**2, axis=1).reshape(1, -1)  # Shape (1, p)
+    # Compute the squared L2 distance matrix using the formula
+    dists_squared = A_squared + B_squared - 2 * np.dot(A, B.T)
+    # Ensure non-negative values due to potential floating-point errors, then take the square root
+    dists = np.sqrt(np.maximum(dists_squared, 0))
+    return dists
+def knn_predict(dists, labels_train, k):
+    output = []
+    # Loop on all the images_test
+    for i in range(len(dists)):
+        # Innitialize table to store the neighbors
+        res = [0] * 10
+        # Get the closest neighbors
+        labels_close = np.argsort(dists[i])[:k]
+        for label in labels_close:
+            #add a label to the table of result
+            res[labels_train[label]] += 1
+        # Get the class with the maximum neighbors
+        label_temp = np.argmax(res) #Careful to the logic here, if there is two or more maximum, the function the first maximum encountered
+        output.append(label_temp)
+    return(np.array(output))
+def evaluate_knn(data_train, labels_train, data_test, labels_tests, k):
+    dist = distance_matrix(data_test, data_train)
+    result_test = knn_predict(dist, labels_train, k)
+    #accuracy 
+    N = labels_tests.shape[0]
+    accuracy = (labels_tests == result_test).sum() / N
+    return(accuracy)
+# def evaluate_knn(data_train, labels_train, data_test, labels_test, k):
+#     dists = distance_matrix(data_test, data_train)
+#     # Determine the number of images in data_test
+#     tot = len(data_test)
+#     accurate = 0
+#     predicted_labels = knn_predict(dists, labels_train, k)
+#     # Count the number of images in data_test whose label has been estimated correctly
+#     for i in range(tot):
+#         if predicted_labels[i] == labels_test[i]:
+#             accurate += 1
+#     # Calculate the classification rate
+#     accuracy = accurate/tot
+#     return accuracy
+if __name__ == "__main__":
+    path = r'data\cifar-10-batches-py\data_batch_1'
+    main_path = r'data\cifar-10-batches-py'
+    data, labels = read_cifar_batch(path)
+    data, labels = read_cifar(main_path)
+    data_train, data_test, labels_train, labels_test = split_dataset(data, labels, 0.9)
+    print(labels_test)
+    dists = distance_matrix(data_test, data_train)
+    #print(dists)
+    r = knn_predict(dists, labels_train, 10)
+    accurancy = evaluate_knn(data_train, labels_train, data_test, labels_test, 10)
+    print(r)
+    print(accurancy)
+#     data, labels = read_cifar('data\cifar-10-batches-py')
+#     data_train, data_test, labels_train, labels_test = split_dataset(data, labels, 0.9)
+#     k=3
+#     accurancies = []
+#     accurancy = evaluate_knn(data_train, data_test, labels_train, labels_test, k)
+#     accurancies.append(accurancy)
+#     print(accurancies)
\ No newline at end of file
--- a/read_cifar.py
+++ b/read_cifar.py
@@ -7,7 +7,6 @@ Created on Thu Nov  7 08:45:09 2024
 import numpy as np
 import pickle
-from sklearn.model_selection import train_test_split 
@@ -59,16 +58,16 @@ def split_dataset(data, labels, split):
    data_train, data_test = data[train_idx,:].astype(np.float32), data[test_idx,:].astype(np.float32)
    labels_train, labels_test = labels[train_idx].astype(np.int64), labels[test_idx].astype(np.int64)
-    return (data_train, data_test, labels_train, labels_test) 
+    return data_train, data_test, labels_train, labels_test 
 if __name__ == "__main__":
-    path = 'data\cifar-10-batches-py\data_batch_1'
+   path = r'data\cifar-10-batches-py\data_batch_1'
-    main_path = 'data\cifar-10-batches-py'
+   main_path = r'data\cifar-10-batches-py'
   data, labels = read_cifar_batch(path)
   data, labels = read_cifar(main_path)
-    X_train, X_test, y_train, y_test = split_dataset(data, labels, 0.9)
+   data_train, data_test, labels_train, labels_test = split_dataset(data, labels, 0.9)
-    print(X_train, X_test, y_train, y_test)
+   #print(X_train, X_test, y_train, y_test)
-    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
+   #print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)