knn

29db5f9b · pierre-cau · 00d6cf8f · 29db5f9b · 29db5f9b · 29db5f9b
Commit 29db5f9b authored 8 months ago by pierre-cau
--- a/src/main.py
+++ b/src/main.py
@@ -15,11 +15,9 @@ if __name__ == "__main__":
    print(f" - Training data shape: {data_train.shape}, Training labels shape: {labels_train.shape}")
    print(f" - Testing data shape: {data_test.shape}, Testing labels shape: {labels_test.shape}")

-    # We flatten the images
-    data_train = data_train.reshape(data_train.shape[0], -1)
-    data_test = data_test.reshape(data_test.shape[0], -1)

-    # Evaluate the k-NN algorithm
+
+    # # Evaluate the k-NN algorithm
    k = 3
    accuracy = evaluate_knn(data_train, labels_train, data_test, labels_test, k)
    
\ No newline at end of file
--- a/src/utils/__pycache__/knn.cpython-38.pyc
+++ b/src/utils/__pycache__/knn.cpython-38.pyc
--- a/src/utils/knn.py
+++ b/src/utils/knn.py
@@ -16,9 +16,11 @@ def distance_matrix(matrix1, matrix2):
    dists : np.ndarray
        L2 Euclidean distance matrix of shape (n1, n2).
    """
+    assert matrix1.shape[1] == matrix2.shape[1], "Matrices must have the same number of columns"
+
    # Compute the squared sum of each row in both matrices
-    matrix1_squared = np.sum(np.square(matrix1), axis=1, keepdims=True)
-    matrix2_squared = np.sum(np.square(matrix2), axis=1, keepdims=True).T
+    matrix1_squared = np.sum(matrix1**2, axis=1, keepdims=True)
+    matrix2_squared = np.sum(matrix2**2, axis=1, keepdims=True).T

    # Compute the dot product between the two matrices
    dot_product = np.dot(matrix1, matrix2.T)
@@ -49,25 +51,19 @@ def knn_predict(dists, labels_train, k):
    n_test = dists.shape[0]
    labels_pred = np.empty(n_test, dtype=labels_train.dtype)
    
-    for i in range(n_test):
-
-        # Find the indices of the k nearest neighbors
-        sorted_indices = np.argsort(dists[i])
-        if len(sorted_indices) < k:
-            k = len(sorted_indices)
-            print(f"Warning: k is too large, reducing it to {k} as a maximum value.")
-        nearest_neighbors = sorted_indices[:k]
-
+    # Find the indices of the k nearest neighbors for each test point
+    sorted_indices = np.argsort(dists, axis=1)
+    nearest_neighbors = sorted_indices[:, :k]

    # Retrieve the labels of the k nearest neighbors
    nearest_labels = labels_train[nearest_neighbors]

-        # Determine the most common label among the k nearest neighbors
-        labels_pred[i] = np.bincount(nearest_labels).argmax()
+    # Determine the most common label among the k nearest neighbors for each test point
+    labels_pred = np.array([np.bincount(nearest_labels[i]).argmax() for i in range(nearest_labels.shape[0])])
    
    return labels_pred

-def evaluate_knn(data_train, labels_train, data_test, labels_test, k):
+def evaluate_knn(data_train, labels_train, data_test, labels_test, k, dists=None):
    """
    Evaluate the k-nearest neighbors algorithm on the given dataset.
    
@@ -83,6 +79,8 @@ def evaluate_knn(data_train, labels_train, data_test, labels_test, k):
        Testing labels of shape (n_test,).
    k : int
        Number of neighbors to consider.
+    dists : np.ndarray, optional
+        Distance matrix of shape (n_test, n_train) between the test set and the train set.
    
    Returns
    -------
@@ -91,7 +89,7 @@ def evaluate_knn(data_train, labels_train, data_test, labels_test, k):
    """
    # Compute the distance matrix between the testing and training data
    dists = distance_matrix(data_test, data_train)
-    
+    print("Distance matrix made successfully")
    # Predict the labels for the test set
    labels_pred = knn_predict(dists, labels_train, k)