distance_matrix + knn_predict

32c1ecfe · pierre-cau · cad0a258 · 32c1ecfe · 32c1ecfe · 32c1ecfe
Commit 32c1ecfe authored 8 months ago by pierre-cau
--- a/data/README.md
+++ b/data/README.md
+> This folder must contain the cifar dataset in the following structure:
+```bash
+data/
+  cifar-10-batches-py/
+    data_batch_1
+    data_batch_2
+    data_batch_3
+    data_batch_4
+    data_batch_5
+    test_batch
+    batches.meta
+```
+> The data can be downloaded from [here](https://www.cs.toronto.edu/~kriz/cifar.html).
--- a/src/main.py
+++ b/src/main.py
@@ -8,9 +8,10 @@ if __name__ == "__main__":
    print(f"Data shape: {data.shape}, Labels shape: {labels.shape}\n")
    # Split the dataset
-    data_train, labels_train, data_test, labels_test = split_dataset(data, labels, 0.8)
+    coef_split = 0.8
+    data_train, labels_train, data_test, labels_test = split_dataset(data, labels, coef_split)
-    print(f"Split the dataset with a {0.8} split factor:")
+    print(f"Split the dataset with a {coef_split} split factor:")
    print(f" - Training data shape: {data_train.shape}, Training labels shape: {labels_train.shape}")
    print(f" - Testing data shape: {data_test.shape}, Testing labels shape: {labels_test.shape}")
\ No newline at end of file
--- a/src/utils/knn.py
+++ b/src/utils/knn.py
+import numpy as np
+def distance_matrix(matrix1, matrix2):
+    """
+    Compute the L2 Euclidean distance matrix between two matrices.
+    Parameters
+    ----------
+    matrix1 : np.ndarray
+        First matrix of shape (n1, d).
+    matrix2 : np.ndarray
+        Second matrix of shape (n2, d).
+    Returns
+    -------
+    dists : np.ndarray
+        L2 Euclidean distance matrix of shape (n1, n2).
+    """
+    # Compute the squared sum of each row in both matrices
+    matrix1_squared = np.sum(np.square(matrix1), axis=1, keepdims=True)
+    matrix2_squared = np.sum(np.square(matrix2), axis=1, keepdims=True).T
+    # Compute the dot product between the two matrices
+    dot_product = np.dot(matrix1, matrix2.T)
+    # Use the formula to compute the L2 Euclidean distance matrix
+    dists = np.sqrt(matrix1_squared + matrix2_squared - 2 * dot_product)
+    return dists
+def knn_predict(dists, labels_train, k):
+    """
+    Predict the labels for the test set using the k-nearest neighbors algorithm.
+    Parameters
+    ----------
+    dists : np.ndarray
+        Distance matrix of shape (n_test, n_train) between the test set and the train set.
+    labels_train : np.ndarray
+        Training labels of shape (n_train,).
+    k : int
+        Number of neighbors to consider.
+    Returns
+    -------
+    labels_pred : np.ndarray
+        Predicted labels for the test set of shape (n_test,).
+    """
+    n_test = dists.shape[0]
+    labels_pred = np.empty(n_test, dtype=labels_train.dtype)
+    for i in range(n_test):
+        # Find the indices of the k nearest neighbors
+        sorted_indices = np.argsort(dists[i])
+        if len(sorted_indices) < k:
+            k = len(sorted_indices)
+            print(f"Warning: k is too large, reducing it to {k} as a maximum value.")
+        nearest_neighbors = sorted_indices[:k]
+        # Retrieve the labels of the k nearest neighbors
+        nearest_labels = labels_train[nearest_neighbors]
+        # Determine the most common label among the k nearest neighbors
+        labels_pred[i] = np.bincount(nearest_labels).argmax()
+    return labels_pred
+if __name__ == "__main__":
+    # Example data and labels
+    data_train = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
+    labels_train = np.array([0, 1, 0, 1])
+    data_test = np.array([[2, 3], [6, 7]])
+    print("Training data:", data_train)
+    print("Training labels:", labels_train)
+    print("Test data:", data_test)
+    # Compute the distance matrix
+    dists = distance_matrix(data_train, data_test)
+    # Predict the labels for the test set
+    k = 3
+    labels_pred = knn_predict(dists, labels_train, k)
+    print("Predicted labels:", labels_pred)
\ No newline at end of file