Skip to content
Snippets Groups Projects
Commit 32c1ecfe authored by pierre-cau's avatar pierre-cau
Browse files

distance_matrix + knn_predict

parent cad0a258
No related branches found
No related tags found
No related merge requests found
> This folder must contain the cifar dataset in the following structure:
```bash
data/
cifar-10-batches-py/
data_batch_1
data_batch_2
data_batch_3
data_batch_4
data_batch_5
test_batch
batches.meta
```
> The data can be downloaded from [here](https://www.cs.toronto.edu/~kriz/cifar.html).
...@@ -8,9 +8,10 @@ if __name__ == "__main__": ...@@ -8,9 +8,10 @@ if __name__ == "__main__":
print(f"Data shape: {data.shape}, Labels shape: {labels.shape}\n") print(f"Data shape: {data.shape}, Labels shape: {labels.shape}\n")
# Split the dataset # Split the dataset
data_train, labels_train, data_test, labels_test = split_dataset(data, labels, 0.8) coef_split = 0.8
data_train, labels_train, data_test, labels_test = split_dataset(data, labels, coef_split)
print(f"Split the dataset with a {0.8} split factor:") print(f"Split the dataset with a {coef_split} split factor:")
print(f" - Training data shape: {data_train.shape}, Training labels shape: {labels_train.shape}") print(f" - Training data shape: {data_train.shape}, Training labels shape: {labels_train.shape}")
print(f" - Testing data shape: {data_test.shape}, Testing labels shape: {labels_test.shape}") print(f" - Testing data shape: {data_test.shape}, Testing labels shape: {labels_test.shape}")
\ No newline at end of file
import numpy as np
def distance_matrix(matrix1, matrix2):
"""
Compute the L2 Euclidean distance matrix between two matrices.
Parameters
----------
matrix1 : np.ndarray
First matrix of shape (n1, d).
matrix2 : np.ndarray
Second matrix of shape (n2, d).
Returns
-------
dists : np.ndarray
L2 Euclidean distance matrix of shape (n1, n2).
"""
# Compute the squared sum of each row in both matrices
matrix1_squared = np.sum(np.square(matrix1), axis=1, keepdims=True)
matrix2_squared = np.sum(np.square(matrix2), axis=1, keepdims=True).T
# Compute the dot product between the two matrices
dot_product = np.dot(matrix1, matrix2.T)
# Use the formula to compute the L2 Euclidean distance matrix
dists = np.sqrt(matrix1_squared + matrix2_squared - 2 * dot_product)
return dists
def knn_predict(dists, labels_train, k):
"""
Predict the labels for the test set using the k-nearest neighbors algorithm.
Parameters
----------
dists : np.ndarray
Distance matrix of shape (n_test, n_train) between the test set and the train set.
labels_train : np.ndarray
Training labels of shape (n_train,).
k : int
Number of neighbors to consider.
Returns
-------
labels_pred : np.ndarray
Predicted labels for the test set of shape (n_test,).
"""
n_test = dists.shape[0]
labels_pred = np.empty(n_test, dtype=labels_train.dtype)
for i in range(n_test):
# Find the indices of the k nearest neighbors
sorted_indices = np.argsort(dists[i])
if len(sorted_indices) < k:
k = len(sorted_indices)
print(f"Warning: k is too large, reducing it to {k} as a maximum value.")
nearest_neighbors = sorted_indices[:k]
# Retrieve the labels of the k nearest neighbors
nearest_labels = labels_train[nearest_neighbors]
# Determine the most common label among the k nearest neighbors
labels_pred[i] = np.bincount(nearest_labels).argmax()
return labels_pred
if __name__ == "__main__":
# Example data and labels
data_train = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
labels_train = np.array([0, 1, 0, 1])
data_test = np.array([[2, 3], [6, 7]])
print("Training data:", data_train)
print("Training labels:", labels_train)
print("Test data:", data_test)
# Compute the distance matrix
dists = distance_matrix(data_train, data_test)
# Predict the labels for the test set
k = 3
labels_pred = knn_predict(dists, labels_train, k)
print("Predicted labels:", labels_pred)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment