diff --git a/.gitignore b/.gitignore index 0f85d8b48c5c645e0d3794e63f9bfa24d34e61f8..5bca2a01a829583e97c640c643d9dbfa4c7b3db1 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,4 @@ cython_debug/ #.idea/ /data +\data diff --git a/knn.py b/knn.py index a94a4cae25d526c5d7e15142ad4506912589fe38..672503a7951e9a54915da6f4ed26b8825fe48364 100644 --- a/knn.py +++ b/knn.py @@ -1,7 +1,102 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Nov 7 10:19:23 2024 +import numpy as np +from read_cifar import * +import matplotlib.pyplot as plt -@author: danjo -""" +import numpy as np + +def distance_matrix(A, B): + """ + Compute the L2 Euclidean distance matrix between two matrices A and B. + + Parameters: + A (numpy.ndarray): Matrix of shape (m, n) + B (numpy.ndarray): Matrix of shape (p, n) + + Returns: + numpy.ndarray: Distance matrix of shape (m, p) where the element (i, j) is the + Euclidean distance between A[i] and B[j]. + """ + # Squared norms of each row in A and B + A_squared = np.sum(A**2, axis=1).reshape(-1, 1) # Shape (m, 1) + B_squared = np.sum(B**2, axis=1).reshape(1, -1) # Shape (1, p) + + # Compute the squared L2 distance matrix using the formula + dists_squared = A_squared + B_squared - 2 * np.dot(A, B.T) + + # Ensure non-negative values due to potential floating-point errors, then take the square root + dists = np.sqrt(np.maximum(dists_squared, 0)) + return dists + + +def knn_predict(dists, labels_train, k): + output = [] + # Loop on all the images_test + for i in range(len(dists)): + # Innitialize table to store the neighbors + res = [0] * 10 + # Get the closest neighbors + labels_close = np.argsort(dists[i])[:k] + for label in labels_close: + #add a label to the table of result + res[labels_train[label]] += 1 + # Get the class with the maximum neighbors + label_temp = np.argmax(res) #Careful to the logic here, if there is two or more maximum, the function the first maximum encountered + output.append(label_temp) + return(np.array(output)) + + +def evaluate_knn(data_train, labels_train, data_test, labels_tests, k): + dist = distance_matrix(data_test, data_train) + result_test = knn_predict(dist, labels_train, k) + + #accuracy + N = labels_tests.shape[0] + accuracy = (labels_tests == result_test).sum() / N + return(accuracy) + +# def evaluate_knn(data_train, labels_train, data_test, labels_test, k): +# dists = distance_matrix(data_test, data_train) +# # Determine the number of images in data_test +# tot = len(data_test) +# accurate = 0 +# predicted_labels = knn_predict(dists, labels_train, k) +# # Count the number of images in data_test whose label has been estimated correctly +# for i in range(tot): +# if predicted_labels[i] == labels_test[i]: +# accurate += 1 +# # Calculate the classification rate +# accuracy = accurate/tot +# return accuracy + + +if __name__ == "__main__": + path = r'data\cifar-10-batches-py\data_batch_1' + main_path = r'data\cifar-10-batches-py' + data, labels = read_cifar_batch(path) + data, labels = read_cifar(main_path) + data_train, data_test, labels_train, labels_test = split_dataset(data, labels, 0.9) + + print(labels_test) + + dists = distance_matrix(data_test, data_train) + #print(dists) + + r = knn_predict(dists, labels_train, 10) + accurancy = evaluate_knn(data_train, labels_train, data_test, labels_test, 10) + print(r) + print(accurancy) + + +# data, labels = read_cifar('data\cifar-10-batches-py') + + +# data_train, data_test, labels_train, labels_test = split_dataset(data, labels, 0.9) + +# k=3 +# accurancies = [] + +# accurancy = evaluate_knn(data_train, data_test, labels_train, labels_test, k) +# accurancies.append(accurancy) + +# print(accurancies) \ No newline at end of file diff --git a/read_cifar.py b/read_cifar.py index 64ccc60a17def3c88704fb6f53fa07c3bac2b29d..c2a03180445946798ed09c380dc89a8170abaf81 100644 --- a/read_cifar.py +++ b/read_cifar.py @@ -7,7 +7,6 @@ Created on Thu Nov 7 08:45:09 2024 import numpy as np import pickle -from sklearn.model_selection import train_test_split @@ -22,7 +21,7 @@ def read_cifar_batch(file): # b'labels', # b'data', # b'filenames'] - return (np.array(dict[b'data']).astype('float32'), np.array(dict[b'labels']).astype('int64') ) + return (np.array(dict[b'data']).astype('float32'), np.array(dict[b'labels']).astype('int64')) def read_cifar(path): data = [] @@ -43,7 +42,7 @@ def read_cifar(path): data = np.concatenate(data, axis = 0) labels = np.concatenate(labels, axis = 0) - return(data, labels) + return (data, labels) @@ -59,16 +58,16 @@ def split_dataset(data, labels, split): data_train, data_test = data[train_idx,:].astype(np.float32), data[test_idx,:].astype(np.float32) labels_train, labels_test = labels[train_idx].astype(np.int64), labels[test_idx].astype(np.int64) - return (data_train, data_test, labels_train, labels_test) + return data_train, data_test, labels_train, labels_test if __name__ == "__main__": - path = 'data\cifar-10-batches-py\data_batch_1' - main_path = 'data\cifar-10-batches-py' - data, labels = read_cifar_batch(path) - data, labels = read_cifar(main_path) - X_train, X_test, y_train, y_test = split_dataset(data, labels, 0.9) - print(X_train, X_test, y_train, y_test) - print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) + path = r'data\cifar-10-batches-py\data_batch_1' + main_path = r'data\cifar-10-batches-py' + data, labels = read_cifar_batch(path) + data, labels = read_cifar(main_path) + data_train, data_test, labels_train, labels_test = split_dataset(data, labels, 0.9) + #print(X_train, X_test, y_train, y_test) + #print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)