diff --git a/knn.py b/knn.py index 72ea8fda995de289b7a7e3d7c3071bf73bf09487..0be4178eaf8d12e447b0e66e298ccff85b04f52b 100644 --- a/knn.py +++ b/knn.py @@ -4,4 +4,72 @@ Created on Fri Oct 20 17:39:37 2023 @author: oscar """ +import read_cifar +import numpy as np +import statistics +from statistics import mode +def distance_matrix(A,B) : + # sum_of_squaresA = np.sum(A ** 2, axis=1) + # sum_of_squaresB = np.sum(B ** 2, axis=1) + sum_of_squaresA = np.sum(np.square(A), axis=1) + sum_of_squaresB = np.sum(np.square(B) ** 2, axis=1) + + + # Calculate the dot product between the two matrices + dot_product = np.dot(A, B.T) + + # Calculate the Euclidean distance matrix using the hint provided + dists = np.sqrt(sum_of_squaresA + sum_of_squaresB - 2 * dot_product) + + return dists + +def knn_predict(dists, labels_train, k) : + number_test, number_train = dists.shape + + # initialze the predicted labels to zeros + labels_predicted = np.zeros(number_test) + + for i in range(number_test) : + sorted_indices = np.argsort(dists[i]) + knn_indices = sorted_indices[ : k] + knn_labels = labels_train[knn_indices] + label_predicted = mode(knn_labels) + labels_predicted[i] = label_predicted + + return labels_predicted + +def evaluate_knn(data_train, labels_train, data_test, labels_test, k) : + dists = distance_matrix(data_test, data_train) + labels_predicted = knn_predict(dists, labels_train, k) + number_true_prediction = np.sum(labels_test == labels_predicted) + number_total_prediction = labels_test.shape[0] + classification_rate = number_true_prediction/number_total_prediction + + return classification_rate + + +if __name__ == "__main__" : + + # # Example distance matrix, training labels, and k value + # dists = np.array([[1000, 2, 3], + # [4, 0.1, 6], + # [7, 8, 0]]) + # labels_train = np.array([0, 1, 5]) + # k = 2 + + # # Predict labels for the test set using k-NN + # predicted_labels = knn_predict(dists, labels_train, k) + + + classification_rate = evaluate_knn(np.array([[1, 27], [100, 300]]), np.array([0.002, 9000]), np.array([[25, 350]]), np.array([9000]), 1) + print("Classification rate:") + print(classification_rate) + + # file = "./data/cifar-10-python/" + # data, labels = read_cifar.read_cifar(file) + # data_train, labels_train, data_test, labels_test = read_cifar.split_dataset(data, labels, 0.8) + + # dists = distance_matrix(data_train, data_test) + # k = 2 + # knn_predict(dists, labels_train, k) \ No newline at end of file diff --git a/read_cifar.py b/read_cifar.py index cc2befce425e2c86d22f92388ac6dd09700c5a5e..6d5369d0cb4811936d948940ff6bd82e1f671615 100644 --- a/read_cifar.py +++ b/read_cifar.py @@ -60,4 +60,4 @@ def split_dataset(data, labels, split) : if __name__ == "__main__": file = "./data/cifar-10-python/" data, labels = read_cifar(file) - res = split_dataset(data, labels, 0.8) + data_train, labels_train, data_test, labels_test = split_dataset(data, labels, 0.8)