diff --git a/knn.py b/knn.py new file mode 100644 index 0000000000000000000000000000000000000000..5be781595258eadba054e7c5d0cf1e7891eecc95 --- /dev/null +++ b/knn.py @@ -0,0 +1,80 @@ +import numpy as np +import matplotlib.pyplot as plt +import os + +def distance_matrix(A, B): + A_square = np.sum(np.square(A), axis=1) + B_square = np.sum(np.square(B), axis=1) + + A_2 = A_square[:, None] + B_2 = B_square[None, :] + + dists = np.sqrt(A_2 + B_2 - 2 * np.dot(A, B.T)) + + return dists + +def knn_predict(dists, labels_train, k): + + num_test = dists.shape[0] + predicted_labels = np.zeros(num_test, dtype=int) + + for i in range(num_test): + # Find the indices of the k-nearest neighbors for the i-th test example + nearest_neighbor_indices = np.argsort(dists[i])[:k] + + # Get the labels of the k-nearest neighbors + k_nearest_labels = labels_train[nearest_neighbor_indices] + + # Count the occurrences of each label and select the most common one + unique_labels, counts = np.unique(k_nearest_labels, return_counts=True) + most_common_label = unique_labels[np.argmax(counts)] + + predicted_labels[i] = most_common_label + + + return predicted_labels + + +def evaluate_knn(data_train, labels_train, data_test, labels_test, k): + # Compute the distance matrix + dists = distance_matrix(data_train, data_test) + + # Predict labels for the test data using k-nearest neighbors + predicted_labels = knn_predict(dists, labels_train, k) + + # Calculate accuracy + y_pred = knn_predict(dists, labels_train, k) + accuracy = np.mean(y_pred == labels_test) + return accuracy + + +def plot_accuracy_vs_k(data_train, labels_train, data_test, labels_test, split_factor=0.9): + k_values = list(range(1, 21)) + accuracies = [] + + for k in k_values: + accuracy = evaluate_knn(data_train, labels_train, data_test, labels_test, k) + accuracies.append(accuracy) + + # Create the "results" directory if it doesn't exist + os.makedirs("results", exist_ok=True) + + plt.plot(k_values, accuracies) + plt.xlabel('k') + plt.ylabel('Accuracy') + plt.title('Accuracy vs. k for KNN') + plt.grid(True) + plt.savefig('results/knn.png') + plt.show() + + +if __name__ == "__main__": + # Load your data and split it into data_train, labels_train, data_test, and labels_test + data_train = np.random.rand(100, 2) # Replace with your actual data + labels_train = np.random.randint(0, 2, 100) # Replace with your actual labels + + # Generate test data and labels with the same number of samples as data_train + data_test = np.random.rand(100, 2) + labels_test = np.random.randint(0, 2, 100) + + plot_accuracy_vs_k(data_train, labels_train, data_test, labels_test)