knn.py

import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import plotly.graph_objects as go


# Create distance Matrix
'''
Arguments:
- Two matrices.

Returns:
dists: the L2 Euclidean distance matrix.
The computation of this function should be done solely through matrix manipulations.
'''
def distance_matrix(X, Y):
    XX = np.sum(X ** 2, axis=1, keepdims=True)
    YY = np.sum(Y ** 2, axis=1, keepdims=True)
    XY = X @ Y.T
    dists = XX + YY.T - 2 * XY
    return dists

# KNN predict
'''
Arguments:
- dists: the distance matrix between the training set and the test set.
- labels_train: training labels.
- k: the number of neighbors.

Returns:
- Predicted labels for the elements in data_test.
'''
def knn_predict(dists, labels_train, k):
    n_test = dists.shape[0]
    y_pred = np.zeros(n_test, dtype=np.int64)
    for i in range(n_test):
        indices = np.argsort(dists[i])[:k]
        k_nearest_labels = labels_train[indices]
        y_pred[i] = np.argmax(np.bincount(k_nearest_labels))
    return y_pred

# evaluate_knn
'''Here is the code to evaluate k-nearest neighbors and plot the accuracy as a function of k:'''
'''
Arguments:
- data_train: training data.
- labels_train: corresponding labels.
- data_test: test data.
- labels_test: corresponding labels.
- k: the number of neighbors.

Returns:
- Accuracy of the Knn model: the classification rate between predicted values and actual observations from test data.
'''
def evaluate_knn(data_train, labels_train, data_test, labels_test, k):
    dists = distance_matrix(data_test, data_train)
    y_pred = knn_predict(dists, labels_train, k)
    accuracy = np.mean(y_pred == labels_test)
    return accuracy

# Plot Accuracy of KNN model
'''The function plots the variation of accuracy with the number of neighbors K.'''
'''
Arguments:
- X_train: training data.
- y_train: training labels.
- X_test: test data.
- y_test: test labels.
'''
def plot_KNN(X_train, y_train, X_test, y_test, max_k=20):
    neighbors = np.arange(1, max_k + 1)
    accuracies = [evaluate_knn(X_train, y_train, X_test, y_test, k) for k in neighbors]
    plt.plot(neighbors, accuracies, 'b-o')
    plt.xlabel('K')
    plt.ylabel('Accuracy')
    plt.title('Variation of Accuracy with K')
    plt.savefig("Results/knn.png")