knn.py

# K-nearest-neighbors

# 1. function distance_matrix

import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize.slsqp import concatenate

from read_cifar import read_cifar, split_dataset

path = "data/cifar-10-batches-py/"


# first we write the function that calculate the distance between two matrix
# we will use this function to compute the distance between two matrix with equal shape
def equal_shape_distance_matrix(X, V):
    """
    compute the Euclidean distance between two matrix with equal shape

    :param
       X: first matrix
       V: second matrix
    :return:
        the Euclidean distance between X and V
    """
    return (
        np.dot(X, X.transpose())
        + np.dot(V, V.transpose())
        - 2 * np.dot(X, V.transpose())
    )


# distance_matrix function between two matrix of any shape


def distance_matrix(data_train, data_test):

    """
    compute the Euclidean distance between two matrix

    :param
       data_train: the data_train matrix that contains the training data
       data_test: the data_test matrix that contains the test data
    :return:
        dist: the Euclidean distance between data_train and data_test as a matrix
    """
    # we compute the first distance with equal shapes
    dist = equal_shape_distance_matrix(data_train[: data_test.shape[0]], data_test)

    # we compute the distance between the test set and the p part of the training data
    p = int(data_train.shape[0] / data_test.shape[0])
    for i in range(1, p):
        sub_dist = equal_shape_distance_matrix(
            data_train[data_test.shape[0] * i : data_test.shape[0] * (i + 1)], data_test
        )
        dist = np.concatenate((dist, sub_dist), axis=1)
    return dist


# 2. the function Knn_predict


def knn_predict(labels_train, dists, k):

    """
    compute the predicted labels for the data_test

    :param
       labels_train: the labels of the training_data with whom we will compare the predicted labels
       dists: the distance matrix that contains the euclidean distances between the data_train and the test_train
       k = number of neighbors
    :return:
        lables_predicted: the predicted labels for the data_test

    """
    # we initialize the matrix of predicted labels
    num_test = dists.shape[0]
    lables_predicted = np.zeros(num_test)
    for i in range(num_test):
        closest_labels = []

        # list des indices des plus petites distances
        sorted_dist = np.argsort(dists[i])

        # les k premiers labels qui correspondent au data_train qui ont la plus petite distance avec les data_test
        closest_labels = list(labels_train[sorted_dist[0:k]])

        pass
        # les labels prédits pour les data_tets
        lables_predicted[i] = np.argmax(np.bincount(closest_labels))

        pass
    return lables_predicted


# 4. evaluate_knn


def evaluate_knn(data_train, labels_train, data_test, labels_test, k):

    """
    evaluate the accuracy of our prediction model

    :param
       data_train: the data of the training set
       labels_train: the labels of the data_train
       data_test: the data of the test set
       labels_test: the actual labels (true labels) for the test set
       k = number of neighbors
    :return:
        accuracy: the accuracy of the model
    """

    # we call for distance_matrix and knn_predict
    dists = distance_matrix(data_train, data_test)
    y_test_pred = knn_predict(labels_train, dists, k)

    # total number of predictions
    num_test = dists.shape[0]
    # number of correct predictions
    correct = np.sum(y_test_pred == labels_test)
    # accuracy
    accuracy = float(correct) / num_test
    print("Got %d / %d correct, accuracy is : %f" % (correct, num_test, accuracy))
    return accuracy


if __name__ == "__main__":

    # load data and split it into train and test

    data, labels = read_cifar(path)

    # we choose the split factor 0.9
    data_train, data_test, labels_train, labels_test = split_dataset(data, labels, 0.9)

    print(data_test.shape)

    # we reduce the shape of the test to prevent memory issues

    num_test = 2000
    mask = list(range(num_test))
    data_test = data_test[mask]
    labels_test = labels_test[mask]

    # we calcul the accuracy for k from 1 to 20

    Ks = []
    accuracies = []

    for k in range(1, 20):

        accuracy = evaluate_knn(data_train, labels_train, data_test, labels_test, k)
        Ks.append(k)
        accuracies.append(accuracy)

    # we plot the variation of the accuracy as a function of k and save it as knn.png

    plt.plot(Ks, accuracies, "o")
    plt.title("Accuracy vs K")
    plt.savefig("knn.png", bbox_inches="tight")
    plt.show()