Skip to content
Snippets Groups Projects
Select Git revision
  • main
1 result

knn.py

Blame
  • user avatar
    choukri authored
    be92e118
    History
    knn.py 4.58 KiB
    # K-nearest-neighbors
    
    # 1. function distance_matrix
    
    import matplotlib.pyplot as plt
    import numpy as np
    from scipy.optimize.slsqp import concatenate
    
    from read_cifar import read_cifar, split_dataset
    
    path = "data/cifar-10-batches-py/"
    
    
    # first we write the function that calculate the distance between two matrix
    # we will use this function to compute the distance between two matrix with equal shape
    def equal_shape_distance_matrix(X, V):
        """
        compute the Euclidean distance between two matrix with equal shape
    
        :param
           X: first matrix
           V: second matrix
        :return:
            the Euclidean distance between X and V
        """
        return (
            np.dot(X, X.transpose())
            + np.dot(V, V.transpose())
            - 2 * np.dot(X, V.transpose())
        )
    
    
    # distance_matrix function between two matrix of any shape
    
    
    def distance_matrix(data_train, data_test):
    
        """
        compute the Euclidean distance between two matrix
    
        :param
           data_train: the data_train matrix that contains the training data
           data_test: the data_test matrix that contains the test data
        :return:
            dist: the Euclidean distance between data_train and data_test as a matrix
        """
        # we compute the first distance with equal shapes
        dist = equal_shape_distance_matrix(data_train[: data_test.shape[0]], data_test)
    
        # we compute the distance between the test set and the p part of the training data
        p = int(data_train.shape[0] / data_test.shape[0])
        for i in range(1, p):
            sub_dist = equal_shape_distance_matrix(
                data_train[data_test.shape[0] * i : data_test.shape[0] * (i + 1)], data_test
            )
            dist = np.concatenate((dist, sub_dist), axis=1)
        return dist
    
    
    # 2. the function Knn_predict
    
    
    def knn_predict(labels_train, dists, k):
    
        """
        compute the predicted labels for the data_test
    
        :param
           labels_train: the labels of the training_data with whom we will compare the predicted labels
           dists: the distance matrix that contains the euclidean distances between the data_train and the test_train
           k = number of neighbors
        :return:
            lables_predicted: the predicted labels for the data_test
    
        """
        # we initialize the matrix of predicted labels
        num_test = dists.shape[0]
        lables_predicted = np.zeros(num_test)
        for i in range(num_test):
            closest_labels = []
    
            # list des indices des plus petites distances
            sorted_dist = np.argsort(dists[i])
    
            # les k premiers labels qui correspondent au data_train qui ont la plus petite distance avec les data_test
            closest_labels = list(labels_train[sorted_dist[0:k]])
    
            pass
            # les labels prédits pour les data_tets
            lables_predicted[i] = np.argmax(np.bincount(closest_labels))
    
            pass
        return lables_predicted
    
    
    # 4. evaluate_knn
    
    
    def evaluate_knn(data_train, labels_train, data_test, labels_test, k):
    
        """
        evaluate the accuracy of our prediction model
    
        :param
           data_train: the data of the training set
           labels_train: the labels of the data_train
           data_test: the data of the test set
           labels_test: the actual labels (true labels) for the test set
           k = number of neighbors
        :return:
            accuracy: the accuracy of the model
        """
    
        # we call for distance_matrix and knn_predict
        dists = distance_matrix(data_train, data_test)
        y_test_pred = knn_predict(labels_train, dists, k)
    
        # total number of predictions
        num_test = dists.shape[0]
        # number of correct predictions
        correct = np.sum(y_test_pred == labels_test)
        # accuracy
        accuracy = float(correct) / num_test
        print("Got %d / %d correct, accuracy is : %f" % (correct, num_test, accuracy))
        return accuracy
    
    
    if __name__ == "__main__":
    
        # load data and split it into train and test
    
        data, labels = read_cifar(path)
    
        # we choose the split factor 0.9
        data_train, data_test, labels_train, labels_test = split_dataset(data, labels, 0.9)
    
        print(data_test.shape)
    
        # we reduce the shape of the test to prevent memory issues
    
        num_test = 2000
        mask = list(range(num_test))
        data_test = data_test[mask]
        labels_test = labels_test[mask]
    
        # we calcul the accuracy for k from 1 to 20
    
        Ks = []
        accuracies = []
    
        for k in range(1, 20):
    
            accuracy = evaluate_knn(data_train, labels_train, data_test, labels_test, k)
            Ks.append(k)
            accuracies.append(accuracy)
    
        # we plot the variation of the accuracy as a function of k and save it as knn.png
    
        plt.plot(Ks, accuracies, "o")
        plt.title("Accuracy vs K")
        plt.savefig("knn.png", bbox_inches="tight")
        plt.show()