Select Git revision
knn.py 4.58 KiB
# K-nearest-neighbors
# 1. function distance_matrix
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize.slsqp import concatenate
from read_cifar import read_cifar, split_dataset
path = "data/cifar-10-batches-py/"
# first we write the function that calculate the distance between two matrix
# we will use this function to compute the distance between two matrix with equal shape
def equal_shape_distance_matrix(X, V):
"""
compute the Euclidean distance between two matrix with equal shape
:param
X: first matrix
V: second matrix
:return:
the Euclidean distance between X and V
"""
return (
np.dot(X, X.transpose())
+ np.dot(V, V.transpose())
- 2 * np.dot(X, V.transpose())
)
# distance_matrix function between two matrix of any shape
def distance_matrix(data_train, data_test):
"""
compute the Euclidean distance between two matrix
:param
data_train: the data_train matrix that contains the training data
data_test: the data_test matrix that contains the test data
:return:
dist: the Euclidean distance between data_train and data_test as a matrix
"""
# we compute the first distance with equal shapes
dist = equal_shape_distance_matrix(data_train[: data_test.shape[0]], data_test)
# we compute the distance between the test set and the p part of the training data
p = int(data_train.shape[0] / data_test.shape[0])
for i in range(1, p):
sub_dist = equal_shape_distance_matrix(
data_train[data_test.shape[0] * i : data_test.shape[0] * (i + 1)], data_test
)
dist = np.concatenate((dist, sub_dist), axis=1)
return dist
# 2. the function Knn_predict
def knn_predict(labels_train, dists, k):
"""
compute the predicted labels for the data_test
:param
labels_train: the labels of the training_data with whom we will compare the predicted labels
dists: the distance matrix that contains the euclidean distances between the data_train and the test_train
k = number of neighbors
:return:
lables_predicted: the predicted labels for the data_test
"""
# we initialize the matrix of predicted labels
num_test = dists.shape[0]
lables_predicted = np.zeros(num_test)
for i in range(num_test):
closest_labels = []
# list des indices des plus petites distances
sorted_dist = np.argsort(dists[i])
# les k premiers labels qui correspondent au data_train qui ont la plus petite distance avec les data_test
closest_labels = list(labels_train[sorted_dist[0:k]])
pass
# les labels prédits pour les data_tets
lables_predicted[i] = np.argmax(np.bincount(closest_labels))
pass
return lables_predicted
# 4. evaluate_knn
def evaluate_knn(data_train, labels_train, data_test, labels_test, k):
"""
evaluate the accuracy of our prediction model
:param
data_train: the data of the training set
labels_train: the labels of the data_train
data_test: the data of the test set
labels_test: the actual labels (true labels) for the test set
k = number of neighbors
:return:
accuracy: the accuracy of the model
"""
# we call for distance_matrix and knn_predict
dists = distance_matrix(data_train, data_test)
y_test_pred = knn_predict(labels_train, dists, k)
# total number of predictions
num_test = dists.shape[0]
# number of correct predictions
correct = np.sum(y_test_pred == labels_test)
# accuracy
accuracy = float(correct) / num_test
print("Got %d / %d correct, accuracy is : %f" % (correct, num_test, accuracy))
return accuracy
if __name__ == "__main__":
# load data and split it into train and test
data, labels = read_cifar(path)
# we choose the split factor 0.9
data_train, data_test, labels_train, labels_test = split_dataset(data, labels, 0.9)
print(data_test.shape)
# we reduce the shape of the test to prevent memory issues
num_test = 2000
mask = list(range(num_test))
data_test = data_test[mask]
labels_test = labels_test[mask]
# we calcul the accuracy for k from 1 to 20
Ks = []
accuracies = []
for k in range(1, 20):
accuracy = evaluate_knn(data_train, labels_train, data_test, labels_test, k)
Ks.append(k)
accuracies.append(accuracy)
# we plot the variation of the accuracy as a function of k and save it as knn.png
plt.plot(Ks, accuracies, "o")
plt.title("Accuracy vs K")
plt.savefig("knn.png", bbox_inches="tight")
plt.show()