diff --git a/read_cifar.py b/read_cifar.py new file mode 100644 index 0000000000000000000000000000000000000000..963000d69a97751b5f45b275935cea06e179440f --- /dev/null +++ b/read_cifar.py @@ -0,0 +1,112 @@ +import numpy as np +import tensorflow as tf +import pandas as pd +import pickle +import os +import scipy +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsRegressor +import matplotlib.pyplot as plt + + +def unpickle(file): + import pickle + with open(file, 'rb') as fo: + dict = pickle.load(fo, encoding='bytes') + return dict + +def read_cifar_batch(path): + dictionary = unpickle(path) + data = np.array(dictionary[b'data'], dtype = np.float32) + labels = np.array(dictionary[b'labels'], dtype = np.int64) + + return data , labels + +def read_cifar(path1,path2,path3,path4,path5,path6): + data , labels = read_cifar_batch(path1) + dataAux , labelsAux = read_cifar_batch(path2) + data = np.concatenate((data,dataAux),0) + labels = np.concatenate((labels,labelsAux)) + dataAux , labelsAux = read_cifar_batch(path3) + data = np.concatenate((data,dataAux),0) + labels = np.concatenate((labels,labelsAux)) + dataAux , labelsAux = read_cifar_batch(path4) + data = np.concatenate((data,dataAux),0) + labels = np.concatenate((labels,labelsAux)) + dataAux , labelsAux = read_cifar_batch(path5) + data = np.concatenate((data,dataAux),0) + labels = np.concatenate((labels,labelsAux)) + dataAux , labelsAux = read_cifar_batch(path6) + data = np.concatenate((data,dataAux),0) + labels = np.concatenate((labels,labelsAux)) + + return data , labels + + + +def split_dataset(data , labels): + + data_train, data_test , labels_train , labels_test = train_test_split(data, labels,shuffle = True ,test_size = 0.1) + + return data_train, data_test , labels_train , labels_test + + +def distance_matrix(data_test,data_train): + + dists = np.array([np.sum((data_train-l)**2,axis=1)**.5 for l in data_test]) + + return dists +#receives a 2d array data_train(M,k) and a data_test (N,k), +#returning a 2d array(N,M) such that dists[i,j] represents +#the distance between the i-th data_test row and the j-th data_train row +#in resume, each column represent a distance of a training point to all other + +def knn_predict(dists , labels_train , k): + #classif = np.array(0) + print(labels_train[:20]) + print(labels_train.size) + classif = [] + + for testRows in dists.T: + + distances = np.stack((testRows,labels_train),axis = 1) + distances = distances[distances[:, 0].argsort()] + #for picturesClasses in distances[:k,1]: + countArray = [np.count_nonzero(distances[:k,1]==i) for i in range(0,10)] + classif = np.append(classif,np.argmax(countArray)) + + classif = np.array(classif , dtype = int) + + return classif + +def evaluate_knn(data_train,labels_train,data_test,labels_test,k): + + classif = np.array(knn_predict(distance_matrix(data_train,data_test) , labels_train , k)) + result = np.array(classif == labels_test) + acc = np.count_nonzero(result) / np.size(result) + + return acc*100 + + + +datas,labels = read_cifar_batch('data_batch_1') +print(datas.shape,labels.shape) +dataTrain,dataTest,labelsTrain,labelsTest = split_dataset(datas,labels) +print(dataTrain.shape,dataTest.shape,labelsTrain.shape) +distanceMatrix = distance_matrix(dataTrain,dataTest) +print(distanceMatrix.shape) +print() + +result = [] +for i in range (1,21): + result = np.append(result,evaluate_knn(dataTrain,labelsTrain,dataTest,labelsTest,i)) + +x = np.arange(1, 21) + +# plotting +plt.title("Plot graph") +plt.xlabel("K neighbors") +plt.ylabel("Accuracy %") +plt.plot(x, result, color ="red") +plt.show() +