diff --git a/read_cifar.py b/read_cifar.py index 963000d69a97751b5f45b275935cea06e179440f..b88037c7d4e4e5c90269861522678a45059c02a6 100644 --- a/read_cifar.py +++ b/read_cifar.py @@ -9,104 +9,123 @@ from sklearn.neighbors import KNeighborsRegressor import matplotlib.pyplot as plt -def unpickle(file): - import pickle - with open(file, 'rb') as fo: - dict = pickle.load(fo, encoding='bytes') - return dict - -def read_cifar_batch(path): - dictionary = unpickle(path) - data = np.array(dictionary[b'data'], dtype = np.float32) - labels = np.array(dictionary[b'labels'], dtype = np.int64) - - return data , labels - -def read_cifar(path1,path2,path3,path4,path5,path6): - data , labels = read_cifar_batch(path1) - dataAux , labelsAux = read_cifar_batch(path2) - data = np.concatenate((data,dataAux),0) - labels = np.concatenate((labels,labelsAux)) - dataAux , labelsAux = read_cifar_batch(path3) - data = np.concatenate((data,dataAux),0) - labels = np.concatenate((labels,labelsAux)) - dataAux , labelsAux = read_cifar_batch(path4) - data = np.concatenate((data,dataAux),0) - labels = np.concatenate((labels,labelsAux)) - dataAux , labelsAux = read_cifar_batch(path5) - data = np.concatenate((data,dataAux),0) - labels = np.concatenate((labels,labelsAux)) - dataAux , labelsAux = read_cifar_batch(path6) - data = np.concatenate((data,dataAux),0) - labels = np.concatenate((labels,labelsAux)) - - return data , labels - - - -def split_dataset(data , labels): - - data_train, data_test , labels_train , labels_test = train_test_split(data, labels,shuffle = True ,test_size = 0.1) - - return data_train, data_test , labels_train , labels_test - - -def distance_matrix(data_test,data_train): +# +# ATTENTION : THIS CODE IS CONCATENATION OF THE CODES read_cifar.py AND knn.py +# - dists = np.array([np.sum((data_train-l)**2,axis=1)**.5 for l in data_test]) - return dists -#receives a 2d array data_train(M,k) and a data_test (N,k), -#returning a 2d array(N,M) such that dists[i,j] represents -#the distance between the i-th data_test row and the j-th data_train row -#in resume, each column represent a distance of a training point to all other - -def knn_predict(dists , labels_train , k): - #classif = np.array(0) - print(labels_train[:20]) - print(labels_train.size) - classif = [] - - for testRows in dists.T: - - distances = np.stack((testRows,labels_train),axis = 1) - distances = distances[distances[:, 0].argsort()] - #for picturesClasses in distances[:k,1]: - countArray = [np.count_nonzero(distances[:k,1]==i) for i in range(0,10)] - classif = np.append(classif,np.argmax(countArray)) - - classif = np.array(classif , dtype = int) - - return classif - -def evaluate_knn(data_train,labels_train,data_test,labels_test,k): - - classif = np.array(knn_predict(distance_matrix(data_train,data_test) , labels_train , k)) - result = np.array(classif == labels_test) - acc = np.count_nonzero(result) / np.size(result) - - return acc*100 +def unpickle(file): + """Use to Unpack the CIFAR10 dataset as a pickle. It returns a dictinary with the dataset and its labels.""" + import pickle + with open(file, "rb") as fo: + dict = pickle.load(fo, encoding="bytes") + return dict -datas,labels = read_cifar_batch('data_batch_1') -print(datas.shape,labels.shape) -dataTrain,dataTest,labelsTrain,labelsTest = split_dataset(datas,labels) -print(dataTrain.shape,dataTest.shape,labelsTrain.shape) -distanceMatrix = distance_matrix(dataTrain,dataTest) -print(distanceMatrix.shape) +def read_cifar_batch(path): + """Taking as parameter the path of a single batch as a string, and returning: + matrix data of size (batch_size x data_size) and a vector labels of size batch_size""" + dictionary = unpickle(path) + data = np.array(dictionary[b"data"], dtype=np.float32) + labels = np.array(dictionary[b"labels"], dtype=np.int64) + + return data, labels + + +def read_cifar(path1, path2, path3, path4, path5, path6): + """taking as parameter the path of the directory containing the six batches + (five data_batch and one test_batch) as a string, and returning: + a matrix data of shape (batch_size x data_size) and a vector labels of size batch_size""" + data, labels = read_cifar_batch(path1) + dataAux, labelsAux = read_cifar_batch(path2) + data = np.concatenate((data, dataAux), 0) + labels = np.concatenate((labels, labelsAux)) + dataAux, labelsAux = read_cifar_batch(path3) + data = np.concatenate((data, dataAux), 0) + labels = np.concatenate((labels, labelsAux)) + dataAux, labelsAux = read_cifar_batch(path4) + data = np.concatenate((data, dataAux), 0) + labels = np.concatenate((labels, labelsAux)) + dataAux, labelsAux = read_cifar_batch(path5) + data = np.concatenate((data, dataAux), 0) + labels = np.concatenate((labels, labelsAux)) + dataAux, labelsAux = read_cifar_batch(path6) + data = np.concatenate((data, dataAux), 0) + labels = np.concatenate((labels, labelsAux)) + + return data, labels + + +def split_dataset(data, labels): + """which splits the dataset into a training set and a test set""" + data_train, data_test, labels_train, labels_test = train_test_split( + data, labels, shuffle=True, test_size=0.1 + ) + + return data_train, data_test, labels_train, labels_test + + +def distance_matrix(data_test, data_train): + """Takes the matrix data_test and data_train. It returning a 2d array(N,M) such that dists[i,j] represents + the distance between the i-th data_test row and the j-th data_train row + """ + dists = np.array([np.sum((data_train - l) ** 2, axis=1) ** 0.5 for l in data_test]) + + return dists + + +def knn_predict(dists, labels_train, k): + """Take the matrix of distances dists, the labels for training and k nearest neighbor + It returns the classification given by the module KNN. + """ + # classif = np.array(0) + print(labels_train[:20]) + print(labels_train.size) + classif = [] + + for testRows in dists.T: + + distances = np.stack((testRows, labels_train), axis=1) + distances = distances[distances[:, 0].argsort()] + # for picturesClasses in distances[:k,1]: + countArray = [np.count_nonzero(distances[:k, 1] == i) for i in range(0, 10)] + classif = np.append(classif, np.argmax(countArray)) + + classif = np.array(classif, dtype=int) + + return classif + + +def evaluate_knn(data_train, labels_train, data_test, labels_test, k): + """Receives the datas ans labels for training and teste and k nearest neighbor. + It retuns the accuracy of the KNN module""" + classif = np.array( + knn_predict(distance_matrix(data_train, data_test), labels_train, k) + ) + result = np.array(classif == labels_test) + acc = np.count_nonzero(result) / np.size(result) + + return acc * 100 + + +datas, labels = read_cifar_batch("data_batch_1") +dataTrain, dataTest, labelsTrain, labelsTest = split_dataset(datas, labels) +distanceMatrix = distance_matrix(dataTrain, dataTest) print() result = [] -for i in range (1,21): - result = np.append(result,evaluate_knn(dataTrain,labelsTrain,dataTest,labelsTest,i)) +# Apply various KNN moduli with k ranging from 1 to 20 +for i in range(1, 21): + result = np.append( + result, evaluate_knn(dataTrain, labelsTrain, dataTest, labelsTest, i) + ) x = np.arange(1, 21) - -# plotting + +# plot the graph of (Accuracy) x k plt.title("Plot graph") plt.xlabel("K neighbors") plt.ylabel("Accuracy %") -plt.plot(x, result, color ="red") +plt.plot(x, result, color="red") plt.show() -