diff --git a/read_cifar.py b/read_cifar.py index d527d1399941b0f8ce5a641633d018feab1a417f..75954c0da929f019018a35d0eab7260e166c5d4d 100644 --- a/read_cifar.py +++ b/read_cifar.py @@ -2,48 +2,42 @@ import numpy as np from six.moves import cPickle as pickle import os -import platform -#Defining the classes contained in the CIFAR-10 dataset -classes = ('plane', 'car', 'bird', 'cat', - 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') -img_rows, img_cols = 32, 32 #The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes -input_shape = (img_rows, img_cols, 3) - -def load_pickle(f): - #This function takes a file name as an input and loads it in order to work on it later. - version = platform.python_version_tuple() - #The loading of the file depends on the version of python we are using. - if version[0] == '2': - return pickle.load(f) - elif version[0] == '3': - return pickle.load(f, encoding='latin1') - raise ValueError("invalid python version: {}".format(version)) - +import random +def unpickle(file): + '''loads the data dictionnary.''' + with open(file, 'rb') as fo: + dict = pickle.load(fo, encoding='bytes') + return dict def read_cifar_batch (batch_path): #This function takes as parameter the path of a single batch as a string, and returns a matrix data of size (batch_size x data_size) and a a vector labels of size batch_size. - with open(batch_path, 'rb') as bp: - data_dict = load_pickle(bp) - data = data_dict['data'] - labels = data_dict['labels'] - data = data.reshape(len(data),len(data[0])) - data = data.astype('f') #data must be np.float32 array. - labels = np.array(labels, dtype='int64') #labels must be np.int64 array. - return data, labels + data_dict = load_pickle(bp) + data = data_dict['data'] + labels = data_dict['labels'] + data = data.reshape(len(data),len(data[0])) + data = data.astype('f') #data must be np.float32 array. + labels = np.array(labels, dtype='int64') #labels must be np.int64 array. + return data, labels def read_cifar(directory_path): #This function takes as parameter the path of the directory containing the six batches and returns a matrix data a vector lables of size batch_size - data=[] - labels=[] - for b in range(1,6): - file = os.path.join(directory_path, 'data_batch_%d'% (b, )) - Xd, Yd = read_cifar_batch(file) - data.append( Xd ) - labels.append( Yd ) - Xt, Yt = read_cifar_batch(os.path.join(directory_path, 'test_batch')) - data.append( Xt ) - labels.append( Yt ) + files=['/data_batch_1','/data_batch_2','/data_batch_3','/data_batch_4','/data_batch_5','/test_batch'] + A=10000 + N=60000 + P=3072 + data=np.empty((N,P),dtype=np.float) + labels=np.empty(A,dtype=np.int64) + for i in range(len(files)): + fichier=directory_path+files[i] + data_dict=unpickle(fichier) + M=data_dict[b'data'] + L=data_dict[b'labels'] + L=np.array(L) + data=np.vstack((X,M)) + labels=np.hstack((Y,L)) + data=X[N:2*N,] + labels=Y[A:,] return data,labels - + def split_dataset(data,labels,split): #This function splits the dataset into a training set and a test set #It takes as parameter data and labels, two arrays that have the same size in the first dimension. And a split, a float between 0 and 1 which determines the split factor of the training set with respect to the test set.