Update read_cifar file

6f171b98 · selalimi · 68b2b1d0 · 6f171b98
Commit 6f171b98 authored 1 year ago by selalimi
--- a/read_cifar.py
+++ b/read_cifar.py
@@ -2,87 +2,82 @@ import numpy as np
 import os
 import pickle

-# Commentaire global expliquant le but du code
-'''Here is the code to prepare the CIFAR dataset, create a function to read CIFAR batches, and split the dataset into training and testing sets:'''
-
-# Fonction read_cifar_batch :
+# Function read_cifar_batch:
 '''
 Arguments:
-Le chemin d'un seul batch en tant que chaîne de caractères.
+A single batch's path as a string.

 Returns:
-Une matrice de données de taille (taille_du_batch , taille_des_données)
-Un vecteur d'étiquettes (labels) de taille (taille_du_batch)
+A data matrix of size (batch_size, data_size)
+A labels vector of size (batch_size)
 '''

 def read_cifar_batch(batch_path):
-    # Ouvre le fichier du batch et charge les données
+    # Opens the batch file and loads the data
    with open(batch_path, 'rb') as f:
        batch_dict = pickle.load(f, encoding='bytes')
-    # Convertit les données en float32
+    # Converts the data to float32
    data = batch_dict[b'data'].astype(np.float32)
-    # Convertit les étiquettes en int64
+    # Converts the labels to int64
    labels = np.array(batch_dict[b'labels'], dtype=np.int64)
    return data, labels

-# Fonction read_cifar :
+# Function read_cifar:
 '''
-*** lit le chemin du répertoire contenant tous les lots (y compris test_batch)***
+Reads the directory path containing all the batches (including test_batch).

-*Arguments :
-Le chemin du répertoire contenant les six lots (cinq data_batch et un test_batch) en tant que chaîne de caractères.
+Arguments:
+The directory path containing all six batches (five data_batch and one test_batch) as a string.

-*Returns :
-Une matrice de données de taille (taille_du_lot , taille_des_données).
-Un vecteur d'étiquettes(labels) de taille (taille_du_lot).
+Returns:
+- A data matrix of size (total_batch_size, data_size).
+- A labels vector of size (total_batch_size).
 '''
 def read_cifar(folder):
-    # Liste des noms de fichiers de batch
+    # List of batch file names
    batch_files = ["data_batch_1", "data_batch_2", "data_batch_3", "data_batch_4", "data_batch_5", "test_batch"]
-    data_list, labels_list = [], []
+    data_list, labels_list = []

-    # Boucle sur les fichiers de batch
+    # Loop over batch files
    for batch_file in batch_files:
        path = os.path.join(folder, batch_file)
-        # Appelle read_cifar_batch pour lire chaque batch
+        # Calls read_cifar_batch to read each batch
        data, labels = read_cifar_batch(path)
        data_list.append(data)
        labels_list.append(labels)

-    # Combine les données de tous les batches
+    # Combines data from all batches
    data = np.vstack(data_list)
-    # Combine les étiquettes de tous les batches
+    # Combines labels from all batches
    labels = np.hstack(labels_list)

    return data, labels

-# Fonction pour diviser les données en ensembles d'entraînement et de test :
+# Function to split the data into training and testing sets:
 '''
-*Arguments :
-data et labels, deux tableaux de même taille dans la première dimension.
-split, un nombre flottant compris entre 0 et 1, qui détermine le facteur de répartition de l'ensemble d'entraînement par rapport à l'ensemble de test.
+Arguments:
+- data and labels, two arrays of the same size in the first dimension.
+- split, a floating-point number between 0 and 1, determining the split ratio of the training set compared to the test set.

-*Renvoie :
-data_train : les données d'entraînement.
-labels_train : les étiquettes correspondantes.
-data_test : les données de test.
-labels_test : les étiquettes correspondantes.
+Returns:
+- data_train: the training data.
+- labels_train: the corresponding labels.
+- data_test: the testing data.
+- labels_test: the corresponding labels.
 '''
 def split_dataset(data, labels, split):
-    # Vérifie que le ratio de division est valide
+    # Check that the split ratio is valid
    assert 0 < split < 1
    n = len(data)
-    # Mélange les indices des données
+    # Shuffle the data indices
    indices = np.random.permutation(n)
    split_index = int(split * n)
-    # Sépare les indices pour l'ensemble d'entraînement et de test
+    # Separate the indices for the training and test sets
    train_indices = indices[:split_index]
    test_indices = indices[split_index:]
-    # Sépare les données et étiquettes en ensembles d'entraînement et de test
+    # Split the data and labels into training and test sets
    data_train = data[train_indices]
    labels_train = labels[train_indices]
    data_test = data[test_indices]
    labels_test = labels[test_indices]
    return data_train, labels_train, data_test, labels_test
\ No newline at end of file
-
-