diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..708bba6ee1ff7e21644ef4a18c646cf19711ca8f --- /dev/null +++ b/.gitignore @@ -0,0 +1,161 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +data \ No newline at end of file diff --git a/knn.py b/knn.py new file mode 100644 index 0000000000000000000000000000000000000000..6014d059dca815518c14ddf3fd0e6211d0c99639 --- /dev/null +++ b/knn.py @@ -0,0 +1,58 @@ +import numpy as np +from read_cifar import * +from matplotlib import pyplot as plt + +def distance_matrix(mat1, mat2): + + norm1 = np.sum(mat1**2, axis=1, keepdims=True) + norm2 = np.sum(mat2**2, axis=1, keepdims=True) + dot_products = np.dot(mat1, mat2.T) + dists = np.sqrt(norm1 - 2 * dot_products + norm2.T) + + return dists + + +def knn_predict(dists, labels_train, k): + + + predicted_labels = np.zeros(dists.shape[0], dtype=int) + + for i in range(0,dists.shape[0],1): + nearest_indices = np.argsort(dists[i])[:k] + nearest_labels=[labels_train[i] for i in nearest_indices] + predicted_class=max(nearest_labels,key=nearest_labels.count) + predicted_labels[i]=predicted_class + + return predicted_labels + +def evaluate_knn(data_train, labels_train, data_test, labels_test, k): + + dists=distance_matrix(data_train,data_test) + predicted_labels=knn_predict(dists,labels_train,k) + accuracy = (np.sum(predicted_labels == labels_test)) / len(labels_test) + + return accuracy + + + +def acc_graph(): + + axis=[] + result=[] + data_path = r"C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py" + allData,allLabels = read_cifar(data_path) + data_train,labels_train, data_test,labels_test = split_dataset(allData,allLabels, split=0.9) + + for i in range(1,20,1): + axis.append(i) + acc=evaluate_knn(data_train,labels_train, data_test,labels_test,i) + result.append(acc) + plt.plot(axis,result) + plt.title("the variation of the accuracy as a function of k") + plt.xlabel("Number of neighbors") + plt.ylabel("Accuracy") + plt.show() + plt.savefig(r"C:\Intel\Desktop\DeepLearning\image-classification\results") + +if __name__ == "__main__": + acc_graph() diff --git a/mlp.py b/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..65e721658a20bd7453f39addaadcb1facb23fd5d --- /dev/null +++ b/mlp.py @@ -0,0 +1,163 @@ +from read_cifar import * +from matplotlib import pyplot as plt + + +def segmoid(x): + return 1/(1 + np.exp(-x)) + +def deriv_segmoid(x): + return segmoid(x)*(1-segmoid(x)) + +def softmax(x): + e_x = np.exp(x) + return e_x / e_x.sum(axis=1, keepdims=True) + +def learn_once_mse(w1,b1,w2,b2,data,targets,learning_rate): + + a0=data + a1=segmoid(np.matmul(a0, w1) + b1) + a2=segmoid(np.matmul(a1,w2) + b2) #The predicted classes + + #Calculate the different partial derivatives + dc_da2=2*(a2-targets) + dc_dz2=np.multiply(np.multiply(a2,(1-a2)),dc_da2) + dc_dw2=np.matmul(a1.T,dc_dz2) + dc_db2=dc_dz2 + + dc_da1=np.matmul(dc_dz2,w2.T) + dc_dz1=np.multiply(np.multiply(a1,(1-a1)),dc_da1) + dc_dw1=np.matmul(a0.T,dc_dz1) + dc_db1=dc_dz1 + + #application of the backpropagation of the gradient + w1=w1-learning_rate*dc_dw1 + w2=w2-learning_rate*dc_dw2 + b1=b1-learning_rate*dc_db1 + b2=b2-learning_rate*dc_db2 + + # Forward pass + z1 = np.matmul(a0, w1) + b1 + a1 = segmoid(z1) + z2 = np.matmul(a1, w2) + b2 + a2 = segmoid(z2) + predictions = a2 + + # Compute loss (MSE) + loss = np.mean(np.square(predictions - targets)) + + return (w1,b1,w2,b2,loss) + +def one_hot(NDarray): + + result = np.zeros((NDarray.shape[0],int(np.max(NDarray)+1))) + for i in range(0,NDarray.shape[0],1): + result[i,int(NDarray[i])]=1 + return result + + +def cross_entropy(classes,prob): + loss=-np.sum(np.multiply(classes,np.log(prob))) + return loss/float(prob.shape[0]) + + +def learn_once_cross_entropy(w1,b1,w2,b2,data,labels_train,learning_rate): + + a0=data + labels_train=one_hot(labels_train) + a1=segmoid(np.matmul(a0, w1) + b1) + a2=softmax(np.matmul(a1,w2) + b2) + nb_rows=data.shape[0] + + #gradient descent optimization + + dc_dz2=(a2-labels_train)/data.shape[0] + dc_dw2=np.matmul(a1.T,dc_dz2) + dc_db2=np.dot(np.ones(nb_rows),dc_dz2) + + dc_da1=np.matmul(dc_dz2,w2.T) + dc_dz1=np.multiply(np.multiply(a1,(1-a1)),dc_da1) + dc_dw1=np.matmul(a0.T,dc_dz1) + dc_db1= np.dot(np.ones(nb_rows),dc_dz1) + + #application of the backpropagation of the gradient + w1=w1-learning_rate*dc_dw1 + w2=w2-learning_rate*dc_dw2 + b1=b1-learning_rate*dc_db1 + b2=b2-learning_rate*dc_db2 + + + # Forward pass + z1 = np.matmul(a0, w1) + b1 + a1 = segmoid(z1) + z2 = np.matmul(a1, w2) + b2 + a2 = softmax(z2) + # Compute loss (Cross entropy loss) + loss = cross_entropy(labels_train,a2) + return (w1,b1,w2,b2,loss) + +def train_mlp(w1,b1,w2,b2,data_train,labels_train,learning_rate,num_epoch): + + train_accuracies=[] + for i in range(0,num_epoch,1): + (w1,b1,w2,b2,loss)=learn_once_cross_entropy(w1,b1,w2,b2,data_train,labels_train,learning_rate) + + a0=data_train + z1 = np.matmul(a0, w1) + b1 + a1 = segmoid(z1) + z2 = np.matmul(a1, w2) + b2 + a2 = softmax(z2) + predict = np.argmax(a2,axis=1) + + train_accuracies.append((np.sum(predict == labels_train)/predict.shape[0])*100) + + return (w1,w2,b1,b2,train_accuracies) + +def test_mlp(w1,b1,w2,b2,data_test,labels_test): + + a0=data_test + z1 = np.matmul(a0, w1) + b1 + a1 = segmoid(z1) + z2 = np.matmul(a1, w2) + b2 + a2 = softmax(z2) + predict = np.argmax(a2,axis=1) + return (np.sum(predict == labels_test)/predict.shape[0])*100 + +def run_mlp_training(data_train, labels_train, data_test, labels_test,d_h,learning_rate ,num_epoch ): + + d_in = data_train.shape[1] + d_out = 10 + w1 = 2 * np.random.rand(d_in, d_h) - 1 + b1 = np.zeros((1, d_h)) + w2 = 2 * np.random.rand(d_h, d_out) - 1 + b2 = np.zeros((1, d_out)) + + + (w1,w2,b1,b2,train_accuracies)=train_mlp(w1,b1,w2,b2,data_train,labels_train,learning_rate,num_epoch) + + test_acc=test_mlp(w1,b1,w2,b2,data_test,labels_test) + + return (train_accuracies,test_acc) + + +if __name__ == "__main__": + + dir_batches = r'C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py' + (data,labels)=read_cifar(dir_batches) + (data_train, labels_train, data_test, labels_test)=split_dataset(data,labels,0.9) + + d_in = data_train.shape[1] + d_out = 10 + + w1 = 2 * np.random.rand(d_in, 64) - 1 + b1 = np.zeros((1, 64)) + w2 = 2 * np.random.rand(64, d_out) - 1 + b2 = np.zeros((1, d_out)) + acc=train_mlp(w1,b1,w2,b2,data_train,labels_train,0.1,100)[4] + epochs =[i+1 for i in range(0,100,1)] + + plt.plot(epochs,acc) + plt.title("the evolution of learning accuracy across learning epochs") + plt.xlabel("epochs") + plt.ylabel("Accuracy") + plt.show() + plt.savefig(r"C:\Intel\Desktop\DeepLearning\image-classification\results") \ No newline at end of file diff --git a/read_cifar.py b/read_cifar.py new file mode 100644 index 0000000000000000000000000000000000000000..0243ae22e24396c9ac0881a2cf26152363a11e6b --- /dev/null +++ b/read_cifar.py @@ -0,0 +1,72 @@ +import os +import numpy as np +import pickle + + +def unpickle(path): + + with open(path, 'rb') as fo: + dict = pickle.load(fo, encoding='bytes') + return dict + +def read_cifar_batch(path): + + batch_data = unpickle(path) + data = batch_data[b'data'] + labels = batch_data[b'labels'] + data = np.array(data, dtype=np.float32) + labels = np.array(labels, dtype=np.int64) + + return data, labels + +def read_cifar(path): + file_list = os.listdir(path) + unwanted_files = ['data_batch_1', 'batches.meta', 'readme.html'] + #initilize the Matrix data and labels with the data of the first batch + (Matrix_Data, allLabels) = read_cifar_batch(os.path.join(path, 'data_batch_1')) + file_list.remove('data_batch_1') + for i in file_list: + if i not in unwanted_files: + (data, labels) = read_cifar_batch(os.path.join(path, i)) + Matrix_Data = np.concatenate((Matrix_Data, data), axis=0) + allLabels = np.concatenate((allLabels, labels), axis=0) + + return (Matrix_Data, allLabels) + + +def split_dataset(data, labels, split): + if split < 0 or split > 1: + raise ValueError("The 'split' factor is a float between 0 and 1") + + # Determine the split indices + split_idx = int(len(data) * split) + + # Shuffle the data and labels using the same random order + shuffled_indices = np.random.permutation(len(data)) + data_shuffled = data[shuffled_indices] + labels_shuffled = labels[shuffled_indices] + + # Split the data and labels into training and test sets + data_train = data_shuffled[:split_idx] + labels_train = labels_shuffled[:split_idx] + data_test = data_shuffled[split_idx:] + labels_test = labels_shuffled[split_idx:] + + return data_train, labels_train, data_test, labels_test + + + + +if __name__ == "__main__": + + batch_path = r"C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py\data_batch_1" + data_path = r"C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py" + data, labels = read_cifar_batch(batch_path) + allData,allLabels = read_cifar(data_path) + print("Data shape:", data.shape) + print("Labels shape:", labels.shape) + data_train,labels_train, data_test,labels_test = split_dataset(allData,allLabels, split=0.9) + print("Data_train:", data_train) + print("Labels_train:", labels_train) + print("Data_test:", data_test) + print("Labels_test:", labels_test) \ No newline at end of file diff --git a/results.png b/results.png new file mode 100644 index 0000000000000000000000000000000000000000..cc7eff72c4c83e0eb2e6492ce71903a251b16a6e Binary files /dev/null and b/results.png differ diff --git a/results/Accuracy.png b/results/Accuracy.png new file mode 100644 index 0000000000000000000000000000000000000000..80049b7a4aabf69223690b38f733c8338e6dcdf6 Binary files /dev/null and b/results/Accuracy.png differ diff --git a/results/mlp.png b/results/mlp.png new file mode 100644 index 0000000000000000000000000000000000000000..8c4a7f84e98eb288bb64a6d9f2211062f8f2460d Binary files /dev/null and b/results/mlp.png differ diff --git a/tests/test_knn.py b/tests/test_knn.py new file mode 100644 index 0000000000000000000000000000000000000000..4de68bb414687bb036493ffd82c0859f6525e231 --- /dev/null +++ b/tests/test_knn.py @@ -0,0 +1,14 @@ +import numpy as np +from knn import distance_matrix +from knn import knn_predict +from knn import evaluate_knn +from read_cifar import read_cifar_batch + +(data,labels)=read_cifar_batch(r"C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py\data_batch_1") +(data_test,labels_test)=read_cifar_batch(r"C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py\test_batch") + +#Unittest + +assert distance_matrix(data,data_test).shape == (data.shape[0],data_test.shape[0]) +assert knn_predict(data,labels,2).shape == labels.shape +assert 0 < evaluate_knn(data,labels,data_test,labels_test,5) < 1 diff --git a/tests/test_mlp.py b/tests/test_mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..534e951e90ba2be60a40eb2c3d24ad86356a126a --- /dev/null +++ b/tests/test_mlp.py @@ -0,0 +1,47 @@ +import numpy as np +from mlp import * + +#testing the MSE Gradient Descent +N = 30 +d_in = 3 +d_h = 3 +d_out = 2 +w1 = 2 * np.random.rand(d_in, d_h) - 1 +b1 = np.zeros((1, d_h)) +w2 = 2 * np.random.rand(d_h, d_out) - 1 +b2 = np.zeros((1, d_out)) + +data = np.random.rand(N, d_in) +targets = np.random.rand(N, d_out) +learning_rate=0.5 + +(w1n,b1n,w2n,b2n,loss)=learn_once_mse(w1,b1,w2,b2,data,targets,learning_rate) +assert 0 < loss < 1 +assert w1n.shape==w1.shape + +#test the one-hot encoding function + +assert ((one_hot(np.array([1,2,0])) == [[0, 1, 0],[0, 0, 1],[1, 0, 0]]).all())==True + +#testing the Cross Entropy Gradien descent and the training of the model +(data,labels)=read_cifar_batch(r"C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py\data_batch_1") + +N = data.shape[0] +d_in = data.shape[1] +d_h = 64 +d_out = 10 +w1 = 2 * np.random.rand(d_in, d_h) - 1 +b1 = np.zeros((1, d_h)) +w2 = 2 * np.random.rand(d_h, d_out) - 1 +b2 = np.zeros((1, d_out)) +learning_rate=0.1 +num_epoch=100 + +assert 0< learn_once_cross_entropy(w1,b1,w2,b2,data,labels,learning_rate)[4] +print(train_mlp(w1,b1,w2,b2,data,labels,learning_rate,num_epoch)[4]) + +#test the model testing function + +dir_test =r"C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py\test_batch" +(data_test,labels_test)=read_cifar_batch(dir_test) +assert 0< test_mlp(w1,b1,w2,b2,data_test,labels_test) < 100 diff --git a/tests/test_read_cifar.py b/tests/test_read_cifar.py new file mode 100644 index 0000000000000000000000000000000000000000..92c865c44633354a5d3b964aca1d369e5ace1929 --- /dev/null +++ b/tests/test_read_cifar.py @@ -0,0 +1,20 @@ +import numpy as np +from read_cifar import read_cifar_batch +from read_cifar import read_cifar +from read_cifar import split_dataset + +batch_path = r"C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py\data_batch_1" +data_path = r"C:\Intel\Desktop\DeepLearning\image-classification\data\cifar-10-batches-py" +(data,labels)=read_cifar_batch(batch_path) +(alldata,alllabels)=read_cifar(data_path) +image = np.random.randrange(25000) +pixel = np.random.randrange(3071) + +#Unittest + +assert data.shape == (10000,3072) +assert labels.shape == (10000,) +assert alldata.shape == (60000,3072) +assert alllabels.shape == (60000,) +assert split_dataset(alldata,alllabels,0.5)[0].shape == (30000,3072) +assert split_dataset(alldata,alllabels,0.5)[0][image][pixel] != split_dataset(alldata,alllabels,0.5)[0][image][pixel]