Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • main
1 result

Target

Select target project
  • edelland/deep_learning
  • cdemode/deep_learning
2 results
Select Git revision
  • main
1 result
Show changes
Commits on Source (19)
Showing
with 4133 additions and 0 deletions
File added
File added
File added
File added
File added
File added
%% Cell type:markdown id: tags:
# Chroma database
Chroma is an open-source vector database that is similar to Milvus and can be used with Windows systems. Here is an example of code illustrating its use.
%% Cell type:code id: tags:
``` python
# Installing the chromadb package
!pip install chromadb
```
%% Cell type:code id: tags:
``` python
# Importing the necessary module
from chromadb import PersistentClient
```
%% Cell type:code id: tags:
``` python
# Creating a database client stored in the "ragdb" folder, or loading it if it already exists
client = PersistentClient(path="./ragdb")
```
%% Cell type:code id: tags:
``` python
# Creating or loading a collection in ChromaDB
collection_name = "my_rag_collection"
try:
collection = client.get_collection(name=collection_name)
except:
collection = client.create_collection(name=collection_name)
```
%% Cell type:code id: tags:
``` python
from sentence_transformers import SentenceTransformer
# Load an embedding model
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
# Define an embedding function
def text_embedding(text):
return embedding_model.encode(text).tolist()
```
%% Cell type:code id: tags:
``` python
# Adding documents with their metadata and unique identifiers
documents = [
"The sun rises in the east and sets in the west.",
"Raindrops create soothing sounds as they hit the ground.",
"Stars twinkle brightly in the clear night sky.",
"The ocean waves crash gently against the shore.",
"Mountains stand tall and majestic, covered in snow.",
"Birds chirp melodiously during the early morning hours.",
"The forest is alive with the sounds of rustling leaves and wildlife.",
"A gentle breeze flows through the meadow, carrying the scent of flowers."
]
embeddings = [text_embedding(document) for document in documents]
ids = [f"{i}" for i in range(len(documents))]
collection.add(
documents=documents,
embeddings=embeddings,
ids=ids
)
```
%% Cell type:code id: tags:
``` python
# Querying to find the documents most similar to a given phrase
query = "What happens in the forest during the day?"
# query = "Describe how stars appear in a clear night sky."
query_embedding = text_embedding(query)
results = collection.query(
query_embeddings=[query_embedding],
n_results=2 # Number of desired similar results
)
```
%% Cell type:code id: tags:
``` python
# Displaying the results
for result in results['documents']:
print("Similar document:", result)
```
import matplotlib.pyplot as plt
import numpy as np
def load_data(file_name, delimiter=','):
""" Reads the file containing the data and returns the matrices corresponding to it
Parameters
----------
file_name : name of the file containing the data
delimiter : character that separates the columns in the file (default is ",")
Returns
-------
x : data matrix of dimension [N, nb_var]
d : matrix containing the target variable values of dimension [N, nb_target]
N : number of elements
nb_var : number of predictor variables
nb_target : number of target variables
"""
data = np.loadtxt(file_name, delimiter=delimiter)
nb_target = 1
nb_var = data.shape[1] - nb_target
N = data.shape[0]
x = data[:, :nb_var]
d = data[:, nb_var:].reshape(N,1)
return x, d, N, nb_var, nb_target
def normalization(x):
""" Normalizes the data by centering and scaling the predictor variables
Parameters
----------
X : data matrix of dimension [N, nb_var]
with N : number of elements and nb_var : number of predictor variables
Returns
-------
X_norm : normalized data matrix of dimension [N, nb_var]
mu : mean of the variables of dimension [1, nb_var]
sigma : standard deviation of the variables of dimension [1, nb_var]
"""
mu = np.mean(x, 0)
sigma = np.std(x, 0)
x_norm = (x - mu) / sigma
return x_norm, mu, sigma
def split_data(x,d,prop_val=0.2, prop_test=0.2):
""" Splits the original data into three distinct subsets: training, validation, and test
Parameters
----------
x : data matrix of dimension [N, nb_var]
d : target values matrix [N, nb_target]
prop_val : proportion of validation data in the total data (between 0 and 1)
prop_test : proportion of test data in the total data (between 0 and 1)
with N : number of elements, nb_var : number of predictor variables, nb_target : number of target variables
Returns
-------
x_train : training data matrix
d_train : target values matrix for training
x_val : validation data matrix
d_val : target values matrix for validation
x_test : test data matrix
d_test : target values matrix for test
"""
assert prop_val + prop_test < 1.0
N = x.shape[0]
indices = np.arange(N)
np.random.shuffle(indices)
nb_val = int(N*prop_val)
nb_test = int(N*prop_test)
nb_train = N - nb_val - nb_test
x = x[indices,:]
d = d[indices,:]
x_train = x[:nb_train,:]
d_train = d[:nb_train,:]
x_val = x[nb_train:nb_train+nb_val,:]
d_val = d[nb_train:nb_train+nb_val,:]
x_test = x[N-nb_test:,:]
d_test = d[N-nb_test:,:]
return x_train, d_train, x_val, d_val, x_test, d_test
def compute_cross_entropy_cost(y, d):
""" Computes the value of the cross-entropy cost function
Parameters
----------
y : predicted data matrix (softmax)
d : actual data matrix encoded by 1
Returns
-------
cost : value corresponding to the cost function
"""
N = y.shape[1]
cost = - np.sum(d*np.log(y)) / N
return cost
def forward_pass(x, W, b, activation):
""" Performs a forward pass in the neural network
Parameters
----------
x : input matrix, dimension nb_var x N
W : list containing the weight matrices of the network
b : list containing the bias matrices of the network
activation : list containing the activation functions of the network layers
with N : number of elements, nb_var : number of predictor variables
Returns
-------
a : list containing the input potentials of the network layers
h : list containing the outputs of the network layers
"""
h = [x]
a = []
for i in range(len(b)):
a.append( W[i].dot(h[i]) + b[i] )
h.append( activation[i](a[i]) )
return a, h
def backward_pass(delta_h, a, h, W, activation):
""" Performs a backward pass in the neural network (backpropagation)
Parameters
----------
delta_h : matrix containing the gradient of the cost with respect to the network output
a : list containing the input potentials of the network layers
h : list containing the outputs of the network layers
W : list containing the weight matrices of the network
activation : list containing the activation functions of the network layers
Returns
-------
delta_W : list containing the gradient matrices of the network's weight layers
delta_b : list containing the gradient matrices of the network's bias layers
"""
delta_b = []
delta_W = []
for i in range(len(W)-1,-1,-1):
delta_a = delta_h * activation[i](a[i], True)
delta_b.append( delta_a.mean(1).reshape(-1,1) )
delta_W.append( delta_a.dot(h[i].T) )
delta_h = (W[i].T).dot(delta_a)
delta_b = delta_b[::-1]
delta_W = delta_W[::-1]
return delta_W, delta_b
def sigmoid(z, deriv=False):
""" Computes the value of the sigmoid function or its derivative applied to z
Parameters
----------
z : can be a scalar or a matrix
deriv : boolean. If False returns the value of the sigmoid function, if True returns its derivative
Returns
-------
s : value of the sigmoid function applied to z or its derivative. Same dimension as z
"""
s = 1 / (1 + np.exp(-z))
if deriv:
return s * (1 - s)
else :
return s
def linear(z, deriv=False):
""" Computes the value of the linear function or its derivative applied to z
Parameters
----------
z : can be a scalar or a matrix
deriv : boolean. If False returns the value of the linear function, if True returns its derivative
Returns
-------
s : value of the linear function applied to z or its derivative. Same dimension as z
"""
if deriv:
return 1
else :
return z
def relu(z, deriv=False):
""" Computes the value of the ReLU function or its derivative applied to z
Parameters
----------
z : can be a scalar or a matrix
deriv : boolean. If False returns the value of the ReLU function, if True returns its derivative
Returns
-------
s : value of the ReLU function applied to z or its derivative. Same dimension as z
"""
r = np.zeros(z.shape)
if deriv:
pos = np.where(z>=0)
r[pos] = 1.0
return r
else :
return np.maximum(r,z)
def softmax(z, deriv=False):
""" Computes the value of the softmax function or its derivative applied to z
Parameters
----------
z : data matrix
deriv : boolean. If False returns the value of the softmax function, if True returns its derivative
Returns
-------
s : value of the softmax function applied to z or its derivative. Same dimension as z
"""
if deriv:
return 1
else :
return np.exp(z) / np.sum(np.exp(z),axis=0)
def one_hot_encoding(d):
""" Performs a one-hot encoding: for the output neurons of the network, only 1 will have the value 1, all others will be 0
Parameters
----------
d : matrix containing the values of the target variable (class of the elements) of dimension [N, 1]
with N : number of elements
Returns
-------
e : encoded data matrix of dimension [N, nb_classes]
with N : number of elements and nb_classes the number of classes (maximum+1) of the values in d
"""
d = d.astype(int).flatten()
N = d.shape[0]
nb_classes = d.max() + 1
e = np.zeros((N,nb_classes))
e[range(N),d] = 1
return e
def classification_accuracy(y,d):
""" Computes the classification accuracy (proportion of correctly classified elements)
Parameters
----------
y : network outputs matrix of dimension [nb_output_neurons x N]
d : true values matrix [nb_output_neurons x N]
with N : number of elements and nb_output_neurons : number of neurons in the output layer
Returns
-------
t : classification accuracy
"""
ind_y = np.argmax(y,axis=0)
ind_d = np.argmax(d,axis=0)
t = np.mean(ind_y == ind_d)
return t
# ===================== Part 1: Reading and Normalizing the Data =====================
print("Reading the data ...")
x, d, N, nb_var, nb_target = load_data("iris.txt")
# x, d, N, nb_var, nb_target = load_data("scores.txt")
# Display the first 10 examples of the dataset
print("Displaying the first 10 examples of the dataset: ")
for i in range(0, 10):
print(f"x = {x[i,:]}, d = {d[i]}")
# Normalization of the variables (centering and scaling)
print("Normalizing the variables ...")
x, mu, sigma = normalization(x)
d = one_hot_encoding(d)
# Split the data into training, validation, and test subsets
x_train, d_train, x_val, d_val, x_test, d_test = split_data(x,d)
# ===================== Part 2: Training =====================
# Learning rate and number of iterations
alpha = 0.0001
nb_iters = 10000
training_costs = np.zeros(nb_iters)
validation_costs = np.zeros(nb_iters)
# Network dimensions
D_c = [nb_var, 15, 15, d_train.shape[1]] # list containing the number of neurons for each layer
activation = [relu, relu, softmax] # list containing the activation functions for hidden layers and output layer
# Random initialization of network weights
W = []
b = []
for i in range(len(D_c)-1):
W.append(2 * np.random.random((D_c[i+1], D_c[i])) - 1)
b.append(np.zeros((D_c[i+1],1)))
x_train = x_train.T # Data is presented as column vectors at the network input
d_train = d_train.T
x_val = x_val.T # Data is presented as column vectors at the network input
d_val = d_val.T
x_test = x_test.T # Data is presented as column vectors at the network input
d_test = d_test.T
for t in range(nb_iters):
#############################################################################
# Forward pass: calculate predicted output y on validation data #
#############################################################################
a, h = forward_pass(x_val, W, b, activation)
y_val = h[-1] # Predicted output
###############################################################################
# Forward pass: calculate predicted output y on training data #
###############################################################################
a, h = forward_pass(x_train, W, b, activation)
y_train = h[-1] # Predicted output
###########################################
# Compute Mean Squared Error loss function #
###########################################
training_costs[t] = compute_cross_entropy_cost(y_train,d_train)
validation_costs[t] = compute_cross_entropy_cost(y_val,d_val)
####################################
# Backward pass: backpropagation #
####################################
delta_h = (y_train-d_train) # For the last layer
delta_W, delta_b = backward_pass(delta_h, a, h, W, activation)
#############################################
# Update weights and biases ##### #
#############################################
for i in range(len(b)-1,-1,-1):
b[i] -= alpha * delta_b[i]
W[i] -= alpha * delta_W[i]
print("Final cost on the training set: ", training_costs[-1])
print("Classification accuracy on the training set: ", classification_accuracy(y_train, d_train))
print("Final cost on the validation set: ", validation_costs[-1])
print("Classification accuracy on the validation set: ", classification_accuracy(y_val, d_val))
# Display cost function evolution during backpropagation
plt.figure(0)
plt.title("Cost function evolution during backpropagation")
plt.plot(np.arange(training_costs.size), training_costs, label="Training")
plt.plot(np.arange(validation_costs.size), validation_costs, label="Validation")
plt.legend(loc="upper left")
plt.xlabel("Number of iterations")
plt.ylabel("Cost")
plt.show()
# ===================== Part 3: Evaluation on the test set =====================
#######################################################################
# Forward pass: calculate predicted output y on test data #
#######################################################################
a, h = forward_pass(x_test, W, b, activation)
y_test = h[-1] # Predicted output
cost = compute_cross_entropy_cost(y_test,d_test)
print("Cost on the test set: ", cost)
print("Classification accuracy on the test set: ", classification_accuracy(y_test, d_test))
import matplotlib.pyplot as plt
import numpy as np
def read_data(file_name, delimiter=','):
""" Reads the file containing the data and returns the corresponding matrices
Parameters
----------
file_name : name of the file containing the data
delimiter : character separating columns in the file ("," by default)
Returns
-------
x : data matrix of size [N, num_vars]
d : matrix containing the target variable values of size [N, num_targets]
N : number of elements
num_vars : number of predictor variables
num_targets : number of target variables
"""
data = np.loadtxt(file_name, delimiter=delimiter)
num_targets = 1
num_vars = data.shape[1] - num_targets
N = data.shape[0]
x = data[:, :num_vars]
d = data[:, num_vars:].reshape(N,1)
return x, d, N, num_vars, num_targets
def normalization(x):
""" Normalizes the data by centering and scaling the predictor variables
Parameters
----------
X : data matrix of size [N, num_vars]
with N : number of elements and num_vars : number of predictor variables
Returns
-------
X_norm : centered-scaled data matrix of size [N, num_vars]
mu : mean of the variables of size [1, num_vars]
sigma : standard deviation of the variables of size [1, num_vars]
"""
mu = np.mean(x, 0)
sigma = np.std(x, 0)
x_norm = (x - mu) / sigma
return x_norm, mu, sigma
def split_data(x, d, val_prop=0.2, test_prop=0.2):
""" Splits the initial data into three distinct subsets for training, validation, and testing
Parameters
----------
x : data matrix of size [N, num_vars]
d : matrix of target values [N, num_targets]
val_prop : proportion of validation data over the entire dataset (between 0 and 1)
test_prop : proportion of test data over the entire dataset (between 0 and 1)
with N : number of elements, num_vars : number of predictor variables, num_targets : number of target variables
Returns
-------
x_train : training data matrix
d_train : training target values matrix
x_val : validation data matrix
d_val : validation target values matrix
x_test : test data matrix
d_test : test target values matrix
"""
assert val_prop + test_prop < 1.0
N = x.shape[0]
indices = np.arange(N)
np.random.shuffle(indices)
num_val = int(N*val_prop)
num_test = int(N*test_prop)
num_train = N - num_val - num_test
x = x[indices,:]
d = d[indices,:]
x_train = x[:num_train,:]
d_train = d[:num_train,:]
x_val = x[num_train:num_train+num_val,:]
d_val = d[num_train:num_train+num_val,:]
x_test = x[N-num_test:,:]
d_test = d[N-num_test:,:]
return x_train, d_train, x_val, d_val, x_test, d_test
def calculate_mse_cost(y, d):
""" Calculates the value of the MSE (mean squared error) cost function
Parameters
----------
y : matrix of predicted data
d : matrix of actual data
Returns
-------
cost : value corresponding to the MSE cost function (mean squared error)
"""
N = y.shape[1]
cost = np.square(y - d).sum() / 2 / N
return cost
def forward_pass(x, W, b, activation):
""" Performs a forward pass in the neural network
Parameters
----------
x : input matrix, of size num_vars x N
W : list containing the weight matrices of the network
b : list containing the bias matrices of the network
activation : list containing the activation functions of the network layers
with N : number of elements, num_vars : number of predictor variables
Returns
-------
a : list containing the input potentials of the network layers
h : list containing the outputs of the network layers
"""
h = [x]
a = []
for i in range(len(b)):
a.append( W[i].dot(h[i]) + b[i] )
h.append( activation[i](a[i]) )
return a, h
def backward_pass(delta_h, a, h, W, activation):
""" Performs a backward pass in the neural network (backpropagation)
Parameters
----------
delta_h : matrix containing the gradient of the cost with respect to the output of the network
a : list containing the input potentials of the network layers
h : list containing the outputs of the network layers
W : list containing the weight matrices of the network
activation : list containing the activation functions of the network layers
Returns
-------
delta_W : list containing the gradient matrices of the network layer weights
delta_b : list containing the gradient matrices of the network layer biases
"""
delta_b = []
delta_W = []
for i in range(len(W)-1,-1,-1):
delta_a = delta_h * activation[i](a[i], True)
delta_b.append( delta_a.mean(1).reshape(-1,1) )
delta_W.append( delta_a.dot(h[i].T) )
delta_h = (W[i].T).dot(delta_a)
delta_b = delta_b[::-1]
delta_W = delta_W[::-1]
return delta_W, delta_b
def sigmoid(z, deriv=False):
""" Calculates the value of the sigmoid function or its derivative applied to z
Parameters
----------
z : can be a scalar or a matrix
deriv : boolean. If False returns the value of the sigmoid function, if True returns its derivative
Returns
-------
s : value of the sigmoid function applied to z or its derivative. Same dimension as z
"""
s = 1 / (1 + np.exp(-z))
if deriv:
return s * (1 - s)
else :
return s
def linear(z, deriv=False):
""" Calculates the value of the linear function or its derivative applied to z
Parameters
----------
z : can be a scalar or a matrix
deriv : boolean. If False returns the value of the linear function, if True returns its derivative
Returns
-------
s : value of the linear function applied to z or its derivative. Same dimension as z
"""
if deriv:
return 1
else :
return z
def relu(z, deriv=False):
""" Calculates the value of the relu function or its derivative applied to z
Parameters
----------
z : can be a scalar or a matrix
deriv : boolean. If False returns the value of the relu function, if True returns its derivative
Returns
-------
s : value of the relu function applied to z or its derivative. Same dimension as z
"""
r = np.zeros(z.shape)
if deriv:
pos = np.where(z>=0)
r[pos] = 1.0
return r
else :
return np.maximum(r,z)
# ===================== Part 1: Data Reading and Normalization =====================
print("Reading data ...")
x, d, N, num_vars, num_targets = read_data("food_truck.txt")
# x, d, N, num_vars, num_targets = read_data("houses.txt")
# Displaying the first 10 examples from the dataset
print("Displaying the first 10 examples from the dataset: ")
for i in range(0, 10):
print(f"x = {x[i,:]}, d = {d[i]}")
# Normalizing the variables (centering and scaling)
print("Normalizing the variables ...")
x, mu, sigma = normalization(x)
dmax = d.max()
d = d / dmax
# Splitting the data into training, validation, and test subsets
x_train, d_train, x_val, d_val, x_test, d_test = split_data(x, d)
# ===================== Part 2: Training =====================
# Choosing the learning rate and number of iterations
alpha = 0.001
num_iters = 500
train_costs = np.zeros(num_iters)
val_costs = np.zeros(num_iters)
# Network dimensions
D_c = [num_vars, 5, 10, num_targets] # list containing the number of neurons for each layer
activation = [relu, sigmoid, linear] # list containing the activation functions for the hidden layers and the output layer
# Random initialization of the network weights
W = []
b = []
for i in range(len(D_c)-1):
W.append(2 * np.random.random((D_c[i+1], D_c[i])) - 1)
b.append(np.zeros((D_c[i+1],1)))
x_train = x_train.T # Data is presented as column vectors at the input of the network
d_train = d_train.T
x_val = x_val.T # Data is presented as column vectors at the input of the network
d_val = d_val.T
x_test = x_test.T # Data is presented as column vectors at the input of the network
d_test = d_test.T
for t in range(num_iters):
#############################################################################
# Forward pass: calculating predicted output y on validation data #
#############################################################################
a, h = forward_pass(x_val, W, b, activation)
y_val = h[-1] # Predicted output
###############################################################################
# Forward pass: calculating predicted output y on training data #
###############################################################################
a, h = forward_pass(x_train, W, b, activation)
y_train = h[-1] # Predicted output
###########################################
# Calculating the MSE loss function #
###########################################
train_costs[t] = calculate_mse_cost(y_train, d_train)
val_costs[t] = calculate_mse_cost(y_val, d_val)
####################################
# Backward pass: backpropagation #
####################################
delta_h = (y_train-d_train) # For the last layer
delta_W, delta_b = backward_pass(delta_h, a, h, W, activation)
#############################################
# Updating weights and biases #
#############################################
for i in range(len(b)-1,-1,-1):
b[i] -= alpha * delta_b[i]
W[i] -= alpha * delta_W[i]
print("Final cost on the training set: ", train_costs[-1])
print("Final cost on the validation set: ", val_costs[-1])
# Plotting the evolution of the cost function during backpropagation
plt.figure(0)
plt.title("Evolution of the cost function during backpropagation")
plt.plot(np.arange(train_costs.size), train_costs, label="Training")
plt.plot(np.arange(val_costs.size), val_costs, label="Validation")
plt.legend(loc="upper left")
plt.xlabel("Number of iterations")
plt.ylabel("Cost")
plt.show()
# ===================== Part 3: Evaluation on the test set =====================
#######################################################################
# Forward pass: calculating predicted output y on test data #
#######################################################################
a, h = forward_pass(x_test, W, b, activation)
y_test = h[-1] # Predicted output
cost = calculate_mse_cost(y_test, d_test)
print("Test set cost: ", cost)
File added
6.1101,17.592
5.5277,9.1302
8.5186,13.662
7.0032,11.854
5.8598,6.8233
8.3829,11.886
7.4764,4.3483
8.5781,12
6.4862,6.5987
5.0546,3.8166
5.7107,3.2522
14.164,15.505
5.734,3.1551
8.4084,7.2258
5.6407,0.71618
5.3794,3.5129
6.3654,5.3048
5.1301,0.56077
6.4296,3.6518
7.0708,5.3893
6.1891,3.1386
20.27,21.767
5.4901,4.263
6.3261,5.1875
5.5649,3.0825
18.945,22.638
12.828,13.501
10.957,7.0467
13.176,14.692
22.203,24.147
5.2524,-1.22
6.5894,5.9966
9.2482,12.134
5.8918,1.8495
8.2111,6.5426
7.9334,4.5623
8.0959,4.1164
5.6063,3.3928
12.836,10.117
6.3534,5.4974
5.4069,0.55657
6.8825,3.9115
11.708,5.3854
5.7737,2.4406
7.8247,6.7318
7.0931,1.0463
5.0702,5.1337
5.8014,1.844
11.7,8.0043
5.5416,1.0179
7.5402,6.7504
5.3077,1.8396
7.4239,4.2885
7.6031,4.9981
6.3328,1.4233
6.3589,-1.4211
6.2742,2.4756
5.6397,4.6042
9.3102,3.9624
9.4536,5.4141
8.8254,5.1694
5.1793,-0.74279
21.279,17.929
14.908,12.054
18.959,17.054
7.2182,4.8852
8.2951,5.7442
10.236,7.7754
5.4994,1.0173
20.341,20.992
10.136,6.6799
7.3345,4.0259
6.0062,1.2784
7.2259,3.3411
5.0269,-2.6807
6.5479,0.29678
7.5386,3.8845
5.0365,5.7014
10.274,6.7526
5.1077,2.0576
5.7292,0.47953
5.1884,0.20421
6.3557,0.67861
9.7687,7.5435
6.5159,5.3436
8.5172,4.2415
9.1802,6.7981
6.002,0.92695
5.5204,0.152
5.0594,2.8214
5.7077,1.8451
7.6366,4.2959
5.8707,7.2029
5.3054,1.9869
8.2934,0.14454
13.394,9.0551
5.4369,0.61705
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
def read_data(file_name, delimiter=','):
""" Reads the file containing the data and returns the corresponding tensors """
data = np.loadtxt(file_name, delimiter=delimiter)
num_targets = 1
num_vars = data.shape[1] - num_targets
N = data.shape[0]
x = torch.tensor(data[:, :num_vars], dtype=torch.float32)
d = torch.tensor(data[:, num_vars:], dtype=torch.float32).view(N, 1)
return x, d, N, num_vars, num_targets
def normalization(x):
""" Normalizes the data by centering and scaling the predictor variables """
mu = x.mean(0)
sigma = x.std(0)
x_norm = (x - mu) / sigma
return x_norm, mu, sigma
def split_data(x, d, val_prop=0.2, test_prop=0.2):
""" Splits the initial data into training, validation, and testing subsets """
assert val_prop + test_prop < 1.0
N = x.size(0)
indices = torch.randperm(N)
num_val = int(N * val_prop)
num_test = int(N * test_prop)
num_train = N - num_val - num_test
x = x[indices]
d = d[indices]
x_train = x[:num_train]
d_train = d[:num_train]
x_val = x[num_train:num_train + num_val]
d_val = d[num_train:num_train + num_val]
x_test = x[num_train + num_val:]
d_test = d[num_train + num_val:]
return x_train, d_train, x_val, d_val, x_test, d_test
# Define the neural network class
class NeuralNetwork(nn.Module):
def __init__(self, layer_dims, activations):
super(NeuralNetwork, self).__init__()
layers = []
for i in range(len(layer_dims) - 1):
layers.append(nn.Linear(layer_dims[i], layer_dims[i + 1]))
if activations[i] == 'relu':
layers.append(nn.ReLU())
elif activations[i] == 'sigmoid':
layers.append(nn.Sigmoid())
elif activations[i] == 'linear':
pass # Linear activation is implicit
self.model = nn.Sequential(*layers)
def forward(self, x):
return self.model(x)
def calculate_mse_cost(y, d):
""" Calculates the MSE loss function """
return ((y - d) ** 2).mean() / 2
# ===================== Part 1: Data Reading and Normalization =====================
print("Reading data ...")
x, d, N, num_vars, num_targets = read_data("food_truck.txt")
# x, d, N, num_vars, num_targets = read_data("houses.txt")
# Displaying the first 10 examples from the dataset
print("Displaying the first 10 examples from the dataset: ")
for i in range(10):
print(f"x = {x[i]}, d = {d[i]}")
# Normalizing the variables
print("Normalizing the variables ...")
x, mu, sigma = normalization(x)
dmax = d.max()
d = d / dmax
# Splitting the data
x_train, d_train, x_val, d_val, x_test, d_test = split_data(x, d)
# ===================== Part 2: Training =====================
# Hyperparameters
alpha = 0.001
num_iters = 500
layer_dims = [num_vars, 5, 10, num_targets]
activations = ['relu', 'sigmoid', 'linear']
# Model, loss, and optimizer
model = NeuralNetwork(layer_dims, activations)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=alpha)
train_costs = []
val_costs = []
for t in range(num_iters):
# Training forward pass
model.train()
y_train = model(x_train)
train_loss = criterion(y_train, d_train)
# Backpropagation
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
# Validation forward pass
model.eval()
with torch.no_grad():
y_val = model(x_val)
val_loss = criterion(y_val, d_val)
train_costs.append(train_loss.item())
val_costs.append(val_loss.item())
print("Final cost on the training set: ", train_costs[-1])
print("Final cost on the validation set: ", val_costs[-1])
# Plotting the evolution of the cost function
plt.figure()
plt.title("Evolution of the cost function during training")
plt.plot(range(num_iters), train_costs, label="Training")
plt.plot(range(num_iters), val_costs, label="Validation")
plt.legend(loc="upper left")
plt.xlabel("Number of iterations")
plt.ylabel("Cost")
plt.show()
# ===================== Part 3: Evaluation on the Test Set =====================
model.eval()
with torch.no_grad():
y_test = model(x_test)
test_loss = criterion(y_test, d_test)
print("Test set cost: ", test_loss.item())
%% Cell type:markdown id:7edf7168 tags:
### **_Deep Learning - Bsc Data Science for Responsible Business - Centrale Lyon_**
2024-2025
Emmanuel Dellandréa
%% Cell type:markdown id:4c69d182 tags:
# Practical Session 4 – Convolutional Neural Networks
%% Cell type:markdown id:fa71eda4 tags:
The objective of this tutorial is to use the PyTorch library for building, training, and evaluating CNN models.
%% Cell type:markdown id:23f266da tags:
## Sequence 1: Training a CNN to classify CIFAR10 images
The goal is to apply a Convolutional Neural Net (CNN) model on the CIFAR10 image dataset and test the accuracy of the model on the basis of image classification.
Be sure to check the PyTorch tutorials and documentation when needed:
https://pytorch.org/tutorials/
https://pytorch.org/docs/stable/index.html
%% Cell type:markdown id:4ba1c82d tags:
You can test if GPU is available on your machine and thus train on it to speed up the process
%% Cell type:code id:6e18f2fd tags:
``` python
import torch
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
print("CUDA is not available. Training on CPU ...")
else:
print("CUDA is available! Training on GPU ...")
```
%% Cell type:markdown id:5cf214eb tags:
Next we load the CIFAR10 dataset
%% Cell type:code id:462666a2 tags:
``` python
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data.sampler import SubsetRandomSampler
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 20
# percentage of training set to use as validation
valid_size = 0.2
# convert data to a normalized torch.FloatTensor
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
# choose the training and test datasets
train_data = datasets.CIFAR10("data", train=True, download=True, transform=transform)
test_data = datasets.CIFAR10("data", train=False, download=True, transform=transform)
# obtain training indices that will be used for validation
num_train = len(train_data)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]
# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
# prepare data loaders (combine dataset and sampler)
train_loader = torch.utils.data.DataLoader(
train_data, batch_size=batch_size, sampler=train_sampler, num_workers=num_workers
)
valid_loader = torch.utils.data.DataLoader(
train_data, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers
)
test_loader = torch.utils.data.DataLoader(
test_data, batch_size=batch_size, num_workers=num_workers
)
# specify the image classes
classes = [
"airplane",
"automobile",
"bird",
"cat",
"deer",
"dog",
"frog",
"horse",
"ship",
"truck",
]
```
%% Cell type:markdown id:58ec3903 tags:
CNN definition (this one is an example)
%% Cell type:code id:317bf070 tags:
``` python
import torch.nn as nn
import torch.nn.functional as F
# define the CNN architecture
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
# create a complete CNN
model = Net()
print(model)
# move tensors to GPU if CUDA is available
if train_on_gpu:
model.cuda()
```
%% Cell type:markdown id:a2dc4974 tags:
Loss function and training using SGD (Stochastic Gradient Descent) optimizer
%% Cell type:code id:4b53f229 tags:
``` python
import torch.optim as optim
criterion = nn.CrossEntropyLoss() # specify loss function
optimizer = optim.SGD(model.parameters(), lr=0.01) # specify optimizer
n_epochs = 30 # number of epochs to train the model
train_loss_list = [] # list to store loss to visualize
valid_loss_min = np.Inf # track change in validation loss
for epoch in range(n_epochs):
# Keep track of training and validation loss
train_loss = 0.0
valid_loss = 0.0
# Train the model
model.train()
for data, target in train_loader:
# Move tensors to GPU if CUDA is available
if train_on_gpu:
data, target = data.cuda(), target.cuda()
# Clear the gradients of all optimized variables
optimizer.zero_grad()
# Forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# Calculate the batch loss
loss = criterion(output, target)
# Backward pass: compute gradient of the loss with respect to model parameters
loss.backward()
# Perform a single optimization step (parameter update)
optimizer.step()
# Update training loss
train_loss += loss.item() * data.size(0)
# Validate the model
model.eval()
for data, target in valid_loader:
# Move tensors to GPU if CUDA is available
if train_on_gpu:
data, target = data.cuda(), target.cuda()
# Forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# Calculate the batch loss
loss = criterion(output, target)
# Update average validation loss
valid_loss += loss.item() * data.size(0)
# Calculate average losses
train_loss = train_loss / len(train_loader)
valid_loss = valid_loss / len(valid_loader)
train_loss_list.append(train_loss)
# Print training/validation statistics
print(
"Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}".format(
epoch, train_loss, valid_loss
)
)
# Save model if validation loss has decreased
if valid_loss <= valid_loss_min:
print(
"Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...".format(
valid_loss_min, valid_loss
)
)
torch.save(model.state_dict(), "model_cifar.pt")
valid_loss_min = valid_loss
```
%% Cell type:markdown id:13e1df74 tags:
Does overfit occur? If so, do an early stopping.
%% Cell type:code id:d39df818 tags:
``` python
import matplotlib.pyplot as plt
plt.plot(range((len(train_loss_list))), train_loss_list)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Performance of Model 1")
plt.show()
```
%% Cell type:markdown id:11df8fd4 tags:
Now loading the model with the lowest validation loss value
%% Cell type:code id:e93efdfc tags:
``` python
model.load_state_dict(torch.load("./model_cifar.pt"))
# track test loss
test_loss = 0.0
class_correct = list(0.0 for i in range(10))
class_total = list(0.0 for i in range(10))
model.eval()
# iterate over test data
for data, target in test_loader:
# move tensors to GPU if CUDA is available
if train_on_gpu:
data, target = data.cuda(), target.cuda()
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
# calculate the batch loss
loss = criterion(output, target)
# update test loss
test_loss += loss.item() * data.size(0)
# convert output probabilities to predicted class
_, pred = torch.max(output, 1)
# compare predictions to true label
correct_tensor = pred.eq(target.data.view_as(pred))
correct = (
np.squeeze(correct_tensor.numpy())
if not train_on_gpu
else np.squeeze(correct_tensor.cpu().numpy())
)
# calculate test accuracy for each object class
for i in range(batch_size):
label = target.data[i]
class_correct[label] += correct[i].item()
class_total[label] += 1
# average test loss
test_loss = test_loss / len(test_loader)
print("Test Loss: {:.6f}\n".format(test_loss))
for i in range(10):
if class_total[i] > 0:
print(
"Test Accuracy of %5s: %2d%% (%2d/%2d)"
% (
classes[i],
100 * class_correct[i] / class_total[i],
np.sum(class_correct[i]),
np.sum(class_total[i]),
)
)
else:
print("Test Accuracy of %5s: N/A (no training examples)" % (classes[i]))
print(
"\nTest Accuracy (Overall): %2d%% (%2d/%2d)"
% (
100.0 * np.sum(class_correct) / np.sum(class_total),
np.sum(class_correct),
np.sum(class_total),
)
)
```
%% Cell type:markdown id:944991a2 tags:
### Experiments:
Build a new network with the following structure.
- It has 3 convolutional layers of kernel size 3 and padding of 1.
- The first convolutional layer must output 16 channels, the second 32 and the third 64.
- At each convolutional layer output, we apply a ReLU activation then a MaxPool with kernel size of 2.
- Then, three fully connected layers, the first two being followed by a ReLU activation.
- The first fully connected layer will have an output size of 512.
- The second fully connected layer will have an output size of 64.
Compare the results obtained with this new network to those obtained previously.
%% Cell type:markdown id:201470f9 tags:
## Sequence 2: Working with pre-trained models.
PyTorch offers several pre-trained models https://pytorch.org/vision/0.8/models.html
We will use ResNet50 trained on ImageNet dataset (https://www.image-net.org/index.php). Use the following code with the files `imagenet-simple-labels.json` that contains the imagenet labels and the image dog.png that we will use as test.
%% Cell type:code id:b4d13080 tags:
``` python
import json
from PIL import Image
from torchvision import models
# Choose an image to pass through the model
test_image = "dog.png"
# Configure matplotlib for pretty inline plots
#%matplotlib inline
#%config InlineBackend.figure_format = 'retina'
# Prepare the labels
with open("imagenet-simple-labels.json") as f:
labels = json.load(f)
# First prepare the transformations: resize the image to what the model was trained on and convert it to a tensor
data_transform = transforms.Compose(
[
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
# Load the image
image = Image.open(test_image)
plt.imshow(image), plt.xticks([]), plt.yticks([])
# Now apply the transformation, expand the batch dimension, and send the image to the GPU
# image = data_transform(image).unsqueeze(0).cuda()
image = data_transform(image).unsqueeze(0)
# Download the model if it's not there already. It will take a bit on the first run, after that it's fast
model = models.resnet50(pretrained=True)
# Send the model to the GPU
# model.cuda()
# Set layers such as dropout and batchnorm in evaluation mode
model.eval()
# Get the 1000-dimensional model output
out = model(image)
# Find the predicted class
print("Predicted class is: {}".format(labels[out.argmax()]))
```
%% Cell type:markdown id:184cfceb tags:
### Experiments:
Study the code and the results obtained. Possibly add other images downloaded from the internet.
Experiment with other pre-trained CNN models.
%% Cell type:markdown id:5d57da4b tags:
## Sequence 3: Transfer Learning
For this work, we will use a pre-trained model (ResNet18) as a descriptor extractor and will refine the classification by training only the last fully connected layer of the network. Thus, the output layer of the pre-trained network will be replaced by a layer adapted to the new classes to be recognized which will be in our case ants and bees.
Download and unzip in your working directory the dataset available at the address :
https://download.pytorch.org/tutorial/hymenoptera_data.zip
Execute the following code in order to display some images of the dataset.
%% Cell type:code id:be2d31f5 tags:
``` python
import os
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision
from torchvision import datasets, transforms
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
"train": transforms.Compose(
[
transforms.RandomResizedCrop(
224
), # ImageNet models were trained on 224x224 images
transforms.RandomHorizontalFlip(), # flip horizontally 50% of the time - increases train set variability
transforms.ToTensor(), # convert it to a PyTorch tensor
transforms.Normalize(
[0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
), # ImageNet models expect this norm
]
),
"val": transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
),
}
data_dir = "hymenoptera_data"
# Create train and validation datasets and loaders
image_datasets = {
x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
for x in ["train", "val"]
}
dataloaders = {
x: torch.utils.data.DataLoader(
image_datasets[x], batch_size=4, shuffle=True, num_workers=0
)
for x in ["train", "val"]
}
dataset_sizes = {x: len(image_datasets[x]) for x in ["train", "val"]}
class_names = image_datasets["train"].classes
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Helper function for displaying images
def imshow(inp, title=None):
"""Imshow for Tensor."""
inp = inp.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
# Un-normalize the images
inp = std * inp + mean
# Clip just in case
inp = np.clip(inp, 0, 1)
plt.imshow(inp)
if title is not None:
plt.title(title)
plt.pause(0.001) # pause a bit so that plots are updated
plt.show()
# Get a batch of training data
inputs, classes = next(iter(dataloaders["train"]))
# Make a grid from batch
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[class_names[x] for x in classes])
```
%% Cell type:markdown id:bbd48800 tags:
Now, execute the following code which uses a pre-trained model ResNet18 having replaced the output layer for the ants/bees classification and performs the model training by only changing the weights of this output layer.
%% Cell type:code id:572d824c tags:
``` python
import copy
import time
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
epoch_time = [] # we'll keep track of the time needed for each epoch
for epoch in range(num_epochs):
epoch_start = time.time()
print("Epoch {}/{}".format(epoch + 1, num_epochs))
print("-" * 10)
# Each epoch has a training and validation phase
for phase in ["train", "val"]:
if phase == "train":
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# Forward
# Track history if only in training phase
with torch.set_grad_enabled(phase == "train"):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == "train":
loss.backward()
optimizer.step()
# Statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print("{} Loss: {:.4f} Acc: {:.4f}".format(phase, epoch_loss, epoch_acc))
# Deep copy the model
if phase == "val" and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
# Add the epoch time
t_epoch = time.time() - epoch_start
epoch_time.append(t_epoch)
print()
time_elapsed = time.time() - since
print(
"Training complete in {:.0f}m {:.0f}s".format(
time_elapsed // 60, time_elapsed % 60
)
)
print("Best val Acc: {:4f}".format(best_acc))
# Load best model weights
model.load_state_dict(best_model_wts)
return model, epoch_time
# Download a pre-trained ResNet18 model and freeze its weights
model = torchvision.models.resnet18(pretrained=True)
for param in model.parameters():
param.requires_grad = False
# Replace the final fully connected layer
# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)
# Send the model to the GPU
model = model.to(device)
# Set the loss function
criterion = nn.CrossEntropyLoss()
# Observe that only the parameters of the final layer are being optimized
optimizer_conv = optim.SGD(model.fc.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)
model, epoch_time = train_model(
model, criterion, optimizer_conv, exp_lr_scheduler, num_epochs=10
)
```
%% Cell type:markdown id:bbd48800 tags:
### Experiments:
Study the code and the results obtained.
Modify the code and add an "eval_model" function to allow
the evaluation of the model on a test set (different from the learning and validation sets used during the learning phase). Study the results obtained.
Now modify the code to replace the current classification layer with a set of two layers using a "relu" activation function for the middle layer. Renew the experiments and study the results obtained.
Experiment with other models and datasets.
Practical_sessions/Session_4/dog.png

84.4 KiB

["tench",
"goldfish",
"great white shark",
"tiger shark",
"hammerhead shark",
"electric ray",
"stingray",
"cock",
"hen",
"ostrich",
"brambling",
"goldfinch",
"house finch",
"junco",
"indigo bunting",
"American robin",
"bulbul",
"jay",
"magpie",
"chickadee",
"American dipper",
"kite",
"bald eagle",
"vulture",
"great grey owl",
"fire salamander",
"smooth newt",
"newt",
"spotted salamander",
"axolotl",
"American bullfrog",
"tree frog",
"tailed frog",
"loggerhead sea turtle",
"leatherback sea turtle",
"mud turtle",
"terrapin",
"box turtle",
"banded gecko",
"green iguana",
"Carolina anole",
"desert grassland whiptail lizard",
"agama",
"frilled-necked lizard",
"alligator lizard",
"Gila monster",
"European green lizard",
"chameleon",
"Komodo dragon",
"Nile crocodile",
"American alligator",
"triceratops",
"worm snake",
"ring-necked snake",
"eastern hog-nosed snake",
"smooth green snake",
"kingsnake",
"garter snake",
"water snake",
"vine snake",
"night snake",
"boa constrictor",
"African rock python",
"Indian cobra",
"green mamba",
"sea snake",
"Saharan horned viper",
"eastern diamondback rattlesnake",
"sidewinder",
"trilobite",
"harvestman",
"scorpion",
"yellow garden spider",
"barn spider",
"European garden spider",
"southern black widow",
"tarantula",
"wolf spider",
"tick",
"centipede",
"black grouse",
"ptarmigan",
"ruffed grouse",
"prairie grouse",
"peacock",
"quail",
"partridge",
"grey parrot",
"macaw",
"sulphur-crested cockatoo",
"lorikeet",
"coucal",
"bee eater",
"hornbill",
"hummingbird",
"jacamar",
"toucan",
"duck",
"red-breasted merganser",
"goose",
"black swan",
"tusker",
"echidna",
"platypus",
"wallaby",
"koala",
"wombat",
"jellyfish",
"sea anemone",
"brain coral",
"flatworm",
"nematode",
"conch",
"snail",
"slug",
"sea slug",
"chiton",
"chambered nautilus",
"Dungeness crab",
"rock crab",
"fiddler crab",
"red king crab",
"American lobster",
"spiny lobster",
"crayfish",
"hermit crab",
"isopod",
"white stork",
"black stork",
"spoonbill",
"flamingo",
"little blue heron",
"great egret",
"bittern",
"crane",
"limpkin",
"common gallinule",
"American coot",
"bustard",
"ruddy turnstone",
"dunlin",
"common redshank",
"dowitcher",
"oystercatcher",
"pelican",
"king penguin",
"albatross",
"grey whale",
"killer whale",
"dugong",
"sea lion",
"Chihuahua",
"Japanese Chin",
"Maltese",
"Pekingese",
"Shih Tzu",
"King Charles Spaniel",
"Papillon",
"toy terrier",
"Rhodesian Ridgeback",
"Afghan Hound",
"Basset Hound",
"Beagle",
"Bloodhound",
"Bluetick Coonhound",
"Black and Tan Coonhound",
"Treeing Walker Coonhound",
"English foxhound",
"Redbone Coonhound",
"borzoi",
"Irish Wolfhound",
"Italian Greyhound",
"Whippet",
"Ibizan Hound",
"Norwegian Elkhound",
"Otterhound",
"Saluki",
"Scottish Deerhound",
"Weimaraner",
"Staffordshire Bull Terrier",
"American Staffordshire Terrier",
"Bedlington Terrier",
"Border Terrier",
"Kerry Blue Terrier",
"Irish Terrier",
"Norfolk Terrier",
"Norwich Terrier",
"Yorkshire Terrier",
"Wire Fox Terrier",
"Lakeland Terrier",
"Sealyham Terrier",
"Airedale Terrier",
"Cairn Terrier",
"Australian Terrier",
"Dandie Dinmont Terrier",
"Boston Terrier",
"Miniature Schnauzer",
"Giant Schnauzer",
"Standard Schnauzer",
"Scottish Terrier",
"Tibetan Terrier",
"Australian Silky Terrier",
"Soft-coated Wheaten Terrier",
"West Highland White Terrier",
"Lhasa Apso",
"Flat-Coated Retriever",
"Curly-coated Retriever",
"Golden Retriever",
"Labrador Retriever",
"Chesapeake Bay Retriever",
"German Shorthaired Pointer",
"Vizsla",
"English Setter",
"Irish Setter",
"Gordon Setter",
"Brittany",
"Clumber Spaniel",
"English Springer Spaniel",
"Welsh Springer Spaniel",
"Cocker Spaniels",
"Sussex Spaniel",
"Irish Water Spaniel",
"Kuvasz",
"Schipperke",
"Groenendael",
"Malinois",
"Briard",
"Australian Kelpie",
"Komondor",
"Old English Sheepdog",
"Shetland Sheepdog",
"collie",
"Border Collie",
"Bouvier des Flandres",
"Rottweiler",
"German Shepherd Dog",
"Dobermann",
"Miniature Pinscher",
"Greater Swiss Mountain Dog",
"Bernese Mountain Dog",
"Appenzeller Sennenhund",
"Entlebucher Sennenhund",
"Boxer",
"Bullmastiff",
"Tibetan Mastiff",
"French Bulldog",
"Great Dane",
"St. Bernard",
"husky",
"Alaskan Malamute",
"Siberian Husky",
"Dalmatian",
"Affenpinscher",
"Basenji",
"pug",
"Leonberger",
"Newfoundland",
"Pyrenean Mountain Dog",
"Samoyed",
"Pomeranian",
"Chow Chow",
"Keeshond",
"Griffon Bruxellois",
"Pembroke Welsh Corgi",
"Cardigan Welsh Corgi",
"Toy Poodle",
"Miniature Poodle",
"Standard Poodle",
"Mexican hairless dog",
"grey wolf",
"Alaskan tundra wolf",
"red wolf",
"coyote",
"dingo",
"dhole",
"African wild dog",
"hyena",
"red fox",
"kit fox",
"Arctic fox",
"grey fox",
"tabby cat",
"tiger cat",
"Persian cat",
"Siamese cat",
"Egyptian Mau",
"cougar",
"lynx",
"leopard",
"snow leopard",
"jaguar",
"lion",
"tiger",
"cheetah",
"brown bear",
"American black bear",
"polar bear",
"sloth bear",
"mongoose",
"meerkat",
"tiger beetle",
"ladybug",
"ground beetle",
"longhorn beetle",
"leaf beetle",
"dung beetle",
"rhinoceros beetle",
"weevil",
"fly",
"bee",
"ant",
"grasshopper",
"cricket",
"stick insect",
"cockroach",
"mantis",
"cicada",
"leafhopper",
"lacewing",
"dragonfly",
"damselfly",
"red admiral",
"ringlet",
"monarch butterfly",
"small white",
"sulphur butterfly",
"gossamer-winged butterfly",
"starfish",
"sea urchin",
"sea cucumber",
"cottontail rabbit",
"hare",
"Angora rabbit",
"hamster",
"porcupine",
"fox squirrel",
"marmot",
"beaver",
"guinea pig",
"common sorrel",
"zebra",
"pig",
"wild boar",
"warthog",
"hippopotamus",
"ox",
"water buffalo",
"bison",
"ram",
"bighorn sheep",
"Alpine ibex",
"hartebeest",
"impala",
"gazelle",
"dromedary",
"llama",
"weasel",
"mink",
"European polecat",
"black-footed ferret",
"otter",
"skunk",
"badger",
"armadillo",
"three-toed sloth",
"orangutan",
"gorilla",
"chimpanzee",
"gibbon",
"siamang",
"guenon",
"patas monkey",
"baboon",
"macaque",
"langur",
"black-and-white colobus",
"proboscis monkey",
"marmoset",
"white-headed capuchin",
"howler monkey",
"titi",
"Geoffroy's spider monkey",
"common squirrel monkey",
"ring-tailed lemur",
"indri",
"Asian elephant",
"African bush elephant",
"red panda",
"giant panda",
"snoek",
"eel",
"coho salmon",
"rock beauty",
"clownfish",
"sturgeon",
"garfish",
"lionfish",
"pufferfish",
"abacus",
"abaya",
"academic gown",
"accordion",
"acoustic guitar",
"aircraft carrier",
"airliner",
"airship",
"altar",
"ambulance",
"amphibious vehicle",
"analog clock",
"apiary",
"apron",
"waste container",
"assault rifle",
"backpack",
"bakery",
"balance beam",
"balloon",
"ballpoint pen",
"Band-Aid",
"banjo",
"baluster",
"barbell",
"barber chair",
"barbershop",
"barn",
"barometer",
"barrel",
"wheelbarrow",
"baseball",
"basketball",
"bassinet",
"bassoon",
"swimming cap",
"bath towel",
"bathtub",
"station wagon",
"lighthouse",
"beaker",
"military cap",
"beer bottle",
"beer glass",
"bell-cot",
"bib",
"tandem bicycle",
"bikini",
"ring binder",
"binoculars",
"birdhouse",
"boathouse",
"bobsleigh",
"bolo tie",
"poke bonnet",
"bookcase",
"bookstore",
"bottle cap",
"bow",
"bow tie",
"brass",
"bra",
"breakwater",
"breastplate",
"broom",
"bucket",
"buckle",
"bulletproof vest",
"high-speed train",
"butcher shop",
"taxicab",
"cauldron",
"candle",
"cannon",
"canoe",
"can opener",
"cardigan",
"car mirror",
"carousel",
"tool kit",
"carton",
"car wheel",
"automated teller machine",
"cassette",
"cassette player",
"castle",
"catamaran",
"CD player",
"cello",
"mobile phone",
"chain",
"chain-link fence",
"chain mail",
"chainsaw",
"chest",
"chiffonier",
"chime",
"china cabinet",
"Christmas stocking",
"church",
"movie theater",
"cleaver",
"cliff dwelling",
"cloak",
"clogs",
"cocktail shaker",
"coffee mug",
"coffeemaker",
"coil",
"combination lock",
"computer keyboard",
"confectionery store",
"container ship",
"convertible",
"corkscrew",
"cornet",
"cowboy boot",
"cowboy hat",
"cradle",
"crane",
"crash helmet",
"crate",
"infant bed",
"Crock Pot",
"croquet ball",
"crutch",
"cuirass",
"dam",
"desk",
"desktop computer",
"rotary dial telephone",
"diaper",
"digital clock",
"digital watch",
"dining table",
"dishcloth",
"dishwasher",
"disc brake",
"dock",
"dog sled",
"dome",
"doormat",
"drilling rig",
"drum",
"drumstick",
"dumbbell",
"Dutch oven",
"electric fan",
"electric guitar",
"electric locomotive",
"entertainment center",
"envelope",
"espresso machine",
"face powder",
"feather boa",
"filing cabinet",
"fireboat",
"fire engine",
"fire screen sheet",
"flagpole",
"flute",
"folding chair",
"football helmet",
"forklift",
"fountain",
"fountain pen",
"four-poster bed",
"freight car",
"French horn",
"frying pan",
"fur coat",
"garbage truck",
"gas mask",
"gas pump",
"goblet",
"go-kart",
"golf ball",
"golf cart",
"gondola",
"gong",
"gown",
"grand piano",
"greenhouse",
"grille",
"grocery store",
"guillotine",
"barrette",
"hair spray",
"half-track",
"hammer",
"hamper",
"hair dryer",
"hand-held computer",
"handkerchief",
"hard disk drive",
"harmonica",
"harp",
"harvester",
"hatchet",
"holster",
"home theater",
"honeycomb",
"hook",
"hoop skirt",
"horizontal bar",
"horse-drawn vehicle",
"hourglass",
"iPod",
"clothes iron",
"jack-o'-lantern",
"jeans",
"jeep",
"T-shirt",
"jigsaw puzzle",
"pulled rickshaw",
"joystick",
"kimono",
"knee pad",
"knot",
"lab coat",
"ladle",
"lampshade",
"laptop computer",
"lawn mower",
"lens cap",
"paper knife",
"library",
"lifeboat",
"lighter",
"limousine",
"ocean liner",
"lipstick",
"slip-on shoe",
"lotion",
"speaker",
"loupe",
"sawmill",
"magnetic compass",
"mail bag",
"mailbox",
"tights",
"tank suit",
"manhole cover",
"maraca",
"marimba",
"mask",
"match",
"maypole",
"maze",
"measuring cup",
"medicine chest",
"megalith",
"microphone",
"microwave oven",
"military uniform",
"milk can",
"minibus",
"miniskirt",
"minivan",
"missile",
"mitten",
"mixing bowl",
"mobile home",
"Model T",
"modem",
"monastery",
"monitor",
"moped",
"mortar",
"square academic cap",
"mosque",
"mosquito net",
"scooter",
"mountain bike",
"tent",
"computer mouse",
"mousetrap",
"moving van",
"muzzle",
"nail",
"neck brace",
"necklace",
"nipple",
"notebook computer",
"obelisk",
"oboe",
"ocarina",
"odometer",
"oil filter",
"organ",
"oscilloscope",
"overskirt",
"bullock cart",
"oxygen mask",
"packet",
"paddle",
"paddle wheel",
"padlock",
"paintbrush",
"pajamas",
"palace",
"pan flute",
"paper towel",
"parachute",
"parallel bars",
"park bench",
"parking meter",
"passenger car",
"patio",
"payphone",
"pedestal",
"pencil case",
"pencil sharpener",
"perfume",
"Petri dish",
"photocopier",
"plectrum",
"Pickelhaube",
"picket fence",
"pickup truck",
"pier",
"piggy bank",
"pill bottle",
"pillow",
"ping-pong ball",
"pinwheel",
"pirate ship",
"pitcher",
"hand plane",
"planetarium",
"plastic bag",
"plate rack",
"plow",
"plunger",
"Polaroid camera",
"pole",
"police van",
"poncho",
"billiard table",
"soda bottle",
"pot",
"potter's wheel",
"power drill",
"prayer rug",
"printer",
"prison",
"projectile",
"projector",
"hockey puck",
"punching bag",
"purse",
"quill",
"quilt",
"race car",
"racket",
"radiator",
"radio",
"radio telescope",
"rain barrel",
"recreational vehicle",
"reel",
"reflex camera",
"refrigerator",
"remote control",
"restaurant",
"revolver",
"rifle",
"rocking chair",
"rotisserie",
"eraser",
"rugby ball",
"ruler",
"running shoe",
"safe",
"safety pin",
"salt shaker",
"sandal",
"sarong",
"saxophone",
"scabbard",
"weighing scale",
"school bus",
"schooner",
"scoreboard",
"CRT screen",
"screw",
"screwdriver",
"seat belt",
"sewing machine",
"shield",
"shoe store",
"shoji",
"shopping basket",
"shopping cart",
"shovel",
"shower cap",
"shower curtain",
"ski",
"ski mask",
"sleeping bag",
"slide rule",
"sliding door",
"slot machine",
"snorkel",
"snowmobile",
"snowplow",
"soap dispenser",
"soccer ball",
"sock",
"solar thermal collector",
"sombrero",
"soup bowl",
"space bar",
"space heater",
"space shuttle",
"spatula",
"motorboat",
"spider web",
"spindle",
"sports car",
"spotlight",
"stage",
"steam locomotive",
"through arch bridge",
"steel drum",
"stethoscope",
"scarf",
"stone wall",
"stopwatch",
"stove",
"strainer",
"tram",
"stretcher",
"couch",
"stupa",
"submarine",
"suit",
"sundial",
"sunglass",
"sunglasses",
"sunscreen",
"suspension bridge",
"mop",
"sweatshirt",
"swimsuit",
"swing",
"switch",
"syringe",
"table lamp",
"tank",
"tape player",
"teapot",
"teddy bear",
"television",
"tennis ball",
"thatched roof",
"front curtain",
"thimble",
"threshing machine",
"throne",
"tile roof",
"toaster",
"tobacco shop",
"toilet seat",
"torch",
"totem pole",
"tow truck",
"toy store",
"tractor",
"semi-trailer truck",
"tray",
"trench coat",
"tricycle",
"trimaran",
"tripod",
"triumphal arch",
"trolleybus",
"trombone",
"tub",
"turnstile",
"typewriter keyboard",
"umbrella",
"unicycle",
"upright piano",
"vacuum cleaner",
"vase",
"vault",
"velvet",
"vending machine",
"vestment",
"viaduct",
"violin",
"volleyball",
"waffle iron",
"wall clock",
"wallet",
"wardrobe",
"military aircraft",
"sink",
"washing machine",
"water bottle",
"water jug",
"water tower",
"whiskey jug",
"whistle",
"wig",
"window screen",
"window shade",
"Windsor tie",
"wine bottle",
"wing",
"wok",
"wooden spoon",
"wool",
"split-rail fence",
"shipwreck",
"yawl",
"yurt",
"website",
"comic book",
"crossword",
"traffic sign",
"traffic light",
"dust jacket",
"menu",
"plate",
"guacamole",
"consomme",
"hot pot",
"trifle",
"ice cream",
"ice pop",
"baguette",
"bagel",
"pretzel",
"cheeseburger",
"hot dog",
"mashed potato",
"cabbage",
"broccoli",
"cauliflower",
"zucchini",
"spaghetti squash",
"acorn squash",
"butternut squash",
"cucumber",
"artichoke",
"bell pepper",
"cardoon",
"mushroom",
"Granny Smith",
"strawberry",
"orange",
"lemon",
"fig",
"pineapple",
"banana",
"jackfruit",
"custard apple",
"pomegranate",
"hay",
"carbonara",
"chocolate syrup",
"dough",
"meatloaf",
"pizza",
"pot pie",
"burrito",
"red wine",
"espresso",
"cup",
"eggnog",
"alp",
"bubble",
"cliff",
"coral reef",
"geyser",
"lakeshore",
"promontory",
"shoal",
"seashore",
"valley",
"volcano",
"baseball player",
"bridegroom",
"scuba diver",
"rapeseed",
"daisy",
"yellow lady's slipper",
"corn",
"acorn",
"rose hip",
"horse chestnut seed",
"coral fungus",
"agaric",
"gyromitra",
"stinkhorn mushroom",
"earth star",
"hen-of-the-woods",
"bolete",
"ear",
"toilet paper"]
%% Cell type:markdown id: tags:
### **_Deep Learning - Bsc Data Science for Responsible Business - Centrale Lyon_**
2024-2025
Emmanuel Dellandréa
%% Cell type:markdown id: tags:
# Practical Session 5 – Monitoring the training with Weights & Biases
The objective of this short tutorial is to learn how to monitor a CNN training with [Weights and Biases](https://wandb.ai/site/). With W&B, you can track and compare your experiments, visualize your model training and performance.
#### Installation
You'll need to install `wand`.
```shell
pip install wandb
```
Have a look at the documentation of for integrating [Weights & Biases into Pytorch](https://docs.wandb.ai/guides/integrations/pytorch/).
Then, study the code below and the informations registered in W&B.
As the computation is heavy, particularly during training, we encourage you to use a GPU. If your laptob is not equiped, you may use one of these remote jupyter servers, where you can select the execution on GPU :
1) [jupyter.mi90.ec-lyon.fr](https://jupyter.mi90.ec-lyon.fr/)
This server is accessible within the campus network. If outside, you need to use a VPN. Before executing the notebook, select the kernel "Python PyTorch" to run it on GPU and have access to PyTorch module.
2) [Google Colaboratory](https://colab.research.google.com/)
Before executing the notebook, select the execution on GPU : "Exécution" Menu -> "Modifier le type d'exécution" and select "T4 GPU".
%% Cell type:code id: tags:
```
import wandb
# Initialize wandb
wandb.init(
project="cnn_cifar10", # Set your project name
config={ # Define hyperparameters
"epochs": 5,
"batch_size": 64,
"learning_rate": 0.01,
"optimizer": "Adam"
}
)
```
%% Cell type:code id: tags:
```
import torch
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data.sampler import SubsetRandomSampler
import torch.optim as optim
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = wandb.config.batch_size
# percentage of training set to use as validation
valid_size = 0.2
# convert data to a normalized torch.FloatTensor
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
# choose the training and test datasets
train_data = datasets.CIFAR10("data", train=True, download=True, transform=transform)
test_data = datasets.CIFAR10("data", train=False, download=True, transform=transform)
# obtain training indices that will be used for validation
num_train = len(train_data)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]
# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
# prepare data loaders (combine dataset and sampler)
train_loader = torch.utils.data.DataLoader(
train_data, batch_size=wandb.config.batch_size, sampler=train_sampler, num_workers=num_workers
)
valid_loader = torch.utils.data.DataLoader(
train_data, batch_size=wandb.config.batch_size, sampler=valid_sampler, num_workers=num_workers
)
test_loader = torch.utils.data.DataLoader(
test_data, batch_size=wandb.config.batch_size, num_workers=num_workers
)
# specify the image classes
classes = [
"airplane",
"automobile",
"bird",
"cat",
"deer",
"dog",
"frog",
"horse",
"ship",
"truck",
]
```
%% Cell type:code id: tags:
```
import torch.nn as nn
import torch.nn.functional as F
# define the CNN architecture
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
```
%% Cell type:code id: tags:
```
# Define model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
```
%% Cell type:code id: tags:
```
valid_loss_min = np.Inf
# Training loop
for epoch in range(wandb.config.epochs):
epoch_loss_train = 0
correct_train = 0
total_train = 0
# Training
model.train()
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
epoch_loss_train += loss.item()
_, predicted = torch.max(outputs, 1)
total_train += labels.size(0)
correct_train += (predicted == labels).sum().item()
# Validation
epoch_loss_valid = 0
correct_valid = 0
total_valid = 0
model.eval()
for images, labels in valid_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
epoch_loss_valid += loss.item()
_, predicted = torch.max(outputs, 1)
total_valid += labels.size(0)
correct_valid += (predicted == labels).sum().item()
# Save model if validation loss has decreased
if epoch_loss_valid <= valid_loss_min:
print(
"Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...".format(
valid_loss_min, epoch_loss_valid
)
)
torch.save(model.state_dict(), "model_cifar.pt")
valid_loss_min = epoch_loss_valid
accuracy_train = 100 * correct_train / total_train
avg_loss_train = epoch_loss_train / len(train_loader)
accuracy_valid = 100 * correct_valid / total_valid
avg_loss_valid = epoch_loss_valid / len(valid_loader)
# Log metrics to wandb
wandb.log({"epoch": epoch+1, "train_loss": avg_loss_train, "train_accuracy": accuracy_train})
wandb.log({"valid_loss": avg_loss_valid, "valid_accuracy": accuracy_valid})
print(f"Epoch {epoch+1}, Loss: {avg_loss_train:.4f}, Accuracy: {accuracy_train:.2f}%")
```
%% Cell type:code id: tags:
```
torch.save(model.state_dict(), "model_cifar.pth")
wandb.save("model_cifar.pth")
```
%% Cell type:code id: tags:
```
# Log an example image
wandb.log({"example_image": [wandb.Image(images[0].cpu())]})
# Log gradients
wandb.watch(model, log="all")
```
%% Cell type:code id: tags:
```
# Finish the wandb run
wandb.finish()
```
%% Cell type:markdown id: tags:
## Experiments
Run several trainings with different tuning of the hyperparameters and check the result in W&B.
%% Cell type:markdown id: tags:
### **_Deep Learning - Bsc Data Science for Responsible Business - Centrale Lyon_**
2024-2025
Emmanuel Dellandréa
%% Cell type:markdown id: tags:
# Practical Session 6 – Vision Transformers
The objective of this tutorial is to use the PyTorch library for building, training, and evaluating Vision Transformer models.
The notebook contains code cells with the **"# TO DO"** comments. Your goal is to complete these cells and run the proposed experiments.
As the computation is heavy, particularly during training, we encourage you to use a GPU. If your laptob is not equiped, you may use one of these remote jupyter servers, where you can select the execution on GPU :
1) [jupyter.mi90.ec-lyon.fr](https://jupyter.mi90.ec-lyon.fr/)
This server is accessible within the campus network. If outside, you need to use a VPN. Before executing the notebook, select the kernel "Python PyTorch" to run it on GPU and have access to PyTorch module.
2) [Google Colaboratory](https://colab.research.google.com/)
Before executing the notebook, select the execution on GPU : "Exécution" Menu -> "Modifier le type d'exécution" and select "T4 GPU".
%% Cell type:markdown id: tags:
### Goal of the TD
Transformers have been introduced by [Vaswani et al. in 2017](https://arxiv.org/abs/1706.03762) in the context of NLP (Natural Language Processing), and particulary for Machine Translation.
Its great success has led to its adaptation to various applications, including image classification. In this trend, [Dosovitskiy et al. in 2020](https://arxiv.org/abs/2010.11929) have proposed Vision Transformers (ViT) that we will study and implement from scratch in this TD.
The principle is illustrated in the following picture from this paper.
![Vision Tranformers](./figures/vit.png "Vision Transformers")
First, an input image is “cut” into sub-images equally sized.
Each such sub-image goes through a linear embedding. From then, each sub-image becomes a one-dimensional vector.
A positional embedding is then added to these vectors (tokens). The positional embedding allows the network to know where each sub-image is positioned originally in the image. Without this information, the network would not be able to know where each such image would be placed, leading to potentially wrong predictions.
These tokens are then passed, together with a special classification token, to the transformer encoders blocks, were each is composed of : A Layer Normalization (LN), followed by a Multi-head Self Attention (MSA) and a residual connection. Then a second LN, a Multi-Layer Perceptron (MLP), and again a residual connection. These blocks are connected back-to-back.
Finally, a classification MLP head is used for the final classification only on the special classification token, which by the end of this process has global information about the picture.
%% Cell type:markdown id: tags:
### Implementation of the ViT model
%% Cell type:markdown id: tags:
First, we import the required modules.
%% Cell type:code id: tags:
```
# Import modules
import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchvision.datasets.mnist import MNIST
from torchvision.transforms import ToTensor
```
%% Cell type:markdown id: tags:
For this first experiment, we will use the MNIST dataset that contains 28x28 binary pixels images of hand-written digits ([0–9]).
%% Cell type:code id: tags:
```
# Load data
transform = ToTensor()
train_set = MNIST(
root="datasets", train=True, download=True, transform=transform
)
test_set = MNIST(
root="datasets", train=False, download=True, transform=transform
)
train_loader = DataLoader(train_set, shuffle=True, batch_size=128)
test_loader = DataLoader(test_set, shuffle=False, batch_size=128)
```
%% Cell type:markdown id: tags:
### "Patchification"
The transformer encoder was originally developed with sequence data in mind, such as English sentences. However, as an image is not a sequence, we need to “sequencify” an image. To do this, we break it into multiple sub-images and map each sub-image to a vector.
We do so by simply reshaping our input, which has size (N, C, H, W), where N is the batch size, C the number of channels and (H,W) the image dimension. In the case of MNIST, dimensions are (N, 1, 28, 28). The target dimension is (N, #Patches, Patch dimensionality), where the dimensionality of a patch is adjusted accordingly.
In this example, we break each (1, 28, 28) into 7x7 patches (hence, each of size 4x4). That is, we are going to obtain 7x7=49 sub-images out of a single image.
Thus, we reshape input (N, 1, 28, 28) to (N, PxP, C x H/P x W/P) = (N, 49, 16)
Notice that, while each patch is a picture of size 1x4x4, we flatten it to a 16-dimensional vector. Also, in this case, we only had a single color channel. If we had multiple color channels, those would also have been flattened into the vector.
%% Cell type:code id: tags:
```
def patchify(images, n_patches):
n, c, h, w = images.shape
assert h == w, "Patchify method is implemented for square images only"
patches = torch.zeros(n, n_patches**2, h * w * c // n_patches**2)
patch_size = h // n_patches
for idx, image in enumerate(images):
for i in range(n_patches):
for j in range(n_patches):
patch = image[
:,
i * patch_size : (i + 1) * patch_size,
j * patch_size : (j + 1) * patch_size,
]
patches[idx, i * n_patches + j] = patch.flatten()
return patches
```
%% Cell type:markdown id: tags:
### Linear embedding
Now that we have our flattened patches, we can map each of them through a Linear mapping. While each patch was a 4x4=16 dimensional vector, the linear mapping can map to any arbitrary vector size. Thus, we will use for this a parameter `hidden_d` for "hidden dimension".
In this example, we will use a hidden dimension of 8, but in principle, any number can be put here. We will thus be mapping each 16-dimensional patch to an 8-dimensional patch.
%% Cell type:markdown id: tags:
### Positional encoding
Positional encoding allows the model to understand where each patch would be placed in the original image. While it is theoretically possible to learn such positional embeddings, previous work by [Vaswani et al. in 2017](https://arxiv.org/abs/1706.03762) suggests that we can just add sines and cosines waves.
In particular, positional encoding adds high-frequency values to the first dimensions and lower-frequency values to the latter dimensions.
In each sequence, for token i we add to its j-th coordinate the following value:
![Positional encoding](./figures/positional_encoding.png "Positional encoding").
This positional embedding is a function of the number of elements in the sequence and the dimensionality of each element. Thus, it is always a 2-dimensional tensor or “rectangle”.
Here is a simple function that, given the number of tokens and the dimensionality of each of them, outputs a matrix where each coordinate (i,j) is the value to be added to token i in dimension j.
This positional encoding is added to our model after the linear mapping and the addition of the class token.
%% Cell type:code id: tags:
```
def get_positional_embeddings(sequence_length, d):
result = torch.ones(sequence_length, d)
for i in range(sequence_length):
for j in range(d):
result[i][j] = (
np.sin(i / (10000 ** (j / d)))
if j % 2 == 0
else np.cos(i / (10000 ** ((j - 1) / d)))
)
return result
```
%% Cell type:markdown id: tags:
### Multi-Head Self-Attention
The objective is now that, for a single image, each patch has to be updated based on some similarity measure with the other patches. We do so by linearly mapping each patch (that is now an 8-dimensional vector in our example) to 3 distinct vectors: q, k, and v (query, key, value).
Then, for a single patch, we are going to compute the dot product between its q vector with all of the k vectors, divide by the square root of the dimensionality of these vectors (sqrt(8)), softmax these so-called attention cues, and finally multiply each attention cue with the v vectors associated with the different k vectors and sum all up.
In this way, each patch assumes a new value that is based on its similarity (after the linear mapping to q, k, and v) with other patches. This whole procedure, however, is carried out H times on H sub-vectors of our current 8-dimensional patches, where H is the number of Heads.
Once all results are obtained, they are concatenated together. Finally, the result is passed through a linear layer (for good measure).
The intuitive idea behind attention is that it allows modeling the relationship between the inputs. What makes a ‘0’ a zero are not the individual pixel values, but how they relate to each other.
This is implemented in the MSA class:
%% Cell type:code id: tags:
```
class MSA(nn.Module):
def __init__(self, d, n_heads=2):
super().__init__()
self.d = d
self.n_heads = n_heads
assert d % n_heads == 0, f"Can't divide dimension {d} into {n_heads} heads"
d_head = int(d / n_heads)
self.q_mappings = nn.ModuleList(
[nn.Linear(d_head, d_head) for _ in range(self.n_heads)]
)
self.k_mappings = nn.ModuleList(
[nn.Linear(d_head, d_head) for _ in range(self.n_heads)]
)
self.v_mappings = nn.ModuleList(
[nn.Linear(d_head, d_head) for _ in range(self.n_heads)]
)
self.d_head = d_head
self.softmax = nn.Softmax(dim=-1)
def forward(self, sequences):
# Sequences has shape (N, seq_length, token_dim)
# We go into shape (N, seq_length, n_heads, token_dim / n_heads)
# And come back to (N, seq_length, item_dim) (through concatenation)
result = []
for sequence in sequences:
seq_result = []
for head in range(self.n_heads):
q_mapping = self.q_mappings[head]
k_mapping = self.k_mappings[head]
v_mapping = self.v_mappings[head]
seq = sequence[:, head * self.d_head : (head + 1) * self.d_head]
q, k, v = q_mapping(seq), k_mapping(seq), v_mapping(seq)
#
# TO DO: implement attention computation
#
attention =
seq_result.append(attention)
result.append(torch.hstack(seq_result))
return torch.cat([torch.unsqueeze(r, dim=0) for r in result])
```
%% Cell type:markdown id: tags:
Notice that, for each head, we create distinct Q, K, and V mapping functions (square matrices of size 4x4 in our example).
Since our inputs will be sequences of size (N, 50, 8), and we only use 2 heads, we will at some point have an (N, 50, 2, 4) tensor, use a nn.Linear(4, 4) module on it, and then come back, after concatenation, to an (N, 50, 8) tensor.
Also notice that using loops is not the most efficient way to compute the multi-head self-attention, but it makes the code much clearer for learning.
%% Cell type:markdown id: tags:
### Transformer Encoder Blocks
The next step is to create the transformer encoder block class.
Layer normalization (LN) is a popular block that, given an input, subtracts its mean and divides by the standard deviation. It is applied to the last dimension only. We can thus make each of our 50x8 matrices (representing a single sequence) have mean 0 and std 1. After we run our (N, 50, 8) tensor through LN, we still get the same dimensionality.
Also, We will be using residual connection that consists in adding the original input to the result of some computation. This, intuitively, allows a network to become more powerful while also preserving the set of possible functions that the model can approximate.
We will add a residual connection that will add our original (N, 50, 8) tensor to the (N, 50, 8) obtained after LN and MSA.
Next is to add a simple residual connection between what we already have and what we get after passing the current tensor through another LN and an MLP. The MLP is composed of two layers, where the hidden layer typically is four times as big (this is a parameter).
The transformer encoder block class (which will be a component of the future ViT class) is thus as follows:
%% Cell type:code id: tags:
```
class ViTBlock(nn.Module):
def __init__(self, hidden_d, n_heads, mlp_ratio=4):
super().__init__()
self.hidden_d = hidden_d
self.n_heads = n_heads
self.norm1 = nn.LayerNorm(hidden_d)
self.mhsa = MSA(hidden_d, n_heads)
self.norm2 = nn.LayerNorm(hidden_d)
self.mlp = nn.Sequential(
nn.Linear(hidden_d, mlp_ratio * hidden_d),
nn.GELU(),
nn.Linear(mlp_ratio * hidden_d, hidden_d),
)
def forward(self, x):
#
# TO DO: implement the forward pass
#
out =
return out
```
%% Cell type:markdown id: tags:
### ViT model
Now that the encoder block is ready, we just need to insert it in our bigger ViT model which is responsible for patchifying before the transformer blocks, and carrying out the classification after.
To help classification, we will use an additional **classification token** to the input sequence. This is a special token that we add to our model that has the role of capturing information about the other tokens. This will happen with the MSA block. When information about all other tokens will be present here, we will be able to classify the image using only this special token. The initial value of the special token (the one fed to the transformer encoder) is a parameter of the model that needs to be learned.
Thus, we will add a parameter to our model and convert our (N, 49, 8) tokens tensor to an (N, 50, 8) tensor (we add the special token to each sequence).
We could have an arbitrary number of transformer blocks. In this example, to keep it simple, I will use only 2. We also add a parameter to know how many heads does each encoder block will use.
Finally, we can extract just the classification token (first token) out of our N sequences, and use each token to get N classifications.
Since we decided that each token is an 8-dimensional vector, and since we have 10 possible digits, we can implement the classification MLP as a simple 8x10 matrix, activated with the SoftMax function.
The output of our model shoud be an (N, 10) tensor.
%% Cell type:code id: tags:
```
class ViT(nn.Module):
def __init__(self, chw, n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10):
# Super constructor
super().__init__()
# Attributes
self.chw = chw # ( C , H , W )
self.n_patches = n_patches
self.n_blocks = n_blocks
self.n_heads = n_heads
self.hidden_d = hidden_d
# Input and patches sizes
assert (
chw[1] % n_patches == 0
), "Input shape not entirely divisible by number of patches"
assert (
chw[2] % n_patches == 0
), "Input shape not entirely divisible by number of patches"
self.patch_size = (chw[1] / n_patches, chw[2] / n_patches)
# 1) Linear mapper
self.input_d = int(chw[0] * self.patch_size[0] * self.patch_size[1])
self.linear_mapper = nn.Linear(self.input_d, self.hidden_d)
# 2) Learnable classification token
self.class_token = nn.Parameter(torch.rand(1, self.hidden_d))
# 3) Positional embedding
self.register_buffer(
"positional_embeddings",
get_positional_embeddings(n_patches**2 + 1, hidden_d),
persistent=False,
)
# 4) Transformer encoder blocks
self.blocks = nn.ModuleList(
[ViTBlock(hidden_d, n_heads) for _ in range(n_blocks)]
)
# 5) Classification MLPk
self.mlp = nn.Sequential(nn.Linear(self.hidden_d, out_d), nn.Softmax(dim=-1))
def forward(self, images):
#
# TO DO: implement the forward pass
#
# Dividing images into patches
n, c, h, w = images.shape
patches =
# Running linear layer tokenization
# Map the vector corresponding to each patch to the hidden size dimension
tokens =
# Adding classification token to the tokens
tokens = torch.cat((self.class_token.expand(n, 1, -1), tokens), dim=1)
# Adding positional embedding
out = tokens + self.positional_embeddings.repeat(n, 1, 1)
# Transformer Blocks
for block in self.blocks:
out =
# Getting the classification token only
out =
# Map to output dimension, output category distribution
out =
return out
```
%% Cell type:markdown id: tags:
### ViT training
The ViT model being built, the next step is to train it on the MNIST dataset.
%% Cell type:markdown id: tags:
First, we initialize the model and the hyperparameters.
%% Cell type:code id: tags:
```
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(
"Using device: ",
device,
f"({torch.cuda.get_device_name(device)})" if torch.cuda.is_available() else "",
)
model = ViT(
(1, 28, 28), n_patches=7, n_blocks=2, hidden_d=8, n_heads=2, out_d=10
).to(device)
N_EPOCHS = 5
LR = 0.005
```
%% Cell type:markdown id: tags:
Training of the ViT model:
%% Cell type:code id: tags:
```
optimizer = Adam(model.parameters(), lr=LR)
criterion = CrossEntropyLoss()
for epoch in range(N_EPOCHS):
train_loss = 0.0
for batch in train_loader:
x, y = batch
x, y = x.to(device), y.to(device)
y_hat = model(x)
loss = criterion(y_hat, y)
train_loss += loss.detach().cpu().item() / len(train_loader)
#
# TO DO : implement the gradients computation and the parameters update
#
print(f"Epoch {epoch + 1}/{N_EPOCHS} loss: {train_loss:.2f}")
```
%% Cell type:markdown id: tags:
### ViT test
Finally, let's test the trained model.
%% Cell type:code id: tags:
```
with torch.no_grad():
correct, total = 0, 0
test_loss = 0.0
for batch in test_loader:
x, y = batch
x, y = x.to(device), y.to(device)
#
# TO DO: implement the computation of the loss and the accuracy (correct)
#
print(f"Test loss: {test_loss:.2f}")
print(f"Test accuracy: {correct / total * 100:.2f}%")
```
%% Cell type:markdown id: tags:
## Further experiments
1. Adapt the code to apply the ViT model on CIFAR dataset.
2. Make use of a validation set to evaluate overfitting.
3. Evaluate the model with a dimension of 16 for the tokens and 4 encoder blocks.
Practical_sessions/Session_6/figures/positional_encoding.png

7.41 KiB

Practical_sessions/Session_6/figures/vit.png

337 KiB

%% Cell type:markdown id: tags:
### **_Deep Learning - Bsc Data Science for Responsible Business - Centrale Lyon_**
2024-2025
Emmanuel Dellandréa
%% Cell type:markdown id: tags:
# Practical Session 7 – Large Language Models
The objective of this tutorial is to learn to work with LLM models for sentence generation and classification. The pretrained models and tokenizers will be obtained from the [Hugging Face platform](https://huggingface.co/).
This notebook contains 8 parts:
1. Using a Hugging Face text generation model
2. Using Pipeline of Hugging Face for text classification
3. Using Pipeline with a specific model and tokenizer of Hugging Face
4. Experimenting with models from Hugging Face
5. Training a LLM for sentence classification using the **Trainer** class
6. Fine tuning a LLM model with a custom head
7. Sharing a model on Hugging Face platform
8. Further experiments
Before going further into experiments, you work is to understand the provided code, that gives an overview of using LLM with Hugging Face.
**This code is intentionally not commented. It is your objective to add all the necessary comments to ensure your proper understanding of the code.**
You might frequently rely on [Hugging Face’s documentation](https://huggingface.co/docs).
---
As the computation can be heavy, particularly during training, we encourage you to use a GPU. If your laptob is not equiped, you may use one of these remote jupyter servers, where you can select the execution on GPU :
1) [jupyter.mi90.ec-lyon.fr](https://jupyter.mi90.ec-lyon.fr/)
This server is accessible within the campus network. If outside, you need to use a VPN. Before executing the notebook, select the kernel "Python PyTorch" to run it on GPU and have access to PyTorch module.
2) [Google Colaboratory](https://colab.research.google.com/)
Before executing the notebook, select the execution on GPU : "Runtime" -> "Change runtime type" --> "T4 GPU".
%% Cell type:markdown id: tags:
### Installing required librairies
%% Cell type:code id: tags:
``` python
!pip install huggingface_hub
!pip install ipywidgets
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install scikit-learn
```
%% Cell type:markdown id: tags:
### Log in to Hugging Face
First, you need to create an account on [Hugging Face platform](https://huggingface.co/join).
Then you can log in to your account directly from the notebook.
%% Cell type:code id: tags:
``` python
from huggingface_hub import notebook_login
notebook_login()
```
%% Cell type:markdown id: tags:
### Part 1 - Using a Hugging Face text generation model
%% Cell type:code id: tags:
``` python
from transformers import AutoTokenizer, AutoModelForCausalLM
# model_name = "mistralai/Mistral-7B"
# model_name = "deepseek-ai/DeepSeek-R1"
# model_name = "meta-llama/Llama-3.2-3B-Instruct"
# model_name = "homebrewltd/AlphaMaze-v0.2-1.5B"
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
```
%% Cell type:code id: tags:
``` python
input_text = "Hello. Who are you ?"
encoded_input = tokenizer(input_text, return_tensors="pt")
output = model.generate(
input_ids=encoded_input["input_ids"],
attention_mask=encoded_input["attention_mask"],
max_length=100,
temperature=0.8,
pad_token_id=tokenizer.pad_token_id
)
```
%% Cell type:code id: tags:
``` python
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
```
%% Cell type:markdown id: tags:
### Part 2 - Using Pipeline of Hugging Face for text classification
%% Cell type:code id: tags:
``` python
from transformers import pipeline
classifier = pipeline("text-classification")
```
%% Cell type:code id: tags:
``` python
classifier("We are very happy to welcome you at Centrale Lyon.")
```
%% Cell type:code id: tags:
``` python
results = classifier(["We are very happy to welcome you at Centrale Lyon.", "We hope you don't hate it."])
for result in results:
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
```
%% Cell type:markdown id: tags:
### Part 3 - Using Pipeline with a specific model and tokenizer of Hugging Face
%% Cell type:code id: tags:
``` python
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
```
%% Cell type:code id: tags:
``` python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
%% Cell type:code id: tags:
``` python
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
classifier("We are very hapy to present you this incredible model.")
```
%% Cell type:markdown id: tags:
### Part 4 - Experimenting with models from Hugging Face
%% Cell type:code id: tags:
``` python
from transformers import AutoTokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
%% Cell type:code id: tags:
``` python
encoding = tokenizer("We are very happy to welcome you at Centrale Lyon.")
print(encoding)
```
%% Cell type:code id: tags:
``` python
batch = tokenizer(
["We are very happy to welcome you at Centrale Lyon.", "We hope you don't hate it."],
padding=True,
truncation=True,
max_length=512,
return_tensors="pt",
)
print(batch)
```
%% Cell type:code id: tags:
``` python
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype="auto")
print(model)
```
%% Cell type:code id: tags:
``` python
outputs = model(**batch)
print(outputs)
```
%% Cell type:code id: tags:
``` python
from torch import nn
predictions = nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)
```
%% Cell type:code id: tags:
``` python
save_directory = "./save_pretrained"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)
```
%% Cell type:code id: tags:
``` python
loaded_model = AutoModelForSequenceClassification.from_pretrained("./save_pretrained")
```
%% Cell type:markdown id: tags:
### Part 5 - Training a LLM for sentence classification using the **Trainer** class
%% Cell type:code id: tags:
``` python
from transformers import AutoModelForSequenceClassification
model_name = "distilbert/distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype="auto")
```
%% Cell type:code id: tags:
``` python
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="save_folder/",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=2,
)
```
%% Cell type:code id: tags:
``` python
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
%% Cell type:code id: tags:
``` python
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")
```
%% Cell type:code id: tags:
``` python
def tokenize_dataset(dataset):
return tokenizer(dataset["text"])
```
%% Cell type:code id: tags:
``` python
dataset = dataset.map(tokenize_dataset, batched=True)
```
%% Cell type:code id: tags:
``` python
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
```
%% Cell type:code id: tags:
``` python
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
processing_class=tokenizer,
data_collator=data_collator,
)
```
%% Cell type:code id: tags:
``` python
trainer.train()
```
%% Cell type:code id: tags:
``` python
save_directory = "./tomatoes_save_pretrained"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)
```
%% Cell type:code id: tags:
``` python
model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)
```
%% Cell type:code id: tags:
``` python
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
```
%% Cell type:code id: tags:
``` python
t = dataset['test'][345]
print(t)
classifier(t['text'])
```
%% Cell type:markdown id: tags:
### Part 6 - Fine tuning a LLM model with a custom head
%% Cell type:code id: tags:
``` python
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
```
%% Cell type:code id: tags:
``` python
dataset = load_dataset("imdb")
```
%% Cell type:code id: tags:
``` python
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
```
%% Cell type:code id: tags:
``` python
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)
```
%% Cell type:code id: tags:
``` python
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
for param in bert_model.parameters():
param.requires_grad = False
```
%% Cell type:code id: tags:
``` python
class CustomBERTModel(torch.nn.Module):
def __init__(self, bert_model):
super(CustomBERTModel, self).__init__()
self.bert = bert_model
self.custom_head = torch.nn.Sequential(
torch.nn.Linear(self.bert.config.hidden_size, 128),
torch.nn.ReLU(),
torch.nn.Dropout(0.1),
torch.nn.Linear(128, 2)
)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
outputs = self.custom_head(outputs.last_hidden_state[:, 0, :]) # Use [CLS] token output
return outputs
```
%% Cell type:code id: tags:
``` python
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
for param in bert_model.parameters():
param.requires_grad = False
model = CustomBERTModel(bert_model)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps")
model.to(device)
```
%% Cell type:code id: tags:
``` python
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()
```
%% Cell type:code id: tags:
``` python
def train_epoch(model, data_loader, optimizer, criterion, device):
model.train()
total_loss = 0
for batch in data_loader:
optimizer.zero_grad()
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
return total_loss / len(data_loader)
```
%% Cell type:code id: tags:
``` python
def evaluate(model, data_loader, criterion, device):
model.eval()
total_loss = 0
all_predictions = []
all_labels = []
with torch.no_grad():
for batch in data_loader:
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
labels = batch["labels"].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
total_loss += loss.item()
predictions = torch.argmax(outputs, dim=-1)
all_predictions.extend(predictions.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(all_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average="binary")
return total_loss / len(data_loader), accuracy, precision, recall, f1
```
%% Cell type:code id: tags:
``` python
num_epochs = 3
for epoch in range(num_epochs):
print(f"Epoch {epoch + 1}/{num_epochs}")
train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
print(f"Train Loss: {train_loss:.4f}")
val_loss, val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, test_loader, criterion, device)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1 Score: {val_f1:.4f}")
torch.save(model.state_dict(), f"custom_bert_epoch_{epoch + 1}.pth")
# (After 76 minutes of training)
# Epoch 1/3
# Train Loss: 0.6708
# Validation Loss: 0.6415
# Accuracy: 0.7917, Precision: 0.8218, Recall: 0.7450, F1 Score: 0.7815
# Epoch 2/3
# Train Loss: 0.6172
# Validation Loss: 0.5825
# Accuracy: 0.8051, Precision: 0.8142, Recall: 0.7907, F1 Score: 0.8023
# Epoch 3/3
# Train Loss: 0.5634
# Validation Loss: 0.5300
# Accuracy: 0.8098, Precision: 0.8339, Recall: 0.7738, F1 Score: 0.8027
```
%% Cell type:code id: tags:
``` python
model_save_path = "custom_bert_model.pth"
torch.save(model.state_dict(), model_save_path)
```
%% Cell type:code id: tags:
``` python
loadedbert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")
loaded_model = CustomBERTModel(loadedbert_model)
loaded_model.load_state_dict(torch.load(model_save_path))
loaded_model.to(device)
```
%% Cell type:code id: tags:
``` python
batch = next(iter(test_loader))
ids = batch['input_ids'][0]
attention_mask = batch['attention_mask'][0]
label = batch['labels'][0]
ids = ids.to(device)
attention_mask = attention_mask.to(device)
text = tokenizer.decode(ids, skip_special_tokens=True)
print(text)
print(label)
loaded_model.eval()
output = model(input_ids=ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
output = output.squeeze(0)
print(output)
prediction = torch.argmax(output, dim=-1)
print(prediction)
print(label)
print(prediction == label)
```
%% Cell type:markdown id: tags:
### Part 7 - Sharing a model on Hugging Face platform
%% Cell type:code id: tags:
``` python
from transformers import DistilBertPreTrainedModel, DistilBertModel
import torch.nn as nn
class CustomDistilBERTModel(DistilBertPreTrainedModel):
def __init__(self, config, freeze_backbone=True):
super().__init__(config)
self.distilbert = DistilBertModel(config)
self.classifier = nn.Sequential(
nn.Linear(config.hidden_size, 128),
nn.ReLU(),
nn.Dropout(0.1),
nn.Linear(128, config.num_labels)
)
if freeze_backbone:
for param in self.distilbert.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask=None, labels=None):
outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
logits = self.classifier(outputs.last_hidden_state[:, 0, :]) # Use [CLS] token output
return logits
```
%% Cell type:code id: tags:
``` python
from transformers import AutoConfig
AutoConfig.register("custom-distilbert", AutoConfig)
AutoModel.register(CustomDistilBERTModel, "custom-distilbert")
```
%% Cell type:code id: tags:
``` python
from transformers import DistilBertTokenizer
config = AutoConfig.from_pretrained("distilbert-base-uncased", num_labels=2)
config.architectures = ["CustomDistilBERTModel"]
model = CustomDistilBERTModel(config)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model.save_pretrained("custom_distilbert_model")
tokenizer.save_pretrained("custom_distilbert_model")
print("Custom model and tokenizer saved locally!")
```
%% Cell type:code id: tags:
``` python
device = "mps"
model = model.to(device)
```
%% Cell type:code id: tags:
``` python
num_epochs = 3
for epoch in range(num_epochs):
print(f"Epoch {epoch + 1}/{num_epochs}")
train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
print(f"Train Loss: {train_loss:.4f}")
val_loss, val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, test_loader, criterion, device)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1 Score: {val_f1:.4f}")
torch.save(model.state_dict(), f"custom_bert_epoch_{epoch + 1}.pth")
```
%% Cell type:code id: tags:
``` python
model.push_to_hub("custom-distilbert-model")
tokenizer.push_to_hub("custom-distilbert-model")
```
%% Cell type:code id: tags:
``` python
from transformers import AutoTokenizer, AutoModel
loaded_tokenizer = AutoTokenizer.from_pretrained("your_hf_id/custom-distilbert-model")
loaded_model = AutoModel.from_pretrained("your_hf_id/custom-distilbert-model")
```
%% Cell type:markdown id: tags:
### Part 8 - Further experiments
%% Cell type:markdown id: tags:
Now that you know the basics for manipulating LLM through Hugging Face platform, it is time to experiment with:
- different [NLP tasks](https://huggingface.co/tasks)
- different [models](https://huggingface.co/models?pipeline_tag=text-classification&sort=trending)
- different [datasets](https://huggingface.co/datasets?task_categories=task_categories:text-classification&sort=trending)
... and to share your finetuned models on the platform.
Besides, don't forget to monitor your trainings through [Weights & Biases](https://wandb.ai/home).