Skip to content
Snippets Groups Projects
Commit c29ca6cf authored by Dellandrea Emmanuel's avatar Dellandrea Emmanuel
Browse files

Create Subject_5_Training_monitoring.ipynb

parent ba91f239
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
### **_Deep Learning - Bsc Data Science for Responsible Business - Centrale Lyon_**
2024-2025
Emmanuel Dellandréa
%% Cell type:markdown id: tags:
# Practical Session 5 – Monitoring the training with Weights & Biases
The objective of this short tutorial is to learn how to monitor a CNN training with [Weights and Biases](https://wandb.ai/site/). With W&B, you can track and compare your experiments, visualize your model training and performance.
#### Installation
You'll need to install `wand`.
```shell
pip install wandb
```
Have a look at the documentation of for integrating [Weights & Biases into Pytorch](https://docs.wandb.ai/guides/integrations/pytorch/).
Then, study the code below and the informations registered in W&B.
As the computation is heavy, particularly during training, we encourage you to use a GPU. If your laptob is not equiped, you may use one of these remote jupyter servers, where you can select the execution on GPU :
1) [jupyter.mi90.ec-lyon.fr](https://jupyter.mi90.ec-lyon.fr/)
This server is accessible within the campus network. If outside, you need to use a VPN. Before executing the notebook, select the kernel "Python PyTorch" to run it on GPU and have access to PyTorch module.
2) [Google Colaboratory](https://colab.research.google.com/)
Before executing the notebook, select the execution on GPU : "Exécution" Menu -> "Modifier le type d'exécution" and select "T4 GPU".
%% Cell type:code id: tags:
```
import wandb
# Initialize wandb
wandb.init(
project="cnn_cifar10", # Set your project name
config={ # Define hyperparameters
"epochs": 5,
"batch_size": 64,
"learning_rate": 0.01,
"optimizer": "Adam"
}
)
```
%% Cell type:code id: tags:
```
import torch
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data.sampler import SubsetRandomSampler
import torch.optim as optim
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = wandb.config.batch_size
# percentage of training set to use as validation
valid_size = 0.2
# convert data to a normalized torch.FloatTensor
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
# choose the training and test datasets
train_data = datasets.CIFAR10("data", train=True, download=True, transform=transform)
test_data = datasets.CIFAR10("data", train=False, download=True, transform=transform)
# obtain training indices that will be used for validation
num_train = len(train_data)
indices = list(range(num_train))
np.random.shuffle(indices)
split = int(np.floor(valid_size * num_train))
train_idx, valid_idx = indices[split:], indices[:split]
# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)
# prepare data loaders (combine dataset and sampler)
train_loader = torch.utils.data.DataLoader(
train_data, batch_size=wandb.config.batch_size, sampler=train_sampler, num_workers=num_workers
)
valid_loader = torch.utils.data.DataLoader(
train_data, batch_size=wandb.config.batch_size, sampler=valid_sampler, num_workers=num_workers
)
test_loader = torch.utils.data.DataLoader(
test_data, batch_size=wandb.config.batch_size, num_workers=num_workers
)
# specify the image classes
classes = [
"airplane",
"automobile",
"bird",
"cat",
"deer",
"dog",
"frog",
"horse",
"ship",
"truck",
]
```
%% Cell type:code id: tags:
```
import torch.nn as nn
import torch.nn.functional as F
# define the CNN architecture
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
```
%% Cell type:code id: tags:
```
# Define model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
```
%% Cell type:code id: tags:
```
valid_loss_min = np.Inf
# Training loop
for epoch in range(wandb.config.epochs):
epoch_loss_train = 0
correct_train = 0
total_train = 0
# Training
model.train()
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
epoch_loss_train += loss.item()
_, predicted = torch.max(outputs, 1)
total_train += labels.size(0)
correct_train += (predicted == labels).sum().item()
# Validation
epoch_loss_valid = 0
correct_valid = 0
total_valid = 0
model.eval()
for images, labels in valid_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
epoch_loss_valid += loss.item()
_, predicted = torch.max(outputs, 1)
total_valid += labels.size(0)
correct_valid += (predicted == labels).sum().item()
# Save model if validation loss has decreased
if epoch_loss_valid <= valid_loss_min:
print(
"Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...".format(
valid_loss_min, epoch_loss_valid
)
)
torch.save(model.state_dict(), "model_cifar.pt")
valid_loss_min = epoch_loss_valid
accuracy_train = 100 * correct_train / total_train
avg_loss_train = epoch_loss_train / len(train_loader)
accuracy_valid = 100 * correct_valid / total_valid
avg_loss_valid = epoch_loss_valid / len(valid_loader)
# Log metrics to wandb
wandb.log({"epoch": epoch+1, "train_loss": avg_loss_train, "train_accuracy": accuracy_train})
wandb.log({"valid_loss": avg_loss_valid, "valid_accuracy": accuracy_valid})
print(f"Epoch {epoch+1}, Loss: {avg_loss_train:.4f}, Accuracy: {accuracy_train:.2f}%")
```
%% Cell type:code id: tags:
```
torch.save(model.state_dict(), "model_cifar.pth")
wandb.save("model_cifar.pth")
```
%% Cell type:code id: tags:
```
# Log an example image
wandb.log({"example_image": [wandb.Image(images[0].cpu())]})
# Log gradients
wandb.watch(model, log="all")
```
%% Cell type:code id: tags:
```
# Finish the wandb run
wandb.finish()
```
%% Cell type:markdown id: tags:
## Experiments
Run several trainings with different tuning of the hyperparameters and check the result in W&B.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment