diff --git a/Practical_sessions/Session_5/Subject_5_Training_monitoring.ipynb b/Practical_sessions/Session_5/Subject_5_Training_monitoring.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6101f6cb146cec4f225979b63cf5a48435393165 --- /dev/null +++ b/Practical_sessions/Session_5/Subject_5_Training_monitoring.ipynb @@ -0,0 +1,344 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### **_Deep Learning - Bsc Data Science for Responsible Business - Centrale Lyon_**\n", + "\n", + "2024-2025\n", + "\n", + "Emmanuel Dellandréa\t " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Practical Session 5 – Monitoring the training with Weights & Biases\n", + "\n", + "The objective of this short tutorial is to learn how to monitor a CNN training with [Weights and Biases](https://wandb.ai/site/). With W&B, you can track and compare your experiments, visualize your model training and performance.\n", + "\n", + "#### Installation\n", + "\n", + "You'll need to install `wand`.\n", + "\n", + "```shell\n", + "pip install wandb \n", + "```\n", + "\n", + "\n", + "Have a look at the documentation of for integrating [Weights & Biases into Pytorch](https://docs.wandb.ai/guides/integrations/pytorch/).\n", + "\n", + "Then, study the code below and the informations registered in W&B.\n", + "\n", + "As the computation is heavy, particularly during training, we encourage you to use a GPU. If your laptob is not equiped, you may use one of these remote jupyter servers, where you can select the execution on GPU :\n", + "\n", + "1) [jupyter.mi90.ec-lyon.fr](https://jupyter.mi90.ec-lyon.fr/)\n", + "\n", + "This server is accessible within the campus network. If outside, you need to use a VPN. Before executing the notebook, select the kernel \"Python PyTorch\" to run it on GPU and have access to PyTorch module.\n", + "\n", + "2) [Google Colaboratory](https://colab.research.google.com/)\n", + "\n", + "Before executing the notebook, select the execution on GPU : \"Exécution\" Menu -> \"Modifier le type d'exécution\" and select \"T4 GPU\". " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 123 + }, + "id": "W_z5YjQ5b5nn", + "outputId": "ece809cd-f96f-4127-b8d5-1905e0577603" + }, + "outputs": [], + "source": [ + "import wandb\n", + "\n", + "# Initialize wandb\n", + "wandb.init(\n", + " project=\"cnn_cifar10\", # Set your project name\n", + " config={ # Define hyperparameters\n", + " \"epochs\": 5,\n", + " \"batch_size\": 64,\n", + " \"learning_rate\": 0.01,\n", + " \"optimizer\": \"Adam\"\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "W6lKd0LylANS", + "outputId": "a4d3da41-bdb1-4bb8-8759-45f75c71f480" + }, + "outputs": [], + "source": [ + "import torch\n", + "import numpy as np\n", + "from torchvision import datasets, transforms\n", + "from torch.utils.data.sampler import SubsetRandomSampler\n", + "import torch.optim as optim\n", + "\n", + "# number of subprocesses to use for data loading\n", + "num_workers = 0\n", + "# how many samples per batch to load\n", + "batch_size = wandb.config.batch_size\n", + "# percentage of training set to use as validation\n", + "valid_size = 0.2\n", + "\n", + "# convert data to a normalized torch.FloatTensor\n", + "transform = transforms.Compose(\n", + " [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]\n", + ")\n", + "\n", + "# choose the training and test datasets\n", + "train_data = datasets.CIFAR10(\"data\", train=True, download=True, transform=transform)\n", + "test_data = datasets.CIFAR10(\"data\", train=False, download=True, transform=transform)\n", + "\n", + "# obtain training indices that will be used for validation\n", + "num_train = len(train_data)\n", + "indices = list(range(num_train))\n", + "np.random.shuffle(indices)\n", + "split = int(np.floor(valid_size * num_train))\n", + "train_idx, valid_idx = indices[split:], indices[:split]\n", + "\n", + "# define samplers for obtaining training and validation batches\n", + "train_sampler = SubsetRandomSampler(train_idx)\n", + "valid_sampler = SubsetRandomSampler(valid_idx)\n", + "\n", + "# prepare data loaders (combine dataset and sampler)\n", + "train_loader = torch.utils.data.DataLoader(\n", + " train_data, batch_size=wandb.config.batch_size, sampler=train_sampler, num_workers=num_workers\n", + ")\n", + "valid_loader = torch.utils.data.DataLoader(\n", + " train_data, batch_size=wandb.config.batch_size, sampler=valid_sampler, num_workers=num_workers\n", + ")\n", + "test_loader = torch.utils.data.DataLoader(\n", + " test_data, batch_size=wandb.config.batch_size, num_workers=num_workers\n", + ")\n", + "\n", + "# specify the image classes\n", + "classes = [\n", + " \"airplane\",\n", + " \"automobile\",\n", + " \"bird\",\n", + " \"cat\",\n", + " \"deer\",\n", + " \"dog\",\n", + " \"frog\",\n", + " \"horse\",\n", + " \"ship\",\n", + " \"truck\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Cvfx-93dlLGD" + }, + "outputs": [], + "source": [ + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "# define the CNN architecture\n", + "\n", + "\n", + "class Net(nn.Module):\n", + " def __init__(self):\n", + " super(Net, self).__init__()\n", + " self.conv1 = nn.Conv2d(3, 6, 5)\n", + " self.pool = nn.MaxPool2d(2, 2)\n", + " self.conv2 = nn.Conv2d(6, 16, 5)\n", + " self.fc1 = nn.Linear(16 * 5 * 5, 120)\n", + " self.fc2 = nn.Linear(120, 84)\n", + " self.fc3 = nn.Linear(84, 10)\n", + "\n", + " def forward(self, x):\n", + " x = self.pool(F.relu(self.conv1(x)))\n", + " x = self.pool(F.relu(self.conv2(x)))\n", + " x = x.view(-1, 16 * 5 * 5)\n", + " x = F.relu(self.fc1(x))\n", + " x = F.relu(self.fc2(x))\n", + " x = self.fc3(x)\n", + " return x\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7f5T3YLjl4zF" + }, + "outputs": [], + "source": [ + "# Define model, loss, and optimizer\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model = Net().to(device)\n", + "criterion = nn.CrossEntropyLoss()\n", + "optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "22ghR_nQcGzM", + "outputId": "35c1707d-6c75-44f5-d32c-8b75eb37c3a1" + }, + "outputs": [], + "source": [ + "valid_loss_min = np.Inf\n", + "\n", + "# Training loop\n", + "for epoch in range(wandb.config.epochs):\n", + " epoch_loss_train = 0\n", + " correct_train = 0\n", + " total_train = 0\n", + "\n", + " # Training\n", + " model.train()\n", + " for images, labels in train_loader:\n", + " images, labels = images.to(device), labels.to(device)\n", + "\n", + " optimizer.zero_grad()\n", + " outputs = model(images)\n", + " loss = criterion(outputs, labels)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " epoch_loss_train += loss.item()\n", + " _, predicted = torch.max(outputs, 1)\n", + " total_train += labels.size(0)\n", + " correct_train += (predicted == labels).sum().item()\n", + "\n", + " # Validation\n", + " epoch_loss_valid = 0\n", + " correct_valid = 0\n", + " total_valid = 0\n", + "\n", + " model.eval()\n", + " for images, labels in valid_loader:\n", + " images, labels = images.to(device), labels.to(device)\n", + "\n", + " outputs = model(images)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " epoch_loss_valid += loss.item()\n", + " _, predicted = torch.max(outputs, 1)\n", + " total_valid += labels.size(0)\n", + " correct_valid += (predicted == labels).sum().item()\n", + "\n", + " # Save model if validation loss has decreased\n", + " if epoch_loss_valid <= valid_loss_min:\n", + " print(\n", + " \"Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...\".format(\n", + " valid_loss_min, epoch_loss_valid\n", + " )\n", + " )\n", + " torch.save(model.state_dict(), \"model_cifar.pt\")\n", + " valid_loss_min = epoch_loss_valid\n", + "\n", + "\n", + " accuracy_train = 100 * correct_train / total_train\n", + " avg_loss_train = epoch_loss_train / len(train_loader)\n", + " accuracy_valid = 100 * correct_valid / total_valid\n", + " avg_loss_valid = epoch_loss_valid / len(valid_loader)\n", + "\n", + " # Log metrics to wandb\n", + " wandb.log({\"epoch\": epoch+1, \"train_loss\": avg_loss_train, \"train_accuracy\": accuracy_train})\n", + " wandb.log({\"valid_loss\": avg_loss_valid, \"valid_accuracy\": accuracy_valid})\n", + "\n", + " print(f\"Epoch {epoch+1}, Loss: {avg_loss_train:.4f}, Accuracy: {accuracy_train:.2f}%\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4WDrYAw_cLoq", + "outputId": "18c34920-bdaa-483d-b36f-ddbc5ad45a14" + }, + "outputs": [], + "source": [ + "torch.save(model.state_dict(), \"model_cifar.pth\")\n", + "wandb.save(\"model_cifar.pth\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aDn0cbNocOC0" + }, + "outputs": [], + "source": [ + "# Log an example image\n", + "wandb.log({\"example_image\": [wandb.Image(images[0].cpu())]})\n", + "\n", + "# Log gradients\n", + "wandb.watch(model, log=\"all\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 483 + }, + "id": "2wqQ2knajXZD", + "outputId": "3d11450e-b08d-4f35-f3cf-5f8c0293cfcc" + }, + "outputs": [], + "source": [ + "# Finish the wandb run\n", + "wandb.finish()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Experiments\n", + "\n", + "Run several trainings with different tuning of the hyperparameters and check the result in W&B." + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}