Create Subject_5_Training_monitoring.ipynb

c29ca6cf · Dellandrea Emmanuel · ba91f239 · c29ca6cf
Commit c29ca6cf authored 3 months ago by Dellandrea Emmanuel
--- a/Practical_sessions/Session_5/Subject_5_Training_monitoring.ipynb
+++ b/Practical_sessions/Session_5/Subject_5_Training_monitoring.ipynb
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### **_Deep Learning  - Bsc Data Science for Responsible Business - Centrale Lyon_**\n",
+        "\n",
+        "2024-2025\n",
+        "\n",
+        "Emmanuel Dellandréa\t  "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Practical Session 5 – Monitoring the training with Weights & Biases\n",
+        "\n",
+        "The objective of this short tutorial is to learn how to monitor a CNN training with [Weights and Biases](https://wandb.ai/site/). With W&B, you can track and compare your experiments, visualize your model training and performance.\n",
+        "\n",
+        "#### Installation\n",
+        "\n",
+        "You'll need to install `wand`.\n",
+        "\n",
+        "```shell\n",
+        "pip install wandb \n",
+        "```\n",
+        "\n",
+        "\n",
+        "Have a look at the documentation of for integrating [Weights & Biases into Pytorch](https://docs.wandb.ai/guides/integrations/pytorch/).\n",
+        "\n",
+        "Then, study the code below and the informations registered in W&B.\n",
+        "\n",
+        "As the computation is heavy, particularly during training, we encourage you to use a GPU. If your laptob is not equiped, you may use one of these remote jupyter servers, where you can select the execution on GPU :\n",
+        "\n",
+        "1) [jupyter.mi90.ec-lyon.fr](https://jupyter.mi90.ec-lyon.fr/)\n",
+        "\n",
+        "This server is accessible within the campus network. If outside, you need to use a VPN. Before executing the notebook, select the kernel \"Python PyTorch\" to run it on GPU and have access to PyTorch module.\n",
+        "\n",
+        "2) [Google Colaboratory](https://colab.research.google.com/)\n",
+        "\n",
+        "Before executing the notebook, select the execution on GPU : \"Exécution\" Menu -> \"Modifier le type d'exécution\" and select \"T4 GPU\". "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 123
+        },
+        "id": "W_z5YjQ5b5nn",
+        "outputId": "ece809cd-f96f-4127-b8d5-1905e0577603"
+      },
+      "outputs": [],
+      "source": [
+        "import wandb\n",
+        "\n",
+        "# Initialize wandb\n",
+        "wandb.init(\n",
+        "    project=\"cnn_cifar10\",  # Set your project name\n",
+        "    config={                       # Define hyperparameters\n",
+        "        \"epochs\": 5,\n",
+        "        \"batch_size\": 64,\n",
+        "        \"learning_rate\": 0.01,\n",
+        "        \"optimizer\": \"Adam\"\n",
+        "    }\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "W6lKd0LylANS",
+        "outputId": "a4d3da41-bdb1-4bb8-8759-45f75c71f480"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "import numpy as np\n",
+        "from torchvision import datasets, transforms\n",
+        "from torch.utils.data.sampler import SubsetRandomSampler\n",
+        "import torch.optim as optim\n",
+        "\n",
+        "# number of subprocesses to use for data loading\n",
+        "num_workers = 0\n",
+        "# how many samples per batch to load\n",
+        "batch_size = wandb.config.batch_size\n",
+        "# percentage of training set to use as validation\n",
+        "valid_size = 0.2\n",
+        "\n",
+        "# convert data to a normalized torch.FloatTensor\n",
+        "transform = transforms.Compose(\n",
+        "    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]\n",
+        ")\n",
+        "\n",
+        "# choose the training and test datasets\n",
+        "train_data = datasets.CIFAR10(\"data\", train=True, download=True, transform=transform)\n",
+        "test_data = datasets.CIFAR10(\"data\", train=False, download=True, transform=transform)\n",
+        "\n",
+        "# obtain training indices that will be used for validation\n",
+        "num_train = len(train_data)\n",
+        "indices = list(range(num_train))\n",
+        "np.random.shuffle(indices)\n",
+        "split = int(np.floor(valid_size * num_train))\n",
+        "train_idx, valid_idx = indices[split:], indices[:split]\n",
+        "\n",
+        "# define samplers for obtaining training and validation batches\n",
+        "train_sampler = SubsetRandomSampler(train_idx)\n",
+        "valid_sampler = SubsetRandomSampler(valid_idx)\n",
+        "\n",
+        "# prepare data loaders (combine dataset and sampler)\n",
+        "train_loader = torch.utils.data.DataLoader(\n",
+        "    train_data, batch_size=wandb.config.batch_size, sampler=train_sampler, num_workers=num_workers\n",
+        ")\n",
+        "valid_loader = torch.utils.data.DataLoader(\n",
+        "    train_data, batch_size=wandb.config.batch_size, sampler=valid_sampler, num_workers=num_workers\n",
+        ")\n",
+        "test_loader = torch.utils.data.DataLoader(\n",
+        "    test_data, batch_size=wandb.config.batch_size, num_workers=num_workers\n",
+        ")\n",
+        "\n",
+        "# specify the image classes\n",
+        "classes = [\n",
+        "    \"airplane\",\n",
+        "    \"automobile\",\n",
+        "    \"bird\",\n",
+        "    \"cat\",\n",
+        "    \"deer\",\n",
+        "    \"dog\",\n",
+        "    \"frog\",\n",
+        "    \"horse\",\n",
+        "    \"ship\",\n",
+        "    \"truck\",\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Cvfx-93dlLGD"
+      },
+      "outputs": [],
+      "source": [
+        "import torch.nn as nn\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "# define the CNN architecture\n",
+        "\n",
+        "\n",
+        "class Net(nn.Module):\n",
+        "    def __init__(self):\n",
+        "        super(Net, self).__init__()\n",
+        "        self.conv1 = nn.Conv2d(3, 6, 5)\n",
+        "        self.pool = nn.MaxPool2d(2, 2)\n",
+        "        self.conv2 = nn.Conv2d(6, 16, 5)\n",
+        "        self.fc1 = nn.Linear(16 * 5 * 5, 120)\n",
+        "        self.fc2 = nn.Linear(120, 84)\n",
+        "        self.fc3 = nn.Linear(84, 10)\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        x = self.pool(F.relu(self.conv1(x)))\n",
+        "        x = self.pool(F.relu(self.conv2(x)))\n",
+        "        x = x.view(-1, 16 * 5 * 5)\n",
+        "        x = F.relu(self.fc1(x))\n",
+        "        x = F.relu(self.fc2(x))\n",
+        "        x = self.fc3(x)\n",
+        "        return x\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7f5T3YLjl4zF"
+      },
+      "outputs": [],
+      "source": [
+        "# Define model, loss, and optimizer\n",
+        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+        "model = Net().to(device)\n",
+        "criterion = nn.CrossEntropyLoss()\n",
+        "optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "22ghR_nQcGzM",
+        "outputId": "35c1707d-6c75-44f5-d32c-8b75eb37c3a1"
+      },
+      "outputs": [],
+      "source": [
+        "valid_loss_min = np.Inf\n",
+        "\n",
+        "# Training loop\n",
+        "for epoch in range(wandb.config.epochs):\n",
+        "    epoch_loss_train = 0\n",
+        "    correct_train = 0\n",
+        "    total_train = 0\n",
+        "\n",
+        "    # Training\n",
+        "    model.train()\n",
+        "    for images, labels in train_loader:\n",
+        "        images, labels = images.to(device), labels.to(device)\n",
+        "\n",
+        "        optimizer.zero_grad()\n",
+        "        outputs = model(images)\n",
+        "        loss = criterion(outputs, labels)\n",
+        "        loss.backward()\n",
+        "        optimizer.step()\n",
+        "\n",
+        "        epoch_loss_train += loss.item()\n",
+        "        _, predicted = torch.max(outputs, 1)\n",
+        "        total_train += labels.size(0)\n",
+        "        correct_train += (predicted == labels).sum().item()\n",
+        "\n",
+        "    # Validation\n",
+        "    epoch_loss_valid = 0\n",
+        "    correct_valid = 0\n",
+        "    total_valid = 0\n",
+        "\n",
+        "    model.eval()\n",
+        "    for images, labels in valid_loader:\n",
+        "        images, labels = images.to(device), labels.to(device)\n",
+        "\n",
+        "        outputs = model(images)\n",
+        "        loss = criterion(outputs, labels)\n",
+        "\n",
+        "        epoch_loss_valid += loss.item()\n",
+        "        _, predicted = torch.max(outputs, 1)\n",
+        "        total_valid += labels.size(0)\n",
+        "        correct_valid += (predicted == labels).sum().item()\n",
+        "\n",
+        "    # Save model if validation loss has decreased\n",
+        "    if epoch_loss_valid <= valid_loss_min:\n",
+        "        print(\n",
+        "            \"Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...\".format(\n",
+        "                valid_loss_min, epoch_loss_valid\n",
+        "            )\n",
+        "        )\n",
+        "        torch.save(model.state_dict(), \"model_cifar.pt\")\n",
+        "        valid_loss_min = epoch_loss_valid\n",
+        "\n",
+        "\n",
+        "    accuracy_train = 100 * correct_train / total_train\n",
+        "    avg_loss_train = epoch_loss_train / len(train_loader)\n",
+        "    accuracy_valid = 100 * correct_valid / total_valid\n",
+        "    avg_loss_valid = epoch_loss_valid / len(valid_loader)\n",
+        "\n",
+        "    # Log metrics to wandb\n",
+        "    wandb.log({\"epoch\": epoch+1, \"train_loss\": avg_loss_train, \"train_accuracy\": accuracy_train})\n",
+        "    wandb.log({\"valid_loss\": avg_loss_valid, \"valid_accuracy\": accuracy_valid})\n",
+        "\n",
+        "    print(f\"Epoch {epoch+1}, Loss: {avg_loss_train:.4f}, Accuracy: {accuracy_train:.2f}%\")\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "4WDrYAw_cLoq",
+        "outputId": "18c34920-bdaa-483d-b36f-ddbc5ad45a14"
+      },
+      "outputs": [],
+      "source": [
+        "torch.save(model.state_dict(), \"model_cifar.pth\")\n",
+        "wandb.save(\"model_cifar.pth\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "aDn0cbNocOC0"
+      },
+      "outputs": [],
+      "source": [
+        "# Log an example image\n",
+        "wandb.log({\"example_image\": [wandb.Image(images[0].cpu())]})\n",
+        "\n",
+        "# Log gradients\n",
+        "wandb.watch(model, log=\"all\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 483
+        },
+        "id": "2wqQ2knajXZD",
+        "outputId": "3d11450e-b08d-4f35-f3cf-5f8c0293cfcc"
+      },
+      "outputs": [],
+      "source": [
+        "# Finish the wandb run\n",
+        "wandb.finish()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Experiments\n",
+        "\n",
+        "Run several trainings with different tuning of the hyperparameters and check the result in W&B."
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
+%% Cell type:markdown id: tags:
+### **_Deep Learning  - Bsc Data Science for Responsible Business - Centrale Lyon_**
+2024-2025
+Emmanuel Dellandréa
+%% Cell type:markdown id: tags:
+# Practical Session 5 – Monitoring the training with Weights & Biases
+The objective of this short tutorial is to learn how to monitor a CNN training with [Weights and Biases](https://wandb.ai/site/). With W&B, you can track and compare your experiments, visualize your model training and performance.
+#### Installation
+You'll need to install `wand`.
+```shell
+pip install wandb
+```
+Have a look at the documentation of for integrating [Weights & Biases into Pytorch](https://docs.wandb.ai/guides/integrations/pytorch/).
+Then, study the code below and the informations registered in W&B.
+As the computation is heavy, particularly during training, we encourage you to use a GPU. If your laptob is not equiped, you may use one of these remote jupyter servers, where you can select the execution on GPU :
+1) [jupyter.mi90.ec-lyon.fr](https://jupyter.mi90.ec-lyon.fr/)
+This server is accessible within the campus network. If outside, you need to use a VPN. Before executing the notebook, select the kernel "Python PyTorch" to run it on GPU and have access to PyTorch module.
+2) [Google Colaboratory](https://colab.research.google.com/)
+Before executing the notebook, select the execution on GPU : "Exécution" Menu -> "Modifier le type d'exécution" and select "T4 GPU".
+%% Cell type:code id: tags:
+``` 
+import wandb
+# Initialize wandb
+wandb.init(
+    project="cnn_cifar10",  # Set your project name
+    config={                       # Define hyperparameters
+        "epochs": 5,
+        "batch_size": 64,
+        "learning_rate": 0.01,
+        "optimizer": "Adam"
+    }
+)
+```
+%% Cell type:code id: tags:
+``` 
+import torch
+import numpy as np
+from torchvision import datasets, transforms
+from torch.utils.data.sampler import SubsetRandomSampler
+import torch.optim as optim
+# number of subprocesses to use for data loading
+num_workers = 0
+# how many samples per batch to load
+batch_size = wandb.config.batch_size
+# percentage of training set to use as validation
+valid_size = 0.2
+# convert data to a normalized torch.FloatTensor
+transform = transforms.Compose(
+    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+)
+# choose the training and test datasets
+train_data = datasets.CIFAR10("data", train=True, download=True, transform=transform)
+test_data = datasets.CIFAR10("data", train=False, download=True, transform=transform)
+# obtain training indices that will be used for validation
+num_train = len(train_data)
+indices = list(range(num_train))
+np.random.shuffle(indices)
+split = int(np.floor(valid_size * num_train))
+train_idx, valid_idx = indices[split:], indices[:split]
+# define samplers for obtaining training and validation batches
+train_sampler = SubsetRandomSampler(train_idx)
+valid_sampler = SubsetRandomSampler(valid_idx)
+# prepare data loaders (combine dataset and sampler)
+train_loader = torch.utils.data.DataLoader(
+    train_data, batch_size=wandb.config.batch_size, sampler=train_sampler, num_workers=num_workers
+)
+valid_loader = torch.utils.data.DataLoader(
+    train_data, batch_size=wandb.config.batch_size, sampler=valid_sampler, num_workers=num_workers
+)
+test_loader = torch.utils.data.DataLoader(
+    test_data, batch_size=wandb.config.batch_size, num_workers=num_workers
+)
+# specify the image classes
+classes = [
+    "airplane",
+    "automobile",
+    "bird",
+    "cat",
+    "deer",
+    "dog",
+    "frog",
+    "horse",
+    "ship",
+    "truck",
+]
+```
+%% Cell type:code id: tags:
+``` 
+import torch.nn as nn
+import torch.nn.functional as F
+# define the CNN architecture
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+```
+%% Cell type:code id: tags:
+``` 
+# Define model, loss, and optimizer
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = Net().to(device)
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
+```
+%% Cell type:code id: tags:
+``` 
+valid_loss_min = np.Inf
+# Training loop
+for epoch in range(wandb.config.epochs):
+    epoch_loss_train = 0
+    correct_train = 0
+    total_train = 0
+    # Training
+    model.train()
+    for images, labels in train_loader:
+        images, labels = images.to(device), labels.to(device)
+        optimizer.zero_grad()
+        outputs = model(images)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+        epoch_loss_train += loss.item()
+        _, predicted = torch.max(outputs, 1)
+        total_train += labels.size(0)
+        correct_train += (predicted == labels).sum().item()
+    # Validation
+    epoch_loss_valid = 0
+    correct_valid = 0
+    total_valid = 0
+    model.eval()
+    for images, labels in valid_loader:
+        images, labels = images.to(device), labels.to(device)
+        outputs = model(images)
+        loss = criterion(outputs, labels)
+        epoch_loss_valid += loss.item()
+        _, predicted = torch.max(outputs, 1)
+        total_valid += labels.size(0)
+        correct_valid += (predicted == labels).sum().item()
+    # Save model if validation loss has decreased
+    if epoch_loss_valid <= valid_loss_min:
+        print(
+            "Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...".format(
+                valid_loss_min, epoch_loss_valid
+            )
+        )
+        torch.save(model.state_dict(), "model_cifar.pt")
+        valid_loss_min = epoch_loss_valid
+    accuracy_train = 100 * correct_train / total_train
+    avg_loss_train = epoch_loss_train / len(train_loader)
+    accuracy_valid = 100 * correct_valid / total_valid
+    avg_loss_valid = epoch_loss_valid / len(valid_loader)
+    # Log metrics to wandb
+    wandb.log({"epoch": epoch+1, "train_loss": avg_loss_train, "train_accuracy": accuracy_train})
+    wandb.log({"valid_loss": avg_loss_valid, "valid_accuracy": accuracy_valid})
+    print(f"Epoch {epoch+1}, Loss: {avg_loss_train:.4f}, Accuracy: {accuracy_train:.2f}%")
+```
+%% Cell type:code id: tags:
+``` 
+torch.save(model.state_dict(), "model_cifar.pth")
+wandb.save("model_cifar.pth")
+```
+%% Cell type:code id: tags:
+``` 
+# Log an example image
+wandb.log({"example_image": [wandb.Image(images[0].cpu())]})
+# Log gradients
+wandb.watch(model, log="all")
+```
+%% Cell type:code id: tags:
+``` 
+# Finish the wandb run
+wandb.finish()
+```
+%% Cell type:markdown id: tags:
+## Experiments
+Run several trainings with different tuning of the hyperparameters and check the result in W&B.