First commit: Adding reinforce_cartpole.py

fc91605b · Brussart Paul-emile · fc91605b
Commit fc91605b authored Feb 12, 2023 by Brussart Paul-emile
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
+import gym
+import torch
+import matplotlib.pyplot as plt
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.distributions import Categorical
+# Number of episodes to run the environment
+N_Episodes = 500
+# Discount factor for future rewards
+Gamma = 0.99
+# Learning rate for the Adam optimizer
+LR = 5e-3
+# Define the neural network model
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.fc1 = nn.Linear(4, 128) # Using fully connected layers
+        self.fc2 = nn.Linear(128, 2) # Two possible outputs: right or left
+        self.dropout = nn.Dropout(0.25) 
+        self.softmax = nn.Softmax() 
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = self.dropout(x) 
+        x = F.relu(self.fc2(x))
+        x = self.softmax(x) # Apply softmax activation function to get action probabilities
+        return x
+# Initialize the model
+model = Net()
+# Initialize the Adam optimizer
+optimizer = optim.Adam(model.parameters(), LR)
+# Make the CartPole-v1 environment
+env = gym.make("CartPole-v1")
+# Get the maximum number of steps allowed in each episode
+maxSteps = env.spec.max_episode_steps
+# List to store the rewards accumulated through the episodes
+rewardsList = []
+for episode in range(N_Episodes):
+    # Reset the environment for a new episode
+    observation = env.reset()
+    # Initialize the rewards tensor 
+    rewards = torch.zeros(maxSteps)
+    # Initialize the buffer tensor 
+    buffer = torch.zeros(maxSteps)
+    # Set the done flag to False, indicating the episode has not ended
+    done = False
+    # TrainSize counter to keep track of the number of steps in the episode
+    trainSize = 0
+    # Run the episode until it terminates
+    while not(done):
+        # Pass the current observation through the model to get action probabilities
+        prob = model(torch.tensor(observation))
+        # Sample an action from the action probabilities
+        m = Categorical(prob)
+        action = m.sample()
+        # Take the action and get the next state, reward, done flag, and info
+        state, reward, done, info = env.step(action.item())
+        # Store the probability of the action taken in the buffer tensor
+        buffer[trainSize] = prob[action]
+        # Store the reward in the rewards tensor
+        rewards[trainSize] = reward
+        # Accumulate the rewards over time
+        for i in range(trainSize):
+            rewards[i] += Gamma ** (trainSize-i) * reward
+        trainSize += 1
+        # Vizualisation of the environment
+        env.render()
+    # Set the size for the list of rewards and the buffer
+    rewards = rewards[0:trainSize]
+    buffer = buffer[0:trainSize]
+    # Normalizing the rewards
+    F.normalize(rewards, dim=0)
+    loss = - torch.sum(torch.multiply(torch.log10(buffer), rewards))
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+    rewardsList.append(trainSize)