Updated reinforce_cartpole.py, added figure of the results

ec80fa30 · Brussart Paul-emile · fc91605b · ec80fa30 · ec80fa30
Commit ec80fa30 authored 2 years ago by Brussart Paul-emile
--- a/reinforce_cartpole.png
+++ b/reinforce_cartpole.png
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
 import gym
 import torch
-import matplotlib.pyplot as plt
 import torch.nn as nn
-import torch.optim as optim
 import torch.nn.functional as F
-from torch.distributions import Categorical
-
-# Number of episodes to run the environment
-N_Episodes = 500
-
-# Discount factor for future rewards
-Gamma = 0.99
+import torch.optim as optim
+import matplotlib.pyplot as plt

-# Learning rate for the Adam optimizer
-LR = 5e-3
+# setup the environment
+env = gym.make('CartPole-v1')

-# Define the neural network model
-class Net(nn.Module):
+# setup the agent as a neural network
+class Model(nn.Module):
    def __init__(self):
-        super(Net, self).__init__()
-        self.fc1 = nn.Linear(4, 128) # Using fully connected layers
-        self.fc2 = nn.Linear(128, 2) # Two possible outputs: right or left
-        self.dropout = nn.Dropout(0.25) 
-        self.softmax = nn.Softmax() 
+        super(Model, self).__init__()
+        self.fc1 = nn.Linear(4, 128)
+        self.dropout = nn.Dropout(p=0.6)
+        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
-        x = F.relu(self.fc2(x))
-        x = self.softmax(x) # Apply softmax activation function to get action probabilities
-        return x
-
-# Initialize the model
-model = Net()
-
-# Initialize the Adam optimizer
-optimizer = optim.Adam(model.parameters(), LR)
-
-# Make the CartPole-v1 environment
-env = gym.make("CartPole-v1")
-
-# Get the maximum number of steps allowed in each episode
-maxSteps = env.spec.max_episode_steps
-
-# List to store the rewards accumulated through the episodes
-rewardsList = []
-
-for episode in range(N_Episodes):
-
-    # Reset the environment for a new episode
-    observation = env.reset()
-    
-    # Initialize the rewards tensor 
-    rewards = torch.zeros(maxSteps)
-    
-    # Initialize the buffer tensor 
-    buffer = torch.zeros(maxSteps)
-
-    # Set the done flag to False, indicating the episode has not ended
-    done = False
-
-    # TrainSize counter to keep track of the number of steps in the episode
-    trainSize = 0
-
-    # Run the episode until it terminates
-    while not(done):
-
-        # Pass the current observation through the model to get action probabilities
-        prob = model(torch.tensor(observation))
-        
-        # Sample an action from the action probabilities
-        m = Categorical(prob)
+        x = self.fc2(x)
+        return F.softmax(x, dim=1)
+
+# initialize the agent
+model = Model()
+optimizer = optim.Adam(model.parameters(), lr=5e-3)
+
+# Keep track of the number of rewards for each episodes
+rewardsByEpisode = []
+
+# training loop
+for episode in range(500):
+    # reset the environment
+    state = env.reset()
+    log_probs = []
+    rewards = []
+    
+    # repeat until the end of the episode
+    while True:
+        state = torch.from_numpy(state).float().unsqueeze(0)
+        probs = model(state)
+        m = torch.distributions.Categorical(probs)
        action = m.sample()
-        
-        # Take the action and get the next state, reward, done flag, and info
-        state, reward, done, info = env.step(action.item())
-
-        # Store the probability of the action taken in the buffer tensor
-        buffer[trainSize] = prob[action]
-        
-        # Store the reward in the rewards tensor
-        rewards[trainSize] = reward
-
-        # Accumulate the rewards over time
-        for i in range(trainSize):
-            rewards[i] += Gamma ** (trainSize-i) * reward
-
-        trainSize += 1
-
-        # Vizualisation of the environment
-        env.render()
-    
-   
-    # Set the size for the list of rewards and the buffer
-    rewards = rewards[0:trainSize]
-    buffer = buffer[0:trainSize]
-
-    # Normalizing the rewards
-    F.normalize(rewards, dim=0)
-
-
-    loss = - torch.sum(torch.multiply(torch.log10(buffer), rewards))
-
+        log_probs.append(m.log_prob(action))
+        state, reward, done, _ = env.step(action.item())
+        rewards.append(reward)
+        if done:
+            break
+
+
+        # Vizualisation of the environment (commented to gain time)
+        #env.render()
+            
+    # normalize the return
+    returns = []
+    discounted_return = 0
+    for reward in rewards[::-1]:
+        discounted_return = reward + 0.99 * discounted_return
+        returns.insert(0, discounted_return)
+    returns = torch.tensor(returns)
+    returns = (returns - returns.mean()) / (returns.std() + 1e-5)
+    
+    rewardsByEpisode.append(len(rewards))
+
+    # compute the model loss
+    model_loss = []
+    for log_prob, return_ in zip(log_probs, returns):
+        model_loss.append(-log_prob * return_)
+    model_loss = torch.cat(model_loss).sum()
+    
+    # update the model
    optimizer.zero_grad()
-    loss.backward()
+    model_loss.backward()
    optimizer.step()
+    print("N° de l'épisode :", episode)
+    print("Nombre de rewards :", len(rewards))
+
+# X axis :
+x = list(range(len(rewardsByEpisode)))

-    rewardsList.append(trainSize)
+plt.xlabel('Episodes N°')
+plt.ylabel('Number of rewards given')
+plt.plot(x, rewardsByEpisode, '--')
+plt.show()
+plt.savefig('reinforce_cartpole.png')