From ec80fa3086a05f01f6adaf9e2abbe157a7eb734b Mon Sep 17 00:00:00 2001
From: Paul-Emile Brussart <paul-emile.brussart@ecl19.ec-lyon.fr>
Date: Mon, 13 Feb 2023 17:43:29 +0100
Subject: [PATCH] Updated reinforce_cartpole.py, added figure of the results

---
 reinforce_cartpole.png | Bin 0 -> 2396 bytes
 reinforce_cartpole.py  | 161 ++++++++++++++++++-----------------------
 2 files changed, 69 insertions(+), 92 deletions(-)
 create mode 100644 reinforce_cartpole.png

diff --git a/reinforce_cartpole.png b/reinforce_cartpole.png
new file mode 100644
index 0000000000000000000000000000000000000000..8a14610e2263b5a2a77866d3cf0821195bbb5efa
GIT binary patch
literal 2396
zcmeAS@N?(olHy`uVBq!ia0y~yU}|7sV0^&A#=yW}dhyN^1_lPp64!{5;QX|b^2DN4
z2H(Vzf}H%4oXjMJvecsD%=|oKJySg+9fgdNl7eC@ef?ax0=@jAbbT!|c~=Go2F?PH
z$YKTt{zMRFTw%XFlYxQbq^FBxNX4AD*9;jM6nG989N7Q-Ayc0kS5?{hGdc_mM(5r$
zFfiORYG7b6WD{UeIL5%la74m^p+SO~gFzsffsrAJM}fhihmnP$!*Ep5Xb_C1g3*jH
oS{95Jhod#ZXr)Nkc7dK+CHtP`%Qq!4FfcH9y85}Sb4q9e0GTD882|tP

literal 0
HcmV?d00001

diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py
index 54505ce..d1494db 100644
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
@@ -1,109 +1,86 @@
 import gym
 import torch
-import matplotlib.pyplot as plt
 import torch.nn as nn
-import torch.optim as optim
 import torch.nn.functional as F
-from torch.distributions import Categorical
-
-# Number of episodes to run the environment
-N_Episodes = 500
-
-# Discount factor for future rewards
-Gamma = 0.99
+import torch.optim as optim
+import matplotlib.pyplot as plt
 
-# Learning rate for the Adam optimizer
-LR = 5e-3
+# setup the environment
+env = gym.make('CartPole-v1')
 
-# Define the neural network model
-class Net(nn.Module):
+# setup the agent as a neural network
+class Model(nn.Module):
     def __init__(self):
-        super(Net, self).__init__()
-        self.fc1 = nn.Linear(4, 128) # Using fully connected layers
-        self.fc2 = nn.Linear(128, 2) # Two possible outputs: right or left
-        self.dropout = nn.Dropout(0.25) 
-        self.softmax = nn.Softmax() 
+        super(Model, self).__init__()
+        self.fc1 = nn.Linear(4, 128)
+        self.dropout = nn.Dropout(p=0.6)
+        self.fc2 = nn.Linear(128, 2)
 
     def forward(self, x):
         x = F.relu(self.fc1(x))
-        x = self.dropout(x) 
-        x = F.relu(self.fc2(x))
-        x = self.softmax(x) # Apply softmax activation function to get action probabilities
-        return x
-
-# Initialize the model
-model = Net()
-
-# Initialize the Adam optimizer
-optimizer = optim.Adam(model.parameters(), LR)
-
-# Make the CartPole-v1 environment
-env = gym.make("CartPole-v1")
-
-# Get the maximum number of steps allowed in each episode
-maxSteps = env.spec.max_episode_steps
-
-# List to store the rewards accumulated through the episodes
-rewardsList = []
-
-for episode in range(N_Episodes):
-
-    # Reset the environment for a new episode
-    observation = env.reset()
-    
-    # Initialize the rewards tensor 
-    rewards = torch.zeros(maxSteps)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return F.softmax(x, dim=1)
+
+# initialize the agent
+model = Model()
+optimizer = optim.Adam(model.parameters(), lr=5e-3)
+
+# Keep track of the number of rewards for each episodes
+rewardsByEpisode = []
+
+# training loop
+for episode in range(500):
+    # reset the environment
+    state = env.reset()
+    log_probs = []
+    rewards = []
     
-    # Initialize the buffer tensor 
-    buffer = torch.zeros(maxSteps)
-
-    # Set the done flag to False, indicating the episode has not ended
-    done = False
-
-    # TrainSize counter to keep track of the number of steps in the episode
-    trainSize = 0
-
-    # Run the episode until it terminates
-    while not(done):
-
-        # Pass the current observation through the model to get action probabilities
-        prob = model(torch.tensor(observation))
-        
-        # Sample an action from the action probabilities
-        m = Categorical(prob)
+    # repeat until the end of the episode
+    while True:
+        state = torch.from_numpy(state).float().unsqueeze(0)
+        probs = model(state)
+        m = torch.distributions.Categorical(probs)
         action = m.sample()
-        
-        # Take the action and get the next state, reward, done flag, and info
-        state, reward, done, info = env.step(action.item())
-
-        # Store the probability of the action taken in the buffer tensor
-        buffer[trainSize] = prob[action]
-        
-        # Store the reward in the rewards tensor
-        rewards[trainSize] = reward
-
-        # Accumulate the rewards over time
-        for i in range(trainSize):
-            rewards[i] += Gamma ** (trainSize-i) * reward
-
-        trainSize += 1
-
-        # Vizualisation of the environment
-        env.render()
+        log_probs.append(m.log_prob(action))
+        state, reward, done, _ = env.step(action.item())
+        rewards.append(reward)
+        if done:
+            break
+
+
+        # Vizualisation of the environment (commented to gain time)
+        #env.render()
+            
+    # normalize the return
+    returns = []
+    discounted_return = 0
+    for reward in rewards[::-1]:
+        discounted_return = reward + 0.99 * discounted_return
+        returns.insert(0, discounted_return)
+    returns = torch.tensor(returns)
+    returns = (returns - returns.mean()) / (returns.std() + 1e-5)
     
-   
-    # Set the size for the list of rewards and the buffer
-    rewards = rewards[0:trainSize]
-    buffer = buffer[0:trainSize]
-
-    # Normalizing the rewards
-    F.normalize(rewards, dim=0)
-
-
-    loss = - torch.sum(torch.multiply(torch.log10(buffer), rewards))
+    rewardsByEpisode.append(len(rewards))
 
+    # compute the model loss
+    model_loss = []
+    for log_prob, return_ in zip(log_probs, returns):
+        model_loss.append(-log_prob * return_)
+    model_loss = torch.cat(model_loss).sum()
+    
+    # update the model
     optimizer.zero_grad()
-    loss.backward()
+    model_loss.backward()
     optimizer.step()
+    print("N° de l'épisode :", episode)
+    print("Nombre de rewards :", len(rewards))
+
+# X axis :
+x = list(range(len(rewardsByEpisode)))
 
-    rewardsList.append(trainSize)
+plt.xlabel('Episodes N°')
+plt.ylabel('Number of rewards given')
+plt.plot(x, rewardsByEpisode, '--')
+plt.show()
+plt.savefig('reinforce_cartpole.png')
-- 
GitLab