From ec80fa3086a05f01f6adaf9e2abbe157a7eb734b Mon Sep 17 00:00:00 2001 From: Paul-Emile Brussart <paul-emile.brussart@ecl19.ec-lyon.fr> Date: Mon, 13 Feb 2023 17:43:29 +0100 Subject: [PATCH] Updated reinforce_cartpole.py, added figure of the results --- reinforce_cartpole.png | Bin 0 -> 2396 bytes reinforce_cartpole.py | 161 ++++++++++++++++++----------------------- 2 files changed, 69 insertions(+), 92 deletions(-) create mode 100644 reinforce_cartpole.png diff --git a/reinforce_cartpole.png b/reinforce_cartpole.png new file mode 100644 index 0000000000000000000000000000000000000000..8a14610e2263b5a2a77866d3cf0821195bbb5efa GIT binary patch literal 2396 zcmeAS@N?(olHy`uVBq!ia0y~yU}|7sV0^&A#=yW}dhyN^1_lPp64!{5;QX|b^2DN4 z2H(Vzf}H%4oXjMJvecsD%=|oKJySg+9fgdNl7eC@ef?ax0=@jAbbT!|c~=Go2F?PH z$YKTt{zMRFTw%XFlYxQbq^FBxNX4AD*9;jM6nG989N7Q-Ayc0kS5?{hGdc_mM(5r$ zFfiORYG7b6WD{UeIL5%la74m^p+SO~gFzsffsrAJM}fhihmnP$!*Ep5Xb_C1g3*jH oS{95Jhod#ZXr)Nkc7dK+CHtP`%Qq!4FfcH9y85}Sb4q9e0GTD882|tP literal 0 HcmV?d00001 diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py index 54505ce..d1494db 100644 --- a/reinforce_cartpole.py +++ b/reinforce_cartpole.py @@ -1,109 +1,86 @@ import gym import torch -import matplotlib.pyplot as plt import torch.nn as nn -import torch.optim as optim import torch.nn.functional as F -from torch.distributions import Categorical - -# Number of episodes to run the environment -N_Episodes = 500 - -# Discount factor for future rewards -Gamma = 0.99 +import torch.optim as optim +import matplotlib.pyplot as plt -# Learning rate for the Adam optimizer -LR = 5e-3 +# setup the environment +env = gym.make('CartPole-v1') -# Define the neural network model -class Net(nn.Module): +# setup the agent as a neural network +class Model(nn.Module): def __init__(self): - super(Net, self).__init__() - self.fc1 = nn.Linear(4, 128) # Using fully connected layers - self.fc2 = nn.Linear(128, 2) # Two possible outputs: right or left - self.dropout = nn.Dropout(0.25) - self.softmax = nn.Softmax() + super(Model, self).__init__() + self.fc1 = nn.Linear(4, 128) + self.dropout = nn.Dropout(p=0.6) + self.fc2 = nn.Linear(128, 2) def forward(self, x): x = F.relu(self.fc1(x)) - x = self.dropout(x) - x = F.relu(self.fc2(x)) - x = self.softmax(x) # Apply softmax activation function to get action probabilities - return x - -# Initialize the model -model = Net() - -# Initialize the Adam optimizer -optimizer = optim.Adam(model.parameters(), LR) - -# Make the CartPole-v1 environment -env = gym.make("CartPole-v1") - -# Get the maximum number of steps allowed in each episode -maxSteps = env.spec.max_episode_steps - -# List to store the rewards accumulated through the episodes -rewardsList = [] - -for episode in range(N_Episodes): - - # Reset the environment for a new episode - observation = env.reset() - - # Initialize the rewards tensor - rewards = torch.zeros(maxSteps) + x = self.dropout(x) + x = self.fc2(x) + return F.softmax(x, dim=1) + +# initialize the agent +model = Model() +optimizer = optim.Adam(model.parameters(), lr=5e-3) + +# Keep track of the number of rewards for each episodes +rewardsByEpisode = [] + +# training loop +for episode in range(500): + # reset the environment + state = env.reset() + log_probs = [] + rewards = [] - # Initialize the buffer tensor - buffer = torch.zeros(maxSteps) - - # Set the done flag to False, indicating the episode has not ended - done = False - - # TrainSize counter to keep track of the number of steps in the episode - trainSize = 0 - - # Run the episode until it terminates - while not(done): - - # Pass the current observation through the model to get action probabilities - prob = model(torch.tensor(observation)) - - # Sample an action from the action probabilities - m = Categorical(prob) + # repeat until the end of the episode + while True: + state = torch.from_numpy(state).float().unsqueeze(0) + probs = model(state) + m = torch.distributions.Categorical(probs) action = m.sample() - - # Take the action and get the next state, reward, done flag, and info - state, reward, done, info = env.step(action.item()) - - # Store the probability of the action taken in the buffer tensor - buffer[trainSize] = prob[action] - - # Store the reward in the rewards tensor - rewards[trainSize] = reward - - # Accumulate the rewards over time - for i in range(trainSize): - rewards[i] += Gamma ** (trainSize-i) * reward - - trainSize += 1 - - # Vizualisation of the environment - env.render() + log_probs.append(m.log_prob(action)) + state, reward, done, _ = env.step(action.item()) + rewards.append(reward) + if done: + break + + + # Vizualisation of the environment (commented to gain time) + #env.render() + + # normalize the return + returns = [] + discounted_return = 0 + for reward in rewards[::-1]: + discounted_return = reward + 0.99 * discounted_return + returns.insert(0, discounted_return) + returns = torch.tensor(returns) + returns = (returns - returns.mean()) / (returns.std() + 1e-5) - - # Set the size for the list of rewards and the buffer - rewards = rewards[0:trainSize] - buffer = buffer[0:trainSize] - - # Normalizing the rewards - F.normalize(rewards, dim=0) - - - loss = - torch.sum(torch.multiply(torch.log10(buffer), rewards)) + rewardsByEpisode.append(len(rewards)) + # compute the model loss + model_loss = [] + for log_prob, return_ in zip(log_probs, returns): + model_loss.append(-log_prob * return_) + model_loss = torch.cat(model_loss).sum() + + # update the model optimizer.zero_grad() - loss.backward() + model_loss.backward() optimizer.step() + print("N° de l'épisode :", episode) + print("Nombre de rewards :", len(rewards)) + +# X axis : +x = list(range(len(rewardsByEpisode))) - rewardsList.append(trainSize) +plt.xlabel('Episodes N°') +plt.ylabel('Number of rewards given') +plt.plot(x, rewardsByEpisode, '--') +plt.show() +plt.savefig('reinforce_cartpole.png') -- GitLab