Skip to content
Snippets Groups Projects
Commit ec80fa30 authored by Brussart Paul-emile's avatar Brussart Paul-emile
Browse files

Updated reinforce_cartpole.py, added figure of the results

parent fc91605b
No related branches found
No related tags found
No related merge requests found
reinforce_cartpole.png

2.34 KiB

import gym
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
# Number of episodes to run the environment
N_Episodes = 500
# Discount factor for future rewards
Gamma = 0.99
import torch.optim as optim
import matplotlib.pyplot as plt
# Learning rate for the Adam optimizer
LR = 5e-3
# setup the environment
env = gym.make('CartPole-v1')
# Define the neural network model
class Net(nn.Module):
# setup the agent as a neural network
class Model(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(4, 128) # Using fully connected layers
self.fc2 = nn.Linear(128, 2) # Two possible outputs: right or left
self.dropout = nn.Dropout(0.25)
self.softmax = nn.Softmax()
super(Model, self).__init__()
self.fc1 = nn.Linear(4, 128)
self.dropout = nn.Dropout(p=0.6)
self.fc2 = nn.Linear(128, 2)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.softmax(x) # Apply softmax activation function to get action probabilities
return x
# Initialize the model
model = Net()
# Initialize the Adam optimizer
optimizer = optim.Adam(model.parameters(), LR)
# Make the CartPole-v1 environment
env = gym.make("CartPole-v1")
# Get the maximum number of steps allowed in each episode
maxSteps = env.spec.max_episode_steps
# List to store the rewards accumulated through the episodes
rewardsList = []
for episode in range(N_Episodes):
# Reset the environment for a new episode
observation = env.reset()
# Initialize the rewards tensor
rewards = torch.zeros(maxSteps)
# Initialize the buffer tensor
buffer = torch.zeros(maxSteps)
# Set the done flag to False, indicating the episode has not ended
done = False
# TrainSize counter to keep track of the number of steps in the episode
trainSize = 0
# Run the episode until it terminates
while not(done):
# Pass the current observation through the model to get action probabilities
prob = model(torch.tensor(observation))
# Sample an action from the action probabilities
m = Categorical(prob)
x = self.fc2(x)
return F.softmax(x, dim=1)
# initialize the agent
model = Model()
optimizer = optim.Adam(model.parameters(), lr=5e-3)
# Keep track of the number of rewards for each episodes
rewardsByEpisode = []
# training loop
for episode in range(500):
# reset the environment
state = env.reset()
log_probs = []
rewards = []
# repeat until the end of the episode
while True:
state = torch.from_numpy(state).float().unsqueeze(0)
probs = model(state)
m = torch.distributions.Categorical(probs)
action = m.sample()
# Take the action and get the next state, reward, done flag, and info
state, reward, done, info = env.step(action.item())
# Store the probability of the action taken in the buffer tensor
buffer[trainSize] = prob[action]
# Store the reward in the rewards tensor
rewards[trainSize] = reward
# Accumulate the rewards over time
for i in range(trainSize):
rewards[i] += Gamma ** (trainSize-i) * reward
trainSize += 1
# Vizualisation of the environment
env.render()
# Set the size for the list of rewards and the buffer
rewards = rewards[0:trainSize]
buffer = buffer[0:trainSize]
# Normalizing the rewards
F.normalize(rewards, dim=0)
loss = - torch.sum(torch.multiply(torch.log10(buffer), rewards))
log_probs.append(m.log_prob(action))
state, reward, done, _ = env.step(action.item())
rewards.append(reward)
if done:
break
# Vizualisation of the environment (commented to gain time)
#env.render()
# normalize the return
returns = []
discounted_return = 0
for reward in rewards[::-1]:
discounted_return = reward + 0.99 * discounted_return
returns.insert(0, discounted_return)
returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + 1e-5)
rewardsByEpisode.append(len(rewards))
# compute the model loss
model_loss = []
for log_prob, return_ in zip(log_probs, returns):
model_loss.append(-log_prob * return_)
model_loss = torch.cat(model_loss).sum()
# update the model
optimizer.zero_grad()
loss.backward()
model_loss.backward()
optimizer.step()
print("N° de l'épisode :", episode)
print("Nombre de rewards :", len(rewards))
# X axis :
x = list(range(len(rewardsByEpisode)))
rewardsList.append(trainSize)
plt.xlabel('Episodes N°')
plt.ylabel('Number of rewards given')
plt.plot(x, rewardsByEpisode, '--')
plt.show()
plt.savefig('reinforce_cartpole.png')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment