Skip to content
Snippets Groups Projects
Commit fc91605b authored by Brussart Paul-emile's avatar Brussart Paul-emile
Browse files

First commit: Adding reinforce_cartpole.py

parents
Branches
No related tags found
No related merge requests found
import gym
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
# Number of episodes to run the environment
N_Episodes = 500
# Discount factor for future rewards
Gamma = 0.99
# Learning rate for the Adam optimizer
LR = 5e-3
# Define the neural network model
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(4, 128) # Using fully connected layers
self.fc2 = nn.Linear(128, 2) # Two possible outputs: right or left
self.dropout = nn.Dropout(0.25)
self.softmax = nn.Softmax()
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.softmax(x) # Apply softmax activation function to get action probabilities
return x
# Initialize the model
model = Net()
# Initialize the Adam optimizer
optimizer = optim.Adam(model.parameters(), LR)
# Make the CartPole-v1 environment
env = gym.make("CartPole-v1")
# Get the maximum number of steps allowed in each episode
maxSteps = env.spec.max_episode_steps
# List to store the rewards accumulated through the episodes
rewardsList = []
for episode in range(N_Episodes):
# Reset the environment for a new episode
observation = env.reset()
# Initialize the rewards tensor
rewards = torch.zeros(maxSteps)
# Initialize the buffer tensor
buffer = torch.zeros(maxSteps)
# Set the done flag to False, indicating the episode has not ended
done = False
# TrainSize counter to keep track of the number of steps in the episode
trainSize = 0
# Run the episode until it terminates
while not(done):
# Pass the current observation through the model to get action probabilities
prob = model(torch.tensor(observation))
# Sample an action from the action probabilities
m = Categorical(prob)
action = m.sample()
# Take the action and get the next state, reward, done flag, and info
state, reward, done, info = env.step(action.item())
# Store the probability of the action taken in the buffer tensor
buffer[trainSize] = prob[action]
# Store the reward in the rewards tensor
rewards[trainSize] = reward
# Accumulate the rewards over time
for i in range(trainSize):
rewards[i] += Gamma ** (trainSize-i) * reward
trainSize += 1
# Vizualisation of the environment
env.render()
# Set the size for the list of rewards and the buffer
rewards = rewards[0:trainSize]
buffer = buffer[0:trainSize]
# Normalizing the rewards
F.normalize(rewards, dim=0)
loss = - torch.sum(torch.multiply(torch.log10(buffer), rewards))
optimizer.zero_grad()
loss.backward()
optimizer.step()
rewardsList.append(trainSize)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment