Skip to content
Snippets Groups Projects
Commit e04379e4 authored by Amoussas Younes's avatar Amoussas Younes
Browse files

Upload New File

parent c4ab3f49
Branches
No related tags found
No related merge requests found
"""Setup the CartPole environment
Setup the agent as a simple neural network with:
- One fully connected layer with 128 units and ReLU activation followed by a dropout layer
- One fully connected layer followed by softmax activation
Repeat 500 times:
Reset the environment
Reset the buffer
Repeat until the end of the episode:
Compute action probabilities
Sample the action based on the probabilities and store its probability in the buffer
Step the environment with the action
Compute and store in the buffer the return using gamma=0.99
Normalize the return
Compute the policy loss as -sum(log(prob) * return)
Update the policy using an Adam optimizer and a learning rate of 5e-3
"""
import gym
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
# Set up the CartPole environement
# Create the environment
env = gym.make("CartPole-v1")
# Reset the environment and get the initial observation
observation = env.reset()
print('observation space:', env.observation_space)
print('action space:', env.action_space)
print('threshold: ', env.spec.reward_threshold)
# Define the agent
# Define model's parameters
observation_size = env.observation_space.shape[0]
hidden_size = 128
n_actions = env.action_space.n
# Define the neural network model
model = nn.Sequential(
nn.Linear(observation_size, hidden_size),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(hidden_size, n_actions),
nn.Softmax(dim=0)
)
print (model)
# Agent's parameters
learning_rate = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
gamma = 0.99
final_rewards = []
for _ in range(500):
# Reset the environment and get the initial observation
observation = env.reset()
done = False
transitions = []
# Reset the buffer
buffer = [[], []]
rewards = []
# Loop for an episode
t = 0
while not done:
# Compute action probabilities
# pylint: disable=E1101
action_proba = model(torch.from_numpy(observation).float())
# pylint: enable=E1101
# Sample the action based on the probabilities
action = np.random.choice(np.array([0,1]), p=action_proba.data.numpy())
# Store its probability in the buffer
buffer[0].append(action_proba.data.numpy())
# Step the environment with the action
previous_observation = observation
observation, reward, done, info = env.step(action)
transitions.append((previous_observation, action, reward))
rewards.append(reward)
t += 1
# Compute and store in the buffer the return
for t in range(len(transitions)):
return_agent = 0
for i in range(t, len(transitions)):
reward = transitions[i][2]
return_agent += (gamma ** t) * reward
buffer[1].append(return_agent)
# Normalize the return
# pylint: disable=E1101
returns = torch.FloatTensor(buffer[1])
# pylint: enable=E1101
returns /= returns.max()
# Compute the policy loss
states = torch.Tensor([state for (state, _, _) in transitions])
actions = torch.Tensor([action for (_, action, _) in transitions])
predicitions = model(states)
probabilities = predicitions.gather(dim=1,index=actions.long().view(-1,1)).squeeze()
# pylint: disable=E1101
policy_loss = -torch.sum(torch.log(probabilities) * returns)
# pylint: enable=E1101
# Update the policy
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
final_rewards.append(len(rewards))
# Render the environment to visualize the agent's behavior
env.render()
# Plot figure of rewards over episodes
plt.plot(final_rewards)
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.savefig("./images/REINFORCE.png")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment