Skip to content
Snippets Groups Projects
Commit 52562508 authored by oscarchaufour's avatar oscarchaufour
Browse files

reinforce_cartpole

parent ef969c9d
Branches
No related tags found
No related merge requests found
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
# Define the neural network model
class Policy(nn.Module):
def __init__(self, input_size, output_size, hidden_size=128):
super(Policy, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.5)
self.fc2 = nn.Linear(hidden_size, output_size)
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.softmax(x)
return x
def reinforce():
# Create the environment
env = gym.make("CartPole-v1")
# Set up the agent
policy = Policy(
input_size=env.observation_space.shape[0],
output_size=env.action_space.n
)
optimizer = optim.Adam(policy.parameters(), lr=5e-3)
# Training loop
num_episodes = 500
gamma = 0.99
episode_rewards = []
for episode in tqdm(range(num_episodes)):
action_probabilities = []
episode_rewards_weighted = []
# Reset the environment and get the initial observation
observation = env.reset()[0]
terminated = False
episode_reward = 0
step = 0
while not terminated:
step += 1
# Compute action probabilities
action_probs = policy(torch.FloatTensor(observation).unsqueeze(0))
# Sample action based on probabilities and store its probability in the buffer
action = torch.multinomial(action_probs, num_samples=1).item()
# Step the environment with the action
observation, reward, terminated, truncated, info = env.step(action)
env.render()
# Compute and store the return in the buffer
episode_reward += reward
episode_rewards_weighted.append(reward * gamma ** step)
#episode_rewards.append(episode_reward)
# Store the action probabilities
action_probabilities.append(action_probs[0][action])
# Normalize the return
# Convert action_probabilities to a tensor with requires_grad=True
action_probabilities_tensor = torch.FloatTensor(action_probabilities).requires_grad_(True)
episode_rewards_tensor = torch.FloatTensor(episode_rewards_weighted)
episode_rewards_tensor -= torch.mean(episode_rewards_tensor)
episode_rewards_tensor /= torch.std(episode_rewards_tensor)
# Compute policy loss
log_probs = torch.log(action_probabilities_tensor.squeeze(0))
policy_loss = -torch.sum(log_probs * torch.tensor(episode_rewards_weighted))
# Update the policy
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
episode_rewards.append(episode_reward)
return episode_rewards
def plot_rewards(episode_rewards):
# Plot the total reward across episodes
plt.plot(episode_rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('REINFORCE: Total Reward across Episodes')
plt.show()
if __name__ == "__main__":
episode_rewards = reinforce()
plot_rewards(episode_rewards)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment