Select Git revision
reinforce_cartpole.py
Forked from
Dellandrea Emmanuel / MSO_3_4-TD1
Source project has a limited visibility.
-
Majdi Karim authoredMajdi Karim authored
reinforce_cartpole.py 2.78 KiB
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt
# Create the environment
env = gym.make("CartPole-v1", render_mode="human")
# Reset the environment and get the initial observation
observation = env.reset()
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Define the agent neural network model
class Policy(nn.Module):
def __init__(self, state_size, action_size, hidden_size=128):
super(Policy, self).__init__()
self.fc1 = nn.Linear(state_size, hidden_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.6) # Adjust dropout probability as needed
self.fc2 = nn.Linear(hidden_size, action_size)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return F.softmax(x)
policy_model = Policy(state_size, action_size)
optimizer = optim.Adam(policy_model.parameters(), lr=5e-3)
gamma = 0.99
episodes_rewards = []
for i in range(500):
# Reset the environment
# init buffers
observation, info = env.reset(seed=42)
episode_rewards = []
logarithmich_probabilities = []
terminated = False
# Render the environment to visualize the agent's behavior
env.render()
while terminated == False:
# Get action probabilities from the policy model
action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32))
action_distribution = Categorical(action_probabilities)
# Sample an action from the action distribution
action = action_distribution.sample()
logarithmich_probability = action_distribution.log_prob(action)
logarithmich_probabilities.append(logarithmich_probability)
print(int(action.item()))
# Take a step in the environment
#print(env.step(action.item()))
next_observation, reward, done, a, b = env.step(action.item())
episode_rewards.append(reward)
# Update observation
observation = next_observation
# Compute the return for the episode
returns = []
R = 0
for r in reversed(episode_rewards):
R = r + gamma * R
returns.insert(0, R)
# Compute the policy loss
policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum()
episodes_rewards += [-policy_loss]
# Update the policy model
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
env.close()
# Plot the policy loss against iterations
plt.plot([i for i in range(0,500)],episodes_rewards)
plt.xlabel('Iterations')
plt.ylabel('Policy Loss')
plt.title('Policy Loss vs. Iterations')
plt.show()