From 3b582a8d9de216481b4ac31f406d061989699b7b Mon Sep 17 00:00:00 2001 From: Majdi Karim <karim.majdi@etu.ec-lyon.fr> Date: Tue, 5 Mar 2024 21:30:16 +0000 Subject: [PATCH] Add new file --- a2c_sb3_cartpole.py | 93 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 a2c_sb3_cartpole.py diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py new file mode 100644 index 0000000..a767524 --- /dev/null +++ b/a2c_sb3_cartpole.py @@ -0,0 +1,93 @@ +import gymnasium as gym +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +from torch.distributions import Categorical +import matplotlib.pyplot as plt + + + + + + + + + + +# Create the environment +env = gym.make("CartPole-v1", render_mode="human") + +# Reset the environment and get the initial observation +observation = env.reset() + +state_size = env.observation_space.shape[0] +action_size = env.action_space.n +# Define the agent neural network model +class Policy(nn.Module): + def __init__(self, state_size, action_size, hidden_size=128): + super(Policy, self).__init__() + self.fc1 = nn.Linear(state_size, hidden_size) + self.relu = nn.ReLU() + self.dropout = nn.Dropout(p=0.6) # Adjust dropout probability as needed + self.fc2 = nn.Linear(hidden_size, action_size) + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.dropout(x) + x = self.fc2(x) + return F.softmax(x) + +policy_model = Policy(state_size, action_size) +optimizer = optim.Adam(policy_model.parameters(), lr=5e-3) + +gamma = 0.99 +episodes_rewards = [] + +for i in range(500): + # Reset the environment + # init buffers + observation, info = env.reset(seed=42) + episode_rewards = [] + logarithmich_probabilities = [] + terminated = False + # Render the environment to visualize the agent's behavior + env.render() + + while terminated == False: + # Get action probabilities from the policy model + action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32)) + action_distribution = Categorical(action_probabilities) + + # Sample an action from the action distribution + action = action_distribution.sample() + logarithmich_probability = action_distribution.log_prob(action) + logarithmich_probabilities.append(logarithmich_probability) + print(int(action.item())) + # Take a step in the environment + #print(env.step(action.item())) + next_observation, reward, done, a, b = env.step(action.item()) + episode_rewards.append(reward) + + # Update observation + observation = next_observation + + + # Compute the return for the episode + returns = [] + R = 0 + for r in reversed(episode_rewards): + R = r + gamma * R + returns.insert(0, R) + + # Compute the policy loss + policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum() + episodes_rewards += [-policy_loss] + # Update the policy model + optimizer.zero_grad() + policy_loss.backward() + optimizer.step() + + +env.close() -- GitLab