Skip to content
Snippets Groups Projects
Commit 93ba5cfa authored by Majdi Karim's avatar Majdi Karim
Browse files

Update a2c_sb3_cartpole.py

parent 29f96ab1
No related branches found
No related tags found
No related merge requests found
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt
import numpy as np
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import A2C
from huggingface_sb3 import push_to_hub
from huggingface_hub import login
print(f"{gym.__version__=}")
env = gym.make("CartPole-v1", render_mode="rgb_array")
model = A2C("MlpPolicy", env, verbose=1)
def evaluate(model, num_episodes=100, deterministic=True):
# Create the environment
env = gym.make("CartPole-v1", render_mode="human")
# Reset the environment and get the initial observation
observation = env.reset()
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Define the agent neural network model
class Policy(nn.Module):
def __init__(self, state_size, action_size, hidden_size=128):
super(Policy, self).__init__()
self.fc1 = nn.Linear(state_size, hidden_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.6) # Adjust dropout probability as needed
self.fc2 = nn.Linear(hidden_size, action_size)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return F.softmax(x)
policy_model = Policy(state_size, action_size)
optimizer = optim.Adam(policy_model.parameters(), lr=5e-3)
gamma = 0.99
episodes_rewards = []
for i in range(500):
# Reset the environment
# init buffers
observation, info = env.reset(seed=42)
vec_env = model.get_env()
all_episode_rewards = []
for i in range(num_episodes):
episode_rewards = []
logarithmich_probabilities = []
terminated = False
# Render the environment to visualize the agent's behavior
env.render()
while terminated == False:
# Get action probabilities from the policy model
action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32))
action_distribution = Categorical(action_probabilities)
# Sample an action from the action distribution
action = action_distribution.sample()
logarithmich_probability = action_distribution.log_prob(action)
logarithmich_probabilities.append(logarithmich_probability)
print(int(action.item()))
# Take a step in the environment
#print(env.step(action.item()))
next_observation, reward, done, a, b = env.step(action.item())
done = False
obs = vec_env.reset()
while not done:
# _states are only useful when using LSTM policies
action, _states = model.predict(obs, deterministic=deterministic)
# here, action, rewards and dones are arrays
# also note that the step only returns a 4-tuple, as the env that is returned
obs, reward, done, info = vec_env.step(action)
episode_rewards.append(reward)
# Update observation
observation = next_observation
# Compute the return for the episode
returns = []
R = 0
for r in reversed(episode_rewards):
R = r + gamma * R
returns.insert(0, R)
# Compute the policy loss
policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum()
episodes_rewards += [-policy_loss]
# Update the policy model
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
env.close()
# Plot the policy loss against iterations
plt.plot([i for i in range(0,500)],episodes_rewards)
plt.xlabel('Iterations')
plt.ylabel('Policy Loss')
plt.title('Policy Loss vs. Iterations')
plt.show()
all_episode_rewards.append(sum(episode_rewards))
mean_episode_reward = np.mean(all_episode_rewards)
print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)
return mean_episode_reward
# Use a separate environement for evaluation
eval_env = gym.make("CartPole-v1", render_mode="rgb_array")
# Train the agent for 10000 steps
model.learn(total_timesteps=10_000)
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
login(token="****************")
# Save the trained model
model.save("ECL-TD-RL1-a2c_cartpole.zip")
# Load the trained model
model = A2C.load("ECL-TD-RL1-a2c_cartpole.zip")
push_to_hub(
repo_id="Karim-20/a2c_cartpole",
filename="ECL-TD-RL1-a2c_cartpole.zip",
commit_message="Add cartepole-v1 environement, agent used to train is A2C"
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment