diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py index e7f5770d47cc7421e7739599b5d3ed29c63da134..7d50d417c474f4b1b6bfcd9bf622551aa36a54fc 100644 --- a/a2c_sb3_cartpole.py +++ b/a2c_sb3_cartpole.py @@ -1,121 +1,62 @@ import gymnasium as gym -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F -from torch.distributions import Categorical -import matplotlib.pyplot as plt - - - - - - - - - - -# Create the environment -env = gym.make("CartPole-v1", render_mode="human") - -# Reset the environment and get the initial observation -observation = env.reset() - -state_size = env.observation_space.shape[0] -action_size = env.action_space.n -# Define the agent neural network model -class Policy(nn.Module): - def __init__(self, state_size, action_size, hidden_size=128): - super(Policy, self).__init__() - self.fc1 = nn.Linear(state_size, hidden_size) - self.relu = nn.ReLU() - self.dropout = nn.Dropout(p=0.6) # Adjust dropout probability as needed - self.fc2 = nn.Linear(hidden_size, action_size) - - def forward(self, x): - x = self.fc1(x) - x = self.relu(x) - x = self.dropout(x) - x = self.fc2(x) - return F.softmax(x) - -policy_model = Policy(state_size, action_size) -optimizer = optim.Adam(policy_model.parameters(), lr=5e-3) - -gamma = 0.99 -episodes_rewards = [] - -for i in range(500): - # Reset the environment - # init buffers - observation, info = env.reset(seed=42) - episode_rewards = [] - logarithmich_probabilities = [] - terminated = False - # Render the environment to visualize the agent's behavior - env.render() - - while terminated == False: - # Get action probabilities from the policy model - action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32)) - action_distribution = Categorical(action_probabilities) - - # Sample an action from the action distribution - action = action_distribution.sample() - logarithmich_probability = action_distribution.log_prob(action) - logarithmich_probabilities.append(logarithmich_probability) - print(int(action.item())) - # Take a step in the environment - #print(env.step(action.item())) - next_observation, reward, done, a, b = env.step(action.item()) - episode_rewards.append(reward) - - # Update observation - observation = next_observation - - - # Compute the return for the episode - returns = [] - R = 0 - for r in reversed(episode_rewards): - R = r + gamma * R - returns.insert(0, R) - - # Compute the policy loss - policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum() - episodes_rewards += [-policy_loss] - # Update the policy model - optimizer.zero_grad() - policy_loss.backward() - optimizer.step() - - -env.close() - - - - -# Plot the policy loss against iterations -plt.plot([i for i in range(0,500)],episodes_rewards) -plt.xlabel('Iterations') -plt.ylabel('Policy Loss') -plt.title('Policy Loss vs. Iterations') -plt.show() - +import numpy as np +from stable_baselines3.common.evaluation import evaluate_policy +from stable_baselines3 import A2C +from huggingface_sb3 import push_to_hub +from huggingface_hub import login +print(f"{gym.__version__=}") +env = gym.make("CartPole-v1", render_mode="rgb_array") +model = A2C("MlpPolicy", env, verbose=1) +def evaluate(model, num_episodes=100, deterministic=True): + + vec_env = model.get_env() + all_episode_rewards = [] + for i in range(num_episodes): + episode_rewards = [] + done = False + obs = vec_env.reset() + while not done: + # _states are only useful when using LSTM policies + action, _states = model.predict(obs, deterministic=deterministic) + # here, action, rewards and dones are arrays + # also note that the step only returns a 4-tuple, as the env that is returned + obs, reward, done, info = vec_env.step(action) + episode_rewards.append(reward) + all_episode_rewards.append(sum(episode_rewards)) + mean_episode_reward = np.mean(all_episode_rewards) + print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes) + return mean_episode_reward +# Use a separate environement for evaluation +eval_env = gym.make("CartPole-v1", render_mode="rgb_array") +# Train the agent for 10000 steps +model.learn(total_timesteps=10_000) +# Evaluate the trained agent +mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100) +print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") +login(token="****************") +# Save the trained model +model.save("ECL-TD-RL1-a2c_cartpole.zip") +# Load the trained model +model = A2C.load("ECL-TD-RL1-a2c_cartpole.zip") +push_to_hub( + repo_id="Karim-20/a2c_cartpole", + filename="ECL-TD-RL1-a2c_cartpole.zip", + commit_message="Add cartepole-v1 environement, agent used to train is A2C" +)