Update a2c_sb3_cartpole.py

93ba5cfa · Majdi Karim · 29f96ab1 · 93ba5cfa
Commit 93ba5cfa authored 1 year ago by Majdi Karim
--- a/a2c_sb3_cartpole.py
+++ b/a2c_sb3_cartpole.py
 import gymnasium as gym
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-from torch.distributions import Categorical
-import matplotlib.pyplot as plt
+import numpy as np
+from stable_baselines3.common.evaluation import evaluate_policy
+from stable_baselines3 import A2C
+from huggingface_sb3 import push_to_hub
+from huggingface_hub import login



+print(f"{gym.__version__=}")


+env = gym.make("CartPole-v1", render_mode="rgb_array")
+model = A2C("MlpPolicy", env, verbose=1)

+def evaluate(model, num_episodes=100, deterministic=True):
  
-
-
-
-# Create the environment
-env = gym.make("CartPole-v1", render_mode="human")
-
-# Reset the environment and get the initial observation
-observation = env.reset()
-
-state_size = env.observation_space.shape[0]
-action_size = env.action_space.n
-# Define the agent neural network model
-class Policy(nn.Module):
-    def __init__(self, state_size, action_size, hidden_size=128):
-        super(Policy, self).__init__()
-        self.fc1 = nn.Linear(state_size, hidden_size)
-        self.relu = nn.ReLU()
-        self.dropout = nn.Dropout(p=0.6)  # Adjust dropout probability as needed
-        self.fc2 = nn.Linear(hidden_size, action_size)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.relu(x)
-        x = self.dropout(x)
-        x = self.fc2(x)
-        return F.softmax(x)
-
-policy_model = Policy(state_size, action_size)
-optimizer = optim.Adam(policy_model.parameters(), lr=5e-3)
-
-gamma = 0.99
-episodes_rewards = []
-
-for i in range(500):
-    # Reset the environment
-    # init buffers
-    observation, info = env.reset(seed=42)
+    vec_env = model.get_env()
+    all_episode_rewards = []
+    for i in range(num_episodes):
        episode_rewards = []
-    logarithmich_probabilities = []
-    terminated = False
-    # Render the environment to visualize the agent's behavior
-    env.render()
-
-    while terminated == False:
-        # Get action probabilities from the policy model
-        action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32))
-        action_distribution = Categorical(action_probabilities)
-
-        # Sample an action from the action distribution
-        action = action_distribution.sample()
-        logarithmich_probability = action_distribution.log_prob(action)
-        logarithmich_probabilities.append(logarithmich_probability)
-        print(int(action.item()))
-        # Take a step in the environment
-        #print(env.step(action.item()))
-        next_observation, reward, done, a, b = env.step(action.item())
+        done = False
+        obs = vec_env.reset()
+        while not done:
+            # _states are only useful when using LSTM policies
+            action, _states = model.predict(obs, deterministic=deterministic)
+            # here, action, rewards and dones are arrays
+            # also note that the step only returns a 4-tuple, as the env that is returned
+            obs, reward, done, info = vec_env.step(action)
            episode_rewards.append(reward)

-        # Update observation
-        observation = next_observation
-
-
-    # Compute the return for the episode
-    returns = []
-    R = 0
-    for r in reversed(episode_rewards):
-       R = r + gamma * R
-       returns.insert(0, R)
-
-    # Compute the policy loss
-    policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum()
-    episodes_rewards += [-policy_loss]
-    # Update the policy model
-    optimizer.zero_grad()
-    policy_loss.backward()
-    optimizer.step()
-
-
-env.close()
-
-
-
-
-# Plot the policy loss against iterations
-plt.plot([i for i in range(0,500)],episodes_rewards)
-plt.xlabel('Iterations')
-plt.ylabel('Policy Loss')
-plt.title('Policy Loss vs. Iterations')
-plt.show()
-
-
-
-
-
-
-
-
+        all_episode_rewards.append(sum(episode_rewards))

+    mean_episode_reward = np.mean(all_episode_rewards)
+    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

+    return mean_episode_reward

+# Use a separate environement for evaluation
+eval_env = gym.make("CartPole-v1", render_mode="rgb_array")

+# Train the agent for 10000 steps
+model.learn(total_timesteps=10_000)

+# Evaluate the trained agent
+mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

+print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

+login(token="****************")

+# Save the trained model
+model.save("ECL-TD-RL1-a2c_cartpole.zip")

+# Load the trained model
+model = A2C.load("ECL-TD-RL1-a2c_cartpole.zip")

+push_to_hub(
+    repo_id="Karim-20/a2c_cartpole",
+    filename="ECL-TD-RL1-a2c_cartpole.zip",
+    commit_message="Add cartepole-v1 environement, agent used to train is A2C"
+)