Your commit message

bb0e23c5 · Chahri Hanane · 5fcf0aca · bb0e23c5 · bb0e23c5 · bb0e23c5
Commit bb0e23c5 authored 1 month ago by Chahri Hanane
--- a/a2c_cartpole.zip
+++ b/a2c_cartpole.zip
--- a/a2c_sb3_cartpole.py
+++ b/a2c_sb3_cartpole.py
+import gymnasium as gym  # Use gymnasium instead of gym
+from stable_baselines3 import A2C
+from stable_baselines3.common.evaluation import evaluate_policy
+# Create the CartPole environment (Gymnasium syntax)
+env = gym.make("CartPole-v1")
+# Initialize the A2C model
+model = A2C("MlpPolicy", env, verbose=1)
+# Train the model
+model.learn(total_timesteps=100000)
+# Save the model
+model.save("a2c_cartpole")
+# Evaluate the trained model
+mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
+print(f"✅ Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")
+# Close the environment
+env.close()
--- a/evaluate_reinforce_cartpole.py
+++ b/evaluate_reinforce_cartpole.py
+import gym
+import torch
+import torch.nn as nn
+import numpy as np
+# Define the same Policy Network structure
+class PolicyNetwork(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(PolicyNetwork, self).__init__()
+        self.fc1 = nn.Linear(input_dim, 128)
+        self.dropout = nn.Dropout(0.2)
+        self.fc2 = nn.Linear(128, output_dim)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x):
+        x = torch.relu(self.fc1(x))
+        x = self.dropout(x)
+        x = self.softmax(self.fc2(x))
+        return x
+# Load the trained model
+def load_model(filepath, env):
+    model = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
+    model.load_state_dict(torch.load(filepath))
+    model.eval()
+    return model
+# Evaluate the model for 100 episodes
+def evaluate_model(env, model, num_episodes=100, success_threshold=195):
+    success_count = 0
+    rewards = []
+    for episode in range(num_episodes):
+        state = env.reset()[0]
+        total_reward = 0
+        while True:
+            state_tensor = torch.tensor(state, dtype=torch.float32)
+            action_probs = model(state_tensor)
+            action = torch.argmax(action_probs).item()
+            next_state, reward, terminated, truncated, _ = env.step(action)
+            total_reward += reward
+            state = next_state
+            if terminated or truncated:
+                break
+        rewards.append(total_reward)
+        if total_reward >= success_threshold:
+            success_count += 1
+        print(f"Episode {episode+1}: Total Reward = {total_reward}")
+    success_rate = success_count / num_episodes * 100
+    print(f"\n✅ Success Rate: {success_rate:.2f}% ({success_count}/{num_episodes} episodes)")
+    return success_rate, rewards
+if __name__ == "__main__":
+    env = gym.make("CartPole-v1", render_mode="human")  # Render for visualization
+    model = load_model("reinforce_cartpole.pth", env)
+    success_rate, _ = evaluate_model(env, model, num_episodes=100)
+    env.close()
+    # Save success rate for README update
+    with open("evaluation_result.txt", "w") as f:
+        f.write(f"Success Rate: {success_rate:.2f}%\n")
--- a/evaluation_result.txt
+++ b/evaluation_result.txt
+Success Rate: 100.00%
--- a/push_to_hub.py
+++ b/push_to_hub.py
+import gym
+from stable_baselines3 import A2C
+from huggingface_sb3 import package_to_hub
+# Define your Hugging Face repository ID
+repo_id = "pinkiexi/a2c-cartpole-v1"  # Change this!
+# Create and initialize the environment
+env = gym.make("CartPole-v1")
+# Load the trained model
+model = A2C.load("a2c_cartpole")
+# Upload the model to Hugging Face Hub
+package_to_hub(
+    model=model,
+    model_name="A2C",
+    model_architecture="A2C",
+    env_id="CartPole-v1",
+    repo_id=repo_id,
+    commit_message="Upload trained A2C model for CartPole",
+    eval_env=env  # ✅ Now, env is defined
+)
--- a/reinforce_cartpole.pth
+++ b/reinforce_cartpole.pth
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
+import gym
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import matplotlib.pyplot as plt
+# Define the Policy Network
+class PolicyNetwork(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(PolicyNetwork, self).__init__()
+        self.fc1 = nn.Linear(input_dim, 128)
+        self.dropout = nn.Dropout(0.2)
+        self.fc2 = nn.Linear(128, output_dim)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x):
+        x = torch.relu(self.fc1(x))
+        x = self.dropout(x)
+        x = self.softmax(self.fc2(x))
+        return x
+# Define the REINFORCE Algorithm
+class REINFORCE:
+    def __init__(self, env, learning_rate=5e-3, gamma=0.99):
+        self.env = env
+        self.gamma = gamma
+        self.policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)
+        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
+        self.episode_rewards = []
+    def select_action(self, state):
+        state = torch.tensor(state, dtype=torch.float32)
+        action_probs = self.policy(state)
+        action = torch.multinomial(action_probs, 1).item()
+        return action, action_probs[action]
+    def train(self, num_episodes=500):
+        all_rewards = []  # Store rewards for plotting
+        for episode in range(num_episodes):
+            state = self.env.reset()[0]
+            episode_memory = []
+            episode_reward = 0
+            while True:
+                action, action_prob = self.select_action(state)
+                next_state, reward, terminated, truncated, _ = self.env.step(action)
+                episode_memory.append((action_prob, reward))
+                episode_reward += reward
+                state = next_state
+                if terminated or truncated:
+                    break
+            # Compute returns
+            returns = []
+            discounted_sum = 0
+            for _, reward in reversed(episode_memory):
+                discounted_sum = reward + self.gamma * discounted_sum
+                returns.insert(0, discounted_sum)
+            returns = torch.tensor(returns)
+            returns = (returns - returns.mean()) / (returns.std() + 1e-8)
+            # Compute policy loss
+            loss = 0
+            for (action_prob, _), G in zip(episode_memory, returns):
+                loss -= torch.log(action_prob) * G
+            self.optimizer.zero_grad()
+            loss.backward()
+            self.optimizer.step()
+            all_rewards.append(episode_reward)
+            print(f"Episode {episode + 1}: Total Reward = {episode_reward}")
+        # Save the model
+        torch.save(self.policy.state_dict(), "reinforce_cartpole.pth")
+        print("Training complete. Model saved!")
+        # Plot rewards
+        plt.plot(all_rewards)
+        plt.xlabel("Episodes")
+        plt.ylabel("Total Reward")
+        plt.title("Training Progress")
+        plt.savefig("reward_plot.png")  # Save the plot
+        print("Reward plot saved as reward_plot.png.")
+if __name__ == "__main__":
+    env = gym.make("CartPole-v1", render_mode=None)  # Change render_mode to "human" for visualization
+    agent = REINFORCE(env)
+    agent.train(num_episodes=500)
+    env.close()
--- a/reward_plot.png
+++ b/reward_plot.png