diff --git a/a2c_cartpole.zip b/a2c_cartpole.zip new file mode 100644 index 0000000000000000000000000000000000000000..3639d680731143430104e9a3afe38eb30d2fb43d Binary files /dev/null and b/a2c_cartpole.zip differ diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..209d42c65667d721298b94cd2f94167b4570d526 --- /dev/null +++ b/a2c_sb3_cartpole.py @@ -0,0 +1,23 @@ +import gymnasium as gym # Use gymnasium instead of gym +from stable_baselines3 import A2C +from stable_baselines3.common.evaluation import evaluate_policy + +# Create the CartPole environment (Gymnasium syntax) +env = gym.make("CartPole-v1") + +# Initialize the A2C model +model = A2C("MlpPolicy", env, verbose=1) + +# Train the model +model.learn(total_timesteps=100000) + +# Save the model +model.save("a2c_cartpole") + +# Evaluate the trained model +mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10) + +print(f"ā Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}") + +# Close the environment +env.close() diff --git a/evaluate_reinforce_cartpole.py b/evaluate_reinforce_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..831c97dabec0cd4565359281ed32c338d1c8c4ef --- /dev/null +++ b/evaluate_reinforce_cartpole.py @@ -0,0 +1,68 @@ +import gym +import torch +import torch.nn as nn +import numpy as np + +# Define the same Policy Network structure +class PolicyNetwork(nn.Module): + def __init__(self, input_dim, output_dim): + super(PolicyNetwork, self).__init__() + self.fc1 = nn.Linear(input_dim, 128) + self.dropout = nn.Dropout(0.2) + self.fc2 = nn.Linear(128, output_dim) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x): + x = torch.relu(self.fc1(x)) + x = self.dropout(x) + x = self.softmax(self.fc2(x)) + return x + +# Load the trained model +def load_model(filepath, env): + model = PolicyNetwork(env.observation_space.shape[0], env.action_space.n) + model.load_state_dict(torch.load(filepath)) + model.eval() + return model + +# Evaluate the model for 100 episodes +def evaluate_model(env, model, num_episodes=100, success_threshold=195): + success_count = 0 + rewards = [] + + for episode in range(num_episodes): + state = env.reset()[0] + total_reward = 0 + + while True: + state_tensor = torch.tensor(state, dtype=torch.float32) + action_probs = model(state_tensor) + action = torch.argmax(action_probs).item() + + next_state, reward, terminated, truncated, _ = env.step(action) + total_reward += reward + state = next_state + + if terminated or truncated: + break + + rewards.append(total_reward) + if total_reward >= success_threshold: + success_count += 1 + + print(f"Episode {episode+1}: Total Reward = {total_reward}") + + success_rate = success_count / num_episodes * 100 + print(f"\nā Success Rate: {success_rate:.2f}% ({success_count}/{num_episodes} episodes)") + + return success_rate, rewards + +if __name__ == "__main__": + env = gym.make("CartPole-v1", render_mode="human") # Render for visualization + model = load_model("reinforce_cartpole.pth", env) + success_rate, _ = evaluate_model(env, model, num_episodes=100) + env.close() + + # Save success rate for README update + with open("evaluation_result.txt", "w") as f: + f.write(f"Success Rate: {success_rate:.2f}%\n") diff --git a/evaluation_result.txt b/evaluation_result.txt new file mode 100644 index 0000000000000000000000000000000000000000..0be35bff0c32753099d9d8ed6b09faf260992058 --- /dev/null +++ b/evaluation_result.txt @@ -0,0 +1 @@ +Success Rate: 100.00% diff --git a/push_to_hub.py b/push_to_hub.py new file mode 100644 index 0000000000000000000000000000000000000000..d6c8a105b356bb0926a5ad027a97dcb42179c312 --- /dev/null +++ b/push_to_hub.py @@ -0,0 +1,23 @@ +import gym +from stable_baselines3 import A2C +from huggingface_sb3 import package_to_hub + +# Define your Hugging Face repository ID +repo_id = "pinkiexi/a2c-cartpole-v1" # Change this! + +# Create and initialize the environment +env = gym.make("CartPole-v1") + +# Load the trained model +model = A2C.load("a2c_cartpole") + +# Upload the model to Hugging Face Hub +package_to_hub( + model=model, + model_name="A2C", + model_architecture="A2C", + env_id="CartPole-v1", + repo_id=repo_id, + commit_message="Upload trained A2C model for CartPole", + eval_env=env # ā Now, env is defined +) diff --git a/reinforce_cartpole.pth b/reinforce_cartpole.pth new file mode 100644 index 0000000000000000000000000000000000000000..fef2108e9d370f6a85a40fccc5f5713e73df3497 Binary files /dev/null and b/reinforce_cartpole.pth differ diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..da7aa518bb88ecf190ab90c86f2f13cf5dca1802 --- /dev/null +++ b/reinforce_cartpole.py @@ -0,0 +1,93 @@ +import gym +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +import matplotlib.pyplot as plt + +# Define the Policy Network +class PolicyNetwork(nn.Module): + def __init__(self, input_dim, output_dim): + super(PolicyNetwork, self).__init__() + self.fc1 = nn.Linear(input_dim, 128) + self.dropout = nn.Dropout(0.2) + self.fc2 = nn.Linear(128, output_dim) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x): + x = torch.relu(self.fc1(x)) + x = self.dropout(x) + x = self.softmax(self.fc2(x)) + return x + +# Define the REINFORCE Algorithm +class REINFORCE: + def __init__(self, env, learning_rate=5e-3, gamma=0.99): + self.env = env + self.gamma = gamma + self.policy = PolicyNetwork(env.observation_space.shape[0], env.action_space.n) + self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate) + self.episode_rewards = [] + + def select_action(self, state): + state = torch.tensor(state, dtype=torch.float32) + action_probs = self.policy(state) + action = torch.multinomial(action_probs, 1).item() + return action, action_probs[action] + + def train(self, num_episodes=500): + all_rewards = [] # Store rewards for plotting + + for episode in range(num_episodes): + state = self.env.reset()[0] + episode_memory = [] + episode_reward = 0 + + while True: + action, action_prob = self.select_action(state) + next_state, reward, terminated, truncated, _ = self.env.step(action) + episode_memory.append((action_prob, reward)) + episode_reward += reward + state = next_state + if terminated or truncated: + break + + # Compute returns + returns = [] + discounted_sum = 0 + for _, reward in reversed(episode_memory): + discounted_sum = reward + self.gamma * discounted_sum + returns.insert(0, discounted_sum) + + returns = torch.tensor(returns) + returns = (returns - returns.mean()) / (returns.std() + 1e-8) + + # Compute policy loss + loss = 0 + for (action_prob, _), G in zip(episode_memory, returns): + loss -= torch.log(action_prob) * G + + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + all_rewards.append(episode_reward) + print(f"Episode {episode + 1}: Total Reward = {episode_reward}") + + # Save the model + torch.save(self.policy.state_dict(), "reinforce_cartpole.pth") + print("Training complete. Model saved!") + + # Plot rewards + plt.plot(all_rewards) + plt.xlabel("Episodes") + plt.ylabel("Total Reward") + plt.title("Training Progress") + plt.savefig("reward_plot.png") # Save the plot + print("Reward plot saved as reward_plot.png.") + +if __name__ == "__main__": + env = gym.make("CartPole-v1", render_mode=None) # Change render_mode to "human" for visualization + agent = REINFORCE(env) + agent.train(num_episodes=500) + env.close() diff --git a/reward_plot.png b/reward_plot.png new file mode 100644 index 0000000000000000000000000000000000000000..7ccb35b44f14dc330883c4062492af46fd74f947 Binary files /dev/null and b/reward_plot.png differ