diff --git a/README.md b/README.md index f49f3ded47e0ae40091a7f395e4233bea2d902a8..29da3e74e8aed76ea80f93dc1225dfeb83e45bea 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,6 @@ ### Get familiar with Hugging Face Hub Link to the model on the hub : + +# REINFORCE +Plot showing the total reward accross episodes:  diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py index ba510f9e1f147190ab70bbb3c7165400ed5f7b66..3666845bf8010fd247e855d9f0b60b8487ebc311 100644 --- a/a2c_sb3_cartpole.py +++ b/a2c_sb3_cartpole.py @@ -2,8 +2,15 @@ import gymnasium as gym import cv2 from stable_baselines3 import A2C from huggingface_sb3 import package_to_hub, push_to_hub +from gym import envs +from gymnasium.envs.registration import register -env = gym.make("CartPole-v1", render_mode="rgb_array") +env_id = "CartPole-v1" + +# Register the environment +register(id=env_id, entry_point='gym.envs.classic_control:CartPoleEnv', max_episode_steps=500) + +env = gym.make(env_id) model = A2C("MlpPolicy", env, verbose=1) model.learn(total_timesteps=10_000) @@ -18,15 +25,16 @@ for i in range(1000): # if done: # obs = vec_env.reset() -if __name__ == "__main__": - package_to_hub(model=model, - model_name="a2c-CartPole-v1", - model_architecture="a2c", - env_id="CartPole-v1", - eval_env=env, - repo_id="oscarchaufour/a2c-CartPole-v1", - commit_message="Test commit") - - push_to_hub(repo_id="oscarchaufour/a2c-CartPole-v1", - filename="a2c_sb3_cartpole.zip", - commit_message="Added A2C CartPole model") \ No newline at end of file +# Package and push the model to the Hugging Face Hub +model_package_id = package_to_hub(model=model, + model_name="a2c-CartPole-v1", + model_architecture="a2c", + env_id=env_id, + eval_env=env, + repo_id="oscarchaufour/a2c-CartPole-v1", + commit_message="Initial commit of A2C CartPole model") + +# Push the model package to the Hub +push_to_hub(repo_id="oscarchaufour/a2c-CartPole-v1", + filename=model_package_id + ".zip", + commit_message="Added A2C CartPole model") \ No newline at end of file diff --git a/images/reinforce_rewards.png b/images/reinforce_rewards.png new file mode 100644 index 0000000000000000000000000000000000000000..e76d632e17aa3bc917cd5e2de89c865baaeb0bcf Binary files /dev/null and b/images/reinforce_rewards.png differ diff --git a/neural_network_model.zip b/neural_network_model.zip new file mode 100644 index 0000000000000000000000000000000000000000..ba6de9a0b83bb0d2cf55768c2497c4f1a0e9345c Binary files /dev/null and b/neural_network_model.zip differ diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py index c732e8588847e62a8fe552b4973550dd754c65a3..bf5d1bb07e35868c96494835dad522ff77b29a78 100644 --- a/reinforce_cartpole.py +++ b/reinforce_cartpole.py @@ -5,6 +5,7 @@ import torch.optim as optim import matplotlib.pyplot as plt from tqdm import tqdm import numpy as np +from torch.distributions import Categorical # Define the neural network model class Policy(nn.Module): @@ -24,77 +25,68 @@ class Policy(nn.Module): x = self.softmax(x) return x - -def reinforce(): - # Create the environment - env = gym.make("CartPole-v1") - - # Set up the agent - policy = Policy( - input_size=env.observation_space.shape[0], - output_size=env.action_space.n - ) - optimizer = optim.Adam(policy.parameters(), lr=5e-3) - +def reinforce(policy, env, optimizer): # Training loop - num_episodes = 500 + num_episodes = 200 gamma = 0.99 - episode_rewards = [] + episodes_rewards = [] for episode in tqdm(range(num_episodes)): - action_probabilities = [] - episode_rewards_weighted = [] - # Reset the environment and get the initial observation - observation = env.reset()[0] + observation = env.reset() + rewards = [] + log_probabilities = [] terminated = False episode_reward = 0 - step = 0 while not terminated: - step += 1 # Compute action probabilities - action_probs = policy(torch.FloatTensor(observation).unsqueeze(0)) - + observation_array = observation[0] if isinstance(observation, tuple) else observation + observation_tensor = torch.FloatTensor(observation_array).unsqueeze(0) + action_probabilities = policy(observation_tensor) + + cat = Categorical(action_probabilities) # Sample action based on probabilities and store its probability in the buffer - action = torch.multinomial(action_probs, num_samples=1).item() + action = cat.sample() + log_probability = cat.log_prob(action) # Step the environment with the action - observation, reward, terminated, truncated, info = env.step(action) + observation, reward, terminated, _, _ = env.step(action.item()) env.render() - # Compute and store the return in the buffer + # Compute and store the reward in the buffer episode_reward += reward - episode_rewards_weighted.append(reward * gamma ** step) - #episode_rewards.append(episode_reward) - - # Store the action probabilities - action_probabilities.append(action_probs[0][action]) - - # Normalize the return - # Convert action_probabilities to a tensor with requires_grad=True - action_probabilities_tensor = torch.FloatTensor(action_probabilities).requires_grad_(True) - - episode_rewards_tensor = torch.FloatTensor(episode_rewards_weighted) - episode_rewards_tensor -= torch.mean(episode_rewards_tensor) - episode_rewards_tensor /= torch.std(episode_rewards_tensor) + log_probabilities.append(log_probability) + rewards.append(reward) + + episodes_rewards.append(episode_reward) + + # Compute the returns + weighted_rewards = 0 + returns = [] + for r in rewards[::-1]: + weighted_rewards = r + gamma * weighted_rewards + returns.insert(0, weighted_rewards) + returns = torch.tensor(returns) + returns = (returns - returns.mean()) / (returns.std() + 1e-5) # Compute policy loss - log_probs = torch.log(action_probabilities_tensor.squeeze(0)) - policy_loss = -torch.sum(log_probs * torch.tensor(episode_rewards_weighted)) + loss = [] + for log_prob, weighted_rewards in zip(log_probabilities, returns): + loss.append(log_prob * weighted_rewards) + policy_loss = - torch.cat(loss).sum() # Update the policy optimizer.zero_grad() policy_loss.backward() optimizer.step() - episode_rewards.append(episode_reward) - return episode_rewards + return episodes_rewards -def plot_rewards(episode_rewards): +def plot_rewards(episodes_rewards): # Plot the total reward across episodes - plt.plot(episode_rewards) + plt.plot(episodes_rewards) plt.xlabel('Episode') plt.ylabel('Total Reward') plt.title('REINFORCE: Total Reward across Episodes') @@ -102,5 +94,33 @@ def plot_rewards(episode_rewards): if __name__ == "__main__": - episode_rewards = reinforce() - plot_rewards(episode_rewards) \ No newline at end of file + # Create the environment + env = gym.make("CartPole-v1") + + # Set up the agent + policy = Policy( + input_size=env.observation_space.shape[0], + output_size=env.action_space.n + ) + optimizer = optim.Adam(policy.parameters(), lr=5e-3) + episodes_rewards = reinforce(policy, env, optimizer) + env.close() + + # plot the rewards + plot_rewards(episodes_rewards) + + + # Serialize the model and save it to a .zip file + # import pickle + # import zipfile + + # Assuming `policy` is your neural network model + + # # Step 1: Serialize the model + # model_bytes = pickle.dumps(policy) + + # # Step 2: Create a .zip file containing the serialized model + # zip_filename = ".zip" + # with zipfile.ZipFile(zip_filename, 'w') as zipf: + # zipf.writestr("model.pkl", model_bytes) + diff --git a/test.ipynb b/test.ipynb index b0249dbcf21e2d326dd711520f27a6557899644e..8a16e6581015df166f4ea76288d775d5fc00158f 100644 --- a/test.ipynb +++ b/test.ipynb @@ -708,13 +708,41 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Reinforce" + "## Upload model to Hugging Face hub" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 8, "metadata": {}, - "source": [] + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'model' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhuggingface_sb3\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m package_to_hub, push_to_hub\n\u001b[1;32m----> 3\u001b[0m package_to_hub(model\u001b[38;5;241m=\u001b[39m\u001b[43mmodel\u001b[49m, \n\u001b[0;32m 4\u001b[0m model_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ma2c-CartPole-v1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 5\u001b[0m model_architecture\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ma2c\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 6\u001b[0m env_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCartPole-v1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 7\u001b[0m eval_env\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 8\u001b[0m repo_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moscarchaufour/a2c-CartPole-v1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 9\u001b[0m commit_message\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTest commit\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 11\u001b[0m push_to_hub(repo_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moscarchaufour/a2c-CartPole-v1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 12\u001b[0m filename\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ma2c_sb3_cartpole.zip\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 13\u001b[0m commit_message\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAdded A2C CartPole model\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[1;31mNameError\u001b[0m: name 'model' is not defined" + ] + } + ], + "source": [ + "from huggingface_sb3 import package_to_hub, push_to_hub\n", + "\n", + "package_to_hub(model=model, \n", + " model_name=\"a2c-CartPole-v1\",\n", + " model_architecture=\"a2c\",\n", + " env_id=\"CartPole-v1\",\n", + " eval_env=None,\n", + " repo_id=\"oscarchaufour/a2c-CartPole-v1\",\n", + " commit_message=\"Test commit\")\n", + " \n", + "push_to_hub(repo_id=\"oscarchaufour/a2c-CartPole-v1\",\n", + " filename=\"a2c_sb3_cartpole.zip\",\n", + " commit_message=\"Added A2C CartPole model\")\n" + ] } ], "metadata": {