diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f408c60f80a560fd2848dadb46ad7e3c45bd4024 --- /dev/null +++ b/.gitignore @@ -0,0 +1,179 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +models/ +wandb/ +runs/ +videos/ + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc \ No newline at end of file diff --git a/README.md b/README.md index 4ce3432a1a0d4298e018985d6a0e49ef80bca78b..cf4e98c81cbb201e2031ab823a23c246a4b046e4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,37 @@ # TD1-Reinforcement-learning +## REINFORCE Implementation for CartPole + +This repository contains an implementation of the REINFORCE algorithm (Monte Carlo Policy Gradient) to solve the CartPole-v1 environment from OpenAI Gym. + +### Implementation Details + +The implementation consists of: + +1. A simple policy network with: + - Input layer (4 units for state space) + - Hidden layer (128 units with ReLU activation and dropout) + - Output layer (2 units with softmax activation for action probabilities) + +2. REINFORCE algorithm features: + - Uses PyTorch for neural network and automatic differentiation + - Implements full episode Monte Carlo returns with discount factor γ=0.99 + - Uses Adam optimizer with learning rate 5e-3 + - Includes return normalization for training stability + +### Training Results + +The agent was trained for 500 episodes. The plot below shows the total reward obtained in each episode during training: + + + +### Files + +- `reinforce_cartpole.py`: Contains the implementation of the policy network and REINFORCE algorithm +- `reinforce_cartpole.pth`: Saved model weights after training +- `training_plot.png`: Visualization of the training progress + + ## Getting started @@ -91,3 +123,15 @@ For open source projects, say how it is licensed. ## Project status If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. + + +### Evaluation Results +After training, the agent was evaluated on 100 episodes: +- Success Rate: 100.00% +- Average Reward: 498.60 + +### HuggingFace Model +https://huggingface.co/SimRams/a2c_sb3_cartpole + +### Wandb link +https://wandb.ai/sim-ramos01-centrale-lyon/sb3/runs/bv67u8pe?nw=nwusersimramos01 \ No newline at end of file diff --git a/a2c/a2c_cartpole.zip b/a2c/a2c_cartpole.zip new file mode 100644 index 0000000000000000000000000000000000000000..a45efa4e32af09ae6d864ca559471edd85ed86b1 Binary files /dev/null and b/a2c/a2c_cartpole.zip differ diff --git a/a2c/a2c_sb3_cartpole.py b/a2c/a2c_sb3_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..12c4cf7e91fccd4ae3c86f8d670d243e460c90b5 --- /dev/null +++ b/a2c/a2c_sb3_cartpole.py @@ -0,0 +1,41 @@ +from stable_baselines3 import A2C +from stable_baselines3.common.env_util import make_vec_env +from huggingface_hub import login +from huggingface_sb3 import package_to_hub, push_to_hub + + +# Parallel environments +vec_env = make_vec_env("CartPole-v1", n_envs=4) + +model = A2C("MlpPolicy", vec_env, verbose=1) +model.learn(total_timesteps=250, callback=WandbCallback( + gradient_save_freq=100, + model_save_path=f"models/{run.id}", + verbose=2, + )) + +# model.save("a2c_cartpole") + +# model = A2C.load("a2c_cartpole") + +# obs = vec_env.reset() +# while True: +# action, _states = model.predict(obs) +# obs, rewards, dones, info = vec_env.step(action) +# vec_env.render("human") + +# login() + +# package_to_hub(model=model, +# model_name="a2c_sb3_cartpole", +# model_architecture="a2c_sb3_cartpole", +# env_id="CartPole-v1", +# eval_env=vec_env, +# repo_id="SimRams/a2c_sb3_cartpole", +# commit_message="Test commit") + +# push_to_hub( +# repo_id="SimRams/a2c_sb3_cartpole", +# filename="a2c_cartpole.zip", +# commit_message="Test commit", +# ) \ No newline at end of file diff --git a/a2c/a2c_sb3_panda_reach.py b/a2c/a2c_sb3_panda_reach.py new file mode 100644 index 0000000000000000000000000000000000000000..5465a933aacf38e8059368fef6a628bddd9692f4 --- /dev/null +++ b/a2c/a2c_sb3_panda_reach.py @@ -0,0 +1,91 @@ +import os +import gymnasium as gym +import panda_gym +import wandb +from wandb.integration.sb3 import WandbCallback +from stable_baselines3 import A2C +from stable_baselines3.common.monitor import Monitor +from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder +from stable_baselines3.common.evaluation import evaluate_policy +from huggingface_hub import login +from huggingface_sb3 import package_to_hub + +# Configuration +config = { + "policy_type": "MlpPolicy", + "total_timesteps": 500000, + "env_id": "PandaReachJointsDense-v3", + "n_envs": 4 +} + +# Initialize wandb +run = wandb.init( + project="sb3-panda-reach", + entity="sim-ramos01-centrale-lyon", + config=config, + sync_tensorboard=True, + monitor_gym=True, + save_code=True, +) + +# Create environment with video recording +def make_env(): + env = gym.make(config["env_id"], render_mode="rgb_array") + env = Monitor(env) + return env + +# Create vectorized environment +env = DummyVecEnv([make_env for _ in range(config["n_envs"])]) +env = VecVideoRecorder(env, f"videos/{run.id}", + record_video_trigger=lambda x: x % 2000 == 0, + video_length=200) + +# Initialize the model +model = A2C( + config["policy_type"], + env, + verbose=1, + tensorboard_log=f"runs/{run.id}" +) + +# Train the model +model.learn( + total_timesteps=config["total_timesteps"], + callback=WandbCallback( + gradient_save_freq=100, + model_save_path=f"models/{run.id}", + verbose=2, + ) +) + +# Save the model +model_name = f"a2c_panda_reach_{run.id}" +model.save(model_name) + +# Evaluate the model +eval_env = gym.make(config["env_id"], render_mode="rgb_array") +mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) +print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}") + +# Log final metrics +wandb.log({ + "eval/mean_reward": mean_reward, + "eval/std_reward": std_reward +}) + +# Upload to Hugging Face Hub +repo_id = "SimRams/a2c-panda-reach" +package_to_hub( + model=model, + model_name=model_name, + model_architecture="A2C", + env_id=config["env_id"], + eval_env=eval_env, + repo_id=repo_id, + commit_message="Training A2C on PandaReachJointsDense-v3" +) + +# Cleanup +env.close() +eval_env.close() +wandb.finish() diff --git a/a2c/wandb_cartpole.py b/a2c/wandb_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..b9009b3b41f45996da971c4421dcebad9cbce2f1 --- /dev/null +++ b/a2c/wandb_cartpole.py @@ -0,0 +1,44 @@ +import gymnasium as gym +from stable_baselines3 import A2C +from stable_baselines3.common.monitor import Monitor +from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder +from huggingface_hub import login +from huggingface_sb3 import push_to_hub +import wandb +from wandb.integration.sb3 import WandbCallback +import time + +config = { + "policy": "MlpPolicy", + "env_name": "CartPole-v1", + "total_timesteps": 25000 +} + +experiment_name = f"A2C_{int(time.time())}" + +run = wandb.init( + name=experiment_name, + project="sb3", + config=config, + sync_tensorboard=True, # auto-upload sb3's tensorboard metrics + monitor_gym=True, # auto-upload the videos of agents playing the game + save_code=True, # optional +) + + +def make_env(): + env = gym.make("CartPole-v1",render_mode="rgb_array") + env = Monitor(env) # record stats such as returns + return env + +env = DummyVecEnv([make_env]) + +#env = VecVideoRecorder(env, "videos", + #record_video_trigger=lambda x: x % 2000 == 0, video_length=200) # record videos + +model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}") +model.learn(total_timesteps=config["total_timesteps"], callback=WandbCallback( + gradient_save_freq=100, + #model_save_path=f"models/{run.id}", + verbose=2, +)) diff --git a/cartpole_first/evaluate_reinforce_cartpole.py b/cartpole_first/evaluate_reinforce_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..b021def2dbc903635264b5cc6ecf06628abc9099 --- /dev/null +++ b/cartpole_first/evaluate_reinforce_cartpole.py @@ -0,0 +1,59 @@ +import numpy as np +import gymnasium as gym +import torch +from reinforce_cartpole import PolicyNet, device + +def evaluate_policy(policy, env, num_episodes=100): + successes = 0 + total_rewards = [] + + with torch.no_grad(): # Disable gradient computation for evaluation + for i_episode in range(num_episodes): + state, _ = env.reset() + episode_reward = 0 + done = False + + while not done: + + action, _ = policy.act(state) + + state, reward, terminated, truncated, _ = env.step(action) + done = terminated or truncated + episode_reward += reward + + total_rewards.append(episode_reward) + + if episode_reward >= 195: # OpenAI's definition of solving CartPole + successes += 1 + + print(f'Episode {i_episode + 1}: Reward = {episode_reward}') + + success_rate = successes / num_episodes * 100 + avg_reward = np.mean(total_rewards) + print(f'\nEvaluation Results:') + print(f'Success Rate: {success_rate:.2f}%') + print(f'Average Reward: {avg_reward:.2f}') + + return success_rate, avg_reward + +if __name__ == '__main__': + # Setup environment + env = gym.make('CartPole-v1') + + # Load policy + policy = PolicyNet() + policy.load_state_dict(torch.load('reinforce_cartpole.pth')) + policy.to(device) + policy.eval() + + # Evaluate + success_rate, avg_reward = evaluate_policy(policy, env) + + # Update README with results + with open('README.md', 'a') as f: + f.write(f'\n\n### Evaluation Results\n') + f.write(f'After training, the agent was evaluated on 100 episodes:\n') + f.write(f'- Success Rate: {success_rate:.2f}%\n') + f.write(f'- Average Reward: {avg_reward:.2f}\n') + + env.close() diff --git a/cartpole_first/main.py b/cartpole_first/main.py new file mode 100644 index 0000000000000000000000000000000000000000..822638ca6e8a01ee3448f8f7430c183e67b0788c --- /dev/null +++ b/cartpole_first/main.py @@ -0,0 +1,22 @@ +import gymnasium as gym + +# Create the environment +env = gym.make("CartPole-v1", render_mode="human") + +# Reset the environment and get the initial observation +observation = env.reset() + +for _ in range(1000): + # Select a random action from the action space + action = env.action_space.sample() + # Apply the action to the environment + # Returns next observation, reward, done signal (indicating + # if the episode has ended), and an additional info dictionary + observation, reward, terminated, truncated, info = env.step(action) + # Render the environment to visualize the agent's behavior + env.render() + if terminated: + # Terminated before max step + break + +env.close() diff --git a/cartpole_first/reinforce_cartpole.pth b/cartpole_first/reinforce_cartpole.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba06fb4700fd4c64263d214c2b609cf3b7c77778 Binary files /dev/null and b/cartpole_first/reinforce_cartpole.pth differ diff --git a/cartpole_first/reinforce_cartpole.py b/cartpole_first/reinforce_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..48474d8483d8472c416352e6df733ab36c8a6e4d --- /dev/null +++ b/cartpole_first/reinforce_cartpole.py @@ -0,0 +1,150 @@ +import numpy as np + +from collections import deque + +import matplotlib.pyplot as plt + +# PyTorch +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.modules.activation import LeakyReLU +import torch.optim as optim +from torch.distributions import Categorical + +# Gym +import gymnasium as gym + +# Hugging Face Hub +# from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub. +# import imageio + +# ----------------- +# Init +# ----------------- + +env_id = "CartPole-v1" +# Create the env +env = gym.make(env_id) + +# Create the evaluation env +eval_env = gym.make(env_id) + +class PolicyNet(nn.Module): + def __init__(self): + super(PolicyNet, self).__init__() + self.fc1 = nn.Linear(4, 128) + self.fc2 = nn.Linear(128, 2) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.dropout(x, 0.5) + + x = self.fc2(x) + return F.softmax(x, dim=1) + + def act(self, state): + state = torch.from_numpy(state).float().unsqueeze(0).to(device) + probs = self.forward(state).cpu() + m = Categorical(probs) + action = m.sample() + return action.item(), m.log_prob(action) + +AgentPolicy = PolicyNet() + +# ----------------- +# TRAINING +# ----------------- + +def reinforce(policy, optimizer, n_training_episodes, gamma, print_every): + # Help us to calculate the score during the training + scores_deque = deque(maxlen=100) + scores = [] + # Line 3 of pseudocode + for i_episode in range(1, n_training_episodes + 1): + ## Reset the buffer + saved_log_probs = [] + rewards = [] + ## Reset the environment + state, _ = env.reset() + done = False + + ## Repeat until the end of the episode: + while not done: + ## Compute action probabilities + action, log_prob = policy.act(state) + ## Save the probs in the buffer + saved_log_probs.append(log_prob) + ## Step the environment with the action + state, reward, terminated, truncated, _ = env.step(action) + done = terminated or truncated + rewards.append(reward) + + # env.render() + + + + scores_deque.append(sum(rewards)) + scores.append(sum(rewards)) + ## Compute the returns + returns = deque() + n_steps = len(rewards) + ## Store in the buffer the return using gamma=0.99 + for t in range(n_steps)[::-1]: + disc_return_t = returns[0] if len(returns) > 0 else 0 + returns.appendleft(gamma * disc_return_t + rewards[t]) + + ## Normalize the return + eps = np.finfo(np.float32).eps.item() + # eps is the smallest representable float, which is + # added to the standard deviation of the returns to avoid numerical instabilities + returns = torch.tensor(returns) + returns = (returns - returns.mean()) / (returns.std() + eps) + + ## Compute the policy loss as -sum(log(prob) * return): + policy_loss = [] + for log_prob, disc_return in zip(saved_log_probs, returns): + policy_loss.append(-log_prob * disc_return) + policy_loss = torch.cat(policy_loss).sum() + + ## Update the policy using an Adam optimizer and a learning rate of 5e-3 + optimizer.zero_grad() + policy_loss.backward() + optimizer.step() + + if i_episode % print_every == 0: + print("Episode {}\tAverage Score: {:.2f}".format(i_episode, np.mean(scores_deque))) + + return scores + +lr = 5e-3 +gamma = 0.99 + +cartpole_optimizer = optim.Adam(AgentPolicy.parameters(), lr=lr) + +# Set the device +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +if __name__ == '__main__': + # Training hyperparameters + n_training_episodes = 500 + print_every = 10 + + # Initialize and setup the agent + AgentPolicy.to(device) + + # Train the agent + scores = reinforce(AgentPolicy, cartpole_optimizer, n_training_episodes, gamma, print_every) + + # Save the trained model + torch.save(AgentPolicy.state_dict(), 'reinforce_cartpole.pth') + + # Plot the scores + plt.figure(figsize=(10,5)) + plt.plot(scores) + plt.title('REINFORCE Training - CartPole-v1') + plt.xlabel('Episode') + plt.ylabel('Total Reward') + plt.savefig('training_plot.png') + plt.close() + diff --git a/cartpole_first/training_plot.png b/cartpole_first/training_plot.png new file mode 100644 index 0000000000000000000000000000000000000000..f2eca5829e9f9da7b084b13fd2d463a6e1239c46 Binary files /dev/null and b/cartpole_first/training_plot.png differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9398b1e67bd7300be09a64ac11c2f430bffa4f7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +numpy>=1.24.3,<2.0.0 +torch>=2.1.0 +gymnasium>=0.29.1 +matplotlib>=3.7.1 +stable-baselines3 +stable-baselines3[extra] +moviepy +huggingface-sb3 +huggingface_hub