diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f408c60f80a560fd2848dadb46ad7e3c45bd4024
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,179 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+models/
+wandb/
+runs/
+videos/
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
\ No newline at end of file
diff --git a/README.md b/README.md
index 4ce3432a1a0d4298e018985d6a0e49ef80bca78b..cf4e98c81cbb201e2031ab823a23c246a4b046e4 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,37 @@
 # TD1-Reinforcement-learning
 
+## REINFORCE Implementation for CartPole
+
+This repository contains an implementation of the REINFORCE algorithm (Monte Carlo Policy Gradient) to solve the CartPole-v1 environment from OpenAI Gym.
+
+### Implementation Details
+
+The implementation consists of:
+
+1. A simple policy network with:
+   - Input layer (4 units for state space)
+   - Hidden layer (128 units with ReLU activation and dropout)
+   - Output layer (2 units with softmax activation for action probabilities)
+
+2. REINFORCE algorithm features:
+   - Uses PyTorch for neural network and automatic differentiation
+   - Implements full episode Monte Carlo returns with discount factor γ=0.99
+   - Uses Adam optimizer with learning rate 5e-3
+   - Includes return normalization for training stability
+
+### Training Results
+
+The agent was trained for 500 episodes. The plot below shows the total reward obtained in each episode during training:
+
+![Training Plot](training_plot.png)
+
+### Files
+
+- `reinforce_cartpole.py`: Contains the implementation of the policy network and REINFORCE algorithm
+- `reinforce_cartpole.pth`: Saved model weights after training
+- `training_plot.png`: Visualization of the training progress
+
+
 
 
 ## Getting started
@@ -91,3 +123,15 @@ For open source projects, say how it is licensed.
 
 ## Project status
 If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
+
+
+### Evaluation Results
+After training, the agent was evaluated on 100 episodes:
+- Success Rate: 100.00%
+- Average Reward: 498.60
+
+### HuggingFace Model
+https://huggingface.co/SimRams/a2c_sb3_cartpole
+
+### Wandb link
+https://wandb.ai/sim-ramos01-centrale-lyon/sb3/runs/bv67u8pe?nw=nwusersimramos01
\ No newline at end of file
diff --git a/a2c/a2c_cartpole.zip b/a2c/a2c_cartpole.zip
new file mode 100644
index 0000000000000000000000000000000000000000..a45efa4e32af09ae6d864ca559471edd85ed86b1
Binary files /dev/null and b/a2c/a2c_cartpole.zip differ
diff --git a/a2c/a2c_sb3_cartpole.py b/a2c/a2c_sb3_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..12c4cf7e91fccd4ae3c86f8d670d243e460c90b5
--- /dev/null
+++ b/a2c/a2c_sb3_cartpole.py
@@ -0,0 +1,41 @@
+from stable_baselines3 import A2C
+from stable_baselines3.common.env_util import make_vec_env
+from huggingface_hub import login
+from huggingface_sb3 import package_to_hub, push_to_hub
+
+
+# Parallel environments
+vec_env = make_vec_env("CartPole-v1", n_envs=4)
+
+model = A2C("MlpPolicy", vec_env, verbose=1)
+model.learn(total_timesteps=250, callback=WandbCallback(
+        gradient_save_freq=100,
+        model_save_path=f"models/{run.id}",
+        verbose=2,
+    ))
+
+# model.save("a2c_cartpole")
+
+# model = A2C.load("a2c_cartpole")
+
+# obs = vec_env.reset()
+# while True:
+#     action, _states = model.predict(obs)
+#     obs, rewards, dones, info = vec_env.step(action)
+#     vec_env.render("human")
+
+# login()
+
+# package_to_hub(model=model, 
+#                model_name="a2c_sb3_cartpole",
+#                model_architecture="a2c_sb3_cartpole",
+#                env_id="CartPole-v1",
+#                eval_env=vec_env,
+#                repo_id="SimRams/a2c_sb3_cartpole",
+#                commit_message="Test commit")
+
+# push_to_hub(
+#     repo_id="SimRams/a2c_sb3_cartpole",
+#     filename="a2c_cartpole.zip",
+#     commit_message="Test commit",
+# )
\ No newline at end of file
diff --git a/a2c/a2c_sb3_panda_reach.py b/a2c/a2c_sb3_panda_reach.py
new file mode 100644
index 0000000000000000000000000000000000000000..5465a933aacf38e8059368fef6a628bddd9692f4
--- /dev/null
+++ b/a2c/a2c_sb3_panda_reach.py
@@ -0,0 +1,91 @@
+import os
+import gymnasium as gym
+import panda_gym
+import wandb
+from wandb.integration.sb3 import WandbCallback
+from stable_baselines3 import A2C
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
+from stable_baselines3.common.evaluation import evaluate_policy
+from huggingface_hub import login
+from huggingface_sb3 import package_to_hub
+
+# Configuration
+config = {
+    "policy_type": "MlpPolicy",
+    "total_timesteps": 500000,
+    "env_id": "PandaReachJointsDense-v3",
+    "n_envs": 4
+}
+
+# Initialize wandb
+run = wandb.init(
+    project="sb3-panda-reach",
+    entity="sim-ramos01-centrale-lyon",
+    config=config,
+    sync_tensorboard=True,
+    monitor_gym=True,
+    save_code=True,
+)
+
+# Create environment with video recording
+def make_env():
+    env = gym.make(config["env_id"], render_mode="rgb_array")
+    env = Monitor(env)
+    return env
+
+# Create vectorized environment
+env = DummyVecEnv([make_env for _ in range(config["n_envs"])])
+env = VecVideoRecorder(env, f"videos/{run.id}",
+                      record_video_trigger=lambda x: x % 2000 == 0,
+                      video_length=200)
+
+# Initialize the model
+model = A2C(
+    config["policy_type"],
+    env,
+    verbose=1,
+    tensorboard_log=f"runs/{run.id}"
+)
+
+# Train the model
+model.learn(
+    total_timesteps=config["total_timesteps"],
+    callback=WandbCallback(
+        gradient_save_freq=100,
+        model_save_path=f"models/{run.id}",
+        verbose=2,
+    )
+)
+
+# Save the model
+model_name = f"a2c_panda_reach_{run.id}"
+model.save(model_name)
+
+# Evaluate the model
+eval_env = gym.make(config["env_id"], render_mode="rgb_array")
+mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10)
+print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")
+
+# Log final metrics
+wandb.log({
+    "eval/mean_reward": mean_reward,
+    "eval/std_reward": std_reward
+})
+
+# Upload to Hugging Face Hub
+repo_id = "SimRams/a2c-panda-reach"
+package_to_hub(
+    model=model,
+    model_name=model_name,
+    model_architecture="A2C",
+    env_id=config["env_id"],
+    eval_env=eval_env,
+    repo_id=repo_id,
+    commit_message="Training A2C on PandaReachJointsDense-v3"
+)
+
+# Cleanup
+env.close()
+eval_env.close()
+wandb.finish()
diff --git a/a2c/wandb_cartpole.py b/a2c/wandb_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9009b3b41f45996da971c4421dcebad9cbce2f1
--- /dev/null
+++ b/a2c/wandb_cartpole.py
@@ -0,0 +1,44 @@
+import gymnasium as gym
+from stable_baselines3 import A2C
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
+from huggingface_hub import login
+from huggingface_sb3 import push_to_hub
+import wandb
+from wandb.integration.sb3 import WandbCallback
+import time
+
+config = {
+    "policy": "MlpPolicy",
+    "env_name": "CartPole-v1",
+    "total_timesteps": 25000
+}
+
+experiment_name = f"A2C_{int(time.time())}"
+
+run = wandb.init(
+    name=experiment_name,
+    project="sb3",
+    config=config,
+    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
+    monitor_gym=True,  # auto-upload the videos of agents playing the game
+    save_code=True,  # optional
+)
+
+
+def make_env():
+    env = gym.make("CartPole-v1",render_mode="rgb_array")
+    env = Monitor(env)  # record stats such as returns
+    return env
+
+env = DummyVecEnv([make_env])
+
+#env = VecVideoRecorder(env, "videos",
+    #record_video_trigger=lambda x: x % 2000 == 0, video_length=200)  # record videos
+
+model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}")
+model.learn(total_timesteps=config["total_timesteps"], callback=WandbCallback(
+    gradient_save_freq=100,
+    #model_save_path=f"models/{run.id}",
+    verbose=2,
+))
diff --git a/cartpole_first/evaluate_reinforce_cartpole.py b/cartpole_first/evaluate_reinforce_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..b021def2dbc903635264b5cc6ecf06628abc9099
--- /dev/null
+++ b/cartpole_first/evaluate_reinforce_cartpole.py
@@ -0,0 +1,59 @@
+import numpy as np
+import gymnasium as gym
+import torch
+from reinforce_cartpole import PolicyNet, device
+
+def evaluate_policy(policy, env, num_episodes=100):
+    successes = 0
+    total_rewards = []
+    
+    with torch.no_grad():  # Disable gradient computation for evaluation
+        for i_episode in range(num_episodes):
+            state, _ = env.reset()
+            episode_reward = 0
+            done = False
+
+            while not done:
+
+                action, _ = policy.act(state)
+                
+                state, reward, terminated, truncated, _ = env.step(action)
+                done = terminated or truncated
+                episode_reward += reward
+            
+            total_rewards.append(episode_reward)
+            
+            if episode_reward >= 195:  # OpenAI's definition of solving CartPole
+                successes += 1
+
+            print(f'Episode {i_episode + 1}: Reward = {episode_reward}')
+    
+    success_rate = successes / num_episodes * 100
+    avg_reward = np.mean(total_rewards)
+    print(f'\nEvaluation Results:')
+    print(f'Success Rate: {success_rate:.2f}%')
+    print(f'Average Reward: {avg_reward:.2f}')
+    
+    return success_rate, avg_reward
+
+if __name__ == '__main__':
+    # Setup environment
+    env = gym.make('CartPole-v1')
+    
+    # Load policy
+    policy = PolicyNet()
+    policy.load_state_dict(torch.load('reinforce_cartpole.pth'))
+    policy.to(device)
+    policy.eval()
+    
+    # Evaluate
+    success_rate, avg_reward = evaluate_policy(policy, env)
+    
+    # Update README with results
+    with open('README.md', 'a') as f:
+        f.write(f'\n\n### Evaluation Results\n')
+        f.write(f'After training, the agent was evaluated on 100 episodes:\n')
+        f.write(f'- Success Rate: {success_rate:.2f}%\n')
+        f.write(f'- Average Reward: {avg_reward:.2f}\n')
+    
+    env.close()
diff --git a/cartpole_first/main.py b/cartpole_first/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..822638ca6e8a01ee3448f8f7430c183e67b0788c
--- /dev/null
+++ b/cartpole_first/main.py
@@ -0,0 +1,22 @@
+import gymnasium as gym
+
+# Create the environment
+env = gym.make("CartPole-v1", render_mode="human")
+
+# Reset the environment and get the initial observation
+observation = env.reset()
+
+for _ in range(1000):
+    # Select a random action from the action space
+    action = env.action_space.sample()
+    # Apply the action to the environment
+    # Returns next observation, reward, done signal (indicating
+    # if the episode has ended), and an additional info dictionary
+    observation, reward, terminated, truncated, info = env.step(action)
+    # Render the environment to visualize the agent's behavior
+    env.render()
+    if terminated: 
+        # Terminated before max step
+        break
+
+env.close()
diff --git a/cartpole_first/reinforce_cartpole.pth b/cartpole_first/reinforce_cartpole.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ba06fb4700fd4c64263d214c2b609cf3b7c77778
Binary files /dev/null and b/cartpole_first/reinforce_cartpole.pth differ
diff --git a/cartpole_first/reinforce_cartpole.py b/cartpole_first/reinforce_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..48474d8483d8472c416352e6df733ab36c8a6e4d
--- /dev/null
+++ b/cartpole_first/reinforce_cartpole.py
@@ -0,0 +1,150 @@
+import numpy as np
+
+from collections import deque
+
+import matplotlib.pyplot as plt
+
+# PyTorch
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.activation import LeakyReLU
+import torch.optim as optim
+from torch.distributions import Categorical
+
+# Gym
+import gymnasium as gym
+
+# Hugging Face Hub
+# from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.
+# import imageio
+
+# -----------------
+# Init
+# -----------------
+
+env_id = "CartPole-v1"
+# Create the env
+env = gym.make(env_id)
+
+# Create the evaluation env
+eval_env = gym.make(env_id)
+
+class PolicyNet(nn.Module):
+    def __init__(self):
+        super(PolicyNet, self).__init__()
+        self.fc1 = nn.Linear(4, 128)
+        self.fc2 = nn.Linear(128, 2)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, 0.5)
+
+        x = self.fc2(x)
+        return F.softmax(x, dim=1)
+
+    def act(self, state):
+        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
+        probs = self.forward(state).cpu()
+        m = Categorical(probs)
+        action = m.sample()
+        return action.item(), m.log_prob(action)
+
+AgentPolicy = PolicyNet()
+
+# -----------------
+# TRAINING
+# -----------------
+
+def reinforce(policy, optimizer, n_training_episodes, gamma, print_every):
+    # Help us to calculate the score during the training
+    scores_deque = deque(maxlen=100)
+    scores = []
+    # Line 3 of pseudocode
+    for i_episode in range(1, n_training_episodes + 1):
+        ## Reset the buffer
+        saved_log_probs = []
+        rewards = []
+        ## Reset the environment
+        state, _ = env.reset()
+        done = False
+        
+        ## Repeat until the end of the episode:
+        while not done:
+            ## Compute action probabilities
+            action, log_prob = policy.act(state)
+            ## Save the probs in the buffer
+            saved_log_probs.append(log_prob)
+            ## Step the environment with the action
+            state, reward, terminated, truncated, _ = env.step(action)
+            done = terminated or truncated
+            rewards.append(reward)
+
+            # env.render()
+            
+        
+
+        scores_deque.append(sum(rewards))
+        scores.append(sum(rewards))
+        ## Compute the returns
+        returns = deque()
+        n_steps = len(rewards)
+        ## Store in the buffer the return using gamma=0.99 
+        for t in range(n_steps)[::-1]:
+            disc_return_t = returns[0] if len(returns) > 0 else 0
+            returns.appendleft(gamma * disc_return_t + rewards[t])
+
+        ## Normalize the return
+        eps = np.finfo(np.float32).eps.item()
+        # eps is the smallest representable float, which is
+        # added to the standard deviation of the returns to avoid numerical instabilities
+        returns = torch.tensor(returns)
+        returns = (returns - returns.mean()) / (returns.std() + eps)
+
+        ## Compute the policy loss as -sum(log(prob) * return):
+        policy_loss = []
+        for log_prob, disc_return in zip(saved_log_probs, returns):
+            policy_loss.append(-log_prob * disc_return)
+        policy_loss = torch.cat(policy_loss).sum()
+
+        ## Update the policy using an Adam optimizer and a learning rate of 5e-3
+        optimizer.zero_grad()
+        policy_loss.backward()
+        optimizer.step()
+
+        if i_episode % print_every == 0:
+            print("Episode {}\tAverage Score: {:.2f}".format(i_episode, np.mean(scores_deque)))
+
+    return scores
+
+lr = 5e-3
+gamma = 0.99
+
+cartpole_optimizer = optim.Adam(AgentPolicy.parameters(), lr=lr)
+
+# Set the device
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+if __name__ == '__main__':
+    # Training hyperparameters
+    n_training_episodes = 500
+    print_every = 10
+
+    # Initialize and setup the agent
+    AgentPolicy.to(device)
+
+    # Train the agent
+    scores = reinforce(AgentPolicy, cartpole_optimizer, n_training_episodes, gamma, print_every)
+
+    # Save the trained model
+    torch.save(AgentPolicy.state_dict(), 'reinforce_cartpole.pth')
+
+    # Plot the scores
+    plt.figure(figsize=(10,5))
+    plt.plot(scores)
+    plt.title('REINFORCE Training - CartPole-v1')
+    plt.xlabel('Episode')
+    plt.ylabel('Total Reward')
+    plt.savefig('training_plot.png')
+    plt.close()
+
diff --git a/cartpole_first/training_plot.png b/cartpole_first/training_plot.png
new file mode 100644
index 0000000000000000000000000000000000000000..f2eca5829e9f9da7b084b13fd2d463a6e1239c46
Binary files /dev/null and b/cartpole_first/training_plot.png differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d9398b1e67bd7300be09a64ac11c2f430bffa4f7
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+numpy>=1.24.3,<2.0.0
+torch>=2.1.0
+gymnasium>=0.29.1
+matplotlib>=3.7.1
+stable-baselines3
+stable-baselines3[extra]
+moviepy
+huggingface-sb3
+huggingface_hub