diff --git a/README.md b/README.md
index f49f3ded47e0ae40091a7f395e4233bea2d902a8..29da3e74e8aed76ea80f93dc1225dfeb83e45bea 100644
--- a/README.md
+++ b/README.md
@@ -3,3 +3,6 @@
 ### Get familiar with Hugging Face Hub
 
 Link to the model on the hub : 
+
+# REINFORCE
+Plot showing the total reward accross episodes: ![Alt text](images/reinforce_rewards.png)
diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py
index ba510f9e1f147190ab70bbb3c7165400ed5f7b66..3666845bf8010fd247e855d9f0b60b8487ebc311 100644
--- a/a2c_sb3_cartpole.py
+++ b/a2c_sb3_cartpole.py
@@ -2,8 +2,15 @@ import gymnasium as gym
 import cv2
 from stable_baselines3 import A2C
 from huggingface_sb3 import package_to_hub, push_to_hub
+from gym import envs
+from gymnasium.envs.registration import register
 
-env = gym.make("CartPole-v1", render_mode="rgb_array")
+env_id = "CartPole-v1"
+
+# Register the environment
+register(id=env_id, entry_point='gym.envs.classic_control:CartPoleEnv', max_episode_steps=500)
+
+env = gym.make(env_id)
 
 model = A2C("MlpPolicy", env, verbose=1)
 model.learn(total_timesteps=10_000)
@@ -18,15 +25,16 @@ for i in range(1000):
     # if done:
     #   obs = vec_env.reset()
 
-if __name__ == "__main__":
-    package_to_hub(model=model, 
-                    model_name="a2c-CartPole-v1",
-                    model_architecture="a2c",
-                    env_id="CartPole-v1",
-                    eval_env=env,
-                    repo_id="oscarchaufour/a2c-CartPole-v1",
-                    commit_message="Test commit")
-        
-    push_to_hub(repo_id="oscarchaufour/a2c-CartPole-v1",
-                filename="a2c_sb3_cartpole.zip",
-                commit_message="Added A2C CartPole model")
\ No newline at end of file
+# Package and push the model to the Hugging Face Hub
+model_package_id = package_to_hub(model=model, 
+                                  model_name="a2c-CartPole-v1",
+                                  model_architecture="a2c",
+                                  env_id=env_id,
+                                  eval_env=env,
+                                  repo_id="oscarchaufour/a2c-CartPole-v1",
+                                  commit_message="Initial commit of A2C CartPole model")
+
+# Push the model package to the Hub
+push_to_hub(repo_id="oscarchaufour/a2c-CartPole-v1",
+            filename=model_package_id + ".zip",
+            commit_message="Added A2C CartPole model")
\ No newline at end of file
diff --git a/images/reinforce_rewards.png b/images/reinforce_rewards.png
new file mode 100644
index 0000000000000000000000000000000000000000..e76d632e17aa3bc917cd5e2de89c865baaeb0bcf
Binary files /dev/null and b/images/reinforce_rewards.png differ
diff --git a/neural_network_model.zip b/neural_network_model.zip
new file mode 100644
index 0000000000000000000000000000000000000000..ba6de9a0b83bb0d2cf55768c2497c4f1a0e9345c
Binary files /dev/null and b/neural_network_model.zip differ
diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py
index c732e8588847e62a8fe552b4973550dd754c65a3..bf5d1bb07e35868c96494835dad522ff77b29a78 100644
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
@@ -5,6 +5,7 @@ import torch.optim as optim
 import matplotlib.pyplot as plt
 from tqdm import tqdm 
 import numpy as np
+from torch.distributions import Categorical
 
 # Define the neural network model
 class Policy(nn.Module):
@@ -24,77 +25,68 @@ class Policy(nn.Module):
         x = self.softmax(x)
         return x
 
-
-def reinforce():
-    # Create the environment
-    env = gym.make("CartPole-v1")
-
-    # Set up the agent
-    policy = Policy(
-        input_size=env.observation_space.shape[0],
-        output_size=env.action_space.n
-    )
-    optimizer = optim.Adam(policy.parameters(), lr=5e-3)
-
+def reinforce(policy, env, optimizer):
     # Training loop
-    num_episodes = 500
+    num_episodes = 200
     gamma = 0.99
-    episode_rewards = []
+    episodes_rewards = []
 
     for episode in tqdm(range(num_episodes)):
-        action_probabilities = []
-        episode_rewards_weighted = []
-
         # Reset the environment and get the initial observation
-        observation = env.reset()[0]
+        observation = env.reset()
 
+        rewards = []
+        log_probabilities = []
         terminated = False
         episode_reward = 0
-        step = 0
 
         while not terminated:
-            step += 1
             # Compute action probabilities
-            action_probs = policy(torch.FloatTensor(observation).unsqueeze(0))
-
+            observation_array = observation[0] if isinstance(observation, tuple) else observation
+            observation_tensor = torch.FloatTensor(observation_array).unsqueeze(0)
+            action_probabilities = policy(observation_tensor)
+            
+            cat = Categorical(action_probabilities)
             # Sample action based on probabilities and store its probability in the buffer
-            action = torch.multinomial(action_probs, num_samples=1).item()
+            action = cat.sample()
+            log_probability = cat.log_prob(action)
 
             # Step the environment with the action
-            observation, reward, terminated, truncated, info = env.step(action)
+            observation, reward, terminated, _, _ = env.step(action.item())
             env.render()
 
-            # Compute and store the return in the buffer
+            # Compute and store the reward in the buffer
             episode_reward += reward
-            episode_rewards_weighted.append(reward * gamma ** step)
-            #episode_rewards.append(episode_reward)
-
-            # Store the action probabilities
-            action_probabilities.append(action_probs[0][action])
-
-        # Normalize the return
-        # Convert action_probabilities to a tensor with requires_grad=True
-        action_probabilities_tensor = torch.FloatTensor(action_probabilities).requires_grad_(True)
-
-        episode_rewards_tensor = torch.FloatTensor(episode_rewards_weighted)
-        episode_rewards_tensor -= torch.mean(episode_rewards_tensor)
-        episode_rewards_tensor /= torch.std(episode_rewards_tensor)
+            log_probabilities.append(log_probability)
+            rewards.append(reward)
+        
+        episodes_rewards.append(episode_reward)
+
+        # Compute the returns
+        weighted_rewards = 0
+        returns = []
+        for r in rewards[::-1]:
+            weighted_rewards = r + gamma * weighted_rewards
+            returns.insert(0, weighted_rewards)
+        returns = torch.tensor(returns)
+        returns = (returns - returns.mean()) / (returns.std() + 1e-5)
 
         # Compute policy loss
-        log_probs = torch.log(action_probabilities_tensor.squeeze(0))
-        policy_loss = -torch.sum(log_probs * torch.tensor(episode_rewards_weighted))
+        loss = []
+        for log_prob, weighted_rewards in zip(log_probabilities, returns):
+            loss.append(log_prob * weighted_rewards)
+        policy_loss = - torch.cat(loss).sum()
 
         # Update the policy
         optimizer.zero_grad()
         policy_loss.backward()
         optimizer.step()
-        episode_rewards.append(episode_reward)
-    return episode_rewards
+    return episodes_rewards
 
 
-def plot_rewards(episode_rewards):
+def plot_rewards(episodes_rewards):
     # Plot the total reward across episodes
-    plt.plot(episode_rewards)
+    plt.plot(episodes_rewards)
     plt.xlabel('Episode')
     plt.ylabel('Total Reward')
     plt.title('REINFORCE: Total Reward across Episodes')
@@ -102,5 +94,33 @@ def plot_rewards(episode_rewards):
 
 
 if __name__ == "__main__":
-    episode_rewards = reinforce()
-    plot_rewards(episode_rewards)
\ No newline at end of file
+    # Create the environment
+    env = gym.make("CartPole-v1")
+
+    # Set up the agent
+    policy = Policy(
+        input_size=env.observation_space.shape[0],
+        output_size=env.action_space.n
+    )
+    optimizer = optim.Adam(policy.parameters(), lr=5e-3)
+    episodes_rewards = reinforce(policy, env, optimizer)
+    env.close()
+
+    # plot the rewards
+    plot_rewards(episodes_rewards)
+
+
+    # Serialize the model and save it to a .zip file
+    # import pickle
+    # import zipfile
+
+    # Assuming `policy` is your neural network model
+
+    # # Step 1: Serialize the model
+    # model_bytes = pickle.dumps(policy)
+
+    # # Step 2: Create a .zip file containing the serialized model
+    # zip_filename = ".zip"
+    # with zipfile.ZipFile(zip_filename, 'w') as zipf:
+    #     zipf.writestr("model.pkl", model_bytes)
+
diff --git a/test.ipynb b/test.ipynb
index b0249dbcf21e2d326dd711520f27a6557899644e..8a16e6581015df166f4ea76288d775d5fc00158f 100644
--- a/test.ipynb
+++ b/test.ipynb
@@ -708,13 +708,41 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Reinforce"
+    "## Upload model to Hugging Face hub"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
-   "source": []
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'model' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[8], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhuggingface_sb3\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m package_to_hub, push_to_hub\n\u001b[1;32m----> 3\u001b[0m package_to_hub(model\u001b[38;5;241m=\u001b[39m\u001b[43mmodel\u001b[49m, \n\u001b[0;32m      4\u001b[0m                 model_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ma2c-CartPole-v1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m      5\u001b[0m                 model_architecture\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ma2c\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m      6\u001b[0m                 env_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCartPole-v1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m      7\u001b[0m                 eval_env\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m      8\u001b[0m                 repo_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moscarchaufour/a2c-CartPole-v1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m      9\u001b[0m                 commit_message\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTest commit\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m     11\u001b[0m push_to_hub(repo_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moscarchaufour/a2c-CartPole-v1\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     12\u001b[0m             filename\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124ma2c_sb3_cartpole.zip\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     13\u001b[0m             commit_message\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAdded A2C CartPole model\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'model' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_sb3 import package_to_hub, push_to_hub\n",
+    "\n",
+    "package_to_hub(model=model, \n",
+    "                model_name=\"a2c-CartPole-v1\",\n",
+    "                model_architecture=\"a2c\",\n",
+    "                env_id=\"CartPole-v1\",\n",
+    "                eval_env=None,\n",
+    "                repo_id=\"oscarchaufour/a2c-CartPole-v1\",\n",
+    "                commit_message=\"Test commit\")\n",
+    "    \n",
+    "push_to_hub(repo_id=\"oscarchaufour/a2c-CartPole-v1\",\n",
+    "            filename=\"a2c_sb3_cartpole.zip\",\n",
+    "            commit_message=\"Added A2C CartPole model\")\n"
+   ]
   }
  ],
  "metadata": {