diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py
index e7f5770d47cc7421e7739599b5d3ed29c63da134..7d50d417c474f4b1b6bfcd9bf622551aa36a54fc 100644
--- a/a2c_sb3_cartpole.py
+++ b/a2c_sb3_cartpole.py
@@ -1,121 +1,62 @@
 import gymnasium as gym
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-from torch.distributions import Categorical
-import matplotlib.pyplot as plt
-
-
-
-
-
-
-
-
-
-
-# Create the environment
-env = gym.make("CartPole-v1", render_mode="human")
-
-# Reset the environment and get the initial observation
-observation = env.reset()
-
-state_size = env.observation_space.shape[0]
-action_size = env.action_space.n
-# Define the agent neural network model
-class Policy(nn.Module):
-    def __init__(self, state_size, action_size, hidden_size=128):
-        super(Policy, self).__init__()
-        self.fc1 = nn.Linear(state_size, hidden_size)
-        self.relu = nn.ReLU()
-        self.dropout = nn.Dropout(p=0.6)  # Adjust dropout probability as needed
-        self.fc2 = nn.Linear(hidden_size, action_size)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.relu(x)
-        x = self.dropout(x)
-        x = self.fc2(x)
-        return F.softmax(x)
-
-policy_model = Policy(state_size, action_size)
-optimizer = optim.Adam(policy_model.parameters(), lr=5e-3)
-
-gamma = 0.99
-episodes_rewards = []
-
-for i in range(500):
-    # Reset the environment
-    # init buffers
-    observation, info = env.reset(seed=42)
-    episode_rewards = []
-    logarithmich_probabilities = []
-    terminated = False
-    # Render the environment to visualize the agent's behavior
-    env.render()
-
-    while terminated == False:
-        # Get action probabilities from the policy model
-        action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32))
-        action_distribution = Categorical(action_probabilities)
-
-        # Sample an action from the action distribution
-        action = action_distribution.sample()
-        logarithmich_probability = action_distribution.log_prob(action)
-        logarithmich_probabilities.append(logarithmich_probability)
-        print(int(action.item()))
-        # Take a step in the environment
-        #print(env.step(action.item()))
-        next_observation, reward, done, a, b = env.step(action.item())
-        episode_rewards.append(reward)
-
-        # Update observation
-        observation = next_observation
-
-
-    # Compute the return for the episode
-    returns = []
-    R = 0
-    for r in reversed(episode_rewards):
-       R = r + gamma * R
-       returns.insert(0, R)
-
-    # Compute the policy loss
-    policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum()
-    episodes_rewards += [-policy_loss]
-    # Update the policy model
-    optimizer.zero_grad()
-    policy_loss.backward()
-    optimizer.step()
-
-
-env.close()
-
-
-
-
-# Plot the policy loss against iterations
-plt.plot([i for i in range(0,500)],episodes_rewards)
-plt.xlabel('Iterations')
-plt.ylabel('Policy Loss')
-plt.title('Policy Loss vs. Iterations')
-plt.show()
-
+import numpy as np
+from stable_baselines3.common.evaluation import evaluate_policy
+from stable_baselines3 import A2C
+from huggingface_sb3 import push_to_hub
+from huggingface_hub import login
 
 
 
+print(f"{gym.__version__=}")
 
 
+env = gym.make("CartPole-v1", render_mode="rgb_array")
+model = A2C("MlpPolicy", env, verbose=1)
 
+def evaluate(model, num_episodes=100, deterministic=True):
+  
+    vec_env = model.get_env()
+    all_episode_rewards = []
+    for i in range(num_episodes):
+        episode_rewards = []
+        done = False
+        obs = vec_env.reset()
+        while not done:
+            # _states are only useful when using LSTM policies
+            action, _states = model.predict(obs, deterministic=deterministic)
+            # here, action, rewards and dones are arrays
+            # also note that the step only returns a 4-tuple, as the env that is returned
+            obs, reward, done, info = vec_env.step(action)
+            episode_rewards.append(reward)
 
+        all_episode_rewards.append(sum(episode_rewards))
 
+    mean_episode_reward = np.mean(all_episode_rewards)
+    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)
 
+    return mean_episode_reward
 
+# Use a separate environement for evaluation
+eval_env = gym.make("CartPole-v1", render_mode="rgb_array")
 
+# Train the agent for 10000 steps
+model.learn(total_timesteps=10_000)
 
+# Evaluate the trained agent
+mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)
 
+print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
 
+login(token="****************")
 
+# Save the trained model
+model.save("ECL-TD-RL1-a2c_cartpole.zip")
 
+# Load the trained model
+model = A2C.load("ECL-TD-RL1-a2c_cartpole.zip")
 
+push_to_hub(
+    repo_id="Karim-20/a2c_cartpole",
+    filename="ECL-TD-RL1-a2c_cartpole.zip",
+    commit_message="Add cartepole-v1 environement, agent used to train is A2C"
+)