diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..c7d51a90750c8d9f217162789317b9bd59738097 Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md index 501d29b5c0ac88bab630dca91cd6f824e180b050..c0693a2e274c7710b9d22073bff9f84d2f4cb349 100644 --- a/README.md +++ b/README.md @@ -208,3 +208,10 @@ Updates by Léo Schneider, Emmanuel Dellandréa ## License MIT + +## Link to huggingface +https://huggingface.co/emipaga/A2C_cartpole + +## Link to wandb + +https://wandb.ai/emilien-paga23/sb3/reports/Untitled-Report--Vmlldzo3MDMxNjk4 diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..a9cf3e445edceb8f302638c7079f651269da6079 --- /dev/null +++ b/a2c_sb3_cartpole.py @@ -0,0 +1,50 @@ +import gymnasium as gym + +from stable_baselines3.common.monitor import Monitor +from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder +import wandb +from wandb.integration.sb3 import WandbCallback +import numpy as np +from stable_baselines3 import A2C, PPO + + +config = { + "policy_type": "MlpPolicy", + "total_timesteps": 10000, + "env_name": "CartPole-v1", +} +run = wandb.init( + project="sb3", + config=config, + sync_tensorboard=True, + monitor_gym=True, + save_code=True, +) + + +def make_env(): + env = gym.make(config["env_name"], render_mode="rgb_array") + env = Monitor(env) + return env + + +env = DummyVecEnv([make_env]) +env = VecVideoRecorder( + env, + f"videos/{run.id}", + record_video_trigger=lambda x: x % 2000 == 0, + video_length=200, +) + +model = A2C(config["policy_type"], env, verbose=1, tensorboard_log=f"runs/{run.id}") +model.learn( + total_timesteps=config["total_timesteps"], + callback=WandbCallback( + gradient_save_freq=100, + model_save_path=f"models/{run.id}", + verbose=2, + ), +) +run.finish() + + diff --git a/a2c_sb3_panda_reach.py b/a2c_sb3_panda_reach.py new file mode 100644 index 0000000000000000000000000000000000000000..b11b99ea943ea77fe69fefd6b120865c430b51d2 --- /dev/null +++ b/a2c_sb3_panda_reach.py @@ -0,0 +1,50 @@ +import wandb +import gymnasium as gym +from stable_baselines3 import A2C +from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder +from wandb.integration.sb3 import WandbCallback +import panda_gym + +# Initialize Weights & Biases +config = { + "policy_type": "MultiInputPolicy", # Type de politique + "total_timesteps": 500000, # Nombre total de pas de temps d'entraînement + "env_name": "PandaReach-v3", # Nom correct de l'environnement +} + +run= wandb.init(project="panda-gym-training", + config=config, + sync_tensorboard=True, + save_code=True, +) + + + +def make_env(): + env = gym.make(config["env_name"]) + return env + +env = DummyVecEnv([make_env]) + +env = VecVideoRecorder( + env, + f"videos/{run.id}", + record_video_trigger=lambda x: x % 50000 == 0, + video_length=200, +) + + +model = A2C(config["policy_type"], env, verbose=1) + + +model.learn(total_timesteps=config["total_timesteps"], callback=WandbCallback()) + + +model.save("a2c_panda_reach_model") + + +nom_artefact = "a2c_panda_reach_model" +with wandb.init(project="panda-gym-training", job_type="upload") as run: + artefact = wandb.Artifact(name=nom_artefact, type="model") + artefact.add_file("a2c_panda_reach_model.zip") # Ajouter le fichier du modèle entraîné + run.log_artifact(artefact) diff --git a/reinforce_carpole.py b/reinforce_carpole.py new file mode 100644 index 0000000000000000000000000000000000000000..ba20898c06694bf2068c2c0611cb51fa874f69bc --- /dev/null +++ b/reinforce_carpole.py @@ -0,0 +1,115 @@ +import gym +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +from tqdm import tqdm +import matplotlib.pyplot as plt + +# Hyperparameters +learning_rate = 5e-3 +gamma = 0.99 +num_episodes = 500 + +# Initialize environment +env = gym.make("CartPole-v1") +input_size = env.observation_space.shape[0] +output_size = env.action_space.n +hidden_size = 128 + +# Define neural network model for the agent +class PolicyNetwork(nn.Module): + def __init__(self, input_size, hidden_size, output_size): + super(PolicyNetwork, self).__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.relu = nn.ReLU() + self.dropout = nn.Dropout(p=0.5) + self.fc2 = nn.Linear(hidden_size, output_size) + self.softmax = nn.Softmax(dim=0) + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.dropout(x) + x = self.fc2(x) + return self.softmax(x) + + +# Function to normalize returns +def normalize_returns(returns): + returns = torch.tensor(returns, dtype=torch.float32) + returns = (returns - returns.mean()) / (returns.std() + 1e-8) + return returns + + + +# Initialize policy network +policy_network = PolicyNetwork(input_size, hidden_size, output_size) +optimizer = optim.Adam(policy_network.parameters(), lr=learning_rate) + +# List to store total rewards for each episode +total_rewards = [] + +# Training loop with tqdm progress bar +for episode in tqdm(range(num_episodes), desc="Training", unit="episode"): + episode_reward = 0 # Track total reward for this episode + returns = [] + saved_log_probs = [] + rewards = [] + terminated = False + + # Reset environment + observation = env.reset() + observation=observation[0] + step=0 + # Episode loop + while not terminated and step<1000: + # Select action based on policy + observation_tensor = torch.tensor(observation, dtype=torch.float32) + action_probs = policy_network(observation_tensor) + action = torch.multinomial(action_probs, num_samples=1).item() + saved_log_probs.append(torch.log(action_probs[action])) + + # Take action + next_observation, reward, terminated, _, _ = env.step(action) + episode_reward += reward # Accumulate total reward + rewards.append(reward) + observation = next_observation + step+=1 + if step==1000: + print("too much") + + + # Store total reward for this episode + total_rewards.append(episode_reward) + + # Calculate returns + episode_returns = [] + G = 0 + for r in rewards[::-1]: + G = r + gamma * G + episode_returns.insert(0, G) + returns.extend(episode_returns) + + # Normalize returns + returns = normalize_returns(returns) + + # Compute policy loss + policy_loss = [-log_prob * G for log_prob, G in zip(saved_log_probs, returns)] + policy_loss = torch.stack(policy_loss).sum() + + # Update policy network + optimizer.zero_grad() + policy_loss.backward() + optimizer.step() + +# Close environment +env.close() + +# Plot total rewards across episodes +plt.plot(total_rewards) +plt.xlabel('Episode') +plt.ylabel('Total Reward') +plt.title('Total Reward per Episode') +plt.grid(True) +plt.show() \ No newline at end of file