Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • epaganel-main-patch-63e8
  • main
2 results

Target

Select target project
  • loestrei/mso_3_4-td1
  • edelland/mso_3_4-td1
  • schneidl/mso_3_4-td1
  • epaganel/mso_3_4-td1
  • asennevi/armand-senneville-mso-3-4-td-1
  • hchauvin/mso_3_4-td1
  • mbabay/mso_3_4-td1
  • ochaufou/mso_3_4-td1
  • cgerest/hands-on-rl
  • robertr/mso_3_4-td1
  • kmajdi/mso_3_4-td1
  • jseksik/hands-on-rl
  • coulonj/mso_3_4-td1
  • tdesgreys/mso_3_4-td1
14 results
Select Git revision
  • main
1 result
Show changes

Commits on Source 1

File added
......@@ -208,3 +208,10 @@ Updates by Léo Schneider, Emmanuel Dellandréa
## License
MIT
## Link to huggingface
https://huggingface.co/emipaga/A2C_cartpole
## Link to wandb
https://wandb.ai/emilien-paga23/sb3/reports/Untitled-Report--Vmlldzo3MDMxNjk4
import gymnasium as gym
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
import wandb
from wandb.integration.sb3 import WandbCallback
import numpy as np
from stable_baselines3 import A2C, PPO
config = {
"policy_type": "MlpPolicy",
"total_timesteps": 10000,
"env_name": "CartPole-v1",
}
run = wandb.init(
project="sb3",
config=config,
sync_tensorboard=True,
monitor_gym=True,
save_code=True,
)
def make_env():
env = gym.make(config["env_name"], render_mode="rgb_array")
env = Monitor(env)
return env
env = DummyVecEnv([make_env])
env = VecVideoRecorder(
env,
f"videos/{run.id}",
record_video_trigger=lambda x: x % 2000 == 0,
video_length=200,
)
model = A2C(config["policy_type"], env, verbose=1, tensorboard_log=f"runs/{run.id}")
model.learn(
total_timesteps=config["total_timesteps"],
callback=WandbCallback(
gradient_save_freq=100,
model_save_path=f"models/{run.id}",
verbose=2,
),
)
run.finish()
import wandb
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
from wandb.integration.sb3 import WandbCallback
import panda_gym
# Initialize Weights & Biases
config = {
"policy_type": "MultiInputPolicy", # Type de politique
"total_timesteps": 500000, # Nombre total de pas de temps d'entraînement
"env_name": "PandaReach-v3", # Nom correct de l'environnement
}
run= wandb.init(project="panda-gym-training",
config=config,
sync_tensorboard=True,
save_code=True,
)
def make_env():
env = gym.make(config["env_name"])
return env
env = DummyVecEnv([make_env])
env = VecVideoRecorder(
env,
f"videos/{run.id}",
record_video_trigger=lambda x: x % 50000 == 0,
video_length=200,
)
model = A2C(config["policy_type"], env, verbose=1)
model.learn(total_timesteps=config["total_timesteps"], callback=WandbCallback())
model.save("a2c_panda_reach_model")
nom_artefact = "a2c_panda_reach_model"
with wandb.init(project="panda-gym-training", job_type="upload") as run:
artefact = wandb.Artifact(name=nom_artefact, type="model")
artefact.add_file("a2c_panda_reach_model.zip") # Ajouter le fichier du modèle entraîné
run.log_artifact(artefact)
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
# Hyperparameters
learning_rate = 5e-3
gamma = 0.99
num_episodes = 500
# Initialize environment
env = gym.make("CartPole-v1")
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
hidden_size = 128
# Define neural network model for the agent
class PolicyNetwork(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(PolicyNetwork, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.5)
self.fc2 = nn.Linear(hidden_size, output_size)
self.softmax = nn.Softmax(dim=0)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return self.softmax(x)
# Function to normalize returns
def normalize_returns(returns):
returns = torch.tensor(returns, dtype=torch.float32)
returns = (returns - returns.mean()) / (returns.std() + 1e-8)
return returns
# Initialize policy network
policy_network = PolicyNetwork(input_size, hidden_size, output_size)
optimizer = optim.Adam(policy_network.parameters(), lr=learning_rate)
# List to store total rewards for each episode
total_rewards = []
# Training loop with tqdm progress bar
for episode in tqdm(range(num_episodes), desc="Training", unit="episode"):
episode_reward = 0 # Track total reward for this episode
returns = []
saved_log_probs = []
rewards = []
terminated = False
# Reset environment
observation = env.reset()
observation=observation[0]
step=0
# Episode loop
while not terminated and step<1000:
# Select action based on policy
observation_tensor = torch.tensor(observation, dtype=torch.float32)
action_probs = policy_network(observation_tensor)
action = torch.multinomial(action_probs, num_samples=1).item()
saved_log_probs.append(torch.log(action_probs[action]))
# Take action
next_observation, reward, terminated, _, _ = env.step(action)
episode_reward += reward # Accumulate total reward
rewards.append(reward)
observation = next_observation
step+=1
if step==1000:
print("too much")
# Store total reward for this episode
total_rewards.append(episode_reward)
# Calculate returns
episode_returns = []
G = 0
for r in rewards[::-1]:
G = r + gamma * G
episode_returns.insert(0, G)
returns.extend(episode_returns)
# Normalize returns
returns = normalize_returns(returns)
# Compute policy loss
policy_loss = [-log_prob * G for log_prob, G in zip(saved_log_probs, returns)]
policy_loss = torch.stack(policy_loss).sum()
# Update policy network
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
# Close environment
env.close()
# Plot total rewards across episodes
plt.plot(total_rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Reward per Episode')
plt.grid(True)
plt.show()
\ No newline at end of file