Skip to content
Snippets Groups Projects
Commit 3b147287 authored by Paganelli Emilien's avatar Paganelli Emilien
Browse files

Almost finished

parent 840aba24
Branches
No related tags found
No related merge requests found
.DS_Store 0 → 100644
File added
......@@ -208,3 +208,10 @@ Updates by Léo Schneider, Emmanuel Dellandréa
## License
MIT
## Link to huggingface
https://huggingface.co/emipaga/A2C_cartpole
## Link to wandb
https://wandb.ai/emilien-paga23/sb3/reports/Untitled-Report--Vmlldzo3MDMxNjk4
import gymnasium as gym
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
import wandb
from wandb.integration.sb3 import WandbCallback
import numpy as np
from stable_baselines3 import A2C, PPO
config = {
"policy_type": "MlpPolicy",
"total_timesteps": 10000,
"env_name": "CartPole-v1",
}
run = wandb.init(
project="sb3",
config=config,
sync_tensorboard=True,
monitor_gym=True,
save_code=True,
)
def make_env():
env = gym.make(config["env_name"], render_mode="rgb_array")
env = Monitor(env)
return env
env = DummyVecEnv([make_env])
env = VecVideoRecorder(
env,
f"videos/{run.id}",
record_video_trigger=lambda x: x % 2000 == 0,
video_length=200,
)
model = A2C(config["policy_type"], env, verbose=1, tensorboard_log=f"runs/{run.id}")
model.learn(
total_timesteps=config["total_timesteps"],
callback=WandbCallback(
gradient_save_freq=100,
model_save_path=f"models/{run.id}",
verbose=2,
),
)
run.finish()
import wandb
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
from wandb.integration.sb3 import WandbCallback
import panda_gym
# Initialize Weights & Biases
config = {
"policy_type": "MultiInputPolicy", # Type de politique
"total_timesteps": 500000, # Nombre total de pas de temps d'entraînement
"env_name": "PandaReach-v3", # Nom correct de l'environnement
}
run= wandb.init(project="panda-gym-training",
config=config,
sync_tensorboard=True,
save_code=True,
)
def make_env():
env = gym.make(config["env_name"])
return env
env = DummyVecEnv([make_env])
env = VecVideoRecorder(
env,
f"videos/{run.id}",
record_video_trigger=lambda x: x % 50000 == 0,
video_length=200,
)
model = A2C(config["policy_type"], env, verbose=1)
model.learn(total_timesteps=config["total_timesteps"], callback=WandbCallback())
model.save("a2c_panda_reach_model")
nom_artefact = "a2c_panda_reach_model"
with wandb.init(project="panda-gym-training", job_type="upload") as run:
artefact = wandb.Artifact(name=nom_artefact, type="model")
artefact.add_file("a2c_panda_reach_model.zip") # Ajouter le fichier du modèle entraîné
run.log_artifact(artefact)
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
# Hyperparameters
learning_rate = 5e-3
gamma = 0.99
num_episodes = 500
# Initialize environment
env = gym.make("CartPole-v1")
input_size = env.observation_space.shape[0]
output_size = env.action_space.n
hidden_size = 128
# Define neural network model for the agent
class PolicyNetwork(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(PolicyNetwork, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.5)
self.fc2 = nn.Linear(hidden_size, output_size)
self.softmax = nn.Softmax(dim=0)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return self.softmax(x)
# Function to normalize returns
def normalize_returns(returns):
returns = torch.tensor(returns, dtype=torch.float32)
returns = (returns - returns.mean()) / (returns.std() + 1e-8)
return returns
# Initialize policy network
policy_network = PolicyNetwork(input_size, hidden_size, output_size)
optimizer = optim.Adam(policy_network.parameters(), lr=learning_rate)
# List to store total rewards for each episode
total_rewards = []
# Training loop with tqdm progress bar
for episode in tqdm(range(num_episodes), desc="Training", unit="episode"):
episode_reward = 0 # Track total reward for this episode
returns = []
saved_log_probs = []
rewards = []
terminated = False
# Reset environment
observation = env.reset()
observation=observation[0]
step=0
# Episode loop
while not terminated and step<1000:
# Select action based on policy
observation_tensor = torch.tensor(observation, dtype=torch.float32)
action_probs = policy_network(observation_tensor)
action = torch.multinomial(action_probs, num_samples=1).item()
saved_log_probs.append(torch.log(action_probs[action]))
# Take action
next_observation, reward, terminated, _, _ = env.step(action)
episode_reward += reward # Accumulate total reward
rewards.append(reward)
observation = next_observation
step+=1
if step==1000:
print("too much")
# Store total reward for this episode
total_rewards.append(episode_reward)
# Calculate returns
episode_returns = []
G = 0
for r in rewards[::-1]:
G = r + gamma * G
episode_returns.insert(0, G)
returns.extend(episode_returns)
# Normalize returns
returns = normalize_returns(returns)
# Compute policy loss
policy_loss = [-log_prob * G for log_prob, G in zip(saved_log_probs, returns)]
policy_loss = torch.stack(policy_loss).sum()
# Update policy network
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
# Close environment
env.close()
# Plot total rewards across episodes
plt.plot(total_rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Total Reward per Episode')
plt.grid(True)
plt.show()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment