diff --git a/README.md b/README.md index 0c50cc419c84a15a5e515b3706d6aa27ac978538..3c9e45e491730e3f7dcbfd581d99bc05d716396b 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,12 @@ - <b>Save:</b> [policy_cartpole.pth](saves/policy_cartpole.pth) - <b>Code:</b> [reinforce_cartpole.py](reinforce_cartpole.py) Below is the rewards accross 300 episodes : - - + +model : [policy_cartpole.pth](saves/policy_cartpole.pth) #### 1.2 Evaluation - <b>Code:</b> [evaluate_reinforce_cartpole.py](evaluate_reinforce_cartpole.py) -The evaluation has been done one 100 episodes and the sucess threshold is set at a score of 400. +The evaluation has been done one 100 episodes and the sucess threshold is set as 400. We finally have an evaluation with 100% of sucess: @@ -26,5 +26,5 @@ We finally have an evaluation with 100% of sucess: Here we set up a complete pipeline to solve Cartpole environment with A2C algorithm. Wandb has been set up to follow the learning phase. - +https://wandb.ai/maximecerise-ecl/cartpole-a2c  diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py index 50f784709d629804612a5849af456d3b1e5d60f3..3cce8ed108e21d7267db74b3f5fa82ca08b2234f 100644 --- a/a2c_sb3_cartpole.py +++ b/a2c_sb3_cartpole.py @@ -13,15 +13,15 @@ env = DummyVecEnv([lambda: env]) wandb.init( entity="maximecerise-ecl", - project="cartpole-a2c_", + project="cartpole-a2c", sync_tensorboard=True, monitor_gym=True, save_code=True ) -model = A2C("MlpPolicy", env, verbose=1) -model.learn(total_timesteps=5000) +model = A2C("MlpPolicy", env, verbose=1, tensorboard_log="./a2c_tensorboard/") +model.learn(total_timesteps=500000) model.save("a2c_cartpole") diff --git a/evaluate_reinforce_cartpole.py b/evaluate_reinforce_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..42f48985e181b66a9c169bdba2d8819398e49eeb --- /dev/null +++ b/evaluate_reinforce_cartpole.py @@ -0,0 +1,53 @@ +import gym +import torch +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from reinforce_cartpole import PolicyNetwork + +def evaluate_reinforce_cpole(): + env = gym.make("CartPole-v1", render_mode="human") + obs_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + + # Charger le modèle + policy = PolicyNetwork(obs_dim, action_dim) + policy.load_state_dict(torch.load("saves/policy_cartpole.pth")) + policy.eval() # Mode évaluation + + num_episodes = 100 + success_threshold = 400 # Score moyen requis pour considérer que l'agent a appris + success_count = 0 + scores = [] + + for episode in range(num_episodes): + state, _ = env.reset() + state = torch.tensor(state, dtype=torch.float32) + done = False + total_reward = 0 + + while not done: + with torch.no_grad(): + action_probs = policy(state) + action = torch.argmax(action_probs).item() # Choisir l'action la plus probable + + next_state, reward, terminated, truncated, _ = env.step(action) + total_reward += reward + + state = torch.tensor(next_state, dtype=torch.float32) + done = terminated or truncated + + scores.append(total_reward) + if total_reward >= success_threshold: + success_count += 1 + + print(f"Épisode {episode+1}: Score = {total_reward}") + + success_rate = success_count / num_episodes * 100 + print(f"\nSuccès: {success_count}/{num_episodes} ({success_rate:.2f}%)") + + env.close() + +if __name__ == "__main__": + evaluate_reinforce_cpole() diff --git a/videos/rl-video-step-0-to-step-1000.mp4 b/videos/rl-video-step-0-to-step-1000.mp4 index 93ec3d2d9597f8803bbcc193c167c8779a59dea8..98f6318f63cb99401362e83ad589912a7b61629f 100644 Binary files a/videos/rl-video-step-0-to-step-1000.mp4 and b/videos/rl-video-step-0-to-step-1000.mp4 differ