Skip to content
Snippets Groups Projects
Commit 81651728 authored by MaximeCerise's avatar MaximeCerise
Browse files

evaluate_reinforce_cartpole.py added

parent 15d811ac
Branches
No related tags found
No related merge requests found
...@@ -10,12 +10,12 @@ ...@@ -10,12 +10,12 @@
- <b>Save:</b> [policy_cartpole.pth](saves/policy_cartpole.pth) - <b>Save:</b> [policy_cartpole.pth](saves/policy_cartpole.pth)
- <b>Code:</b> [reinforce_cartpole.py](reinforce_cartpole.py) - <b>Code:</b> [reinforce_cartpole.py](reinforce_cartpole.py)
Below is the rewards accross 300 episodes : Below is the rewards accross 300 episodes :
![Rewards across episodes](saves/plot_rewards500.png) ![Rewards across episodes](saves/plot_rewards.png)
model : [policy_cartpole.pth](saves/policy_cartpole.pth)
#### 1.2 Evaluation #### 1.2 Evaluation
- <b>Code:</b> [evaluate_reinforce_cartpole.py](evaluate_reinforce_cartpole.py) - <b>Code:</b> [evaluate_reinforce_cartpole.py](evaluate_reinforce_cartpole.py)
The evaluation has been done one 100 episodes and the sucess threshold is set at a score of 400. The evaluation has been done one 100 episodes and the sucess threshold is set as 400.
We finally have an evaluation with 100% of sucess: We finally have an evaluation with 100% of sucess:
...@@ -26,5 +26,5 @@ We finally have an evaluation with 100% of sucess: ...@@ -26,5 +26,5 @@ We finally have an evaluation with 100% of sucess:
Here we set up a complete pipeline to solve Cartpole environment with A2C algorithm. Here we set up a complete pipeline to solve Cartpole environment with A2C algorithm.
Wandb has been set up to follow the learning phase. Wandb has been set up to follow the learning phase.
https://wandb.ai/maximecerise-ecl/cartpole-a2c
![alt text](saves/rollout.png) ![alt text](saves/rollout.png)
...@@ -13,15 +13,15 @@ env = DummyVecEnv([lambda: env]) ...@@ -13,15 +13,15 @@ env = DummyVecEnv([lambda: env])
wandb.init( wandb.init(
entity="maximecerise-ecl", entity="maximecerise-ecl",
project="cartpole-a2c_", project="cartpole-a2c",
sync_tensorboard=True, sync_tensorboard=True,
monitor_gym=True, monitor_gym=True,
save_code=True save_code=True
) )
model = A2C("MlpPolicy", env, verbose=1) model = A2C("MlpPolicy", env, verbose=1, tensorboard_log="./a2c_tensorboard/")
model.learn(total_timesteps=5000) model.learn(total_timesteps=500000)
model.save("a2c_cartpole") model.save("a2c_cartpole")
......
import gym
import torch
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from reinforce_cartpole import PolicyNetwork
def evaluate_reinforce_cpole():
env = gym.make("CartPole-v1", render_mode="human")
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
# Charger le modèle
policy = PolicyNetwork(obs_dim, action_dim)
policy.load_state_dict(torch.load("saves/policy_cartpole.pth"))
policy.eval() # Mode évaluation
num_episodes = 100
success_threshold = 400 # Score moyen requis pour considérer que l'agent a appris
success_count = 0
scores = []
for episode in range(num_episodes):
state, _ = env.reset()
state = torch.tensor(state, dtype=torch.float32)
done = False
total_reward = 0
while not done:
with torch.no_grad():
action_probs = policy(state)
action = torch.argmax(action_probs).item() # Choisir l'action la plus probable
next_state, reward, terminated, truncated, _ = env.step(action)
total_reward += reward
state = torch.tensor(next_state, dtype=torch.float32)
done = terminated or truncated
scores.append(total_reward)
if total_reward >= success_threshold:
success_count += 1
print(f"Épisode {episode+1}: Score = {total_reward}")
success_rate = success_count / num_episodes * 100
print(f"\nSuccès: {success_count}/{num_episodes} ({success_rate:.2f}%)")
env.close()
if __name__ == "__main__":
evaluate_reinforce_cpole()
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment