diff --git a/README.md b/README.md index 329e861dfc819107785af595e2a972a79fa051b6..144b9ce436f17c8edd0e876812d8190872c9ecef 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ Now that you have trained your model, it is time to evaluate its performance. Ru From the openai gym wiki we know that the environment counts as solved when the average reward is greater or equal to 195 for over 100 consecutuve trials. From the evaluation script i used the success rate is 1.0 when we allow the maximum number of steps the environment offers. + + ## Familiarization with a complete RL pipeline: Application to training a robotic arm Stable-Baselines3 (SB3) is a high-level RL library that provides various algorithms and integrated tools to easily train and test reinforcement learning models. @@ -36,7 +38,8 @@ Stable-Baselines3 (SB3) is a high-level RL library that provides various algorit 🛠Share the link of the wandb run in the `README.md` file. wandb: https://wandb.ai/lennartecl-centrale-lyon/sb3?nw=nwuserlennartecl -hugging: https://huggingface.co/lennartoe/Cartpole-v1/tree/main + +huggingface: https://huggingface.co/lennartoe/Cartpole-v1/tree/main ### Full workflow with panda-gym @@ -46,5 +49,6 @@ hugging: https://huggingface.co/lennartoe/Cartpole-v1/tree/main > Share all the code in `a2c_sb3_panda_reach.py`. Share the link of the wandb run and the trained model in the `README.md` file. wandb: https://wandb.ai/lennartecl-centrale-lyon/pandasgym_sb3?nw=nwuserlennartecl -hugging: https://huggingface.co/lennartoe/PandaReachJointsDense-v3/tree/main + +huggingface: https://huggingface.co/lennartoe/PandaReachJointsDense-v3/tree/main diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py index f32222078c69166ff8434803641a1bd7b5dff925..724585316fa14b9aac3a639655727eba35ddd4eb 100644 --- a/a2c_sb3_cartpole.py +++ b/a2c_sb3_cartpole.py @@ -20,7 +20,7 @@ run = wandb.init( save_code=True, ) -env = gym.make("CartPole-v1", render_mode="rgb_array") +env = gym.make("CartPole-v1") model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}") #model = A2C("MlpPolicy", env, ) @@ -31,7 +31,6 @@ obs = vec_env.reset() for i in range(1000): action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = vec_env.step(action) - vec_env.render("human") run.finish() diff --git a/a2c_sb3_panda_reach.py b/a2c_sb3_panda_reach.py index b33f309674b3876692c93361f51a5bfe4d5e9db8..5df3b95d2996f3081a887ecad2ac85198e978a1a 100644 --- a/a2c_sb3_panda_reach.py +++ b/a2c_sb3_panda_reach.py @@ -10,7 +10,7 @@ from huggingface_sb3 import package_to_hub # from documentation of wandb config = { "policy_type": "MultiInputPolicy", - "total_timesteps": 50000, + "total_timesteps": 500000, "env_name": "PandaReachJointsDense-v3", } run = wandb.init( @@ -21,7 +21,7 @@ run = wandb.init( save_code=True, ) -env = gym.make("PandaReachJointsDense-v3", render_mode="rgb_array") +env = gym.make("PandaReachJointsDense-v3") model = A2C("MultiInputPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}") #model = A2C("MlpPolicy", env, ) @@ -32,10 +32,6 @@ obs = vec_env.reset() for i in range(1000): action, _state = model.predict(obs, deterministic=True) obs, reward, done, info = vec_env.step(action) - vec_env.render("human") - # VecEnv resets automatically - # if done: - # obs = vec_env.reset() run.finish() diff --git a/reinforce_cartpole_dr_0.5.png b/reinforce_cartpole_dr_0.5.png new file mode 100644 index 0000000000000000000000000000000000000000..b6cbcb1eb36523e0abee5b6ac64505149124d211 Binary files /dev/null and b/reinforce_cartpole_dr_0.5.png differ