From 08a74dea00a5bf848675057107bbadce8f6c0821 Mon Sep 17 00:00:00 2001 From: number_cruncher <lennart.oestreich@stud.tu-darmstadt.de> Date: Sun, 16 Mar 2025 19:37:53 +0100 Subject: [PATCH] update readme and comments --- README.md | 3 ++- a2c_sb3_cartpole.py | 6 ++++-- a2c_sb3_panda_reach.py | 4 ++-- evaluate_reinforce_cartpole.py | 15 +++++++++------ reinforce_cartpole.py | 20 +++++++++++--------- 5 files changed, 28 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index bedab60..87b6c05 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,8 @@ Now that you have trained your model, it is time to evaluate its performance. Ru > 🛠**To be handed in** > Implement a script which loads your saved model and use it to solve the cartpole enviroment. Run 100 evaluations and share the final success rate across all evaluations in the `README.md`. Share the code in `evaluate_reinforce_cartpole.py`. -From the openai gym wiki we know that the environment counts as solved when the average reward is greater or equal to 195 for over 100 consecutuve trials. +Remark: To me it was not clear what "Success rate" in this context means (Therefore i made some assumptions!). The openai gym wiki says that the environment is solved when the average is over 100 trials is greater than 195. To translate this into a successrate i assumed that it is just necessary to count how often out of 100 trials, the summed reward was bigger than 195. + From the evaluation script i used the success rate is 1.0 when we allow the maximum number of steps the environment offers. ## Familiarization with a complete RL pipeline: Application to training a robotic arm diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py index 7c94adc..701652c 100644 --- a/a2c_sb3_cartpole.py +++ b/a2c_sb3_cartpole.py @@ -20,12 +20,12 @@ run = wandb.init( save_code=True, ) + +# from documentation env = gym.make("CartPole-v1", render_mode="rgb_array") model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}") -#model = A2C("MlpPolicy", env, ) model.learn(total_timesteps=10_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),) -#model.learn(total_timesteps=10_000) vec_env = model.get_env() obs = vec_env.reset() for i in range(1000): @@ -34,6 +34,8 @@ for i in range(1000): run.finish() + +# from documentation of huggingface package_to_hub(model=model, model_name="CartPole-v1", model_architecture="A2C", diff --git a/a2c_sb3_panda_reach.py b/a2c_sb3_panda_reach.py index 5df3b95..b30b753 100644 --- a/a2c_sb3_panda_reach.py +++ b/a2c_sb3_panda_reach.py @@ -21,12 +21,11 @@ run = wandb.init( save_code=True, ) +# from docs env = gym.make("PandaReachJointsDense-v3") model = A2C("MultiInputPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}") -#model = A2C("MlpPolicy", env, ) model.learn(total_timesteps=500_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),) -#model.learn(total_timesteps=10_000) vec_env = model.get_env() obs = vec_env.reset() for i in range(1000): @@ -35,6 +34,7 @@ for i in range(1000): run.finish() +# from documentation of huggingface package_to_hub(model=model, model_name="PandaReachJointsDense-v3", model_architecture="A2C", diff --git a/evaluate_reinforce_cartpole.py b/evaluate_reinforce_cartpole.py index 1021baf..9b18de6 100644 --- a/evaluate_reinforce_cartpole.py +++ b/evaluate_reinforce_cartpole.py @@ -2,8 +2,9 @@ import gymnasium as gym import torch from reinforce_cartpole import Policy +# function to evaluate the policy def eval_policy(eval_length, policy, env): - # Reset the environment and get the initial observation + # reset the environment and get the initial observation observation = env.reset()[0] rewards = [] @@ -13,12 +14,11 @@ def eval_policy(eval_length, policy, env): action = torch.distributions.Categorical(action_probs).sample() observation, reward, terminated, truncated, info = env.step(action.numpy()) rewards.append(reward) - # visualize agent behavio - #env.render() if terminated or truncated: break return sum(rewards) -# Create the environment + +# create env env = gym.make("CartPole-v1") policy = Policy() @@ -26,15 +26,18 @@ policy = Policy() policy.load_state_dict(torch.load('reinforce_cartpole.pth', weights_only=True)) policy.eval() +# max stesps eval_length = env.spec.max_episode_steps num_evals = 100 number_of_solves = 0 + for eval in range(num_evals): sum_reward = eval_policy(eval_length, policy, env) - print(f"Average reward: {sum_reward}") + # check if reward is greater than 195 if sum_reward >= 195: number_of_solves += 1 - + +# compute successrate success_rate = number_of_solves / num_evals print(f"Success rate: {success_rate}") diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py index dc01fd7..32da2bc 100644 --- a/reinforce_cartpole.py +++ b/reinforce_cartpole.py @@ -3,8 +3,10 @@ import torch import numpy as np import matplotlib.pyplot as plt +# modify the dropout rate here DROPOUT_RATE = 0.5 +# the policy class class Policy(torch.nn.Module): def __init__(self, input_size=4, output_size=2): super(Policy, self).__init__() @@ -19,34 +21,34 @@ class Policy(torch.nn.Module): x = self.relu(x) x = self.dropout(x) x = self.fc2(x) - #print(x) x = self.softmax(x) - #print(x) return x def main(): + + # load policy and optimizer policy = Policy() optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3) - # Create the environment + # create the environment env = gym.make("CartPole-v1") - # Reset the environment and get the initial observation - gamma = 0.99 + # track total loss and reward total_reward = [] total_loss = [] epochs = 500 + # max possible steps in the envorinment max_steps = env.spec.max_episode_steps for ep in range(epochs): print(ep) observation = env.reset()[0] - # rewards = torch.zeros(max_steps) - # log_probs = torch.zeros(max_steps) + + # allocate enough spaces for the rewards and log_probs rewards = torch.zeros(max_steps) log_probs = torch.zeros(max_steps) for step in range(max_steps): @@ -55,6 +57,8 @@ def main(): action = torch.distributions.Categorical(action_probs).sample() observation, reward, terminated, truncated, info = env.step(action.numpy()) + + # set rewards and log_probs rewards[step] = reward log_probs[step] = torch.log(action_probs[action]) @@ -79,8 +83,6 @@ def main(): loss.backward() optimizer.step() total_loss.append(loss.detach().numpy()) - # Render the environment to visualize the agent's behavior - #env.render() # save the model weights torch.save(policy.state_dict(), "reinforce_cartpole.pth") -- GitLab