diff --git a/README.md b/README.md index bedab60c91db49a191ad723ec4413142f095d35b..87b6c05ac09fdf77eae88f1c41f278ca9114d37f 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,8 @@ Now that you have trained your model, it is time to evaluate its performance. Ru > 🛠**To be handed in** > Implement a script which loads your saved model and use it to solve the cartpole enviroment. Run 100 evaluations and share the final success rate across all evaluations in the `README.md`. Share the code in `evaluate_reinforce_cartpole.py`. -From the openai gym wiki we know that the environment counts as solved when the average reward is greater or equal to 195 for over 100 consecutuve trials. +Remark: To me it was not clear what "Success rate" in this context means (Therefore i made some assumptions!). The openai gym wiki says that the environment is solved when the average is over 100 trials is greater than 195. To translate this into a successrate i assumed that it is just necessary to count how often out of 100 trials, the summed reward was bigger than 195. + From the evaluation script i used the success rate is 1.0 when we allow the maximum number of steps the environment offers. ## Familiarization with a complete RL pipeline: Application to training a robotic arm diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py index 7c94adcbbdf8cb5b3af5c7def98c89e4e7b4bd90..701652c6f414fb742a920f904052bcb51c383be3 100644 --- a/a2c_sb3_cartpole.py +++ b/a2c_sb3_cartpole.py @@ -20,12 +20,12 @@ run = wandb.init( save_code=True, ) + +# from documentation env = gym.make("CartPole-v1", render_mode="rgb_array") model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}") -#model = A2C("MlpPolicy", env, ) model.learn(total_timesteps=10_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),) -#model.learn(total_timesteps=10_000) vec_env = model.get_env() obs = vec_env.reset() for i in range(1000): @@ -34,6 +34,8 @@ for i in range(1000): run.finish() + +# from documentation of huggingface package_to_hub(model=model, model_name="CartPole-v1", model_architecture="A2C", diff --git a/a2c_sb3_panda_reach.py b/a2c_sb3_panda_reach.py index 5df3b95d2996f3081a887ecad2ac85198e978a1a..b30b75381803bc59f5f809f4f79fa00e51888869 100644 --- a/a2c_sb3_panda_reach.py +++ b/a2c_sb3_panda_reach.py @@ -21,12 +21,11 @@ run = wandb.init( save_code=True, ) +# from docs env = gym.make("PandaReachJointsDense-v3") model = A2C("MultiInputPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}") -#model = A2C("MlpPolicy", env, ) model.learn(total_timesteps=500_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),) -#model.learn(total_timesteps=10_000) vec_env = model.get_env() obs = vec_env.reset() for i in range(1000): @@ -35,6 +34,7 @@ for i in range(1000): run.finish() +# from documentation of huggingface package_to_hub(model=model, model_name="PandaReachJointsDense-v3", model_architecture="A2C", diff --git a/evaluate_reinforce_cartpole.py b/evaluate_reinforce_cartpole.py index 1021baf702d003289c61a334d91e7d3d87d02e2a..9b18de60a225c8d025b0bbc564749f23e553f743 100644 --- a/evaluate_reinforce_cartpole.py +++ b/evaluate_reinforce_cartpole.py @@ -2,8 +2,9 @@ import gymnasium as gym import torch from reinforce_cartpole import Policy +# function to evaluate the policy def eval_policy(eval_length, policy, env): - # Reset the environment and get the initial observation + # reset the environment and get the initial observation observation = env.reset()[0] rewards = [] @@ -13,12 +14,11 @@ def eval_policy(eval_length, policy, env): action = torch.distributions.Categorical(action_probs).sample() observation, reward, terminated, truncated, info = env.step(action.numpy()) rewards.append(reward) - # visualize agent behavio - #env.render() if terminated or truncated: break return sum(rewards) -# Create the environment + +# create env env = gym.make("CartPole-v1") policy = Policy() @@ -26,15 +26,18 @@ policy = Policy() policy.load_state_dict(torch.load('reinforce_cartpole.pth', weights_only=True)) policy.eval() +# max stesps eval_length = env.spec.max_episode_steps num_evals = 100 number_of_solves = 0 + for eval in range(num_evals): sum_reward = eval_policy(eval_length, policy, env) - print(f"Average reward: {sum_reward}") + # check if reward is greater than 195 if sum_reward >= 195: number_of_solves += 1 - + +# compute successrate success_rate = number_of_solves / num_evals print(f"Success rate: {success_rate}") diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py index dc01fd7bc5aad5c71bf1a8115c311dc94b5095e5..32da2bc0bb3f8f9fd2eb2a403de4bebcc2ff65dc 100644 --- a/reinforce_cartpole.py +++ b/reinforce_cartpole.py @@ -3,8 +3,10 @@ import torch import numpy as np import matplotlib.pyplot as plt +# modify the dropout rate here DROPOUT_RATE = 0.5 +# the policy class class Policy(torch.nn.Module): def __init__(self, input_size=4, output_size=2): super(Policy, self).__init__() @@ -19,34 +21,34 @@ class Policy(torch.nn.Module): x = self.relu(x) x = self.dropout(x) x = self.fc2(x) - #print(x) x = self.softmax(x) - #print(x) return x def main(): + + # load policy and optimizer policy = Policy() optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3) - # Create the environment + # create the environment env = gym.make("CartPole-v1") - # Reset the environment and get the initial observation - gamma = 0.99 + # track total loss and reward total_reward = [] total_loss = [] epochs = 500 + # max possible steps in the envorinment max_steps = env.spec.max_episode_steps for ep in range(epochs): print(ep) observation = env.reset()[0] - # rewards = torch.zeros(max_steps) - # log_probs = torch.zeros(max_steps) + + # allocate enough spaces for the rewards and log_probs rewards = torch.zeros(max_steps) log_probs = torch.zeros(max_steps) for step in range(max_steps): @@ -55,6 +57,8 @@ def main(): action = torch.distributions.Categorical(action_probs).sample() observation, reward, terminated, truncated, info = env.step(action.numpy()) + + # set rewards and log_probs rewards[step] = reward log_probs[step] = torch.log(action_probs[action]) @@ -79,8 +83,6 @@ def main(): loss.backward() optimizer.step() total_loss.append(loss.detach().numpy()) - # Render the environment to visualize the agent's behavior - #env.render() # save the model weights torch.save(policy.state_dict(), "reinforce_cartpole.pth")