From 08a74dea00a5bf848675057107bbadce8f6c0821 Mon Sep 17 00:00:00 2001
From: number_cruncher <lennart.oestreich@stud.tu-darmstadt.de>
Date: Sun, 16 Mar 2025 19:37:53 +0100
Subject: [PATCH] update readme and comments

---
 README.md                      |  3 ++-
 a2c_sb3_cartpole.py            |  6 ++++--
 a2c_sb3_panda_reach.py         |  4 ++--
 evaluate_reinforce_cartpole.py | 15 +++++++++------
 reinforce_cartpole.py          | 20 +++++++++++---------
 5 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index bedab60..87b6c05 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,8 @@ Now that you have trained your model, it is time to evaluate its performance. Ru
 > 🛠 **To be handed in**
 > Implement a script which loads your saved model and use it to solve the cartpole enviroment. Run 100 evaluations and share the final success rate across all evaluations in the `README.md`. Share the code in `evaluate_reinforce_cartpole.py`.
 
-From the openai gym wiki we know that the environment counts as solved when the average reward is greater or equal to 195 for over 100 consecutuve trials.
+Remark: To me it was not clear what "Success rate" in this context means (Therefore i made some assumptions!). The openai gym wiki says that the environment is solved when the average is over 100 trials is greater than 195. To translate this into a successrate i assumed that it is just necessary to count how often out of 100 trials, the summed reward was bigger than 195.
+
 From the evaluation script i used the success rate is 1.0 when we allow the maximum number of steps the environment offers.
 
 ## Familiarization with a complete RL pipeline: Application to training a robotic arm
diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py
index 7c94adc..701652c 100644
--- a/a2c_sb3_cartpole.py
+++ b/a2c_sb3_cartpole.py
@@ -20,12 +20,12 @@ run = wandb.init(
     save_code=True,
 )
 
+
+# from documentation
 env = gym.make("CartPole-v1", render_mode="rgb_array")
 
 model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}")
-#model = A2C("MlpPolicy", env, )
 model.learn(total_timesteps=10_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),)
-#model.learn(total_timesteps=10_000)
 vec_env = model.get_env()
 obs = vec_env.reset()
 for i in range(1000):
@@ -34,6 +34,8 @@ for i in range(1000):
 
 run.finish()
 
+
+# from documentation of huggingface
 package_to_hub(model=model, 
                model_name="CartPole-v1",
                model_architecture="A2C",
diff --git a/a2c_sb3_panda_reach.py b/a2c_sb3_panda_reach.py
index 5df3b95..b30b753 100644
--- a/a2c_sb3_panda_reach.py
+++ b/a2c_sb3_panda_reach.py
@@ -21,12 +21,11 @@ run = wandb.init(
     save_code=True,
 )
 
+# from docs
 env = gym.make("PandaReachJointsDense-v3")
 
 model = A2C("MultiInputPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}")
-#model = A2C("MlpPolicy", env, )
 model.learn(total_timesteps=500_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),)
-#model.learn(total_timesteps=10_000)
 vec_env = model.get_env()
 obs = vec_env.reset()
 for i in range(1000):
@@ -35,6 +34,7 @@ for i in range(1000):
 
 run.finish()
 
+# from documentation of huggingface
 package_to_hub(model=model, 
                model_name="PandaReachJointsDense-v3",
                model_architecture="A2C",
diff --git a/evaluate_reinforce_cartpole.py b/evaluate_reinforce_cartpole.py
index 1021baf..9b18de6 100644
--- a/evaluate_reinforce_cartpole.py
+++ b/evaluate_reinforce_cartpole.py
@@ -2,8 +2,9 @@ import gymnasium as gym
 import torch
 from reinforce_cartpole import Policy
 
+# function to evaluate the policy
 def eval_policy(eval_length, policy, env):
-    # Reset the environment and get the initial observation
+    # reset the environment and get the initial observation
     observation = env.reset()[0]
     rewards = []
 
@@ -13,12 +14,11 @@ def eval_policy(eval_length, policy, env):
         action = torch.distributions.Categorical(action_probs).sample()
         observation, reward, terminated, truncated, info = env.step(action.numpy())
         rewards.append(reward)
-        # visualize agent behavio
-        #env.render()
         if terminated or truncated: 
             break
     return sum(rewards)
-# Create the environment
+
+# create env
 env = gym.make("CartPole-v1")
 
 policy = Policy()
@@ -26,15 +26,18 @@ policy = Policy()
 policy.load_state_dict(torch.load('reinforce_cartpole.pth', weights_only=True))
 policy.eval()
 
+# max stesps
 eval_length = env.spec.max_episode_steps
 num_evals = 100
 number_of_solves = 0
+
 for eval in range(num_evals):
     sum_reward = eval_policy(eval_length, policy, env)
-    print(f"Average reward: {sum_reward}")
+    # check if reward is greater than 195
     if sum_reward >= 195:
         number_of_solves += 1
-    
+
+# compute successrate
 success_rate = number_of_solves / num_evals
 print(f"Success rate: {success_rate}")
 
diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py
index dc01fd7..32da2bc 100644
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
@@ -3,8 +3,10 @@ import torch
 import numpy as np
 import matplotlib.pyplot as plt
 
+# modify the dropout rate here
 DROPOUT_RATE = 0.5
 
+# the policy class
 class Policy(torch.nn.Module):
     def __init__(self, input_size=4, output_size=2):
         super(Policy, self).__init__()
@@ -19,34 +21,34 @@ class Policy(torch.nn.Module):
         x = self.relu(x)
         x = self.dropout(x)
         x = self.fc2(x)
-        #print(x)
         x = self.softmax(x)
-        #print(x)
         return x
 
 
 def main():
+
+    # load policy and optimizer
     policy = Policy()
     optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3)
 
-    # Create the environment
+    # create the environment
     env = gym.make("CartPole-v1")
 
-    # Reset the environment and get the initial observation
-
     gamma = 0.99
+    # track total loss and reward
     total_reward = []
     total_loss = []
     epochs = 500
 
+    # max possible steps in the envorinment
     max_steps = env.spec.max_episode_steps
 
 
     for ep in range(epochs):
         print(ep)
         observation = env.reset()[0]
-        # rewards = torch.zeros(max_steps)
-        # log_probs = torch.zeros(max_steps)
+
+        # allocate enough spaces for the rewards and log_probs
         rewards = torch.zeros(max_steps)
         log_probs = torch.zeros(max_steps)
         for step in range(max_steps):
@@ -55,6 +57,8 @@ def main():
 
             action = torch.distributions.Categorical(action_probs).sample()
             observation, reward, terminated, truncated, info = env.step(action.numpy())
+
+            # set rewards and log_probs
             rewards[step] = reward
             log_probs[step] = torch.log(action_probs[action])
 
@@ -79,8 +83,6 @@ def main():
         loss.backward()
         optimizer.step()
         total_loss.append(loss.detach().numpy())
-        # Render the environment to visualize the agent's behavior
-        #env.render()
 
     # save the model weights
     torch.save(policy.state_dict(), "reinforce_cartpole.pth")
-- 
GitLab