diff --git a/a2c_sb3_cartpole.py b/a2c_sb3_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..ee1087d0bd9f61ca3a469f271ca00453a3f42ad0 --- /dev/null +++ b/a2c_sb3_cartpole.py @@ -0,0 +1,47 @@ +import gymnasium as gym + +from stable_baselines3 import A2C +import wandb +from wandb.integration.sb3 import WandbCallback +from huggingface_sb3 import package_to_hub + + +# from documentation of wandb +config = { + "policy_type": "MlpPolicy", + "total_timesteps": 25000, + "env_name": "CartPole-v1", +} +run = wandb.init( + project="sb3", + config=config, + sync_tensorboard=True, + monitor_gym=True, + save_code=True, +) + +env = gym.make("CartPole-v1", render_mode="rgb_array") + +model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}") +#model = A2C("MlpPolicy", env, ) +model.learn(total_timesteps=10_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),) +#model.learn(total_timesteps=10_000) +vec_env = model.get_env() +obs = vec_env.reset() +for i in range(1000): + action, _state = model.predict(obs, deterministic=True) + obs, reward, done, info = vec_env.step(action) + vec_env.render("human") + # VecEnv resets automatically + # if done: + # obs = vec_env.reset() + +run.finish() + +package_to_hub(model=model, + model_name="CartPole-v1", + model_architecture="A2C", + env_id="CartPole-v1", + eval_env=env, + repo_id="lennartoe/Cartpole-v1", + commit_message="First commit") \ No newline at end of file diff --git a/evaluate_reinforce_cartpole.py.py b/evaluate_reinforce_cartpole.py.py new file mode 100644 index 0000000000000000000000000000000000000000..01479f7b7a18bf02be0ba4a09f0d5bb078e093fc --- /dev/null +++ b/evaluate_reinforce_cartpole.py.py @@ -0,0 +1,32 @@ +import gymnasium as gym +import torch +from reinforce_cartpole import Policy +# Create the environment +env = gym.make("CartPole-v1", render_mode="human") + +# Reset the environment and get the initial observation +observation = env.reset()[0] + +policy = Policy() +# load learned policy +policy.load_state_dict(torch.load('policy.pth', weights_only=True)) +policy.eval() + +for _ in range(200): + # sample action from policy + print(observation) + print(torch.from_numpy(observation).float()) + action_probs = policy(torch.from_numpy(observation).float()) + action = torch.distributions.Categorical(action_probs).sample() + # Apply the action to the environment + # Returns next observation, reward, done signal (indicating + # if the episode has ended), and an additional info dictionary + observation, reward, terminated, truncated, info = env.step(action.numpy()) + # Render the environment to visualize the agent's behavior + env.render() + print(terminated or truncated) + if terminated or truncated: + # Terminated before max step + break + +env.close() diff --git a/policy.pth b/policy.pth new file mode 100644 index 0000000000000000000000000000000000000000..970f5c065a6275f6d9a5038936d3e7bff9adc6ac Binary files /dev/null and b/policy.pth differ diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..50be37c67a30fb2a5c723903fc7cf67482403427 --- /dev/null +++ b/reinforce_cartpole.py @@ -0,0 +1,121 @@ +import gymnasium as gym +import torch +import numpy as np + +class Policy(torch.nn.Module): + def __init__(self, input_size=4, output_size=2): + super(Policy, self).__init__() + self.fc1 = torch.nn.Linear(input_size, 128) + self.relu = torch.nn.ReLU() + self.dropout = torch.nn.Dropout(0.2) + self.fc2 = torch.nn.Linear(128, output_size) + self.softmax = torch.nn.Softmax(dim=0) + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.dropout(x) + x = self.fc2(x) + #print(x) + x = self.softmax(x) + #print(x) + return x + + +def main(): + policy = Policy() + optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3) + + # Create the environment + env = gym.make("CartPole-v1") + + # Reset the environment and get the initial observation + + gamma = 0.99 + total_reward = [] + total_loss = [] + epochs = 500 + + max_steps = env.spec.max_episode_steps + + for _ in range(epochs): + print(_) + # Reset the environment + observation = env.reset()[0] + # Reset buffer + # rewards = torch.zeros(max_steps) + # log_probs = torch.zeros(max_steps) + rewards = [] + log_probs = [] + for step in range(max_steps): + # Select a random action from the action space + #print(observation) + action_probs = policy(torch.from_numpy(observation).float()) + + # Sample an action from the action probabilities + action = torch.distributions.Categorical(action_probs).sample() + #print("Action") + #print(action) + # Apply the action to the environment + observation, reward, terminated, truncated, info = env.step(action.numpy()) + #print(observation) + # env.render() + # does this come before adding to the rewards or after + + # rewards[step] = reward + # log_probs[step] = torch.log(action_probs[action]) + rewards.append(torch.tensor(reward)) + log_probs.append(torch.log(action_probs[action])) + + if terminated or truncated: + break + + # apply gamma + # transform rewards and log_probs into tensors + rewards = torch.stack(rewards) + log_probs = torch.stack(log_probs) + rewards_length = len(rewards) + rewards_tensor = torch.zeros(rewards_length, rewards_length) + for i in range(rewards_length): + for j in range(rewards_length-i): + rewards_tensor[i,j] = rewards[i+j] + #print(rewards_tensor) + for i in range(rewards_length): + for j in range(rewards_length): + rewards_tensor[i,j] = rewards_tensor[i,j] * np.pow(gamma,j) + #print(rewards_tensor) + normalized_rewards = torch.sum(rewards_tensor, dim=1) + #print(normalized_rewards) + normalized_rewards = normalized_rewards- torch.mean(normalized_rewards) + normalized_rewards /= torch.std(normalized_rewards) + + + loss = -torch.sum(log_probs * normalized_rewards) + total_reward.append(sum(rewards)) + # optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() + total_loss.append(loss.detach().numpy()) + # Render the environment to visualize the agent's behavior + #env.render() + + # save the model weights + torch.save(policy.state_dict(), "policy.pth") + + + print(total_reward) + print(total_loss) + env.close() + + # plot the rewards and the loss side by side + import matplotlib.pyplot as plt + fig, ax = plt.subplots(1,2) + ax[0].plot(total_reward) + ax[1].plot(total_loss) + plt.show() + + + +if __name__ == "__main__": + main() \ No newline at end of file