reinforce_cartpole

52562508 · oscarchaufour · ef969c9d · 52562508 · 52562508
Commit 52562508 authored 1 year ago by oscarchaufour
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
+import gym
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import matplotlib.pyplot as plt
+from tqdm import tqdm 
+import numpy as np
+# Define the neural network model
+class Policy(nn.Module):
+    def __init__(self, input_size, output_size, hidden_size=128):
+        super(Policy, self).__init__()
+        self.fc1 = nn.Linear(input_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(p=0.5)
+        self.fc2 = nn.Linear(hidden_size, output_size)
+        self.softmax = nn.Softmax(dim=1)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.softmax(x)
+        return x
+def reinforce():
+    # Create the environment
+    env = gym.make("CartPole-v1")
+    # Set up the agent
+    policy = Policy(
+        input_size=env.observation_space.shape[0],
+        output_size=env.action_space.n
+    )
+    optimizer = optim.Adam(policy.parameters(), lr=5e-3)
+    # Training loop
+    num_episodes = 500
+    gamma = 0.99
+    episode_rewards = []
+    for episode in tqdm(range(num_episodes)):
+        action_probabilities = []
+        episode_rewards_weighted = []
+        # Reset the environment and get the initial observation
+        observation = env.reset()[0]
+        terminated = False
+        episode_reward = 0
+        step = 0
+        while not terminated:
+            step += 1
+            # Compute action probabilities
+            action_probs = policy(torch.FloatTensor(observation).unsqueeze(0))
+            # Sample action based on probabilities and store its probability in the buffer
+            action = torch.multinomial(action_probs, num_samples=1).item()
+            # Step the environment with the action
+            observation, reward, terminated, truncated, info = env.step(action)
+            env.render()
+            # Compute and store the return in the buffer
+            episode_reward += reward
+            episode_rewards_weighted.append(reward * gamma ** step)
+            #episode_rewards.append(episode_reward)
+            # Store the action probabilities
+            action_probabilities.append(action_probs[0][action])
+        # Normalize the return
+        # Convert action_probabilities to a tensor with requires_grad=True
+        action_probabilities_tensor = torch.FloatTensor(action_probabilities).requires_grad_(True)
+        episode_rewards_tensor = torch.FloatTensor(episode_rewards_weighted)
+        episode_rewards_tensor -= torch.mean(episode_rewards_tensor)
+        episode_rewards_tensor /= torch.std(episode_rewards_tensor)
+        # Compute policy loss
+        log_probs = torch.log(action_probabilities_tensor.squeeze(0))
+        policy_loss = -torch.sum(log_probs * torch.tensor(episode_rewards_weighted))
+        # Update the policy
+        optimizer.zero_grad()
+        policy_loss.backward()
+        optimizer.step()
+        episode_rewards.append(episode_reward)
+    return episode_rewards
+def plot_rewards(episode_rewards):
+    # Plot the total reward across episodes
+    plt.plot(episode_rewards)
+    plt.xlabel('Episode')
+    plt.ylabel('Total Reward')
+    plt.title('REINFORCE: Total Reward across Episodes')
+    plt.show()
+if __name__ == "__main__":
+    episode_rewards = reinforce()
+    plot_rewards(episode_rewards)
\ No newline at end of file
--- a/test.ipynb
+++ b/test.ipynb