diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py new file mode 100644 index 0000000000000000000000000000000000000000..c732e8588847e62a8fe552b4973550dd754c65a3 --- /dev/null +++ b/reinforce_cartpole.py @@ -0,0 +1,106 @@ +import gym +import torch +import torch.nn as nn +import torch.optim as optim +import matplotlib.pyplot as plt +from tqdm import tqdm +import numpy as np + +# Define the neural network model +class Policy(nn.Module): + def __init__(self, input_size, output_size, hidden_size=128): + super(Policy, self).__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.relu = nn.ReLU() + self.dropout = nn.Dropout(p=0.5) + self.fc2 = nn.Linear(hidden_size, output_size) + self.softmax = nn.Softmax(dim=1) + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.dropout(x) + x = self.fc2(x) + x = self.softmax(x) + return x + + +def reinforce(): + # Create the environment + env = gym.make("CartPole-v1") + + # Set up the agent + policy = Policy( + input_size=env.observation_space.shape[0], + output_size=env.action_space.n + ) + optimizer = optim.Adam(policy.parameters(), lr=5e-3) + + # Training loop + num_episodes = 500 + gamma = 0.99 + episode_rewards = [] + + for episode in tqdm(range(num_episodes)): + action_probabilities = [] + episode_rewards_weighted = [] + + # Reset the environment and get the initial observation + observation = env.reset()[0] + + terminated = False + episode_reward = 0 + step = 0 + + while not terminated: + step += 1 + # Compute action probabilities + action_probs = policy(torch.FloatTensor(observation).unsqueeze(0)) + + # Sample action based on probabilities and store its probability in the buffer + action = torch.multinomial(action_probs, num_samples=1).item() + + # Step the environment with the action + observation, reward, terminated, truncated, info = env.step(action) + env.render() + + # Compute and store the return in the buffer + episode_reward += reward + episode_rewards_weighted.append(reward * gamma ** step) + #episode_rewards.append(episode_reward) + + # Store the action probabilities + action_probabilities.append(action_probs[0][action]) + + # Normalize the return + # Convert action_probabilities to a tensor with requires_grad=True + action_probabilities_tensor = torch.FloatTensor(action_probabilities).requires_grad_(True) + + episode_rewards_tensor = torch.FloatTensor(episode_rewards_weighted) + episode_rewards_tensor -= torch.mean(episode_rewards_tensor) + episode_rewards_tensor /= torch.std(episode_rewards_tensor) + + # Compute policy loss + log_probs = torch.log(action_probabilities_tensor.squeeze(0)) + policy_loss = -torch.sum(log_probs * torch.tensor(episode_rewards_weighted)) + + # Update the policy + optimizer.zero_grad() + policy_loss.backward() + optimizer.step() + episode_rewards.append(episode_reward) + return episode_rewards + + +def plot_rewards(episode_rewards): + # Plot the total reward across episodes + plt.plot(episode_rewards) + plt.xlabel('Episode') + plt.ylabel('Total Reward') + plt.title('REINFORCE: Total Reward across Episodes') + plt.show() + + +if __name__ == "__main__": + episode_rewards = reinforce() + plot_rewards(episode_rewards) \ No newline at end of file diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391