Upload New File

e04379e4 · Amoussas Younes · c4ab3f49 · e04379e4
Commit e04379e4 authored Feb 14, 2023 by Amoussas Younes
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
+"""Setup the CartPole environment
+Setup the agent as a simple neural network with:
+    - One fully connected layer with 128 units and ReLU activation followed by a dropout layer
+    - One fully connected layer followed by softmax activation
+Repeat 500 times:
+    Reset the environment
+    Reset the buffer
+    Repeat until the end of the episode:
+        Compute action probabilities 
+        Sample the action based on the probabilities and store its probability in the buffer 
+        Step the environment with the action
+        Compute and store in the buffer the return using gamma=0.99 
+    Normalize the return
+    Compute the policy loss as -sum(log(prob) * return)
+    Update the policy using an Adam optimizer and a learning rate of 5e-3
+"""
+import gym
+import torch
+import torch.nn as nn
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Set up the CartPole environement
+# Create the environment
+env = gym.make("CartPole-v1")
+
+# Reset the environment and get the initial observation
+observation = env.reset()
+
+print('observation space:', env.observation_space)
+print('action space:', env.action_space)
+print('threshold: ', env.spec.reward_threshold)
+
+# Define the agent
+# Define model's parameters
+observation_size = env.observation_space.shape[0]
+hidden_size = 128
+n_actions = env.action_space.n
+
+# Define the neural network model
+model = nn.Sequential(
+            nn.Linear(observation_size, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(p=0.5),
+            nn.Linear(hidden_size, n_actions),
+            nn.Softmax(dim=0)
+     )
+
+print (model)
+
+# Agent's parameters
+
+learning_rate = 0.005
+optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+gamma = 0.99
+
+final_rewards = []
+
+for _ in range(500):
+
+    # Reset the environment and get the initial observation
+    observation = env.reset()
+    done = False
+    transitions = []
+
+    # Reset the buffer
+    buffer = [[], []]
+
+    rewards = []
+
+    # Loop for an episode
+    t = 0
+    while not done:
+        # Compute action probabilities
+        # pylint: disable=E1101
+        action_proba = model(torch.from_numpy(observation).float())
+        # pylint: enable=E1101
+
+        # Sample the action based on the probabilities
+        action = np.random.choice(np.array([0,1]), p=action_proba.data.numpy())
+
+        # Store its probability in the buffer
+        buffer[0].append(action_proba.data.numpy())
+
+        # Step the environment with the action
+        previous_observation = observation
+        observation, reward, done, info = env.step(action)
+        transitions.append((previous_observation, action, reward))
+        rewards.append(reward)
+        t += 1
+
+    # Compute and store in the buffer the return
+    for t in range(len(transitions)):
+        return_agent = 0
+        for i in range(t, len(transitions)):
+            reward = transitions[i][2]
+            return_agent += (gamma ** t) * reward
+        buffer[1].append(return_agent)
+
+    # Normalize the return
+    # pylint: disable=E1101
+    returns = torch.FloatTensor(buffer[1])
+    # pylint: enable=E1101
+    returns /= returns.max()
+
+    # Compute the policy loss
+    states = torch.Tensor([state for (state, _, _) in transitions])
+    actions = torch.Tensor([action for (_, action, _) in transitions])
+    
+    predicitions = model(states)
+    probabilities = predicitions.gather(dim=1,index=actions.long().view(-1,1)).squeeze()
+    # pylint: disable=E1101
+    policy_loss = -torch.sum(torch.log(probabilities) * returns)
+    # pylint: enable=E1101
+
+    # Update the policy
+    optimizer.zero_grad()
+    policy_loss.backward()
+    optimizer.step()
+
+    final_rewards.append(len(rewards))
+
+    # Render the environment to visualize the agent's behavior
+    env.render()
+
+# Plot figure of rewards over episodes
+
+plt.plot(final_rewards)
+plt.xlabel("Episode")
+plt.ylabel("Reward")
+plt.savefig("./images/REINFORCE.png")
+
+