Upload New File

794e74f9 · Ghelfi Manon · 3bf0c5d0 · 794e74f9
Commit 794e74f9 authored Feb 7, 2023 by Ghelfi Manon
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
+import gym
+import torch
+import os
+import numpy as np
+from torch import nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+import matplotlib.pyplot as plt
+
+# Setup the agent as a simple neural network with:
+env = gym.make("CartPole-v1")
+
+
+
+# Setup the agent as a simple neural network with:
+#     - One fully connected layer with 128 units and ReLU activation followed by a dropout layer
+#     - One fully connected layer followed by softmax activation
+
+class NeuralNetwork(nn.Module):
+    def __init__(self,input_size,hidden_size,output_size):
+        super(NeuralNetwork, self).__init__()
+        self.layer_1 = nn.Sequential (
+            nn.Linear(input_size, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(0.2) )
+        self.layer_2 = nn.Sequential (
+            nn.Linear(128, output_size),
+            nn.Softmax() )
+       
+    def forward(self, x):
+        x = self.layer_1(x)
+        x = self.layer_2(x)
+        return x
+       
+input_size = 4#env.observation_space.shape[0]
+hidden_size = 128
+output_size = 2#env.action_space.n
+model = NeuralNetwork(input_size,hidden_size,output_size)
+#criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=5e-3)
+nb_turn=500
+ep_max=500
+total_rewards=[]
+# Repeat 500 times:
+for i in range(nb_turn):
+    # Reset the environment
+    obs =env.reset()
+    input=torch.Tensor(obs)
+    total_reward=0
+    buffer_proba=torch.empty(ep_max, 2)
+    buffer_return=torch.empty(ep_max, 1)
+#     Repeat until the end of the episode:
+    k=0
+    while True and k<ep_max:
+        
+        action_proba =model(input)
+        buffer_proba[k]=action_proba
+        action_proba.detach().numpy()
+        print(i,k,action_proba.detach().numpy())
+        action=np.random.choice(2, p=action_proba.detach().numpy())
+        observation, reward, done, info = env.step(action)
+        total_reward+=reward
+        for j in range(k):
+            buffer_return[j]+=0.99**(k-j)*reward
+        env.render()
+        input= torch.tensor(observation)
+        k+=1
+        if done : break
+    total_rewards.append(total_reward)
+    # Normalize the return  
+    #k premieres element de buffer_return
+    truncated_buffer_return = buffer_return.narrow(0, 0, k)
+    truncated_buffer_proba = buffer_proba.narrow(0, 0, k)
+    returns_normalize = (truncated_buffer_return - truncated_buffer_return.mean()) / (truncated_buffer_return.std() + 1e-5)
+    # Compute the policy loss as -sum(log(prob) * return)
+    log_probs=torch.log(truncated_buffer_proba)
+    policy_loss = -torch.sum(torch.multiply(log_probs, returns_normalize))
+    # Update the policy using an Adam optimizer and a learning rate of 5e-3
+    optimizer.zero_grad()
+    policy_loss.backward()
+    optimizer.step()
+
+plt.plot(range(nb_turn), total_rewards)
+plt.show()