From 794e74f9c793e52889ed172c0aeb9e80ac6016c2 Mon Sep 17 00:00:00 2001 From: Ghelfi Manon <manon.ghelfi@ecl19.ec-lyon.fr> Date: Tue, 7 Feb 2023 13:47:27 +0000 Subject: [PATCH] Upload New File --- reinforce_cartpole.py | 85 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 reinforce_cartpole.py diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py new file mode 100644 index 0000000..e516813 --- /dev/null +++ b/reinforce_cartpole.py @@ -0,0 +1,85 @@ +import gym +import torch +import os +import numpy as np +from torch import nn +import torch.optim as optim +from torch.utils.data import DataLoader +from torchvision import datasets, transforms +import matplotlib.pyplot as plt + +# Setup the agent as a simple neural network with: +env = gym.make("CartPole-v1") + + + +# Setup the agent as a simple neural network with: +# - One fully connected layer with 128 units and ReLU activation followed by a dropout layer +# - One fully connected layer followed by softmax activation + +class NeuralNetwork(nn.Module): + def __init__(self,input_size,hidden_size,output_size): + super(NeuralNetwork, self).__init__() + self.layer_1 = nn.Sequential ( + nn.Linear(input_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.2) ) + self.layer_2 = nn.Sequential ( + nn.Linear(128, output_size), + nn.Softmax() ) + + def forward(self, x): + x = self.layer_1(x) + x = self.layer_2(x) + return x + +input_size = 4#env.observation_space.shape[0] +hidden_size = 128 +output_size = 2#env.action_space.n +model = NeuralNetwork(input_size,hidden_size,output_size) +#criterion = nn.CrossEntropyLoss() +optimizer = optim.Adam(model.parameters(), lr=5e-3) +nb_turn=500 +ep_max=500 +total_rewards=[] +# Repeat 500 times: +for i in range(nb_turn): + # Reset the environment + obs =env.reset() + input=torch.Tensor(obs) + total_reward=0 + buffer_proba=torch.empty(ep_max, 2) + buffer_return=torch.empty(ep_max, 1) +# Repeat until the end of the episode: + k=0 + while True and k<ep_max: + + action_proba =model(input) + buffer_proba[k]=action_proba + action_proba.detach().numpy() + print(i,k,action_proba.detach().numpy()) + action=np.random.choice(2, p=action_proba.detach().numpy()) + observation, reward, done, info = env.step(action) + total_reward+=reward + for j in range(k): + buffer_return[j]+=0.99**(k-j)*reward + env.render() + input= torch.tensor(observation) + k+=1 + if done : break + total_rewards.append(total_reward) + # Normalize the return + #k premieres element de buffer_return + truncated_buffer_return = buffer_return.narrow(0, 0, k) + truncated_buffer_proba = buffer_proba.narrow(0, 0, k) + returns_normalize = (truncated_buffer_return - truncated_buffer_return.mean()) / (truncated_buffer_return.std() + 1e-5) + # Compute the policy loss as -sum(log(prob) * return) + log_probs=torch.log(truncated_buffer_proba) + policy_loss = -torch.sum(torch.multiply(log_probs, returns_normalize)) + # Update the policy using an Adam optimizer and a learning rate of 5e-3 + optimizer.zero_grad() + policy_loss.backward() + optimizer.step() + +plt.plot(range(nb_turn), total_rewards) +plt.show() -- GitLab