Skip to content
Snippets Groups Projects
Commit 794e74f9 authored by Ghelfi Manon's avatar Ghelfi Manon
Browse files

Upload New File

parent 3bf0c5d0
No related branches found
No related tags found
No related merge requests found
import gym
import torch
import os
import numpy as np
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
# Setup the agent as a simple neural network with:
env = gym.make("CartPole-v1")
# Setup the agent as a simple neural network with:
# - One fully connected layer with 128 units and ReLU activation followed by a dropout layer
# - One fully connected layer followed by softmax activation
class NeuralNetwork(nn.Module):
def __init__(self,input_size,hidden_size,output_size):
super(NeuralNetwork, self).__init__()
self.layer_1 = nn.Sequential (
nn.Linear(input_size, hidden_size),
nn.ReLU(),
nn.Dropout(0.2) )
self.layer_2 = nn.Sequential (
nn.Linear(128, output_size),
nn.Softmax() )
def forward(self, x):
x = self.layer_1(x)
x = self.layer_2(x)
return x
input_size = 4#env.observation_space.shape[0]
hidden_size = 128
output_size = 2#env.action_space.n
model = NeuralNetwork(input_size,hidden_size,output_size)
#criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-3)
nb_turn=500
ep_max=500
total_rewards=[]
# Repeat 500 times:
for i in range(nb_turn):
# Reset the environment
obs =env.reset()
input=torch.Tensor(obs)
total_reward=0
buffer_proba=torch.empty(ep_max, 2)
buffer_return=torch.empty(ep_max, 1)
# Repeat until the end of the episode:
k=0
while True and k<ep_max:
action_proba =model(input)
buffer_proba[k]=action_proba
action_proba.detach().numpy()
print(i,k,action_proba.detach().numpy())
action=np.random.choice(2, p=action_proba.detach().numpy())
observation, reward, done, info = env.step(action)
total_reward+=reward
for j in range(k):
buffer_return[j]+=0.99**(k-j)*reward
env.render()
input= torch.tensor(observation)
k+=1
if done : break
total_rewards.append(total_reward)
# Normalize the return
#k premieres element de buffer_return
truncated_buffer_return = buffer_return.narrow(0, 0, k)
truncated_buffer_proba = buffer_proba.narrow(0, 0, k)
returns_normalize = (truncated_buffer_return - truncated_buffer_return.mean()) / (truncated_buffer_return.std() + 1e-5)
# Compute the policy loss as -sum(log(prob) * return)
log_probs=torch.log(truncated_buffer_proba)
policy_loss = -torch.sum(torch.multiply(log_probs, returns_normalize))
# Update the policy using an Adam optimizer and a learning rate of 5e-3
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
plt.plot(range(nb_turn), total_rewards)
plt.show()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment