From 1cc0d50243e7e8a3c4d6c59e12780c8b5b80cbf9 Mon Sep 17 00:00:00 2001 From: Majdi Karim <karim.majdi@etu.ec-lyon.fr> Date: Tue, 5 Mar 2024 21:31:21 +0000 Subject: [PATCH] Add new file --- reinforce_cartpole.py | 76 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 reinforce_cartpole.py diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py new file mode 100644 index 0000000..541eeef --- /dev/null +++ b/reinforce_cartpole.py @@ -0,0 +1,76 @@ +# Create the environment +env = gym.make("CartPole-v1", render_mode="human") + +# Reset the environment and get the initial observation +observation = env.reset() + +state_size = env.observation_space.shape[0] +action_size = env.action_space.n +# Define the agent neural network model +class Policy(nn.Module): + def __init__(self, state_size, action_size, hidden_size=128): + super(Policy, self).__init__() + self.fc1 = nn.Linear(state_size, hidden_size) + self.relu = nn.ReLU() + self.dropout = nn.Dropout(p=0.6) # Adjust dropout probability as needed + self.fc2 = nn.Linear(hidden_size, action_size) + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.dropout(x) + x = self.fc2(x) + return F.softmax(x) + +policy_model = Policy(state_size, action_size) +optimizer = optim.Adam(policy_model.parameters(), lr=5e-3) + +gamma = 0.99 +episodes_rewards = [] + +for i in range(500): + # Reset the environment + # init buffers + observation, info = env.reset(seed=42) + episode_rewards = [] + logarithmich_probabilities = [] + terminated = False + # Render the environment to visualize the agent's behavior + env.render() + + while terminated == False: + # Get action probabilities from the policy model + action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32)) + action_distribution = Categorical(action_probabilities) + + # Sample an action from the action distribution + action = action_distribution.sample() + logarithmich_probability = action_distribution.log_prob(action) + logarithmich_probabilities.append(logarithmich_probability) + print(int(action.item())) + # Take a step in the environment + #print(env.step(action.item())) + next_observation, reward, done, a, b = env.step(action.item()) + episode_rewards.append(reward) + + # Update observation + observation = next_observation + + + # Compute the return for the episode + returns = [] + R = 0 + for r in reversed(episode_rewards): + R = r + gamma * R + returns.insert(0, R) + + # Compute the policy loss + policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum() + episodes_rewards += [-policy_loss] + # Update the policy model + optimizer.zero_grad() + policy_loss.backward() + optimizer.step() + + +env.close() -- GitLab