From 1cc0d50243e7e8a3c4d6c59e12780c8b5b80cbf9 Mon Sep 17 00:00:00 2001
From: Majdi Karim <karim.majdi@etu.ec-lyon.fr>
Date: Tue, 5 Mar 2024 21:31:21 +0000
Subject: [PATCH] Add new file

---
 reinforce_cartpole.py | 76 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 reinforce_cartpole.py

diff --git a/reinforce_cartpole.py b/reinforce_cartpole.py
new file mode 100644
index 0000000..541eeef
--- /dev/null
+++ b/reinforce_cartpole.py
@@ -0,0 +1,76 @@
+# Create the environment
+env = gym.make("CartPole-v1", render_mode="human")
+
+# Reset the environment and get the initial observation
+observation = env.reset()
+
+state_size = env.observation_space.shape[0]
+action_size = env.action_space.n
+# Define the agent neural network model
+class Policy(nn.Module):
+    def __init__(self, state_size, action_size, hidden_size=128):
+        super(Policy, self).__init__()
+        self.fc1 = nn.Linear(state_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(p=0.6)  # Adjust dropout probability as needed
+        self.fc2 = nn.Linear(hidden_size, action_size)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return F.softmax(x)
+
+policy_model = Policy(state_size, action_size)
+optimizer = optim.Adam(policy_model.parameters(), lr=5e-3)
+
+gamma = 0.99
+episodes_rewards = []
+
+for i in range(500):
+    # Reset the environment
+    # init buffers
+    observation, info = env.reset(seed=42)
+    episode_rewards = []
+    logarithmich_probabilities = []
+    terminated = False
+    # Render the environment to visualize the agent's behavior
+    env.render()
+
+    while terminated == False:
+        # Get action probabilities from the policy model
+        action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32))
+        action_distribution = Categorical(action_probabilities)
+
+        # Sample an action from the action distribution
+        action = action_distribution.sample()
+        logarithmich_probability = action_distribution.log_prob(action)
+        logarithmich_probabilities.append(logarithmich_probability)
+        print(int(action.item()))
+        # Take a step in the environment
+        #print(env.step(action.item()))
+        next_observation, reward, done, a, b = env.step(action.item())
+        episode_rewards.append(reward)
+
+        # Update observation
+        observation = next_observation
+
+
+    # Compute the return for the episode
+    returns = []
+    R = 0
+    for r in reversed(episode_rewards):
+       R = r + gamma * R
+       returns.insert(0, R)
+
+    # Compute the policy loss
+    policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum()
+    episodes_rewards += [-policy_loss]
+    # Update the policy model
+    optimizer.zero_grad()
+    policy_loss.backward()
+    optimizer.step()
+
+
+env.close()
-- 
GitLab