Add new file

1cc0d502 · Majdi Karim · 3b582a8d · 1cc0d502
Commit 1cc0d502 authored 1 year ago by Majdi Karim
--- a/reinforce_cartpole.py
+++ b/reinforce_cartpole.py
+# Create the environment
+env = gym.make("CartPole-v1", render_mode="human")
+# Reset the environment and get the initial observation
+observation = env.reset()
+state_size = env.observation_space.shape[0]
+action_size = env.action_space.n
+# Define the agent neural network model
+class Policy(nn.Module):
+    def __init__(self, state_size, action_size, hidden_size=128):
+        super(Policy, self).__init__()
+        self.fc1 = nn.Linear(state_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(p=0.6)  # Adjust dropout probability as needed
+        self.fc2 = nn.Linear(hidden_size, action_size)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return F.softmax(x)
+policy_model = Policy(state_size, action_size)
+optimizer = optim.Adam(policy_model.parameters(), lr=5e-3)
+gamma = 0.99
+episodes_rewards = []
+for i in range(500):
+    # Reset the environment
+    # init buffers
+    observation, info = env.reset(seed=42)
+    episode_rewards = []
+    logarithmich_probabilities = []
+    terminated = False
+    # Render the environment to visualize the agent's behavior
+    env.render()
+    while terminated == False:
+        # Get action probabilities from the policy model
+        action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32))
+        action_distribution = Categorical(action_probabilities)
+        # Sample an action from the action distribution
+        action = action_distribution.sample()
+        logarithmich_probability = action_distribution.log_prob(action)
+        logarithmich_probabilities.append(logarithmich_probability)
+        print(int(action.item()))
+        # Take a step in the environment
+        #print(env.step(action.item()))
+        next_observation, reward, done, a, b = env.step(action.item())
+        episode_rewards.append(reward)
+        # Update observation
+        observation = next_observation
+    # Compute the return for the episode
+    returns = []
+    R = 0
+    for r in reversed(episode_rewards):
+       R = r + gamma * R
+       returns.insert(0, R)
+    # Compute the policy loss
+    policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum()
+    episodes_rewards += [-policy_loss]
+    # Update the policy model
+    optimizer.zero_grad()
+    policy_loss.backward()
+    optimizer.step()
+env.close()