reinforce_cartpole.py

import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt


# Create the environment
env = gym.make("CartPole-v1", render_mode="human")

# Reset the environment and get the initial observation
observation = env.reset()

state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Define the agent neural network model
class Policy(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.6)  # Adjust dropout probability as needed
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return F.softmax(x)

policy_model = Policy(state_size, action_size)
optimizer = optim.Adam(policy_model.parameters(), lr=5e-3)

gamma = 0.99
episodes_rewards = []

for i in range(500):
    # Reset the environment
    # init buffers
    observation, info = env.reset(seed=42)
    episode_rewards = []
    logarithmich_probabilities = []
    terminated = False
    # Render the environment to visualize the agent's behavior
    env.render()

    while terminated == False:
        # Get action probabilities from the policy model
        action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32))
        action_distribution = Categorical(action_probabilities)

        # Sample an action from the action distribution
        action = action_distribution.sample()
        logarithmich_probability = action_distribution.log_prob(action)
        logarithmich_probabilities.append(logarithmich_probability)
        print(int(action.item()))
        # Take a step in the environment
        #print(env.step(action.item()))
        next_observation, reward, done, a, b = env.step(action.item())
        episode_rewards.append(reward)

        # Update observation
        observation = next_observation


    # Compute the return for the episode
    returns = []
    R = 0
    for r in reversed(episode_rewards):
       R = r + gamma * R
       returns.insert(0, R)

    # Compute the policy loss
    policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum()
    episodes_rewards += [-policy_loss]
    # Update the policy model
    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()


env.close()

# Plot the policy loss against iterations
plt.plot([i for i in range(0,500)],episodes_rewards)
plt.xlabel('Iterations')
plt.ylabel('Policy Loss')
plt.title('Policy Loss vs. Iterations')
plt.show()