Skip to content
Snippets Groups Projects
Select Git revision
  • e0efe1ee2d0216f45e41ae4a1b6d16cfbd6b6f1e
  • main default protected
2 results

reinforce_cartpole.py

Blame
  • Forked from Dellandrea Emmanuel / MSO_3_4-TD1
    Source project has a limited visibility.
    reinforce_cartpole.py 2.78 KiB
    import gymnasium as gym
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import torch.nn.functional as F
    from torch.distributions import Categorical
    import matplotlib.pyplot as plt
    
    
    # Create the environment
    env = gym.make("CartPole-v1", render_mode="human")
    
    # Reset the environment and get the initial observation
    observation = env.reset()
    
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    # Define the agent neural network model
    class Policy(nn.Module):
        def __init__(self, state_size, action_size, hidden_size=128):
            super(Policy, self).__init__()
            self.fc1 = nn.Linear(state_size, hidden_size)
            self.relu = nn.ReLU()
            self.dropout = nn.Dropout(p=0.6)  # Adjust dropout probability as needed
            self.fc2 = nn.Linear(hidden_size, action_size)
    
        def forward(self, x):
            x = self.fc1(x)
            x = self.relu(x)
            x = self.dropout(x)
            x = self.fc2(x)
            return F.softmax(x)
    
    policy_model = Policy(state_size, action_size)
    optimizer = optim.Adam(policy_model.parameters(), lr=5e-3)
    
    gamma = 0.99
    episodes_rewards = []
    
    for i in range(500):
        # Reset the environment
        # init buffers
        observation, info = env.reset(seed=42)
        episode_rewards = []
        logarithmich_probabilities = []
        terminated = False
        # Render the environment to visualize the agent's behavior
        env.render()
    
        while terminated == False:
            # Get action probabilities from the policy model
            action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32))
            action_distribution = Categorical(action_probabilities)
    
            # Sample an action from the action distribution
            action = action_distribution.sample()
            logarithmich_probability = action_distribution.log_prob(action)
            logarithmich_probabilities.append(logarithmich_probability)
            print(int(action.item()))
            # Take a step in the environment
            #print(env.step(action.item()))
            next_observation, reward, done, a, b = env.step(action.item())
            episode_rewards.append(reward)
    
            # Update observation
            observation = next_observation
    
    
        # Compute the return for the episode
        returns = []
        R = 0
        for r in reversed(episode_rewards):
           R = r + gamma * R
           returns.insert(0, R)
    
        # Compute the policy loss
        policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum()
        episodes_rewards += [-policy_loss]
        # Update the policy model
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
    
    
    env.close()
    
    # Plot the policy loss against iterations
    plt.plot([i for i in range(0,500)],episodes_rewards)
    plt.xlabel('Iterations')
    plt.ylabel('Policy Loss')
    plt.title('Policy Loss vs. Iterations')
    plt.show()