Skip to content
Snippets Groups Projects
Commit 3b582a8d authored by Majdi Karim's avatar Majdi Karim
Browse files

Add new file

parent 1da23e76
Branches
No related tags found
No related merge requests found
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt
# Create the environment
env = gym.make("CartPole-v1", render_mode="human")
# Reset the environment and get the initial observation
observation = env.reset()
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# Define the agent neural network model
class Policy(nn.Module):
def __init__(self, state_size, action_size, hidden_size=128):
super(Policy, self).__init__()
self.fc1 = nn.Linear(state_size, hidden_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.6) # Adjust dropout probability as needed
self.fc2 = nn.Linear(hidden_size, action_size)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return F.softmax(x)
policy_model = Policy(state_size, action_size)
optimizer = optim.Adam(policy_model.parameters(), lr=5e-3)
gamma = 0.99
episodes_rewards = []
for i in range(500):
# Reset the environment
# init buffers
observation, info = env.reset(seed=42)
episode_rewards = []
logarithmich_probabilities = []
terminated = False
# Render the environment to visualize the agent's behavior
env.render()
while terminated == False:
# Get action probabilities from the policy model
action_probabilities = policy_model(torch.tensor(observation, dtype=torch.float32))
action_distribution = Categorical(action_probabilities)
# Sample an action from the action distribution
action = action_distribution.sample()
logarithmich_probability = action_distribution.log_prob(action)
logarithmich_probabilities.append(logarithmich_probability)
print(int(action.item()))
# Take a step in the environment
#print(env.step(action.item()))
next_observation, reward, done, a, b = env.step(action.item())
episode_rewards.append(reward)
# Update observation
observation = next_observation
# Compute the return for the episode
returns = []
R = 0
for r in reversed(episode_rewards):
R = r + gamma * R
returns.insert(0, R)
# Compute the policy loss
policy_loss = torch.tensor([-loga_prob * R for loga_prob, R in zip(logarithmich_probabilities, returns)]).sum()
episodes_rewards += [-policy_loss]
# Update the policy model
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
env.close()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment