Skip to content
Snippets Groups Projects
Commit 36eabce6 authored by number_cruncher's avatar number_cruncher
Browse files

init

parent fd4d64f2
No related branches found
No related tags found
No related merge requests found
import gymnasium as gym
from stable_baselines3 import A2C
import wandb
from wandb.integration.sb3 import WandbCallback
from huggingface_sb3 import package_to_hub
# from documentation of wandb
config = {
"policy_type": "MlpPolicy",
"total_timesteps": 25000,
"env_name": "CartPole-v1",
}
run = wandb.init(
project="sb3",
config=config,
sync_tensorboard=True,
monitor_gym=True,
save_code=True,
)
env = gym.make("CartPole-v1", render_mode="rgb_array")
model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}")
#model = A2C("MlpPolicy", env, )
model.learn(total_timesteps=10_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),)
#model.learn(total_timesteps=10_000)
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
action, _state = model.predict(obs, deterministic=True)
obs, reward, done, info = vec_env.step(action)
vec_env.render("human")
# VecEnv resets automatically
# if done:
# obs = vec_env.reset()
run.finish()
package_to_hub(model=model,
model_name="CartPole-v1",
model_architecture="A2C",
env_id="CartPole-v1",
eval_env=env,
repo_id="lennartoe/Cartpole-v1",
commit_message="First commit")
\ No newline at end of file
import gymnasium as gym
import torch
from reinforce_cartpole import Policy
# Create the environment
env = gym.make("CartPole-v1", render_mode="human")
# Reset the environment and get the initial observation
observation = env.reset()[0]
policy = Policy()
# load learned policy
policy.load_state_dict(torch.load('policy.pth', weights_only=True))
policy.eval()
for _ in range(200):
# sample action from policy
print(observation)
print(torch.from_numpy(observation).float())
action_probs = policy(torch.from_numpy(observation).float())
action = torch.distributions.Categorical(action_probs).sample()
# Apply the action to the environment
# Returns next observation, reward, done signal (indicating
# if the episode has ended), and an additional info dictionary
observation, reward, terminated, truncated, info = env.step(action.numpy())
# Render the environment to visualize the agent's behavior
env.render()
print(terminated or truncated)
if terminated or truncated:
# Terminated before max step
break
env.close()
File added
import gymnasium as gym
import torch
import numpy as np
class Policy(torch.nn.Module):
def __init__(self, input_size=4, output_size=2):
super(Policy, self).__init__()
self.fc1 = torch.nn.Linear(input_size, 128)
self.relu = torch.nn.ReLU()
self.dropout = torch.nn.Dropout(0.2)
self.fc2 = torch.nn.Linear(128, output_size)
self.softmax = torch.nn.Softmax(dim=0)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
#print(x)
x = self.softmax(x)
#print(x)
return x
def main():
policy = Policy()
optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3)
# Create the environment
env = gym.make("CartPole-v1")
# Reset the environment and get the initial observation
gamma = 0.99
total_reward = []
total_loss = []
epochs = 500
max_steps = env.spec.max_episode_steps
for _ in range(epochs):
print(_)
# Reset the environment
observation = env.reset()[0]
# Reset buffer
# rewards = torch.zeros(max_steps)
# log_probs = torch.zeros(max_steps)
rewards = []
log_probs = []
for step in range(max_steps):
# Select a random action from the action space
#print(observation)
action_probs = policy(torch.from_numpy(observation).float())
# Sample an action from the action probabilities
action = torch.distributions.Categorical(action_probs).sample()
#print("Action")
#print(action)
# Apply the action to the environment
observation, reward, terminated, truncated, info = env.step(action.numpy())
#print(observation)
# env.render()
# does this come before adding to the rewards or after
# rewards[step] = reward
# log_probs[step] = torch.log(action_probs[action])
rewards.append(torch.tensor(reward))
log_probs.append(torch.log(action_probs[action]))
if terminated or truncated:
break
# apply gamma
# transform rewards and log_probs into tensors
rewards = torch.stack(rewards)
log_probs = torch.stack(log_probs)
rewards_length = len(rewards)
rewards_tensor = torch.zeros(rewards_length, rewards_length)
for i in range(rewards_length):
for j in range(rewards_length-i):
rewards_tensor[i,j] = rewards[i+j]
#print(rewards_tensor)
for i in range(rewards_length):
for j in range(rewards_length):
rewards_tensor[i,j] = rewards_tensor[i,j] * np.pow(gamma,j)
#print(rewards_tensor)
normalized_rewards = torch.sum(rewards_tensor, dim=1)
#print(normalized_rewards)
normalized_rewards = normalized_rewards- torch.mean(normalized_rewards)
normalized_rewards /= torch.std(normalized_rewards)
loss = -torch.sum(log_probs * normalized_rewards)
total_reward.append(sum(rewards))
# optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss.append(loss.detach().numpy())
# Render the environment to visualize the agent's behavior
#env.render()
# save the model weights
torch.save(policy.state_dict(), "policy.pth")
print(total_reward)
print(total_loss)
env.close()
# plot the rewards and the loss side by side
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,2)
ax[0].plot(total_reward)
ax[1].plot(total_loss)
plt.show()
if __name__ == "__main__":
main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment