Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • main
1 result

Target

Select target project
  • loestrei/mso_3_4-td1
  • edelland/mso_3_4-td1
  • schneidl/mso_3_4-td1
  • epaganel/mso_3_4-td1
  • asennevi/armand-senneville-mso-3-4-td-1
  • hchauvin/mso_3_4-td1
  • mbabay/mso_3_4-td1
  • ochaufou/mso_3_4-td1
  • cgerest/hands-on-rl
  • robertr/mso_3_4-td1
  • kmajdi/mso_3_4-td1
  • jseksik/hands-on-rl
  • coulonj/mso_3_4-td1
  • tdesgreys/mso_3_4-td1
14 results
Select Git revision
  • main
1 result
Show changes
Commits on Source (2)
......@@ -176,6 +176,8 @@ Use the documentation of [Stable-Baselines3](https://stable-baselines3.readthedo
> ⚠️ **Warning**
> Make sure to make the run public! If it is not possible (due to the restrictions on your account), you can create a WandB [report](https://docs.wandb.ai/guides/reports/create-a-report/), add all relevant graphs and any textual descriptions or explanations you find pertinent, then download a PDF file (landscape format) and upload it along with the code to GitLab. Make sure to arrange the plots in a way that makes them understandable in the PDF (e.g., one graph per row, correct axes, etc.). Specify which report corresponds to which experiment.
LINK: https://wandb.ai/lennartecl-centrale-lyon/sb3?nw=nwuserlennartecl
### Full workflow with panda-gym
[Panda-gym](https://github.com/qgallouedec/panda-gym) is a collection of environments for robotic simulation and control. It provides a range of challenges for training robotic agents in a simulated environment. In this section, you will get familiar with one of the environments provided by panda-gym, the `PandaReachJointsDense-v3`. The objective is to learn how to reach any point in 3D space by directly controlling the robot's articulations.
......
import gymnasium as gym
from stable_baselines3 import A2C
import wandb
from wandb.integration.sb3 import WandbCallback
from huggingface_sb3 import package_to_hub
# from documentation of wandb
config = {
"policy_type": "MlpPolicy",
"total_timesteps": 25000,
"env_name": "CartPole-v1",
}
run = wandb.init(
project="sb3",
config=config,
sync_tensorboard=True,
monitor_gym=True,
save_code=True,
)
env = gym.make("CartPole-v1", render_mode="rgb_array")
model = A2C("MlpPolicy", env, verbose=1, tensorboard_log=f"runs/{run.id}")
#model = A2C("MlpPolicy", env, )
model.learn(total_timesteps=10_000, callback=WandbCallback(gradient_save_freq=100,model_save_path=f"models/{run.id}",verbose=2,),)
#model.learn(total_timesteps=10_000)
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
action, _state = model.predict(obs, deterministic=True)
obs, reward, done, info = vec_env.step(action)
vec_env.render("human")
# VecEnv resets automatically
# if done:
# obs = vec_env.reset()
run.finish()
package_to_hub(model=model,
model_name="CartPole-v1",
model_architecture="A2C",
env_id="CartPole-v1",
eval_env=env,
repo_id="lennartoe/Cartpole-v1",
commit_message="First commit")
\ No newline at end of file
import gymnasium as gym
import torch
from reinforce_cartpole import Policy
# Create the environment
env = gym.make("CartPole-v1", render_mode="human")
# Reset the environment and get the initial observation
observation = env.reset()[0]
policy = Policy()
# load learned policy
policy.load_state_dict(torch.load('policy.pth', weights_only=True))
policy.eval()
for _ in range(200):
# sample action from policy
print(observation)
print(torch.from_numpy(observation).float())
action_probs = policy(torch.from_numpy(observation).float())
action = torch.distributions.Categorical(action_probs).sample()
# Apply the action to the environment
# Returns next observation, reward, done signal (indicating
# if the episode has ended), and an additional info dictionary
observation, reward, terminated, truncated, info = env.step(action.numpy())
# Render the environment to visualize the agent's behavior
env.render()
print(terminated or truncated)
if terminated or truncated:
# Terminated before max step
break
env.close()
File added
import gymnasium as gym
import torch
import numpy as np
class Policy(torch.nn.Module):
def __init__(self, input_size=4, output_size=2):
super(Policy, self).__init__()
self.fc1 = torch.nn.Linear(input_size, 128)
self.relu = torch.nn.ReLU()
self.dropout = torch.nn.Dropout(0.2)
self.fc2 = torch.nn.Linear(128, output_size)
self.softmax = torch.nn.Softmax(dim=0)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
#print(x)
x = self.softmax(x)
#print(x)
return x
def main():
policy = Policy()
optimizer = torch.optim.Adam(policy.parameters(), lr=5e-3)
# Create the environment
env = gym.make("CartPole-v1")
# Reset the environment and get the initial observation
gamma = 0.99
total_reward = []
total_loss = []
epochs = 500
max_steps = env.spec.max_episode_steps
for _ in range(epochs):
print(_)
# Reset the environment
observation = env.reset()[0]
# Reset buffer
# rewards = torch.zeros(max_steps)
# log_probs = torch.zeros(max_steps)
rewards = []
log_probs = []
for step in range(max_steps):
# Select a random action from the action space
#print(observation)
action_probs = policy(torch.from_numpy(observation).float())
# Sample an action from the action probabilities
action = torch.distributions.Categorical(action_probs).sample()
#print("Action")
#print(action)
# Apply the action to the environment
observation, reward, terminated, truncated, info = env.step(action.numpy())
#print(observation)
# env.render()
# does this come before adding to the rewards or after
# rewards[step] = reward
# log_probs[step] = torch.log(action_probs[action])
rewards.append(torch.tensor(reward))
log_probs.append(torch.log(action_probs[action]))
if terminated or truncated:
break
# apply gamma
# transform rewards and log_probs into tensors
rewards = torch.stack(rewards)
log_probs = torch.stack(log_probs)
rewards_length = len(rewards)
rewards_tensor = torch.zeros(rewards_length, rewards_length)
for i in range(rewards_length):
for j in range(rewards_length-i):
rewards_tensor[i,j] = rewards[i+j]
#print(rewards_tensor)
for i in range(rewards_length):
for j in range(rewards_length):
rewards_tensor[i,j] = rewards_tensor[i,j] * np.pow(gamma,j)
#print(rewards_tensor)
normalized_rewards = torch.sum(rewards_tensor, dim=1)
#print(normalized_rewards)
normalized_rewards = normalized_rewards- torch.mean(normalized_rewards)
normalized_rewards /= torch.std(normalized_rewards)
loss = -torch.sum(log_probs * normalized_rewards)
total_reward.append(sum(rewards))
# optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss.append(loss.detach().numpy())
# Render the environment to visualize the agent's behavior
#env.render()
# save the model weights
torch.save(policy.state_dict(), "policy.pth")
print(total_reward)
print(total_loss)
env.close()
# plot the rewards and the loss side by side
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,2)
ax[0].plot(total_reward)
ax[1].plot(total_loss)
plt.show()
if __name__ == "__main__":
main()
\ No newline at end of file