Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
H
hands-on-rl
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Brussart Paul-emile
hands-on-rl
Commits
ec80fa30
Commit
ec80fa30
authored
2 years ago
by
Brussart Paul-emile
Browse files
Options
Downloads
Patches
Plain Diff
Updated reinforce_cartpole.py, added figure of the results
parent
fc91605b
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
reinforce_cartpole.png
+0
-0
0 additions, 0 deletions
reinforce_cartpole.png
reinforce_cartpole.py
+69
-92
69 additions, 92 deletions
reinforce_cartpole.py
with
69 additions
and
92 deletions
reinforce_cartpole.png
0 → 100644
+
0
−
0
View file @
ec80fa30
2.34 KiB
This diff is collapsed.
Click to expand it.
reinforce_cartpole.py
+
69
−
92
View file @
ec80fa30
import
gym
import
torch
import
matplotlib.pyplot
as
plt
import
torch.nn
as
nn
import
torch.optim
as
optim
import
torch.nn.functional
as
F
from
torch.distributions
import
Categorical
# Number of episodes to run the environment
N_Episodes
=
500
# Discount factor for future rewards
Gamma
=
0.99
import
torch.optim
as
optim
import
matplotlib.pyplot
as
plt
#
Learning rate for the Adam optimizer
LR
=
5e-3
#
setup the environment
env
=
gym
.
make
(
'
CartPole-v1
'
)
#
Define the
neural network
model
class
Net
(
nn
.
Module
):
#
setup the agent as a
neural network
class
Model
(
nn
.
Module
):
def
__init__
(
self
):
super
(
Net
,
self
).
__init__
()
self
.
fc1
=
nn
.
Linear
(
4
,
128
)
# Using fully connected layers
self
.
fc2
=
nn
.
Linear
(
128
,
2
)
# Two possible outputs: right or left
self
.
dropout
=
nn
.
Dropout
(
0.25
)
self
.
softmax
=
nn
.
Softmax
()
super
(
Model
,
self
).
__init__
()
self
.
fc1
=
nn
.
Linear
(
4
,
128
)
self
.
dropout
=
nn
.
Dropout
(
p
=
0.6
)
self
.
fc2
=
nn
.
Linear
(
128
,
2
)
def
forward
(
self
,
x
):
x
=
F
.
relu
(
self
.
fc1
(
x
))
x
=
self
.
dropout
(
x
)
x
=
F
.
relu
(
self
.
fc2
(
x
))
x
=
self
.
softmax
(
x
)
# Apply softmax activation function to get action probabilities
return
x
# Initialize the model
model
=
Net
()
# Initialize the Adam optimizer
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
LR
)
# Make the CartPole-v1 environment
env
=
gym
.
make
(
"
CartPole-v1
"
)
# Get the maximum number of steps allowed in each episode
maxSteps
=
env
.
spec
.
max_episode_steps
# List to store the rewards accumulated through the episodes
rewardsList
=
[]
for
episode
in
range
(
N_Episodes
):
# Reset the environment for a new episode
observation
=
env
.
reset
()
# Initialize the rewards tensor
rewards
=
torch
.
zeros
(
maxSteps
)
# Initialize the buffer tensor
buffer
=
torch
.
zeros
(
maxSteps
)
# Set the done flag to False, indicating the episode has not ended
done
=
False
# TrainSize counter to keep track of the number of steps in the episode
trainSize
=
0
# Run the episode until it terminates
while
not
(
done
):
# Pass the current observation through the model to get action probabilities
prob
=
model
(
torch
.
tensor
(
observation
))
# Sample an action from the action probabilities
m
=
Categorical
(
prob
)
x
=
self
.
fc2
(
x
)
return
F
.
softmax
(
x
,
dim
=
1
)
# initialize the agent
model
=
Model
()
optimizer
=
optim
.
Adam
(
model
.
parameters
(),
lr
=
5e-3
)
# Keep track of the number of rewards for each episodes
rewardsByEpisode
=
[]
# training loop
for
episode
in
range
(
500
):
# reset the environment
state
=
env
.
reset
()
log_probs
=
[]
rewards
=
[]
# repeat until the end of the episode
while
True
:
state
=
torch
.
from_numpy
(
state
).
float
().
unsqueeze
(
0
)
probs
=
model
(
state
)
m
=
torch
.
distributions
.
Categorical
(
probs
)
action
=
m
.
sample
()
# Take the action and get the next state, reward, done flag, and info
state
,
reward
,
done
,
info
=
env
.
step
(
action
.
item
())
# Store the probability of the action taken in the buffer tensor
buffer
[
trainSize
]
=
prob
[
action
]
# Store the reward in the rewards tensor
rewards
[
trainSize
]
=
reward
# Accumulate the rewards over time
for
i
in
range
(
trainSize
):
rewards
[
i
]
+=
Gamma
**
(
trainSize
-
i
)
*
reward
trainSize
+=
1
# Vizualisation of the environment
env
.
render
()
# Set the size for the list of rewards and the buffer
rewards
=
rewards
[
0
:
trainSize
]
buffer
=
buffer
[
0
:
trainSize
]
# Normalizing the rewards
F
.
normalize
(
rewards
,
dim
=
0
)
loss
=
-
torch
.
sum
(
torch
.
multiply
(
torch
.
log10
(
buffer
),
rewards
))
log_probs
.
append
(
m
.
log_prob
(
action
))
state
,
reward
,
done
,
_
=
env
.
step
(
action
.
item
())
rewards
.
append
(
reward
)
if
done
:
break
# Vizualisation of the environment (commented to gain time)
#env.render()
# normalize the return
returns
=
[]
discounted_return
=
0
for
reward
in
rewards
[::
-
1
]:
discounted_return
=
reward
+
0.99
*
discounted_return
returns
.
insert
(
0
,
discounted_return
)
returns
=
torch
.
tensor
(
returns
)
returns
=
(
returns
-
returns
.
mean
())
/
(
returns
.
std
()
+
1e-5
)
rewardsByEpisode
.
append
(
len
(
rewards
))
# compute the model loss
model_loss
=
[]
for
log_prob
,
return_
in
zip
(
log_probs
,
returns
):
model_loss
.
append
(
-
log_prob
*
return_
)
model_loss
=
torch
.
cat
(
model_loss
).
sum
()
# update the model
optimizer
.
zero_grad
()
loss
.
backward
()
model_
loss
.
backward
()
optimizer
.
step
()
print
(
"
N° de l
'
épisode :
"
,
episode
)
print
(
"
Nombre de rewards :
"
,
len
(
rewards
))
# X axis :
x
=
list
(
range
(
len
(
rewardsByEpisode
)))
rewardsList
.
append
(
trainSize
)
plt
.
xlabel
(
'
Episodes N°
'
)
plt
.
ylabel
(
'
Number of rewards given
'
)
plt
.
plot
(
x
,
rewardsByEpisode
,
'
--
'
)
plt
.
show
()
plt
.
savefig
(
'
reinforce_cartpole.png
'
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment