Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Flatland
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
yoogottamk
Flatland
Commits
5744260c
Commit
5744260c
authored
5 years ago
by
Erik Nygren
Browse files
Options
Downloads
Patches
Plain Diff
added agent examples for testing the code
parent
164be269
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
agents/dqn_agent.py
+189
-0
189 additions, 0 deletions
agents/dqn_agent.py
agents/model.py
+62
-0
62 additions, 0 deletions
agents/model.py
examples/temporary_example.py
+6
-6
6 additions, 6 deletions
examples/temporary_example.py
with
257 additions
and
6 deletions
agents/dqn_agent.py
0 → 100644
+
189
−
0
View file @
5744260c
import
numpy
as
np
import
random
from
collections
import
namedtuple
,
deque
import
os
from
agent.model
import
QNetwork
,
QNetwork2
import
torch
import
torch.nn.functional
as
F
import
torch.optim
as
optim
import
copy
BUFFER_SIZE
=
int
(
1e5
)
# replay buffer size
BATCH_SIZE
=
512
# minibatch size
GAMMA
=
0.99
# discount factor 0.99
TAU
=
1e-3
# for soft update of target parameters
LR
=
0.5e-4
# learning rate 5
UPDATE_EVERY
=
10
# how often to update the network
double_dqn
=
True
# If using double dqn algorithm
input_channels
=
5
# Number of Input channels
device
=
torch
.
device
(
"
cuda:0
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
)
device
=
torch
.
device
(
"
cpu
"
)
print
(
device
)
class
Agent
():
"""
Interacts with and learns from the environment.
"""
def
__init__
(
self
,
state_size
,
action_size
,
net_type
,
seed
,
double_dqn
=
True
,
input_channels
=
5
):
"""
Initialize an Agent object.
Params
======
state_size (int): dimension of each state
action_size (int): dimension of each action
seed (int): random seed
"""
self
.
state_size
=
state_size
self
.
action_size
=
action_size
self
.
seed
=
random
.
seed
(
seed
)
self
.
version
=
net_type
self
.
double_dqn
=
double_dqn
# Q-Network
if
self
.
version
==
"
Conv
"
:
self
.
qnetwork_local
=
QNetwork2
(
state_size
,
action_size
,
seed
,
input_channels
).
to
(
device
)
self
.
qnetwork_target
=
copy
.
deepcopy
(
self
.
qnetwork_local
)
else
:
self
.
qnetwork_local
=
QNetwork
(
state_size
,
action_size
,
seed
).
to
(
device
)
self
.
qnetwork_target
=
copy
.
deepcopy
(
self
.
qnetwork_local
)
self
.
optimizer
=
optim
.
Adam
(
self
.
qnetwork_local
.
parameters
(),
lr
=
LR
)
# Replay memory
self
.
memory
=
ReplayBuffer
(
action_size
,
BUFFER_SIZE
,
BATCH_SIZE
,
seed
)
# Initialize time step (for updating every UPDATE_EVERY steps)
self
.
t_step
=
0
def
save
(
self
,
filename
):
torch
.
save
(
self
.
qnetwork_local
.
state_dict
(),
filename
+
"
.local
"
)
torch
.
save
(
self
.
qnetwork_target
.
state_dict
(),
filename
+
"
.target
"
)
def
load
(
self
,
filename
):
if
os
.
path
.
exists
(
filename
+
"
.local
"
):
self
.
qnetwork_local
.
load_state_dict
(
torch
.
load
(
filename
+
"
.local
"
))
if
os
.
path
.
exists
(
filename
+
"
.target
"
):
self
.
qnetwork_target
.
load_state_dict
(
torch
.
load
(
filename
+
"
.target
"
))
def
step
(
self
,
state
,
action
,
reward
,
next_state
,
done
):
# Save experience in replay memory
self
.
memory
.
add
(
state
,
action
,
reward
,
next_state
,
done
)
# Learn every UPDATE_EVERY time steps.
self
.
t_step
=
(
self
.
t_step
+
1
)
%
UPDATE_EVERY
if
self
.
t_step
==
0
:
# If enough samples are available in memory, get random subset and learn
if
len
(
self
.
memory
)
>
BATCH_SIZE
:
experiences
=
self
.
memory
.
sample
()
self
.
learn
(
experiences
,
GAMMA
)
def
act
(
self
,
state
,
eps
=
0.
):
"""
Returns actions for given state as per current policy.
Params
======
state (array_like): current state
eps (float): epsilon, for epsilon-greedy action selection
"""
state
=
torch
.
from_numpy
(
state
).
float
().
unsqueeze
(
0
).
to
(
device
)
self
.
qnetwork_local
.
eval
()
with
torch
.
no_grad
():
action_values
=
self
.
qnetwork_local
(
state
)
self
.
qnetwork_local
.
train
()
# Epsilon-greedy action selection
if
random
.
random
()
>
eps
:
return
np
.
argmax
(
action_values
.
cpu
().
data
.
numpy
())
else
:
return
random
.
choice
(
np
.
arange
(
self
.
action_size
))
def
learn
(
self
,
experiences
,
gamma
):
"""
Update value parameters using given batch of experience tuples.
Params
======
experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s
'
, done) tuples
gamma (float): discount factor
"""
states
,
actions
,
rewards
,
next_states
,
dones
=
experiences
# Get expected Q values from local model
Q_expected
=
self
.
qnetwork_local
(
states
).
gather
(
1
,
actions
)
if
self
.
double_dqn
:
# Double DQN
q_best_action
=
self
.
qnetwork_local
(
next_states
).
max
(
1
)[
1
]
Q_targets_next
=
self
.
qnetwork_target
(
next_states
).
gather
(
1
,
q_best_action
.
unsqueeze
(
-
1
))
else
:
# DQN
Q_targets_next
=
self
.
qnetwork_target
(
next_states
).
detach
().
max
(
1
)[
0
].
unsqueeze
(
-
1
)
# Compute Q targets for current states
Q_targets
=
rewards
+
(
gamma
*
Q_targets_next
*
(
1
-
dones
))
# Compute loss
loss
=
F
.
mse_loss
(
Q_expected
,
Q_targets
)
# Minimize the loss
self
.
optimizer
.
zero_grad
()
loss
.
backward
()
self
.
optimizer
.
step
()
# ------------------- update target network ------------------- #
self
.
soft_update
(
self
.
qnetwork_local
,
self
.
qnetwork_target
,
TAU
)
def
soft_update
(
self
,
local_model
,
target_model
,
tau
):
"""
Soft update model parameters.
θ_target = τ*θ_local + (1 - τ)*θ_target
Params
======
local_model (PyTorch model): weights will be copied from
target_model (PyTorch model): weights will be copied to
tau (float): interpolation parameter
"""
for
target_param
,
local_param
in
zip
(
target_model
.
parameters
(),
local_model
.
parameters
()):
target_param
.
data
.
copy_
(
tau
*
local_param
.
data
+
(
1.0
-
tau
)
*
target_param
.
data
)
class
ReplayBuffer
:
"""
Fixed-size buffer to store experience tuples.
"""
def
__init__
(
self
,
action_size
,
buffer_size
,
batch_size
,
seed
):
"""
Initialize a ReplayBuffer object.
Params
======
action_size (int): dimension of each action
buffer_size (int): maximum size of buffer
batch_size (int): size of each training batch
seed (int): random seed
"""
self
.
action_size
=
action_size
self
.
memory
=
deque
(
maxlen
=
buffer_size
)
self
.
batch_size
=
batch_size
self
.
experience
=
namedtuple
(
"
Experience
"
,
field_names
=
[
"
state
"
,
"
action
"
,
"
reward
"
,
"
next_state
"
,
"
done
"
])
self
.
seed
=
random
.
seed
(
seed
)
def
add
(
self
,
state
,
action
,
reward
,
next_state
,
done
):
"""
Add a new experience to memory.
"""
e
=
self
.
experience
(
np
.
expand_dims
(
state
,
0
),
action
,
reward
,
np
.
expand_dims
(
next_state
,
0
),
done
)
self
.
memory
.
append
(
e
)
def
sample
(
self
):
"""
Randomly sample a batch of experiences from memory.
"""
experiences
=
random
.
sample
(
self
.
memory
,
k
=
self
.
batch_size
)
states
=
torch
.
from_numpy
(
np
.
vstack
([
e
.
state
for
e
in
experiences
if
e
is
not
None
])).
float
().
to
(
device
)
actions
=
torch
.
from_numpy
(
np
.
vstack
([
e
.
action
for
e
in
experiences
if
e
is
not
None
])).
long
().
to
(
device
)
rewards
=
torch
.
from_numpy
(
np
.
vstack
([
e
.
reward
for
e
in
experiences
if
e
is
not
None
])).
float
().
to
(
device
)
next_states
=
torch
.
from_numpy
(
np
.
vstack
([
e
.
next_state
for
e
in
experiences
if
e
is
not
None
])).
float
().
to
(
device
)
dones
=
torch
.
from_numpy
(
np
.
vstack
([
e
.
done
for
e
in
experiences
if
e
is
not
None
]).
astype
(
np
.
uint8
)).
float
().
to
(
device
)
return
(
states
,
actions
,
rewards
,
next_states
,
dones
)
def
__len__
(
self
):
"""
Return the current size of internal memory.
"""
return
len
(
self
.
memory
)
This diff is collapsed.
Click to expand it.
agents/model.py
0 → 100644
+
62
−
0
View file @
5744260c
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
class
QNetwork
(
nn
.
Module
):
def
__init__
(
self
,
state_size
,
action_size
,
seed
,
hidsize1
=
128
,
hidsize2
=
128
):
super
(
QNetwork
,
self
).
__init__
()
self
.
fc1_val
=
nn
.
Linear
(
state_size
,
hidsize1
)
self
.
fc2_val
=
nn
.
Linear
(
hidsize1
,
hidsize2
)
self
.
fc3_val
=
nn
.
Linear
(
hidsize2
,
1
)
self
.
fc1_adv
=
nn
.
Linear
(
state_size
,
hidsize1
)
self
.
fc2_adv
=
nn
.
Linear
(
hidsize1
,
hidsize2
)
self
.
fc3_adv
=
nn
.
Linear
(
hidsize2
,
action_size
)
def
forward
(
self
,
x
):
val
=
F
.
relu
(
self
.
fc1_val
(
x
))
val
=
F
.
relu
(
self
.
fc2_val
(
val
))
val
=
self
.
fc3_val
(
val
)
# advantage calculation
adv
=
F
.
relu
(
self
.
fc1_adv
(
x
))
adv
=
F
.
relu
(
self
.
fc2_adv
(
adv
))
adv
=
self
.
fc3_adv
(
adv
)
return
val
+
adv
-
adv
.
mean
()
class
QNetwork2
(
nn
.
Module
):
def
__init__
(
self
,
state_size
,
action_size
,
seed
,
input_channels
,
hidsize1
=
128
,
hidsize2
=
64
):
super
(
QNetwork2
,
self
).
__init__
()
self
.
conv1
=
nn
.
Conv2d
(
input_channels
,
16
,
kernel_size
=
3
,
stride
=
1
)
self
.
bn1
=
nn
.
BatchNorm2d
(
16
)
self
.
conv2
=
nn
.
Conv2d
(
16
,
32
,
kernel_size
=
5
,
stride
=
3
)
self
.
bn2
=
nn
.
BatchNorm2d
(
32
)
self
.
conv3
=
nn
.
Conv2d
(
32
,
64
,
kernel_size
=
5
,
stride
=
3
)
self
.
bn3
=
nn
.
BatchNorm2d
(
64
)
self
.
fc1_val
=
nn
.
Linear
(
6400
,
hidsize1
)
self
.
fc2_val
=
nn
.
Linear
(
hidsize1
,
hidsize2
)
self
.
fc3_val
=
nn
.
Linear
(
hidsize2
,
1
)
self
.
fc1_adv
=
nn
.
Linear
(
6400
,
hidsize1
)
self
.
fc2_adv
=
nn
.
Linear
(
hidsize1
,
hidsize2
)
self
.
fc3_adv
=
nn
.
Linear
(
hidsize2
,
action_size
)
def
forward
(
self
,
x
):
x
=
F
.
relu
(
self
.
conv1
(
x
))
x
=
F
.
relu
(
self
.
conv2
(
x
))
x
=
F
.
relu
(
self
.
conv3
(
x
))
# value function approximation
val
=
F
.
relu
(
self
.
fc1_val
(
x
.
view
(
x
.
size
(
0
),
-
1
)))
val
=
F
.
relu
(
self
.
fc2_val
(
val
))
val
=
self
.
fc3_val
(
val
)
# advantage calculation
adv
=
F
.
relu
(
self
.
fc1_adv
(
x
.
view
(
x
.
size
(
0
),
-
1
)))
adv
=
F
.
relu
(
self
.
fc2_adv
(
adv
))
adv
=
self
.
fc3_adv
(
adv
)
return
val
+
adv
-
adv
.
mean
()
This diff is collapsed.
Click to expand it.
examples/temporary_example.py
+
6
−
6
View file @
5744260c
...
...
@@ -21,12 +21,12 @@ transition_probability = [1.0, # empty cell - Case 0
"""
transition_probability
=
[
1.0
,
# empty cell - Case 0
1.0
,
# Case 1 - straight
1.0
,
# Case 2 - simple switch
1.0
,
# Case 3 - diamond drossing
1.0
,
# Case 4 - single slip
1.0
,
# Case 5 - double slip
1.0
,
# Case 6 - symmetrical
1
.0
]
# Case 7 - dead end
0.5
,
# Case 2 - simple switch
0.2
,
# Case 3 - diamond drossing
0.5
,
# Case 4 - single slip
0.1
,
# Case 5 - double slip
0.2
,
# Case 6 - symmetrical
0
.0
1
]
# Case 7 - dead end
# Example generate a random rail
env
=
RailEnv
(
width
=
20
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment