Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Flatland
neurips2020-flatland-baselines
Commits
4b39e2f0
Commit
4b39e2f0
authored
Jul 20, 2020
by
nilabha
Browse files
clean up code imitation trainer
parent
86b99ebb
Changes
1
Show whitespace changes
Inline
Side-by-side
imitation_trainer.py
View file @
4b39e2f0
...
...
@@ -5,7 +5,8 @@ from ray.rllib.agents.dqn import ApexTrainer,DQNTrainer
from
ray.rllib.utils.annotations
import
override
from
ray.rllib.agents.ppo.ppo
import
PPOTrainer
import
ray
from
ray
import
tune
import
numpy
as
np
import
os
...
...
@@ -33,6 +34,7 @@ from utils.argparser import create_parser
from
utils.loader
import
load_envs
,
load_models
# Custom wandb logger with hotfix to allow custom callbacks
from
wandblogger
import
WandbLogger
import
pandas
as
pd
"""
Note : This implementation has been adapted from :
...
...
@@ -58,7 +60,7 @@ from libs.cell_graph_dispatcher import CellGraphDispatcher
def
adam_optimizer
(
policy
,
config
):
return
tf
.
train
.
AdamOptimizer
(
learning_rate
=
0.01
,
epsilon
=
0.001
)
learning_rate
=
config
.
get
(
'lr'
,
5e-4
),
epsilon
=
config
.
get
(
'adam_epsilon'
,
1e-8
)
)
def
default_execution_plan
(
workers
:
WorkerSet
,
config
):
# Collects experiences in parallel from multiple RolloutWorker actors.
...
...
@@ -78,8 +80,6 @@ def default_execution_plan(workers: WorkerSet, config):
def
loss_imitation
(
policy
,
model
,
dist_class
,
train_batch
):
return
np
.
random
.
randint
(
5
)
# policy = DQNTFPolicy.with_updates(name="ImitPolicy",)
ImitationTFPolicy
=
build_tf_policy
(
name
=
"ImitationTFPolicy"
,
loss_fn
=
loss_imitation
,
...
...
@@ -87,8 +87,6 @@ ImitationTFPolicy = build_tf_policy(
)
# yapf: disable
# __sphinx_doc_begin__
class
ImitationAgent
(
PPOTrainer
):
"""Policy that takes random actions and never learns."""
...
...
@@ -138,7 +136,6 @@ class ImitationAgent(PPOTrainer):
# batch_size = 2
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# while not done["__all__"]:
for
step
in
range
(
max_steps
):
for
a
in
range
(
n_agents
):
if
not
done
.
get
(
a
)
and
obs
.
get
(
a
)
is
not
None
:
...
...
@@ -150,7 +147,6 @@ class ImitationAgent(PPOTrainer):
obs
,
all_rewards
,
done
,
info
=
self
.
env
.
step
(
action_dict
)
steps
+=
1
#super()._train()
for
agent
,
agent_info
in
info
.
items
():
if
episode_max_steps
==
0
:
...
...
@@ -177,7 +173,6 @@ class ImitationAgent(PPOTrainer):
def
_train
(
self
):
import
tensorflow
as
tf
policy
=
self
.
get_policy
()
# optimizer = tf.keras.optimizers.Adam()
steps
=
0
for
_
in
range
(
1
):
env
=
self
.
env
.
_env
.
rail_env
...
...
@@ -195,7 +190,6 @@ class ImitationAgent(PPOTrainer):
episode_num_agents
=
0
episode_score
=
0
episode_done_agents
=
0
# obs = self.env.reset()
done
=
{}
done
[
"__all__"
]
=
False
...
...
@@ -203,7 +197,6 @@ class ImitationAgent(PPOTrainer):
# batch_size = 2
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# while not done["__all__"]:
for
step
in
range
(
max_steps
):
action_dict
=
dispatcher
.
step
(
env
.
_elapsed_steps
)
...
...
@@ -214,26 +207,18 @@ class ImitationAgent(PPOTrainer):
if
not
done
.
get
(
a
)
and
obs
.
get
(
a
)
is
not
None
:
active_agents
+=
1
expert_action
=
action_dict
[
a
].
value
# self.model.custom_loss(tf.constant(expert_action),
# {"obs": tf.cast(tf.expand_dims(obs[a],0),tf.float32)})
# self.model.custom_loss(expert_action,{"obs": np.expand_dims(obs[a],0)},)
input_dict
=
{
"obs"
:
np
.
expand_dims
(
obs
[
a
],
0
)}
input_dict
[
'obs_flat'
]
=
input_dict
[
'obs'
]
logits
,
_
=
policy
.
model
.
forward
(
input_dict
,
[],
None
)
model_logits
=
tf
.
squeeze
(
logits
)
expert_logits
=
tf
.
cast
(
expert_action
,
tf
.
int32
)
# expert_one_hot = tf.one_hot(expert_logits,num_outputs)
action_dist
=
Categorical
(
logits
,
policy
.
model
.
model_config
)
imitation_loss
+=
tf
.
reduce_mean
(
-
action_dist
.
logp
(
tf
.
expand_dims
(
expert_logits
,
0
)))
imitation_loss
=
imitation_loss
/
max
(
active_agents
,
1
)
# imitation_loss = tf.nn.softmax_cross_entropy_with_logits(
# labels=expert_logits, logits=model_logits)
gradients
=
tape
.
gradient
(
imitation_loss
,
policy
.
model
.
trainable_variables
())
# optimizer.apply_gradients(zip(gradients, policy.model.trainable_variables()))
self
.
workers
.
local_worker
().
apply_gradients
(
gradients
)
weights
=
ray
.
put
(
self
.
workers
.
local_worker
().
get_weights
())
...
...
@@ -241,14 +226,8 @@ class ImitationAgent(PPOTrainer):
for
e
in
self
.
workers
.
remote_workers
():
e
.
set_weights
.
remote
(
weights
)
# grads_and_vars = optimizer.compute_gradients(lambda :imitation_loss, var_list=variables)
# grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
# policy.apply_gradients(grads_and_vars)
# optimizer.apply_gradients(grads_and_vars)
obs
,
all_rewards
,
done
,
info
=
self
.
env
.
step
(
action_dict
)
steps
+=
1
#super()._train()
for
agent
,
agent_info
in
info
.
items
():
if
episode_max_steps
==
0
:
...
...
@@ -267,8 +246,30 @@ class ImitationAgent(PPOTrainer):
"episode_reward_mean"
:
episode_score
,
"timesteps_this_iter"
:
steps
,
}
# __sphinx_doc_end__
# don't enable yapf after, it's buggy here
def
imitation_train_fn
(
config
,
reporter
=
None
):
imitation_trainer
=
ImitationAgent
(
config
,
env
=
"flatland_sparse"
,)
eval_results_all
=
pd
.
DataFrame
.
from_dict
([{
'episode_reward_mean'
:
0
,
'episode_completion_mean'
:
0
,
'timesteps_this_iter'
:
0
}])
for
i
in
range
(
1000
):
result
=
imitation_trainer
.
train
()
if
reporter
:
reporter
(
**
result
)
if
i
%
10
==
0
:
eval_results
=
imitation_trainer
.
eval
()
print
(
"Eval Results:"
,
eval_results
)
eval_results_all
=
pd
.
concat
([
eval_results_all
,
pd
.
DataFrame
.
from_dict
([
eval_results
])])
eval_results_all
.
to_csv
(
'EvalResults.csv'
)
checkpoint
=
imitation_trainer
.
save
()
# TODO: Loads weights but not optimizer state
# Could be done by overriding _save by using model.save_weight(checkpoint)
# Also override _restore. Ideally use workers to save/load weights.
# imitation_trainer.restore(checkpoint)
print
(
"checkpoint saved at"
,
checkpoint
)
imitation_trainer
.
stop
()
print
(
"Test: OK"
)
if
__name__
==
"__main__"
:
...
...
@@ -292,44 +293,16 @@ if __name__ == "__main__":
webui_host
=
"0.0.0.0"
# TODO should be in exp['config'] directly
exp
[
'config'
][
'env_config'
][
'yaml_config'
]
=
config_file
exp
[
'loggers'
]
=
[
TBXLogger
]
exp
[
'loggers'
]
=
[
WandbLogger
,
TBXLogger
]
_default_config
=
with_common_config
(
exp
[
"config"
])
ray
.
init
(
num_cpus
=
3
,
num_gpus
=
0
)
imitation_trainer
=
ImitationAgent
(
_default_config
,
env
=
"flatland_sparse"
,)
# default_policy=ImitationPolicy,
# get_policy_class=ImitationPolicy)
# env="CartPole-v0")
# trainer = ApexTrainer(_default_config,
# env="flatland_sparse",)
# trainer = PPOTrainer(_default_config,
# env="flatland_sparse",)
for
i
in
range
(
10
):
result
=
imitation_trainer
.
train
()
if
i
%
5
:
eval_results
=
imitation_trainer
.
eval
()
print
(
"Eval Results:"
,
eval_results
)
checkpoint
=
imitation_trainer
.
save
()
# TODO: Loads weights but not optimizer state
# Could be done by overriding _save by using model.save_weight(checkpoint)
# Also override _restore. Ideally use workers to save/load weights.
# imitation_trainer.restore(checkpoint)
print
(
"checkpoint saved at"
,
checkpoint
)
imitation_trainer
.
stop
()
# registry.register_trainable('ImitationPolicyTrainer',ImitationAgent)
# ImitationPolicyTrainer = build_trainer(
# name="ImitationPolicyTrainer",
# default_policy=ImitationPolicy,
# default_config=_default_config,)
resources
=
PPOTrainer
.
default_resource_request
(
_default_config
).
to_json
()
imitation_train_fn
(
_default_config
)
# tune.run(imitation_train_fn, resources_per_trial=resources, config=_default_config)
print
(
"Test: OK"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment