Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Flatland
neurips2020-flatland-baselines
Commits
4b39e2f0
Commit
4b39e2f0
authored
Jul 20, 2020
by
nilabha
Browse files
clean up code imitation trainer
parent
86b99ebb
Changes
1
Hide whitespace changes
Inline
Side-by-side
imitation_trainer.py
View file @
4b39e2f0
...
@@ -5,7 +5,8 @@ from ray.rllib.agents.dqn import ApexTrainer,DQNTrainer
...
@@ -5,7 +5,8 @@ from ray.rllib.agents.dqn import ApexTrainer,DQNTrainer
from
ray.rllib.utils.annotations
import
override
from
ray.rllib.utils.annotations
import
override
from
ray.rllib.agents.ppo.ppo
import
PPOTrainer
from
ray.rllib.agents.ppo.ppo
import
PPOTrainer
import
ray
from
ray
import
tune
import
numpy
as
np
import
numpy
as
np
import
os
import
os
...
@@ -33,6 +34,7 @@ from utils.argparser import create_parser
...
@@ -33,6 +34,7 @@ from utils.argparser import create_parser
from
utils.loader
import
load_envs
,
load_models
from
utils.loader
import
load_envs
,
load_models
# Custom wandb logger with hotfix to allow custom callbacks
# Custom wandb logger with hotfix to allow custom callbacks
from
wandblogger
import
WandbLogger
from
wandblogger
import
WandbLogger
import
pandas
as
pd
"""
"""
Note : This implementation has been adapted from :
Note : This implementation has been adapted from :
...
@@ -58,7 +60,7 @@ from libs.cell_graph_dispatcher import CellGraphDispatcher
...
@@ -58,7 +60,7 @@ from libs.cell_graph_dispatcher import CellGraphDispatcher
def
adam_optimizer
(
policy
,
config
):
def
adam_optimizer
(
policy
,
config
):
return
tf
.
train
.
AdamOptimizer
(
return
tf
.
train
.
AdamOptimizer
(
learning_rate
=
0.01
,
epsilon
=
0.001
)
learning_rate
=
config
.
get
(
'lr'
,
5e-4
),
epsilon
=
config
.
get
(
'adam_epsilon'
,
1e-8
)
)
def
default_execution_plan
(
workers
:
WorkerSet
,
config
):
def
default_execution_plan
(
workers
:
WorkerSet
,
config
):
# Collects experiences in parallel from multiple RolloutWorker actors.
# Collects experiences in parallel from multiple RolloutWorker actors.
...
@@ -78,8 +80,6 @@ def default_execution_plan(workers: WorkerSet, config):
...
@@ -78,8 +80,6 @@ def default_execution_plan(workers: WorkerSet, config):
def
loss_imitation
(
policy
,
model
,
dist_class
,
train_batch
):
def
loss_imitation
(
policy
,
model
,
dist_class
,
train_batch
):
return
np
.
random
.
randint
(
5
)
return
np
.
random
.
randint
(
5
)
# policy = DQNTFPolicy.with_updates(name="ImitPolicy",)
ImitationTFPolicy
=
build_tf_policy
(
ImitationTFPolicy
=
build_tf_policy
(
name
=
"ImitationTFPolicy"
,
name
=
"ImitationTFPolicy"
,
loss_fn
=
loss_imitation
,
loss_fn
=
loss_imitation
,
...
@@ -87,8 +87,6 @@ ImitationTFPolicy = build_tf_policy(
...
@@ -87,8 +87,6 @@ ImitationTFPolicy = build_tf_policy(
)
)
# yapf: disable
# __sphinx_doc_begin__
class
ImitationAgent
(
PPOTrainer
):
class
ImitationAgent
(
PPOTrainer
):
"""Policy that takes random actions and never learns."""
"""Policy that takes random actions and never learns."""
...
@@ -138,7 +136,6 @@ class ImitationAgent(PPOTrainer):
...
@@ -138,7 +136,6 @@ class ImitationAgent(PPOTrainer):
# batch_size = 2
# batch_size = 2
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# while not done["__all__"]:
for
step
in
range
(
max_steps
):
for
step
in
range
(
max_steps
):
for
a
in
range
(
n_agents
):
for
a
in
range
(
n_agents
):
if
not
done
.
get
(
a
)
and
obs
.
get
(
a
)
is
not
None
:
if
not
done
.
get
(
a
)
and
obs
.
get
(
a
)
is
not
None
:
...
@@ -150,7 +147,6 @@ class ImitationAgent(PPOTrainer):
...
@@ -150,7 +147,6 @@ class ImitationAgent(PPOTrainer):
obs
,
all_rewards
,
done
,
info
=
self
.
env
.
step
(
action_dict
)
obs
,
all_rewards
,
done
,
info
=
self
.
env
.
step
(
action_dict
)
steps
+=
1
steps
+=
1
#super()._train()
for
agent
,
agent_info
in
info
.
items
():
for
agent
,
agent_info
in
info
.
items
():
if
episode_max_steps
==
0
:
if
episode_max_steps
==
0
:
...
@@ -177,7 +173,6 @@ class ImitationAgent(PPOTrainer):
...
@@ -177,7 +173,6 @@ class ImitationAgent(PPOTrainer):
def
_train
(
self
):
def
_train
(
self
):
import
tensorflow
as
tf
import
tensorflow
as
tf
policy
=
self
.
get_policy
()
policy
=
self
.
get_policy
()
# optimizer = tf.keras.optimizers.Adam()
steps
=
0
steps
=
0
for
_
in
range
(
1
):
for
_
in
range
(
1
):
env
=
self
.
env
.
_env
.
rail_env
env
=
self
.
env
.
_env
.
rail_env
...
@@ -195,7 +190,6 @@ class ImitationAgent(PPOTrainer):
...
@@ -195,7 +190,6 @@ class ImitationAgent(PPOTrainer):
episode_num_agents
=
0
episode_num_agents
=
0
episode_score
=
0
episode_score
=
0
episode_done_agents
=
0
episode_done_agents
=
0
# obs = self.env.reset()
done
=
{}
done
=
{}
done
[
"__all__"
]
=
False
done
[
"__all__"
]
=
False
...
@@ -203,7 +197,6 @@ class ImitationAgent(PPOTrainer):
...
@@ -203,7 +197,6 @@ class ImitationAgent(PPOTrainer):
# batch_size = 2
# batch_size = 2
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# while not done["__all__"]:
for
step
in
range
(
max_steps
):
for
step
in
range
(
max_steps
):
action_dict
=
dispatcher
.
step
(
env
.
_elapsed_steps
)
action_dict
=
dispatcher
.
step
(
env
.
_elapsed_steps
)
...
@@ -214,41 +207,27 @@ class ImitationAgent(PPOTrainer):
...
@@ -214,41 +207,27 @@ class ImitationAgent(PPOTrainer):
if
not
done
.
get
(
a
)
and
obs
.
get
(
a
)
is
not
None
:
if
not
done
.
get
(
a
)
and
obs
.
get
(
a
)
is
not
None
:
active_agents
+=
1
active_agents
+=
1
expert_action
=
action_dict
[
a
].
value
expert_action
=
action_dict
[
a
].
value
# self.model.custom_loss(tf.constant(expert_action),
# {"obs": tf.cast(tf.expand_dims(obs[a],0),tf.float32)})
# self.model.custom_loss(expert_action,{"obs": np.expand_dims(obs[a],0)},)
input_dict
=
{
"obs"
:
np
.
expand_dims
(
obs
[
a
],
0
)}
input_dict
=
{
"obs"
:
np
.
expand_dims
(
obs
[
a
],
0
)}
input_dict
[
'obs_flat'
]
=
input_dict
[
'obs'
]
input_dict
[
'obs_flat'
]
=
input_dict
[
'obs'
]
logits
,
_
=
policy
.
model
.
forward
(
input_dict
,
[],
None
)
logits
,
_
=
policy
.
model
.
forward
(
input_dict
,
[],
None
)
model_logits
=
tf
.
squeeze
(
logits
)
model_logits
=
tf
.
squeeze
(
logits
)
expert_logits
=
tf
.
cast
(
expert_action
,
tf
.
int32
)
expert_logits
=
tf
.
cast
(
expert_action
,
tf
.
int32
)
# expert_one_hot = tf.one_hot(expert_logits,num_outputs)
action_dist
=
Categorical
(
logits
,
policy
.
model
.
model_config
)
action_dist
=
Categorical
(
logits
,
policy
.
model
.
model_config
)
imitation_loss
+=
tf
.
reduce_mean
(
-
action_dist
.
logp
(
tf
.
expand_dims
(
expert_logits
,
0
)))
imitation_loss
+=
tf
.
reduce_mean
(
-
action_dist
.
logp
(
tf
.
expand_dims
(
expert_logits
,
0
)))
imitation_loss
=
imitation_loss
/
max
(
active_agents
,
1
)
imitation_loss
=
imitation_loss
/
max
(
active_agents
,
1
)
# imitation_loss = tf.nn.softmax_cross_entropy_with_logits(
# labels=expert_logits, logits=model_logits)
gradients
=
tape
.
gradient
(
imitation_loss
,
policy
.
model
.
trainable_variables
())
gradients
=
tape
.
gradient
(
imitation_loss
,
policy
.
model
.
trainable_variables
())
# optimizer.apply_gradients(zip(gradients, policy.model.trainable_variables()))
self
.
workers
.
local_worker
().
apply_gradients
(
gradients
)
self
.
workers
.
local_worker
().
apply_gradients
(
gradients
)
weights
=
ray
.
put
(
self
.
workers
.
local_worker
().
get_weights
())
weights
=
ray
.
put
(
self
.
workers
.
local_worker
().
get_weights
())
# print(self.workers.local_worker().get_weights())
# print(self.workers.local_worker().get_weights())
for
e
in
self
.
workers
.
remote_workers
():
for
e
in
self
.
workers
.
remote_workers
():
e
.
set_weights
.
remote
(
weights
)
e
.
set_weights
.
remote
(
weights
)
# grads_and_vars = optimizer.compute_gradients(lambda :imitation_loss, var_list=variables)
# grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
# policy.apply_gradients(grads_and_vars)
# optimizer.apply_gradients(grads_and_vars)
obs
,
all_rewards
,
done
,
info
=
self
.
env
.
step
(
action_dict
)
obs
,
all_rewards
,
done
,
info
=
self
.
env
.
step
(
action_dict
)
steps
+=
1
steps
+=
1
#super()._train()
for
agent
,
agent_info
in
info
.
items
():
for
agent
,
agent_info
in
info
.
items
():
if
episode_max_steps
==
0
:
if
episode_max_steps
==
0
:
...
@@ -267,8 +246,30 @@ class ImitationAgent(PPOTrainer):
...
@@ -267,8 +246,30 @@ class ImitationAgent(PPOTrainer):
"episode_reward_mean"
:
episode_score
,
"episode_reward_mean"
:
episode_score
,
"timesteps_this_iter"
:
steps
,
"timesteps_this_iter"
:
steps
,
}
}
# __sphinx_doc_end__
# don't enable yapf after, it's buggy here
def
imitation_train_fn
(
config
,
reporter
=
None
):
imitation_trainer
=
ImitationAgent
(
config
,
env
=
"flatland_sparse"
,)
eval_results_all
=
pd
.
DataFrame
.
from_dict
([{
'episode_reward_mean'
:
0
,
'episode_completion_mean'
:
0
,
'timesteps_this_iter'
:
0
}])
for
i
in
range
(
1000
):
result
=
imitation_trainer
.
train
()
if
reporter
:
reporter
(
**
result
)
if
i
%
10
==
0
:
eval_results
=
imitation_trainer
.
eval
()
print
(
"Eval Results:"
,
eval_results
)
eval_results_all
=
pd
.
concat
([
eval_results_all
,
pd
.
DataFrame
.
from_dict
([
eval_results
])])
eval_results_all
.
to_csv
(
'EvalResults.csv'
)
checkpoint
=
imitation_trainer
.
save
()
# TODO: Loads weights but not optimizer state
# Could be done by overriding _save by using model.save_weight(checkpoint)
# Also override _restore. Ideally use workers to save/load weights.
# imitation_trainer.restore(checkpoint)
print
(
"checkpoint saved at"
,
checkpoint
)
imitation_trainer
.
stop
()
print
(
"Test: OK"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
@@ -292,44 +293,16 @@ if __name__ == "__main__":
...
@@ -292,44 +293,16 @@ if __name__ == "__main__":
webui_host
=
"0.0.0.0"
webui_host
=
"0.0.0.0"
# TODO should be in exp['config'] directly
# TODO should be in exp['config'] directly
exp
[
'config'
][
'env_config'
][
'yaml_config'
]
=
config_file
exp
[
'config'
][
'env_config'
][
'yaml_config'
]
=
config_file
exp
[
'loggers'
]
=
[
TBXLogger
]
exp
[
'loggers'
]
=
[
WandbLogger
,
TBXLogger
]
_default_config
=
with_common_config
(
_default_config
=
with_common_config
(
exp
[
"config"
])
exp
[
"config"
])
ray
.
init
(
num_cpus
=
3
,
num_gpus
=
0
)
ray
.
init
(
num_cpus
=
3
,
num_gpus
=
0
)
imitation_trainer
=
ImitationAgent
(
_default_config
,
env
=
"flatland_sparse"
,)
# default_policy=ImitationPolicy,
# get_policy_class=ImitationPolicy)
# env="CartPole-v0")
# trainer = ApexTrainer(_default_config,
# env="flatland_sparse",)
# trainer = PPOTrainer(_default_config,
# env="flatland_sparse",)
for
i
in
range
(
10
):
result
=
imitation_trainer
.
train
()
if
i
%
5
:
eval_results
=
imitation_trainer
.
eval
()
print
(
"Eval Results:"
,
eval_results
)
checkpoint
=
imitation_trainer
.
save
()
# TODO: Loads weights but not optimizer state
# Could be done by overriding _save by using model.save_weight(checkpoint)
# Also override _restore. Ideally use workers to save/load weights.
# imitation_trainer.restore(checkpoint)
print
(
"checkpoint saved at"
,
checkpoint
)
imitation_trainer
.
stop
()
resources
=
PPOTrainer
.
default_resource_request
(
_default_config
).
to_json
()
# registry.register_trainable('ImitationPolicyTrainer',ImitationAgent)
# ImitationPolicyTrainer = build_trainer(
imitation_train_fn
(
_default_config
)
# name="ImitationPolicyTrainer",
# tune.run(imitation_train_fn, resources_per_trial=resources, config=_default_config)
# default_policy=ImitationPolicy,
# default_config=_default_config,)
print
(
"Test: OK"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment