Commit 4b39e2f0 authored by nilabha's avatar nilabha

clean up code imitation trainer

parent 86b99ebb
...@@ -5,7 +5,8 @@ from ray.rllib.agents.dqn import ApexTrainer,DQNTrainer ...@@ -5,7 +5,8 @@ from ray.rllib.agents.dqn import ApexTrainer,DQNTrainer
from ray.rllib.utils.annotations import override from ray.rllib.utils.annotations import override
from ray.rllib.agents.ppo.ppo import PPOTrainer from ray.rllib.agents.ppo.ppo import PPOTrainer
import ray
from ray import tune
import numpy as np import numpy as np
import os import os
...@@ -33,6 +34,7 @@ from utils.argparser import create_parser ...@@ -33,6 +34,7 @@ from utils.argparser import create_parser
from utils.loader import load_envs, load_models from utils.loader import load_envs, load_models
# Custom wandb logger with hotfix to allow custom callbacks # Custom wandb logger with hotfix to allow custom callbacks
from wandblogger import WandbLogger from wandblogger import WandbLogger
import pandas as pd
""" """
Note : This implementation has been adapted from : Note : This implementation has been adapted from :
...@@ -58,7 +60,7 @@ from libs.cell_graph_dispatcher import CellGraphDispatcher ...@@ -58,7 +60,7 @@ from libs.cell_graph_dispatcher import CellGraphDispatcher
def adam_optimizer(policy, config): def adam_optimizer(policy, config):
return tf.train.AdamOptimizer( return tf.train.AdamOptimizer(
learning_rate=0.01, epsilon=0.001) learning_rate=config.get('lr',5e-4), epsilon=config.get('adam_epsilon',1e-8))
def default_execution_plan(workers: WorkerSet, config): def default_execution_plan(workers: WorkerSet, config):
# Collects experiences in parallel from multiple RolloutWorker actors. # Collects experiences in parallel from multiple RolloutWorker actors.
...@@ -78,8 +80,6 @@ def default_execution_plan(workers: WorkerSet, config): ...@@ -78,8 +80,6 @@ def default_execution_plan(workers: WorkerSet, config):
def loss_imitation(policy, model, dist_class, train_batch): def loss_imitation(policy, model, dist_class, train_batch):
return np.random.randint(5) return np.random.randint(5)
# policy = DQNTFPolicy.with_updates(name="ImitPolicy",)
ImitationTFPolicy = build_tf_policy( ImitationTFPolicy = build_tf_policy(
name="ImitationTFPolicy", name="ImitationTFPolicy",
loss_fn=loss_imitation, loss_fn=loss_imitation,
...@@ -87,8 +87,6 @@ ImitationTFPolicy = build_tf_policy( ...@@ -87,8 +87,6 @@ ImitationTFPolicy = build_tf_policy(
) )
# yapf: disable
# __sphinx_doc_begin__
class ImitationAgent(PPOTrainer): class ImitationAgent(PPOTrainer):
"""Policy that takes random actions and never learns.""" """Policy that takes random actions and never learns."""
...@@ -138,7 +136,6 @@ class ImitationAgent(PPOTrainer): ...@@ -138,7 +136,6 @@ class ImitationAgent(PPOTrainer):
# batch_size = 2 # batch_size = 2
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None) # logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# while not done["__all__"]:
for step in range(max_steps): for step in range(max_steps):
for a in range(n_agents): for a in range(n_agents):
if not done.get(a) and obs.get(a) is not None: if not done.get(a) and obs.get(a) is not None:
...@@ -150,7 +147,6 @@ class ImitationAgent(PPOTrainer): ...@@ -150,7 +147,6 @@ class ImitationAgent(PPOTrainer):
obs, all_rewards, done, info = self.env.step(action_dict) obs, all_rewards, done, info = self.env.step(action_dict)
steps += 1 steps += 1
#super()._train()
for agent, agent_info in info.items(): for agent, agent_info in info.items():
if episode_max_steps == 0: if episode_max_steps == 0:
...@@ -177,7 +173,6 @@ class ImitationAgent(PPOTrainer): ...@@ -177,7 +173,6 @@ class ImitationAgent(PPOTrainer):
def _train(self): def _train(self):
import tensorflow as tf import tensorflow as tf
policy = self.get_policy() policy = self.get_policy()
# optimizer = tf.keras.optimizers.Adam()
steps = 0 steps = 0
for _ in range(1): for _ in range(1):
env = self.env._env.rail_env env = self.env._env.rail_env
...@@ -195,7 +190,6 @@ class ImitationAgent(PPOTrainer): ...@@ -195,7 +190,6 @@ class ImitationAgent(PPOTrainer):
episode_num_agents = 0 episode_num_agents = 0
episode_score = 0 episode_score = 0
episode_done_agents = 0 episode_done_agents = 0
# obs = self.env.reset()
done = {} done = {}
done["__all__"] = False done["__all__"] = False
...@@ -203,7 +197,6 @@ class ImitationAgent(PPOTrainer): ...@@ -203,7 +197,6 @@ class ImitationAgent(PPOTrainer):
# batch_size = 2 # batch_size = 2
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None) # logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# while not done["__all__"]:
for step in range(max_steps): for step in range(max_steps):
action_dict = dispatcher.step(env._elapsed_steps) action_dict = dispatcher.step(env._elapsed_steps)
...@@ -214,41 +207,27 @@ class ImitationAgent(PPOTrainer): ...@@ -214,41 +207,27 @@ class ImitationAgent(PPOTrainer):
if not done.get(a) and obs.get(a) is not None: if not done.get(a) and obs.get(a) is not None:
active_agents += 1 active_agents += 1
expert_action = action_dict[a].value expert_action = action_dict[a].value
# self.model.custom_loss(tf.constant(expert_action),
# {"obs": tf.cast(tf.expand_dims(obs[a],0),tf.float32)})
# self.model.custom_loss(expert_action,{"obs": np.expand_dims(obs[a],0)},)
input_dict = {"obs": np.expand_dims(obs[a],0)} input_dict = {"obs": np.expand_dims(obs[a],0)}
input_dict['obs_flat'] = input_dict['obs'] input_dict['obs_flat'] = input_dict['obs']
logits, _ = policy.model.forward(input_dict, [], None) logits, _ = policy.model.forward(input_dict, [], None)
model_logits = tf.squeeze(logits) model_logits = tf.squeeze(logits)
expert_logits = tf.cast(expert_action, tf.int32) expert_logits = tf.cast(expert_action, tf.int32)
# expert_one_hot = tf.one_hot(expert_logits,num_outputs)
action_dist = Categorical(logits, policy.model.model_config) action_dist = Categorical(logits, policy.model.model_config)
imitation_loss += tf.reduce_mean(-action_dist.logp(tf.expand_dims(expert_logits,0))) imitation_loss += tf.reduce_mean(-action_dist.logp(tf.expand_dims(expert_logits,0)))
imitation_loss = imitation_loss/max(active_agents,1) imitation_loss = imitation_loss/max(active_agents,1)
# imitation_loss = tf.nn.softmax_cross_entropy_with_logits(
# labels=expert_logits, logits=model_logits)
gradients = tape.gradient(imitation_loss, policy.model.trainable_variables()) gradients = tape.gradient(imitation_loss, policy.model.trainable_variables())
# optimizer.apply_gradients(zip(gradients, policy.model.trainable_variables()))
self.workers.local_worker().apply_gradients(gradients) self.workers.local_worker().apply_gradients(gradients)
weights = ray.put(self.workers.local_worker().get_weights()) weights = ray.put(self.workers.local_worker().get_weights())
# print(self.workers.local_worker().get_weights()) # print(self.workers.local_worker().get_weights())
for e in self.workers.remote_workers(): for e in self.workers.remote_workers():
e.set_weights.remote(weights) e.set_weights.remote(weights)
# grads_and_vars = optimizer.compute_gradients(lambda :imitation_loss, var_list=variables)
# grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
# policy.apply_gradients(grads_and_vars)
# optimizer.apply_gradients(grads_and_vars)
obs, all_rewards, done, info = self.env.step(action_dict) obs, all_rewards, done, info = self.env.step(action_dict)
steps += 1 steps += 1
#super()._train()
for agent, agent_info in info.items(): for agent, agent_info in info.items():
if episode_max_steps == 0: if episode_max_steps == 0:
...@@ -267,8 +246,30 @@ class ImitationAgent(PPOTrainer): ...@@ -267,8 +246,30 @@ class ImitationAgent(PPOTrainer):
"episode_reward_mean": episode_score, "episode_reward_mean": episode_score,
"timesteps_this_iter": steps, "timesteps_this_iter": steps,
} }
# __sphinx_doc_end__
# don't enable yapf after, it's buggy here def imitation_train_fn(config, reporter=None):
imitation_trainer = ImitationAgent(config,
env="flatland_sparse",)
eval_results_all = pd.DataFrame.from_dict([{'episode_reward_mean': 0, 'episode_completion_mean': 0, 'timesteps_this_iter': 0}])
for i in range(1000):
result = imitation_trainer.train()
if reporter:
reporter(**result)
if i % 10 == 0:
eval_results = imitation_trainer.eval()
print("Eval Results:",eval_results)
eval_results_all = pd.concat([eval_results_all, pd.DataFrame.from_dict([eval_results])])
eval_results_all.to_csv('EvalResults.csv')
checkpoint = imitation_trainer.save()
# TODO: Loads weights but not optimizer state
# Could be done by overriding _save by using model.save_weight(checkpoint)
# Also override _restore. Ideally use workers to save/load weights.
# imitation_trainer.restore(checkpoint)
print("checkpoint saved at", checkpoint)
imitation_trainer.stop()
print("Test: OK")
if __name__ == "__main__": if __name__ == "__main__":
...@@ -292,44 +293,16 @@ if __name__ == "__main__": ...@@ -292,44 +293,16 @@ if __name__ == "__main__":
webui_host = "0.0.0.0" webui_host = "0.0.0.0"
# TODO should be in exp['config'] directly # TODO should be in exp['config'] directly
exp['config']['env_config']['yaml_config'] = config_file exp['config']['env_config']['yaml_config'] = config_file
exp['loggers'] = [TBXLogger] exp['loggers'] = [WandbLogger,TBXLogger]
_default_config = with_common_config( _default_config = with_common_config(
exp["config"]) exp["config"])
ray.init(num_cpus=3,num_gpus=0) ray.init(num_cpus=3,num_gpus=0)
imitation_trainer = ImitationAgent(_default_config,
env="flatland_sparse",)
# default_policy=ImitationPolicy,
# get_policy_class=ImitationPolicy)
# env="CartPole-v0")
# trainer = ApexTrainer(_default_config,
# env="flatland_sparse",)
# trainer = PPOTrainer(_default_config,
# env="flatland_sparse",)
for i in range(10):
result = imitation_trainer.train()
if i % 5:
eval_results = imitation_trainer.eval()
print("Eval Results:",eval_results)
checkpoint = imitation_trainer.save()
# TODO: Loads weights but not optimizer state
# Could be done by overriding _save by using model.save_weight(checkpoint)
# Also override _restore. Ideally use workers to save/load weights.
# imitation_trainer.restore(checkpoint)
print("checkpoint saved at", checkpoint)
imitation_trainer.stop() resources = PPOTrainer.default_resource_request(_default_config).to_json()
# registry.register_trainable('ImitationPolicyTrainer',ImitationAgent)
# ImitationPolicyTrainer = build_trainer( imitation_train_fn(_default_config)
# name="ImitationPolicyTrainer", # tune.run(imitation_train_fn, resources_per_trial=resources, config=_default_config)
# default_policy=ImitationPolicy,
# default_config=_default_config,)
print("Test: OK")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment