Commit 4b39e2f0 authored by nilabha's avatar nilabha

clean up code imitation trainer

parent 86b99ebb
......@@ -5,7 +5,8 @@ from ray.rllib.agents.dqn import ApexTrainer,DQNTrainer
from ray.rllib.utils.annotations import override
from ray.rllib.agents.ppo.ppo import PPOTrainer
import ray
from ray import tune
import numpy as np
import os
......@@ -33,6 +34,7 @@ from utils.argparser import create_parser
from utils.loader import load_envs, load_models
# Custom wandb logger with hotfix to allow custom callbacks
from wandblogger import WandbLogger
import pandas as pd
"""
Note : This implementation has been adapted from :
......@@ -58,7 +60,7 @@ from libs.cell_graph_dispatcher import CellGraphDispatcher
def adam_optimizer(policy, config):
return tf.train.AdamOptimizer(
learning_rate=0.01, epsilon=0.001)
learning_rate=config.get('lr',5e-4), epsilon=config.get('adam_epsilon',1e-8))
def default_execution_plan(workers: WorkerSet, config):
# Collects experiences in parallel from multiple RolloutWorker actors.
......@@ -78,8 +80,6 @@ def default_execution_plan(workers: WorkerSet, config):
def loss_imitation(policy, model, dist_class, train_batch):
return np.random.randint(5)
# policy = DQNTFPolicy.with_updates(name="ImitPolicy",)
ImitationTFPolicy = build_tf_policy(
name="ImitationTFPolicy",
loss_fn=loss_imitation,
......@@ -87,8 +87,6 @@ ImitationTFPolicy = build_tf_policy(
)
# yapf: disable
# __sphinx_doc_begin__
class ImitationAgent(PPOTrainer):
"""Policy that takes random actions and never learns."""
......@@ -138,7 +136,6 @@ class ImitationAgent(PPOTrainer):
# batch_size = 2
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# while not done["__all__"]:
for step in range(max_steps):
for a in range(n_agents):
if not done.get(a) and obs.get(a) is not None:
......@@ -150,7 +147,6 @@ class ImitationAgent(PPOTrainer):
obs, all_rewards, done, info = self.env.step(action_dict)
steps += 1
#super()._train()
for agent, agent_info in info.items():
if episode_max_steps == 0:
......@@ -177,7 +173,6 @@ class ImitationAgent(PPOTrainer):
def _train(self):
import tensorflow as tf
policy = self.get_policy()
# optimizer = tf.keras.optimizers.Adam()
steps = 0
for _ in range(1):
env = self.env._env.rail_env
......@@ -195,7 +190,6 @@ class ImitationAgent(PPOTrainer):
episode_num_agents = 0
episode_score = 0
episode_done_agents = 0
# obs = self.env.reset()
done = {}
done["__all__"] = False
......@@ -203,7 +197,6 @@ class ImitationAgent(PPOTrainer):
# batch_size = 2
# logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)
# while not done["__all__"]:
for step in range(max_steps):
action_dict = dispatcher.step(env._elapsed_steps)
......@@ -214,26 +207,18 @@ class ImitationAgent(PPOTrainer):
if not done.get(a) and obs.get(a) is not None:
active_agents += 1
expert_action = action_dict[a].value
# self.model.custom_loss(tf.constant(expert_action),
# {"obs": tf.cast(tf.expand_dims(obs[a],0),tf.float32)})
# self.model.custom_loss(expert_action,{"obs": np.expand_dims(obs[a],0)},)
input_dict = {"obs": np.expand_dims(obs[a],0)}
input_dict['obs_flat'] = input_dict['obs']
logits, _ = policy.model.forward(input_dict, [], None)
model_logits = tf.squeeze(logits)
expert_logits = tf.cast(expert_action, tf.int32)
# expert_one_hot = tf.one_hot(expert_logits,num_outputs)
action_dist = Categorical(logits, policy.model.model_config)
imitation_loss += tf.reduce_mean(-action_dist.logp(tf.expand_dims(expert_logits,0)))
imitation_loss = imitation_loss/max(active_agents,1)
# imitation_loss = tf.nn.softmax_cross_entropy_with_logits(
# labels=expert_logits, logits=model_logits)
gradients = tape.gradient(imitation_loss, policy.model.trainable_variables())
# optimizer.apply_gradients(zip(gradients, policy.model.trainable_variables()))
self.workers.local_worker().apply_gradients(gradients)
weights = ray.put(self.workers.local_worker().get_weights())
......@@ -241,14 +226,8 @@ class ImitationAgent(PPOTrainer):
for e in self.workers.remote_workers():
e.set_weights.remote(weights)
# grads_and_vars = optimizer.compute_gradients(lambda :imitation_loss, var_list=variables)
# grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
# policy.apply_gradients(grads_and_vars)
# optimizer.apply_gradients(grads_and_vars)
obs, all_rewards, done, info = self.env.step(action_dict)
steps += 1
#super()._train()
for agent, agent_info in info.items():
if episode_max_steps == 0:
......@@ -267,8 +246,30 @@ class ImitationAgent(PPOTrainer):
"episode_reward_mean": episode_score,
"timesteps_this_iter": steps,
}
# __sphinx_doc_end__
# don't enable yapf after, it's buggy here
def imitation_train_fn(config, reporter=None):
imitation_trainer = ImitationAgent(config,
env="flatland_sparse",)
eval_results_all = pd.DataFrame.from_dict([{'episode_reward_mean': 0, 'episode_completion_mean': 0, 'timesteps_this_iter': 0}])
for i in range(1000):
result = imitation_trainer.train()
if reporter:
reporter(**result)
if i % 10 == 0:
eval_results = imitation_trainer.eval()
print("Eval Results:",eval_results)
eval_results_all = pd.concat([eval_results_all, pd.DataFrame.from_dict([eval_results])])
eval_results_all.to_csv('EvalResults.csv')
checkpoint = imitation_trainer.save()
# TODO: Loads weights but not optimizer state
# Could be done by overriding _save by using model.save_weight(checkpoint)
# Also override _restore. Ideally use workers to save/load weights.
# imitation_trainer.restore(checkpoint)
print("checkpoint saved at", checkpoint)
imitation_trainer.stop()
print("Test: OK")
if __name__ == "__main__":
......@@ -292,44 +293,16 @@ if __name__ == "__main__":
webui_host = "0.0.0.0"
# TODO should be in exp['config'] directly
exp['config']['env_config']['yaml_config'] = config_file
exp['loggers'] = [TBXLogger]
exp['loggers'] = [WandbLogger,TBXLogger]
_default_config = with_common_config(
exp["config"])
ray.init(num_cpus=3,num_gpus=0)
imitation_trainer = ImitationAgent(_default_config,
env="flatland_sparse",)
# default_policy=ImitationPolicy,
# get_policy_class=ImitationPolicy)
# env="CartPole-v0")
# trainer = ApexTrainer(_default_config,
# env="flatland_sparse",)
# trainer = PPOTrainer(_default_config,
# env="flatland_sparse",)
for i in range(10):
result = imitation_trainer.train()
if i % 5:
eval_results = imitation_trainer.eval()
print("Eval Results:",eval_results)
checkpoint = imitation_trainer.save()
# TODO: Loads weights but not optimizer state
# Could be done by overriding _save by using model.save_weight(checkpoint)
# Also override _restore. Ideally use workers to save/load weights.
# imitation_trainer.restore(checkpoint)
print("checkpoint saved at", checkpoint)
imitation_trainer.stop()
# registry.register_trainable('ImitationPolicyTrainer',ImitationAgent)
# ImitationPolicyTrainer = build_trainer(
# name="ImitationPolicyTrainer",
# default_policy=ImitationPolicy,
# default_config=_default_config,)
resources = PPOTrainer.default_resource_request(_default_config).to_json()
imitation_train_fn(_default_config)
# tune.run(imitation_train_fn, resources_per_trial=resources, config=_default_config)
print("Test: OK")
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment