Commit 734be92c authored by nilabha's avatar nilabha

Add more loss functions, dqfd loss implementation

parent efcb9c75
flatland-random-sparse-small-tree-fc-ppo:
flatland-random-sparse-small-tree-fc-apex-il:
run: APEX
env: flatland_sparse
stop:
......@@ -25,15 +25,18 @@ flatland-random-sparse-small-tree-fc-ppo:
generator_config: small_v0
wandb:
project: flatland
entity: masterscrat
project: nilabha2007
entity: neurips2020-flatland-baselines
tags: ["small_v0", "tree_obs", "APEX_DQfD"] # TODO should be set programmatically
model:
custom_model: custom_loss_model
fcnet_activation: relu
fcnet_hiddens: [256, 256]
vf_share_layers: True # False
custom_options:
input_files: /tmp/flatland-out
loss: dqfd # ce (cross entropy), kl (kl divergence)
lambda1: 1
lambda2: 1
......@@ -35,7 +35,7 @@ class CustomLossModel(TFModelV2):
def custom_loss(self, policy_loss, loss_inputs):
# create a new input reader per worker
reader = JsonReader(self.model_config["custom_options"]["input_files"])
input_ops = reader.tf_input_ops()
input_ops = reader.tf_input_ops(self.model_config["custom_options"]["expert_size"])
# define a secondary loss by building a graph copy with weight sharing
obs = restore_original_dimensions(
......@@ -47,15 +47,38 @@ class CustomLossModel(TFModelV2):
# loss can be added as follows:
# ae_loss = squared_diff(
# loss_inputs["obs"], Decoder(self.fcnet.last_layer))
print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
# print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
# compute the IL loss
action_dist = Categorical(logits, self.model_config)
self.policy_loss = policy_loss
self.imitation_loss = tf.reduce_mean(
-action_dist.logp(input_ops["actions"]))
total_loss = policy_loss + self.model_config["custom_options"]["lambda1"]\
* policy_loss + self.model_config["custom_options"]["lambda2"]\
(action_scores, model_logits, dist) = self.get_q_value_distributions(logits)
model_logits= tf.squeeze(model_logits)
action_dist = Categorical(model_logits, self.model_config)
expert_logits = tf.cast(input_ops["actions"], tf.int32)
expert_action = tf.math.argmax(expert_logits)
expert_action_one_hot = tf.one_hot(expert_action,self.num_outputs)
model_action = action_dist.deterministic_sample()
model_action_one_hot = tf.one_hot(model_action,self.num_outputs)
model_expert = model_action_one_hot * expert_action_one_hot
imitation_loss = 0
loss_type = self.model_config["custom_options"].get("loss","ce")
if loss_type == "ce":
imitation_loss = tf.reduce_mean(-action_dist.logp(expert_logits))
elif loss_type == "kl":
expert_dist = Categorical(tf.one_hot(expert_logits,\
self.num_outputs), self.model_config)
imitation_loss = tf.reduce_mean(-action_dist.kl(expert_dist))
elif loss_type == "dqfd":
max_value = float("-inf")
Q_select = model_logits # TODO: difference in action_scores,dist and logits
for a in range(self.num_outputs):
max_value = tf.maximum(Q_select[a] + 0.8 * tf.cast(model_expert[a],tf.float32),max_value)
imitation_loss =tf.reduce_mean(1 * (max_value - Q_select[tf.cast(expert_action, tf.int32)]))
self.imitation_loss = imitation_loss
total_loss = self.model_config["custom_options"]["lambda1"]*policy_loss\
+ self.model_config["custom_options"]["lambda2"]\
* self.imitation_loss
return total_loss
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment