Commit 9a904072 authored by nilabha's avatar nilabha

Merge branch 'flatland-paper-baselines' into 'master'

Flatland paper baselines

See merge request !17
parents f8d0c32b 8450fb5c
Pipeline #5369 passed with stage
in 4 minutes and 14 seconds
from .registry import CUSTOM_ALGORITHMS
import numpy as np
from ray.rllib.agents.trainer import Trainer, with_common_config
from ray.rllib.agents.dqn import ApexTrainer,DQNTrainer
from ray.rllib.utils.annotations import override
from ray.rllib.agents.ppo.ppo import PPOTrainer
import ray
from ray import tune
from ray.tune.trainable import Trainable
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
import numpy as np
import os
import math
import ray
import yaml
from pathlib import Path
from ray.cluster_utils import Cluster
from ray.rllib.evaluation import MultiAgentEpisode
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.tune import run_experiments
from ray.tune.logger import TBXLogger
from ray.tune.resources import resources_to_json
from ray.tune.tune import _make_scheduler
from ray.rllib.models.tf.tf_action_dist import Categorical
tf = try_import_tf()
from ray.tune import registry
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.optimizers import PolicyOptimizer, SyncSamplesOptimizer
from ray.rllib.models import ModelCatalog
from utils.argparser import create_parser
from utils.loader import load_envs, load_models, load_algorithms
from envs.flatland import get_eval_config
from ray.rllib.utils import merge_dicts
from ray.rllib.evaluation.metrics import collect_metrics
# Custom wandb logger with hotfix to allow custom callbacks
from wandblogger import WandbLogger
import pandas as pd
"""
Note : This implementation has been adapted from :
https://github.com/ray-project/ray/blob/master/rllib/contrib/random_agent/random_agent.py
"""
from ray.rllib.policy import Policy,TFPolicy
from ray.rllib.policy.dynamic_tf_policy import DynamicTFPolicy
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.execution.rollout_ops import ParallelRollouts, ConcatBatches
from ray.rllib.execution.train_ops import TrainOneStep
from ray.rllib.execution.metric_ops import StandardMetricsReporting
import numpy as np
import logging
logger = logging.getLogger(__name__)
from flatland.envs.agent_utils import RailAgentStatus
import sys,os
class CustomAgent(PPOTrainer):
"""Policy that takes random actions and never learns."""
_name = "CustomAgent"
@override(Trainer)
def _init(self, config, env_creator):
self.env = env_creator(config["env_config"])
self.state = {}
action_space = self.env.action_space
dist_class, logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
self.workers = self._make_workers(
env_creator, self._policy, config, self.config["num_workers"])
@override(Trainer)
def restore(self, checkpoint_path):
pass
@override(Trainer)
def compute_action(self,
observation,
state=None,
prev_action=None,
prev_reward=None,
info=None,
policy_id=DEFAULT_POLICY_ID,
full_fetch=False,
explore=None):
return observation
@override(Trainer)
def _train(self):
import tensorflow as tf
policy = self.get_policy()
steps = 0
n_episodes = 1
for _ in range(n_episodes):
env = self.env._env.rail_env
obs = self.env.reset()
num_outputs = env.action_space[0]
n_agents = env.get_num_agents()
# TODO : Update max_steps as per latest version
# https://gitlab.aicrowd.com/flatland/flatland-examples/blob/master/reinforcement_learning/multi_agent_training.py
# max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) - 1
max_steps = int(4 * 2 * (20 + env.height + env.width))
episode_steps = 0
episode_max_steps = 0
episode_num_agents = 0
episode_score = 0
episode_done_agents = 0
done = {}
done["__all__"] = False
for step in range(max_steps):
action_dict = {i:obs.get(i,2) for i in range(n_agents)}
obs, all_rewards, done, info = self.env.step(action_dict)
steps += 1
for agent, agent_info in info.items():
if agent_info["agent_done"]:
episode_done_agents += 1
if done["__all__"]:
for agent, agent_info in info.items():
if episode_max_steps == 0:
episode_max_steps = agent_info["max_episode_steps"]
episode_num_agents = agent_info["num_agents"]
episode_steps = max(episode_steps, agent_info["agent_step"])
episode_score += agent_info["agent_score"]
print(float(episode_done_agents) / episode_num_agents)
break
norm_factor = 1.0 / (episode_max_steps * episode_num_agents)
result = {
"expert_episode_reward_mean": episode_score,
"episode_reward_mean" : episode_score,
"expert_episode_completion_mean": float(episode_done_agents) / episode_num_agents,
"expert_episode_score_normalized": episode_score * norm_factor,
"episodes_this_iter": n_episodes,
"timesteps_this_iter": steps,
}
# Code taken from _train method of trainer_template.py - TODO: Not working
# res = self.collect_metrics()
# res = {}
# res.update(
# optimizer_steps_this_iter=steps,
# episode_reward_mean=episode_score,
# info=res.get("info", {}))
# res.update(expert_scores = result)
return result
if __name__ == "__main__":
# Copy this file to the root folder to run
from train import on_episode_end
exp = {}
exp['run']= "CustomAgent"
exp['env']= "flatland_sparse"
# exp['stop'] = {"timesteps_total": 15000}
exp['stop'] = {"iterations": 4}
exp['checkpoint_freq'] = 2
# exp['checkpoint_at_end'] = True
# exp['keep_checkpoints_num']= 100
# exp['checkpoint_score_attr']: "episode_reward_mean"
# exp['num_samples']= 3
config = {
"num_workers": 1,
"num_envs_per_worker": 1,
"num_gpus": 0,
"clip_rewards": False,
"vf_clip_param": 500.0,
"entropy_coeff": 0.01,
# effective batch_size: train_batch_size * num_agents_in_each_environment [5, 10]
# see https://github.com/ray-project/ray/issues/4628
"train_batch_size": 1000, # 5000
"rollout_fragment_length": 50, # 100
"sgd_minibatch_size": 100, # 500
"vf_share_layers": False,
"env_config" : {
"observation": "shortest_path_action",
"generator": "sparse_rail_generator",
"generator_config": "small_v0",
"render": "human"
},
"model" : {
"fcnet_activation": "relu",
"fcnet_hiddens": [256, 256],
"vf_share_layers": True }}
exp['config'] = config
exp['config']['callbacks'] = {
'on_episode_end': on_episode_end,
}
eval_configs = get_eval_config(exp['config'].get('env_config',\
{}).get('eval_generator',"default"))
eval_seed = eval_configs.get('evaluation_config',{}).get('env_config',{}).get('seed')
# add evaluation config to the current config
exp['config'] = merge_dicts(exp['config'],eval_configs)
if exp['config'].get('evaluation_config'):
exp['config']['evaluation_config']['env_config'] = exp['config'].get('env_config')
eval_env_config = exp['config']['evaluation_config'].get('env_config')
if eval_seed and eval_env_config:
# We override the env seed from the evaluation config
eval_env_config['seed'] = eval_seed
exp["config"]["eager"] = True
exp["config"]["use_pytorch"] = False
exp["config"]["log_level"] = "INFO"
verbose = 2
exp["config"]["eager_tracing"] = True
webui_host = "0.0.0.0"
# TODO should be in exp['config'] directly
exp['config']['env_config']['yaml_config'] = config
exp['loggers'] = [TBXLogger]
_default_config = with_common_config(
exp["config"])
ray.init(num_cpus=4,num_gpus=0)
trainer = CustomAgent(_default_config,
env=exp['env'],)
# trainer = PPOTrainer(_default_config,
# env="flatland_sparse",)
for i in range(exp.get("stop",{}).get("iterations",5)):
result = trainer.train()
print("Results:",result)
trainer.stop()
print("Test: OK")
This diff is collapsed.
"""
Registry of custom implemented algorithms names
Please refer to the following examples to add your custom algorithms :
- AlphaZero : https://github.com/ray-project/ray/tree/master/rllib/contrib/alpha_zero
- bandits : https://github.com/ray-project/ray/tree/master/rllib/contrib/bandits
- maddpg : https://github.com/ray-project/ray/tree/master/rllib/contrib/maddpg
- random_agent: https://github.com/ray-project/ray/tree/master/rllib/contrib/random_agent
An example integration of the random agent is shown here :
- https://github.com/AIcrowd/neurips2020-procgen-starter-kit/tree/master/algorithms/custom_random_agent
"""
def _import_imitation_trainer():
from .imitation_agent.imitation_trainer import ImitationAgent
return ImitationAgent
def _import_custom_trainer():
from .custom_agent.custom_trainer import CustomAgent
return CustomAgent
CUSTOM_ALGORITHMS = {
"ImitationAgent": _import_imitation_trainer,
"CustomAgent": _import_custom_trainer
}
\ No newline at end of file
......@@ -3,11 +3,13 @@ apex-tree-obs-small-v0:
env: flatland_sparse
stop:
timesteps_total: 15000000 # 1.5e7
checkpoint_freq: 50
checkpoint_at_end: True
keep_checkpoints_num: 100000000
checkpoint_score_attr: episode_reward_mean
num_samples: 3
config:
num_workers: 15
num_workers: 13
num_envs_per_worker: 5
num_gpus: 0
......@@ -21,8 +23,8 @@ apex-tree-obs-small-v0:
generator_config: small_v0
wandb:
project: flatland
entity: masterscrat
project: flatland-paper
entity: aicrowd
tags: ["small_v0", "tree_obs", "apex"] # TODO should be set programmatically
model:
......
......@@ -3,11 +3,13 @@ apex-tree-obs-small-v0-skip:
env: flatland_sparse
stop:
timesteps_total: 15000000 # 1.5e7
checkpoint_freq: 50
checkpoint_at_end: True
keep_checkpoints_num: 100000000
checkpoint_score_attr: episode_reward_mean
num_samples: 3
config:
num_workers: 15
num_workers: 13
num_envs_per_worker: 5
num_gpus: 0
gamma: 0.99
......@@ -26,8 +28,8 @@ apex-tree-obs-small-v0-skip:
discounting: 0.99 # TODO set automatically, should be equal to gamma
wandb:
project: flatland
entity: masterscrat
project: flatland-paper
entity: aicrowd
tags: ["small_v0", "tree_obs", "apex", "skip"] # TODO should be set programmatically
model:
......
......@@ -3,11 +3,13 @@ ppo-tree-obs-small-v0:
env: flatland_sparse
stop:
timesteps_total: 15000000 # 1.5e7
checkpoint_freq: 50
checkpoint_at_end: True
keep_checkpoints_num: 100000000
checkpoint_score_attr: episode_reward_mean
num_samples: 3
config:
num_workers: 15
num_workers: 13
num_envs_per_worker: 5
num_gpus: 0
......@@ -31,8 +33,8 @@ ppo-tree-obs-small-v0:
generator_config: small_v0
wandb:
project: flatland
entity: masterscrat
project: flatland-paper
entity: aicrowd
tags: ["small_v0", "tree_obs", "ppo"] # TODO should be set programmatically
model:
......
......@@ -3,11 +3,13 @@ sparse-mask-ppo-tree-obs-small-v0:
env: flatland_sparse
stop:
timesteps_total: 15000000 # 1.5e7
checkpoint_freq: 50
checkpoint_at_end: True
keep_checkpoints_num: 100000000
checkpoint_score_attr: episode_reward_mean
num_samples: 3
config:
num_workers: 15
num_workers: 13
num_envs_per_worker: 5
num_gpus: 0
gamma: 0.99
......@@ -35,8 +37,8 @@ sparse-mask-ppo-tree-obs-small-v0:
allow_noop: False
wandb:
project: flatland
entity: masterscrat
project: flatland-paper
entity: aicrowd
tags: ["small_v0", "tree_obs", "ppo", "mask"] # TODO should be set programmatically
model:
......
......@@ -3,11 +3,13 @@ ppo-tree-obs-small-v0-skip:
env: flatland_sparse
stop:
timesteps_total: 15000000 # 1.5e7
checkpoint_freq: 50
checkpoint_at_end: True
keep_checkpoints_num: 100000000
checkpoint_score_attr: episode_reward_mean
num_samples: 3
config:
num_workers: 15
num_workers: 13
num_envs_per_worker: 5
num_gpus: 0
gamma: 0.99
......@@ -36,8 +38,8 @@ ppo-tree-obs-small-v0-skip:
discounting: 0.99 # TODO set automatically, should be equal to gamma
wandb:
project: flatland
entity: masterscrat
project: flatland-paper
entity: aicrowd
tags: ["small_v0", "tree_obs", "ppo", "skip"] # TODO should be set programmatically
model:
......
......@@ -2,13 +2,14 @@ flatland-sparse-small-tree-fc-apex:
run: APEX
env: flatland_sparse
stop:
timesteps_total: 5000000 # 5e6
checkpoint_freq: 10
timesteps_total: 15000000 # 1.5e7
checkpoint_freq: 50
checkpoint_at_end: True
keep_checkpoints_num: 5
keep_checkpoints_num: 100000000
checkpoint_score_attr: episode_reward_mean
num_samples: 3
config:
num_workers: 3
num_workers: 13
num_envs_per_worker: 5
num_gpus: 0
......@@ -22,8 +23,8 @@ flatland-sparse-small-tree-fc-apex:
generator_config: small_v0
wandb:
project: flatland
entity: masterscrat
project: flatland-paper
entity: aicrowd
tags: ["small_v0", "tree_obs", "apex"]
model:
......
flatland-random-sparse-small-tree-fc-cctransformer:
run: CcTransformer
env: flatland_sparse
stop:
timesteps_total: 15000000 # 1.5e7
checkpoint_freq: 50
checkpoint_at_end: True
keep_checkpoints_num: 100000000
checkpoint_score_attr: episode_reward_mean
num_samples: 3
config:
clip_rewards: True
clip_param: 0.1
# effective batch_size: train_batch_size * num_agents_in_each_environment [5, 10]
# see https://github.com/ray-project/ray/issues/4628
train_batch_size: 1000 # 5000
rollout_fragment_length: 50 # 100
sgd_minibatch_size: 100 # 500
num_sgd_iter: 10
num_workers: 2
num_envs_per_worker: 5
batch_mode: complete_episodes
vf_share_layers: True
num_gpus: 1
env_config:
observation: tree
sparse_reward: True
done_reward: 1
not_finished_reward: -1
observation_config:
max_depth: 2
shortest_path_max_depth: 30
generator: sparse_rail_generator
generator_config: small_v0
eval_generator: enable_explore
wandb:
project: flatland-paper
entity: aicrowd
tags: ["small_v0", "tree_obs","ccppo"] # TODO should be set programmatically
model:
custom_model: cc_transformer
custom_options:
max_num_agents: 15
actor:
activation_fn: relu
hidden_layers:
- 512
- 512
- 512
critic:
centralized: True
embedding_size: 32
num_heads: 4
d_model: 32
use_scale: True
activation_fn: relu
hidden_layers:
- 512
- 512
- 512
embedding:
activation_fn: relu
hidden_layers:
- 512
- 512
- 512
fcnet_activation: relu
fcnet_hiddens: [512, 512,512]
vf_share_layers: True # False
flatland-random-sparse-small-tree-fc-cctransformer:
run: CcConcatenate
env: flatland_sparse
stop:
timesteps_total: 15000000 # 1.5e7
checkpoint_freq: 50
checkpoint_at_end: True
keep_checkpoints_num: 100000000
checkpoint_score_attr: episode_reward_mean
num_samples: 3
config:
clip_rewards: True
clip_param: 0.1
# effective batch_size: train_batch_size * num_agents_in_each_environment [5, 10]
# see https://github.com/ray-project/ray/issues/4628
train_batch_size: 1000 # 5000
rollout_fragment_length: 50 # 100
sgd_minibatch_size: 100 # 500
num_sgd_iter: 10
num_workers: 2
num_envs_per_worker: 5
batch_mode: complete_episodes
vf_share_layers: True
num_gpus: 1
env_config:
observation: tree
observation_config:
max_depth: 2
shortest_path_max_depth: 30
# skip_no_choice_cells: True
# accumulate_skipped_rewards: True
generator: sparse_rail_generator
generator_config: small_v0
eval_generator: enable_explore
wandb:
project: flatland-paper
entity: aicrowd
tags: ["small_v0", "tree_obs", "ccppo", "transformer"] # TODO should be set programmatically
model:
custom_model: cc_concatenate
custom_options:
max_num_agents: 15
actor:
activation_fn: relu
hidden_layers:
- 512
- 512
- 512
critic:
centralized: True
embedding_size: 32
num_heads: 4
d_model: 32
use_scale: True
activation_fn: relu
hidden_layers:
- 512
- 512
- 512
embedding:
activation_fn: relu
hidden_layers:
- 512
- 512
- 512
fcnet_activation: relu
fcnet_hiddens: [512, 512, 512]
vf_share_layers: True # False
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.