From 8fa14e8211f3f1ac69f7fa9adec26b8a9a7bee2e Mon Sep 17 00:00:00 2001 From: Guillaume Mollard <guillaume.mollard2@gmail.com> Date: Thu, 13 Jun 2019 23:48:51 +0200 Subject: [PATCH] experiment with predictor and new metrix --- RLLib_training/RailEnvRLLibWrapper.py | 54 ++++++++--------- RLLib_training/custom_preprocessors.py | 11 +++- .../predictions_test/config.gin | 13 ++-- RLLib_training/render_training_result.py | 47 ++++++++++----- RLLib_training/train_experiment.py | 59 ++++++++++++------- 5 files changed, 112 insertions(+), 72 deletions(-) diff --git a/RLLib_training/RailEnvRLLibWrapper.py b/RLLib_training/RailEnvRLLibWrapper.py index 3ae98bf..e36f383 100644 --- a/RLLib_training/RailEnvRLLibWrapper.py +++ b/RLLib_training/RailEnvRLLibWrapper.py @@ -52,25 +52,25 @@ class RailEnvRLLibWrapper(MultiAgentEnv): else: obs = self.env.reset() - predictions = self.env.predict() - if predictions != {}: - # pred_pos is a 3 dimensions array (N_Agents, T_pred, 2) containing x and y coordinates of - # agents at each time step - pred_pos = np.concatenate([[x[:, 1:3]] for x in list(predictions.values())], axis=0) - pred_dir = [x[:, 2] for x in list(predictions.values())] + # predictions = self.env.predict() + # if predictions != {}: + # # pred_pos is a 3 dimensions array (N_Agents, T_pred, 2) containing x and y coordinates of + # # agents at each time step + # pred_pos = np.concatenate([[x[:, 1:3]] for x in list(predictions.values())], axis=0) + # pred_dir = [x[:, 2] for x in list(predictions.values())] o = dict() for i_agent in range(len(self.env.agents)): - if predictions != {}: - pred_obs = self.get_prediction_as_observation(pred_pos, pred_dir, i_agent) - - agent_id_one_hot = np.zeros(len(self.env.agents)) - agent_id_one_hot[i_agent] = 1 - o[i_agent] = [obs[i_agent], agent_id_one_hot, pred_obs] - else: - o[i_agent] = obs[i_agent] + # if predictions != {}: + # pred_obs = self.get_prediction_as_observation(pred_pos, pred_dir, i_agent) + # + # agent_id_one_hot = np.zeros(len(self.env.agents)) + # agent_id_one_hot[i_agent] = 1 + # o[i_agent] = [obs[i_agent], agent_id_one_hot, pred_obs] + # else: + o[i_agent] = obs[i_agent] # needed for the renderer self.rail = self.env.rail @@ -95,23 +95,23 @@ class RailEnvRLLibWrapper(MultiAgentEnv): r = dict() o = dict() - predictions = self.env.predict() - if predictions != {}: - # pred_pos is a 3 dimensions array (N_Agents, T_pred, 2) containing x and y coordinates of - # agents at each time step - pred_pos = np.concatenate([[x[:, 1:3]] for x in list(predictions.values())], axis=0) - pred_dir = [x[:, 2] for x in list(predictions.values())] + # predictions = self.env.predict() + # if predictions != {}: + # # pred_pos is a 3 dimensions array (N_Agents, T_pred, 2) containing x and y coordinates of + # # agents at each time step + # pred_pos = np.concatenate([[x[:, 1:3]] for x in list(predictions.values())], axis=0) + # pred_dir = [x[:, 2] for x in list(predictions.values())] for i_agent in range(len(self.env.agents)): if i_agent not in self.agents_done: - if predictions != {}: - pred_obs = self.get_prediction_as_observation(pred_pos, pred_dir, i_agent) - agent_id_one_hot = np.zeros(len(self.env.agents)) - agent_id_one_hot[i_agent] = 1 - o[i_agent] = [obs[i_agent], agent_id_one_hot, pred_obs] - else: - o[i_agent] = obs[i_agent] + # if predictions != {}: + # pred_obs = self.get_prediction_as_observation(pred_pos, pred_dir, i_agent) + # agent_id_one_hot = np.zeros(len(self.env.agents)) + # agent_id_one_hot[i_agent] = 1 + # o[i_agent] = [obs[i_agent], agent_id_one_hot, pred_obs] + # else: + o[i_agent] = obs[i_agent] r[i_agent] = rewards[i_agent] d[i_agent] = dones[i_agent] diff --git a/RLLib_training/custom_preprocessors.py b/RLLib_training/custom_preprocessors.py index bd45dfd..6f9cad7 100644 --- a/RLLib_training/custom_preprocessors.py +++ b/RLLib_training/custom_preprocessors.py @@ -23,7 +23,7 @@ def min_lt(seq, val): min = np.inf idx = len(seq) - 1 while idx >= 0: - if seq[idx] > val and seq[idx] < min: + if seq[idx] >= val and seq[idx] < min: min = seq[idx] idx -= 1 return min @@ -38,7 +38,8 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1): :return: returnes normalized and clipped observatoin """ max_obs = max(1, max_lt(obs, 1000)) - min_obs = max(0, min_lt(obs, 0)) + min_obs = min(max_obs, min_lt(obs, 0)) + if max_obs == min_obs: return np.clip(np.array(obs) / max_obs, clip_min, clip_max) norm = np.abs(max_obs - min_obs) @@ -49,12 +50,16 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1): class CustomPreprocessor(Preprocessor): def _init_shape(self, obs_space, options): + return sum([space.shape[0] for space in obs_space]), # return (sum([space.shape[0] for space in obs_space]), ) - return ((sum([space.shape[0] for space in obs_space[:2]]) + obs_space[2].shape[0] * obs_space[2].shape[1]),) + # return ((sum([space.shape[0] for space in obs_space[:2]]) + obs_space[2].shape[0] * obs_space[2].shape[1]),) def transform(self, observation): + return norm_obs_clip(observation) + return np.concatenate([norm_obs_clip(observation[0]), norm_obs_clip(observation[1])]) # if len(observation) == 111: # return np.concatenate([norm_obs_clip(obs) for obs in observation]) + # print('OBSERVATION:', observation, len(observation[0])) return np.concatenate([norm_obs_clip(observation[0]), observation[1], observation[ 2].flatten()]) #, norm_obs_clip(observation[1]), observation[2], observation[3].flatten()]) #one_hot = observation[-3:] diff --git a/RLLib_training/experiment_configs/predictions_test/config.gin b/RLLib_training/experiment_configs/predictions_test/config.gin index 6d66424..b5923df 100644 --- a/RLLib_training/experiment_configs/predictions_test/config.gin +++ b/RLLib_training/experiment_configs/predictions_test/config.gin @@ -1,14 +1,14 @@ run_experiment.name = "memory_experiment_results" run_experiment.num_iterations = 2002 run_experiment.save_every = 50 -run_experiment.hidden_sizes = {"grid_search": [[32, 32], [64, 64], [128, 128]]} +run_experiment.hidden_sizes = [32, 32] -run_experiment.map_width = 8 +run_experiment.map_width = {"grid_search": [8, 10, 12, 14]} run_experiment.map_height = 8 run_experiment.n_agents = 3 run_experiment.rail_generator = "complex_rail_generator" -run_experiment.nr_extra = 5 -run_experiment.policy_folder_name = "ppo_policy_with_pred_hidden_size_{config[hidden_sizes][0]}_entropy_coeff_{config[entropy_coeff]}_" +run_experiment.nr_extra = 1 +run_experiment.policy_folder_name = "ppo_policy_with_pred_map_size_{config[map_width]}" run_experiment.horizon = 50 run_experiment.seed = 123 @@ -18,10 +18,11 @@ run_experiment.conv_model = False run_experiment.obs_builder = @TreeObsForRailEnv() TreeObsForRailEnv.max_depth = 2 +TreeObsForRailEnv.predictor = @DummyPredictorForRailEnv() LocalObsForRailEnv.view_radius = 5 -run_experiment.entropy_coeff = {"grid_search": [1e-4, 1e-3, 1e-2]} +run_experiment.entropy_coeff = 1e-3 run_experiment.kl_coeff = 0.2 run_experiment.lambda_gae = 0.9 -run_experiment.predictor = "dummy_predictor" +#run_experiment.predictor = "dummy_predictor" run_experiment.step_memory = 1 diff --git a/RLLib_training/render_training_result.py b/RLLib_training/render_training_result.py index 5f0159f..1719f44 100644 --- a/RLLib_training/render_training_result.py +++ b/RLLib_training/render_training_result.py @@ -50,11 +50,11 @@ ModelCatalog.register_custom_model("conv_model", ConvModelGlobalObs) ray.init()#object_store_memory=150000000000, redis_max_memory=30000000000) -CHECKPOINT_PATH = '/home/guillaume/Desktop/distMAgent/env_complexity_benchmark/' \ - 'ppo_policy_nr_extra_10_0qxx0qy_/checkpoint_1001/checkpoint-1001' +CHECKPOINT_PATH = '/home/guillaume/Desktop/distMAgent/experiment_agent_memory/' \ + 'ppo_policy_hidden_size_32_entropy_coeff_0.0001_mu413rlu/checkpoint_201/checkpoint-201' N_EPISODES = 10 -N_STEPS_PER_EPISODE = 80 +N_STEPS_PER_EPISODE = 50 def render_training_result(config): @@ -65,16 +65,24 @@ def render_training_result(config): # Example configuration to generate a random rail env_config = {"width": config['map_width'], "height": config['map_height'], - "rail_generator": config["rail_generator"], + "rail_generator": "load_env",#config["rail_generator"], "nr_extra": config["nr_extra"], "number_of_agents": config['n_agents'], "seed": config['seed'], - "obs_builder": config['obs_builder']} - + "obs_builder": config['obs_builder'], + "predictor": config["predictor"], + "step_memory": config["step_memory"]} # Observation space and action space definitions if isinstance(config["obs_builder"], TreeObsForRailEnv): - obs_space = gym.spaces.Box(low=-1, high=1, shape=(147,)) + if config['predictor'] is None: + obs_space = gym.spaces.Tuple( + (gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)),) * config['step_memory']) + else: + obs_space = gym.spaces.Tuple((gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)), + gym.spaces.Box(low=0, high=1, shape=(config['n_agents'],)), + gym.spaces.Box(low=0, high=1, shape=(20, config['n_agents'])),) * config[ + 'step_memory']) preprocessor = "tree_obs_prep" elif isinstance(config["obs_builder"], GlobalObsForRailEnv): @@ -109,7 +117,7 @@ def render_training_result(config): else: raise ValueError("Undefined observation space") - act_space = gym.spaces.Discrete(4) + act_space = gym.spaces.Discrete(5) # Dict with the different policies to train policy_graphs = { @@ -131,10 +139,11 @@ def render_training_result(config): "policies_to_train": list(policy_graphs.keys())} trainer_config["horizon"] = config['horizon'] + trainer_config["num_workers"] = 0 - trainer_config["num_cpus_per_worker"] = 3 - trainer_config["num_gpus"] = 0 - trainer_config["num_gpus_per_worker"] = 0 + trainer_config["num_cpus_per_worker"] = 4 + trainer_config["num_gpus"] = 0.2 + trainer_config["num_gpus_per_worker"] = 0.2 trainer_config["num_cpus_for_driver"] = 1 trainer_config["num_envs_per_worker"] = 1 trainer_config['entropy_coeff'] = config['entropy_coeff'] @@ -145,17 +154,20 @@ def render_training_result(config): trainer_config['log_level'] = 'WARN' trainer_config['num_sgd_iter'] = 10 trainer_config['clip_param'] = 0.2 + trainer_config['kl_coeff'] = config['kl_coeff'] + trainer_config['lambda'] = config['lambda_gae'] env = RailEnvRLLibWrapper(env_config) trainer = Trainer(env=RailEnvRLLibWrapper, config=trainer_config) + print('hidden sizes:', config['hidden_sizes']) trainer.restore(CHECKPOINT_PATH) policy = trainer.get_policy(config['policy_folder_name'].format(**locals())) - preprocessor = CustomPreprocessor(gym.spaces.Box(low=-1, high=1, shape=(147,))) + preprocessor = CustomPreprocessor(obs_space) env_renderer = RenderTool(env, gl="PIL") for episode in range(N_EPISODES): observation = env.reset() @@ -184,7 +196,8 @@ def render_training_result(config): @gin.configurable def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, map_width, map_height, horizon, policy_folder_name, local_dir, obs_builder, - entropy_coeff, seed, conv_model, rail_generator, nr_extra): + entropy_coeff, seed, conv_model, rail_generator, nr_extra, kl_coeff, + lambda_gae, predictor, step_memory): render_training_result( config={"n_agents": n_agents, @@ -200,13 +213,17 @@ def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, "seed": seed, "conv_model": conv_model, "rail_generator": rail_generator, - "nr_extra": 10# nr_extra + "nr_extra": nr_extra, + "kl_coeff": kl_coeff, + "lambda_gae": lambda_gae, + "predictor": predictor, + "step_memory": step_memory } ) if __name__ == '__main__': gin.external_configurable(tune.grid_search) - dir = '/home/guillaume/EPFL/Master_Thesis/flatland/baselines/RLLib_training/experiment_configs/env_complexity_benchmark' # To Modify + dir = '/home/guillaume/EPFL/Master_Thesis/flatland/baselines/RLLib_training/experiment_configs/experiment_agent_memory' # To Modify gin.parse_config_file(dir + '/config.gin') run_experiment(local_dir=dir) diff --git a/RLLib_training/train_experiment.py b/RLLib_training/train_experiment.py index 5e4c2bc..b674d40 100644 --- a/RLLib_training/train_experiment.py +++ b/RLLib_training/train_experiment.py @@ -34,6 +34,7 @@ gin.external_configurable(LocalObsForRailEnv) gin.external_configurable(GlobalObsForRailEnvDirectionDependent) from ray.rllib.models.preprocessors import TupleFlatteningPreprocessor +import numpy as np ModelCatalog.register_custom_preprocessor("tree_obs_prep", CustomPreprocessor) ModelCatalog.register_custom_preprocessor("global_obs_prep", TupleFlatteningPreprocessor) @@ -51,18 +52,25 @@ def on_episode_start(info): episode.horizon = map_width + map_height -def on_episode_step(info): - episode = info['episode'] +# def on_episode_step(info): +# episode = info['episode'] +# print('#########################', episode._agent_reward_history) +# # print(ds) def on_episode_end(info): episode = info['episode'] - + score = 0 + for k, v in episode._agent_reward_history.items(): + score += np.mean(v) + score /= (len(episode._agent_reward_history) * 1.5 * episode.horizon) + episode.custom_metrics["score"] = score def train(config, reporter): print('Init Env') set_seed(config['seed'], config['seed'], config['seed']) + config['map_height'] = config['map_width'] # Example configuration to generate a random rail env_config = {"width": config['map_width'], @@ -72,19 +80,23 @@ def train(config, reporter): "number_of_agents": config['n_agents'], "seed": config['seed'], "obs_builder": config['obs_builder'], - "predictor": config["predictor"], + # "predictor": config["predictor"], "step_memory": config["step_memory"]} # Observation space and action space definitions if isinstance(config["obs_builder"], TreeObsForRailEnv): - if config['predictor'] is None: - obs_space = gym.spaces.Tuple( - (gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)),) * config['step_memory']) - else: - obs_space = gym.spaces.Tuple((gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)), - gym.spaces.Box(low=0, high=1, shape=(config['n_agents'],)), - gym.spaces.Box(low=0, high=1, shape=(20, config['n_agents'])),) * config[ - 'step_memory']) + obs_space = gym.spaces.Tuple((gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(168,)), )) + # gym.spaces.Box(low=0, high=1, shape=(config['n_agents'],)), + # gym.spaces.Box(low=0, high=1, shape=(20, config['n_agents'])),) * config[ + # 'step_memory']) + # if config['predictor'] is None: + # obs_space = gym.spaces.Tuple( + # (gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)),) * config['step_memory']) + # else: + # obs_space = gym.spaces.Tuple((gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)), + # gym.spaces.Box(low=0, high=1, shape=(config['n_agents'],)), + # gym.spaces.Box(low=0, high=1, shape=(20, config['n_agents'])),) * config[ + # 'step_memory']) preprocessor = "tree_obs_prep" elif isinstance(config["obs_builder"], GlobalObsForRailEnv): @@ -139,7 +151,7 @@ def train(config, reporter): trainer_config['multiagent'] = {"policy_graphs": policy_graphs, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": list(policy_graphs.keys())} - trainer_config["horizon"] = config['horizon'] + trainer_config["horizon"] = 1.5 * (config['map_width'] + config['map_height'])#config['horizon'] trainer_config["num_workers"] = 0 trainer_config["num_cpus_per_worker"] = 4 @@ -157,6 +169,10 @@ def train(config, reporter): trainer_config['clip_param'] = 0.2 trainer_config['kl_coeff'] = config['kl_coeff'] trainer_config['lambda'] = config['lambda_gae'] + trainer_config['callbacks'] = { + "on_episode_start": tune.function(on_episode_start), + "on_episode_end": tune.function(on_episode_end) + } def logger_creator(conf): """Creates a Unified logger with a default logdir prefix @@ -187,7 +203,7 @@ def train(config, reporter): def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, map_width, map_height, horizon, policy_folder_name, local_dir, obs_builder, entropy_coeff, seed, conv_model, rail_generator, nr_extra, kl_coeff, lambda_gae, - predictor, step_memory): + step_memory): tune.run( train, name=name, @@ -208,12 +224,12 @@ def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, "nr_extra": nr_extra, "kl_coeff": kl_coeff, "lambda_gae": lambda_gae, - "predictor": predictor, + # "predictor": predictor, "step_memory": step_memory }, resources_per_trial={ - "cpu": 5, - "gpu": 0.2 + "cpu": 2, + "gpu": 0 }, verbose=2, local_dir=local_dir @@ -222,8 +238,9 @@ def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, if __name__ == '__main__': gin.external_configurable(tune.grid_search) - with path('RLLib_training.experiment_configs.experiment_agent_memory', 'config.gin') as f: - gin.parse_config_file(f) - - dir = os.path.join(__file_dirname__, 'experiment_configs', 'experiment_agent_memory') + # with path('RLLib_training.experiment_configs.n_agents_experiment', 'config.gin') as f: + # gin.parse_config_file(f) + gin.parse_config_file('/home/guillaume/EPFL/Master_Thesis/flatland/baselines/RLLib_training/experiment_configs/predictions_test/config.gin') + dir = '/home/guillaume/EPFL/Master_Thesis/flatland/baselines/RLLib_training/experiment_configs/predictions_test' + # dir = os.path.join(__file_dirname__, 'experiment_configs', 'experiment_agent_memory') run_experiment(local_dir=dir) -- GitLab