diff --git a/RLLib_training/RailEnvRLLibWrapper.py b/RLLib_training/RailEnvRLLibWrapper.py index 6d34e95e6b1e0fdada612fffcbd00c6e18a5ba8e..0fd0f59727bb384a4545b8189e7844d8540367b1 100644 --- a/RLLib_training/RailEnvRLLibWrapper.py +++ b/RLLib_training/RailEnvRLLibWrapper.py @@ -5,6 +5,8 @@ from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.utils.seed import seed as set_seed + + class RailEnvRLLibWrapper(MultiAgentEnv): def __init__(self, config): @@ -18,13 +20,15 @@ class RailEnvRLLibWrapper(MultiAgentEnv): self.predefined_env = False if config['rail_generator'] == "complex_rail_generator": - self.rail_generator = complex_rail_generator(nr_start_goal=config['number_of_agents'], min_dist=5, + self.rail_generator = complex_rail_generator(nr_start_goal=config['number_of_agents'], + min_dist=config['min_dist'], nr_extra=config['nr_extra'], seed=config['seed'] * (1 + vector_index)) elif config['rail_generator'] == "random_rail_generator": self.rail_generator = random_rail_generator() elif config['rail_generator'] == "load_env": self.predefined_env = True + self.rail_generator = random_rail_generator() else: raise (ValueError, f'Unknown rail generator: {config["rail_generator"]}') @@ -32,12 +36,11 @@ class RailEnvRLLibWrapper(MultiAgentEnv): set_seed(config['seed'] * (1 + vector_index)) self.env = RailEnv(width=config["width"], height=config["height"], number_of_agents=config["number_of_agents"], - obs_builder_object=config['obs_builder'], rail_generator=self.rail_generator, - prediction_builder_object=config['predictor']) + obs_builder_object=config['obs_builder'], rail_generator=self.rail_generator) if self.predefined_env: - self.env.load(config['load_env_path']) - self.env.load_resource('torch_training.railway', config['load_env_path']) + # self.env.load(config['load_env_path']) + self.env.load_resource('torch_training.railway', 'complex_scene.pkl') self.width = self.env.width self.height = self.env.height @@ -50,25 +53,28 @@ class RailEnvRLLibWrapper(MultiAgentEnv): else: obs = self.env.reset() - predictions = self.env.predict() - if predictions != {}: - # pred_pos is a 3 dimensions array (N_Agents, T_pred, 2) containing x and y coordinates of - # agents at each time step - pred_pos = np.concatenate([[x[:, 1:3]] for x in list(predictions.values())], axis=0) - pred_dir = [x[:, 2] for x in list(predictions.values())] + + # predictions = self.env.predict() + # if predictions != {}: + # # pred_pos is a 3 dimensions array (N_Agents, T_pred, 2) containing x and y coordinates of + # # agents at each time step + # pred_pos = np.concatenate([[x[:, 1:3]] for x in list(predictions.values())], axis=0) + # pred_dir = [x[:, 2] for x in list(predictions.values())] o = dict() for i_agent in range(len(self.env.agents)): - - if predictions != {}: - pred_obs = self.get_prediction_as_observation(pred_pos, pred_dir, i_agent) - - agent_id_one_hot = np.zeros(len(self.env.agents)) - agent_id_one_hot[i_agent] = 1 - o[i_agent] = [obs[i_agent], agent_id_one_hot, pred_obs] - else: - o[i_agent] = obs[i_agent] + data, distance, agent_data = self.env.obs_builder.split_tree(tree=np.array(obs[i_agent]), + num_features_per_node=8, current_depth=0) + # if predictions != {}: + # pred_obs = self.get_prediction_as_observation(pred_pos, pred_dir, i_agent) + # + # agent_id_one_hot = np.zeros(len(self.env.agents)) + # agent_id_one_hot[i_agent] = 1 + # o[i_agent] = [obs[i_agent], agent_id_one_hot, pred_obs] + # else: + + o[i_agent] = [data, distance, agent_data] # needed for the renderer self.rail = self.env.rail @@ -93,23 +99,25 @@ class RailEnvRLLibWrapper(MultiAgentEnv): r = dict() o = dict() - predictions = self.env.predict() - if predictions != {}: - # pred_pos is a 3 dimensions array (N_Agents, T_pred, 2) containing x and y coordinates of - # agents at each time step - pred_pos = np.concatenate([[x[:, 1:3]] for x in list(predictions.values())], axis=0) - pred_dir = [x[:, 2] for x in list(predictions.values())] + # predictions = self.env.predict() + # if predictions != {}: + # # pred_pos is a 3 dimensions array (N_Agents, T_pred, 2) containing x and y coordinates of + # # agents at each time step + # pred_pos = np.concatenate([[x[:, 1:3]] for x in list(predictions.values())], axis=0) + # pred_dir = [x[:, 2] for x in list(predictions.values())] for i_agent in range(len(self.env.agents)): if i_agent not in self.agents_done: - - if predictions != {}: - pred_obs = self.get_prediction_as_observation(pred_pos, pred_dir, i_agent) - agent_id_one_hot = np.zeros(len(self.env.agents)) - agent_id_one_hot[i_agent] = 1 - o[i_agent] = [obs[i_agent], agent_id_one_hot, pred_obs] - else: - o[i_agent] = obs[i_agent] + data, distance, agent_data = self.env.obs_builder.split_tree(tree=np.array(obs[i_agent]), + num_features_per_node=8, current_depth=0) + + # if predictions != {}: + # pred_obs = self.get_prediction_as_observation(pred_pos, pred_dir, i_agent) + # agent_id_one_hot = np.zeros(len(self.env.agents)) + # agent_id_one_hot[i_agent] = 1 + # o[i_agent] = [obs[i_agent], agent_id_one_hot, pred_obs] + # else: + o[i_agent] = [data, distance, agent_data] r[i_agent] = rewards[i_agent] d[i_agent] = dones[i_agent] diff --git a/RLLib_training/custom_preprocessors.py b/RLLib_training/custom_preprocessors.py index bd45dfd2bd8620c2866ac3d079a9e82dd4c20c7e..41e895bee95cde7bc59220ce2eee4eaabd458651 100644 --- a/RLLib_training/custom_preprocessors.py +++ b/RLLib_training/custom_preprocessors.py @@ -23,7 +23,7 @@ def min_lt(seq, val): min = np.inf idx = len(seq) - 1 while idx >= 0: - if seq[idx] > val and seq[idx] < min: + if seq[idx] >= val and seq[idx] < min: min = seq[idx] idx -= 1 return min @@ -38,7 +38,8 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1): :return: returnes normalized and clipped observatoin """ max_obs = max(1, max_lt(obs, 1000)) - min_obs = max(0, min_lt(obs, 0)) + min_obs = min(max_obs, min_lt(obs, 0)) + if max_obs == min_obs: return np.clip(np.array(obs) / max_obs, clip_min, clip_max) norm = np.abs(max_obs - min_obs) @@ -49,12 +50,22 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1): class CustomPreprocessor(Preprocessor): def _init_shape(self, obs_space, options): + return sum([space.shape[0] for space in obs_space]), # return (sum([space.shape[0] for space in obs_space]), ) - return ((sum([space.shape[0] for space in obs_space[:2]]) + obs_space[2].shape[0] * obs_space[2].shape[1]),) + # return ((sum([space.shape[0] for space in obs_space[:2]]) + obs_space[2].shape[0] * obs_space[2].shape[1]),) def transform(self, observation): + print('OBSSSSSSSSSSSSSSSSSs', observation, observation.shape) + data = norm_obs_clip(observation[0]) + distance = norm_obs_clip(observation[1]) + agent_data = np.clip(observation[2], -1, 1) + + return np.concatenate((np.concatenate((data, distance)), agent_data)) + return norm_obs_clip(observation) + return np.concatenate([norm_obs_clip(observation[0]), norm_obs_clip(observation[1])]) # if len(observation) == 111: # return np.concatenate([norm_obs_clip(obs) for obs in observation]) + # print('OBSERVATION:', observation, len(observation[0])) return np.concatenate([norm_obs_clip(observation[0]), observation[1], observation[ 2].flatten()]) #, norm_obs_clip(observation[1]), observation[2], observation[3].flatten()]) #one_hot = observation[-3:] diff --git a/RLLib_training/experiment_configs/predictions_test/config.gin b/RLLib_training/experiment_configs/predictions_test/config.gin new file mode 100644 index 0000000000000000000000000000000000000000..b5923dfa1de6aacff716dc3054e2b81e4f66a6ab --- /dev/null +++ b/RLLib_training/experiment_configs/predictions_test/config.gin @@ -0,0 +1,28 @@ +run_experiment.name = "memory_experiment_results" +run_experiment.num_iterations = 2002 +run_experiment.save_every = 50 +run_experiment.hidden_sizes = [32, 32] + +run_experiment.map_width = {"grid_search": [8, 10, 12, 14]} +run_experiment.map_height = 8 +run_experiment.n_agents = 3 +run_experiment.rail_generator = "complex_rail_generator" +run_experiment.nr_extra = 1 +run_experiment.policy_folder_name = "ppo_policy_with_pred_map_size_{config[map_width]}" + +run_experiment.horizon = 50 +run_experiment.seed = 123 + +#run_experiment.conv_model = {"grid_search": [True, False]} +run_experiment.conv_model = False + +run_experiment.obs_builder = @TreeObsForRailEnv() +TreeObsForRailEnv.max_depth = 2 +TreeObsForRailEnv.predictor = @DummyPredictorForRailEnv() +LocalObsForRailEnv.view_radius = 5 + +run_experiment.entropy_coeff = 1e-3 +run_experiment.kl_coeff = 0.2 +run_experiment.lambda_gae = 0.9 +#run_experiment.predictor = "dummy_predictor" +run_experiment.step_memory = 1 diff --git a/RLLib_training/render_training_result.py b/RLLib_training/render_training_result.py index 5f0159fa70e975b80a17a296a82139470a645f48..1719f4403316355ebfb063fc1b16baea66d476c4 100644 --- a/RLLib_training/render_training_result.py +++ b/RLLib_training/render_training_result.py @@ -50,11 +50,11 @@ ModelCatalog.register_custom_model("conv_model", ConvModelGlobalObs) ray.init()#object_store_memory=150000000000, redis_max_memory=30000000000) -CHECKPOINT_PATH = '/home/guillaume/Desktop/distMAgent/env_complexity_benchmark/' \ - 'ppo_policy_nr_extra_10_0qxx0qy_/checkpoint_1001/checkpoint-1001' +CHECKPOINT_PATH = '/home/guillaume/Desktop/distMAgent/experiment_agent_memory/' \ + 'ppo_policy_hidden_size_32_entropy_coeff_0.0001_mu413rlu/checkpoint_201/checkpoint-201' N_EPISODES = 10 -N_STEPS_PER_EPISODE = 80 +N_STEPS_PER_EPISODE = 50 def render_training_result(config): @@ -65,16 +65,24 @@ def render_training_result(config): # Example configuration to generate a random rail env_config = {"width": config['map_width'], "height": config['map_height'], - "rail_generator": config["rail_generator"], + "rail_generator": "load_env",#config["rail_generator"], "nr_extra": config["nr_extra"], "number_of_agents": config['n_agents'], "seed": config['seed'], - "obs_builder": config['obs_builder']} - + "obs_builder": config['obs_builder'], + "predictor": config["predictor"], + "step_memory": config["step_memory"]} # Observation space and action space definitions if isinstance(config["obs_builder"], TreeObsForRailEnv): - obs_space = gym.spaces.Box(low=-1, high=1, shape=(147,)) + if config['predictor'] is None: + obs_space = gym.spaces.Tuple( + (gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)),) * config['step_memory']) + else: + obs_space = gym.spaces.Tuple((gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)), + gym.spaces.Box(low=0, high=1, shape=(config['n_agents'],)), + gym.spaces.Box(low=0, high=1, shape=(20, config['n_agents'])),) * config[ + 'step_memory']) preprocessor = "tree_obs_prep" elif isinstance(config["obs_builder"], GlobalObsForRailEnv): @@ -109,7 +117,7 @@ def render_training_result(config): else: raise ValueError("Undefined observation space") - act_space = gym.spaces.Discrete(4) + act_space = gym.spaces.Discrete(5) # Dict with the different policies to train policy_graphs = { @@ -131,10 +139,11 @@ def render_training_result(config): "policies_to_train": list(policy_graphs.keys())} trainer_config["horizon"] = config['horizon'] + trainer_config["num_workers"] = 0 - trainer_config["num_cpus_per_worker"] = 3 - trainer_config["num_gpus"] = 0 - trainer_config["num_gpus_per_worker"] = 0 + trainer_config["num_cpus_per_worker"] = 4 + trainer_config["num_gpus"] = 0.2 + trainer_config["num_gpus_per_worker"] = 0.2 trainer_config["num_cpus_for_driver"] = 1 trainer_config["num_envs_per_worker"] = 1 trainer_config['entropy_coeff'] = config['entropy_coeff'] @@ -145,17 +154,20 @@ def render_training_result(config): trainer_config['log_level'] = 'WARN' trainer_config['num_sgd_iter'] = 10 trainer_config['clip_param'] = 0.2 + trainer_config['kl_coeff'] = config['kl_coeff'] + trainer_config['lambda'] = config['lambda_gae'] env = RailEnvRLLibWrapper(env_config) trainer = Trainer(env=RailEnvRLLibWrapper, config=trainer_config) + print('hidden sizes:', config['hidden_sizes']) trainer.restore(CHECKPOINT_PATH) policy = trainer.get_policy(config['policy_folder_name'].format(**locals())) - preprocessor = CustomPreprocessor(gym.spaces.Box(low=-1, high=1, shape=(147,))) + preprocessor = CustomPreprocessor(obs_space) env_renderer = RenderTool(env, gl="PIL") for episode in range(N_EPISODES): observation = env.reset() @@ -184,7 +196,8 @@ def render_training_result(config): @gin.configurable def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, map_width, map_height, horizon, policy_folder_name, local_dir, obs_builder, - entropy_coeff, seed, conv_model, rail_generator, nr_extra): + entropy_coeff, seed, conv_model, rail_generator, nr_extra, kl_coeff, + lambda_gae, predictor, step_memory): render_training_result( config={"n_agents": n_agents, @@ -200,13 +213,17 @@ def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, "seed": seed, "conv_model": conv_model, "rail_generator": rail_generator, - "nr_extra": 10# nr_extra + "nr_extra": nr_extra, + "kl_coeff": kl_coeff, + "lambda_gae": lambda_gae, + "predictor": predictor, + "step_memory": step_memory } ) if __name__ == '__main__': gin.external_configurable(tune.grid_search) - dir = '/home/guillaume/EPFL/Master_Thesis/flatland/baselines/RLLib_training/experiment_configs/env_complexity_benchmark' # To Modify + dir = '/home/guillaume/EPFL/Master_Thesis/flatland/baselines/RLLib_training/experiment_configs/experiment_agent_memory' # To Modify gin.parse_config_file(dir + '/config.gin') run_experiment(local_dir=dir) diff --git a/RLLib_training/train_experiment.py b/RLLib_training/train_experiment.py index 006bab121a3e9aab7d9f014a466ad77c1aced82f..cc8debe1b89a315624c99560b63858e66f2dea1e 100644 --- a/RLLib_training/train_experiment.py +++ b/RLLib_training/train_experiment.py @@ -34,6 +34,7 @@ gin.external_configurable(LocalObsForRailEnv) gin.external_configurable(GlobalObsForRailEnvDirectionDependent) from ray.rllib.models.preprocessors import TupleFlatteningPreprocessor +import numpy as np ModelCatalog.register_custom_preprocessor("tree_obs_prep", CustomPreprocessor) ModelCatalog.register_custom_preprocessor("global_obs_prep", TupleFlatteningPreprocessor) @@ -44,10 +45,32 @@ ray.init() # object_store_memory=150000000000, redis_max_memory=30000000000) __file_dirname__ = os.path.dirname(os.path.realpath(__file__)) +def on_episode_start(info): + episode = info['episode'] + map_width = info['env'].envs[0].width + map_height = info['env'].envs[0].height + episode.horizon = map_width + map_height + + +# def on_episode_step(info): +# episode = info['episode'] +# print('#########################', episode._agent_reward_history) +# # print(ds) + + +def on_episode_end(info): + episode = info['episode'] + score = 0 + for k, v in episode._agent_reward_history.items(): + score += np.sum(v) + score /= (len(episode._agent_reward_history) * 3 * episode.horizon) + episode.custom_metrics["score"] = score + def train(config, reporter): print('Init Env') set_seed(config['seed'], config['seed'], config['seed']) + config['map_height'] = config['map_width'] # Example configuration to generate a random rail env_config = {"width": config['map_width'], @@ -57,19 +80,24 @@ def train(config, reporter): "number_of_agents": config['n_agents'], "seed": config['seed'], "obs_builder": config['obs_builder'], - "predictor": config["predictor"], + "min_dist": config['min_dist'], + # "predictor": config["predictor"], "step_memory": config["step_memory"]} # Observation space and action space definitions if isinstance(config["obs_builder"], TreeObsForRailEnv): - if config['predictor'] is None: - obs_space = gym.spaces.Tuple( - (gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)),) * config['step_memory']) - else: - obs_space = gym.spaces.Tuple((gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)), - gym.spaces.Box(low=0, high=1, shape=(config['n_agents'],)), - gym.spaces.Box(low=0, high=1, shape=(20, config['n_agents'])),) * config[ - 'step_memory']) + obs_space = gym.spaces.Tuple((gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(168,)), )) + # gym.spaces.Box(low=0, high=1, shape=(config['n_agents'],)), + # gym.spaces.Box(low=0, high=1, shape=(20, config['n_agents'])),) * config[ + # 'step_memory']) + # if config['predictor'] is None: + # obs_space = gym.spaces.Tuple( + # (gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)),) * config['step_memory']) + # else: + # obs_space = gym.spaces.Tuple((gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(147,)), + # gym.spaces.Box(low=0, high=1, shape=(config['n_agents'],)), + # gym.spaces.Box(low=0, high=1, shape=(20, config['n_agents'])),) * config[ + # 'step_memory']) preprocessor = "tree_obs_prep" elif isinstance(config["obs_builder"], GlobalObsForRailEnv): @@ -124,12 +152,12 @@ def train(config, reporter): trainer_config['multiagent'] = {"policy_graphs": policy_graphs, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": list(policy_graphs.keys())} - trainer_config["horizon"] = config['horizon'] + trainer_config["horizon"] = 1.5 * (config['map_width'] + config['map_height'])#config['horizon'] trainer_config["num_workers"] = 0 - trainer_config["num_cpus_per_worker"] = 4 - trainer_config["num_gpus"] = 0.2 - trainer_config["num_gpus_per_worker"] = 0.2 + trainer_config["num_cpus_per_worker"] = 7 + trainer_config["num_gpus"] = 0.0 + trainer_config["num_gpus_per_worker"] = 0.0 trainer_config["num_cpus_for_driver"] = 1 trainer_config["num_envs_per_worker"] = 1 trainer_config['entropy_coeff'] = config['entropy_coeff'] @@ -142,6 +170,10 @@ def train(config, reporter): trainer_config['clip_param'] = 0.2 trainer_config['kl_coeff'] = config['kl_coeff'] trainer_config['lambda'] = config['lambda_gae'] + trainer_config['callbacks'] = { + "on_episode_start": tune.function(on_episode_start), + "on_episode_end": tune.function(on_episode_end) + } def logger_creator(conf): """Creates a Unified logger with a default logdir prefix @@ -172,7 +204,7 @@ def train(config, reporter): def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, map_width, map_height, horizon, policy_folder_name, local_dir, obs_builder, entropy_coeff, seed, conv_model, rail_generator, nr_extra, kl_coeff, lambda_gae, - predictor, step_memory): + step_memory, min_dist): tune.run( train, name=name, @@ -193,12 +225,13 @@ def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, "nr_extra": nr_extra, "kl_coeff": kl_coeff, "lambda_gae": lambda_gae, - "predictor": predictor, + "min_dist": min_dist, + # "predictor": predictor, "step_memory": step_memory }, resources_per_trial={ - "cpu": 5, - "gpu": 0.2 + "cpu": 8, + "gpu": 0 }, verbose=2, local_dir=local_dir @@ -207,8 +240,9 @@ def run_experiment(name, num_iterations, n_agents, hidden_sizes, save_every, if __name__ == '__main__': gin.external_configurable(tune.grid_search) - with path('RLLib_training.experiment_configs.experiment_agent_memory', 'config.gin') as f: - gin.parse_config_file(f) - - dir = os.path.join(__file_dirname__, 'experiment_configs', 'experiment_agent_memory') + # with path('RLLib_training.experiment_configs.n_agents_experiment', 'config.gin') as f: + # gin.parse_config_file(f) + gin.parse_config_file('/home/guillaume/flatland/baselines/RLLib_training/experiment_configs/score_metric_test/config.gin') + dir = '/home/guillaume/flatland/baselines/RLLib_training/experiment_configs/score_metric_test' + # dir = os.path.join(__file_dirname__, 'experiment_configs', 'experiment_agent_memory') run_experiment(local_dir=dir)