diff --git a/RLLib_training/RailEnvRLLibWrapper.py b/RLLib_training/RailEnvRLLibWrapper.py index 4cba2f36fe5f92edfd609c6434dec0d23aa9aa34..6d34e95e6b1e0fdada612fffcbd00c6e18a5ba8e 100644 --- a/RLLib_training/RailEnvRLLibWrapper.py +++ b/RLLib_training/RailEnvRLLibWrapper.py @@ -1,9 +1,8 @@ import numpy as np -from ray.rllib.env.multi_agent_env import MultiAgentEnv -from ray.rllib.utils.seed import seed as set_seed - from flatland.envs.generators import complex_rail_generator, random_rail_generator from flatland.envs.rail_env import RailEnv +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.utils.seed import seed as set_seed class RailEnvRLLibWrapper(MultiAgentEnv): diff --git a/RLLib_training/custom_preprocessors.py b/RLLib_training/custom_preprocessors.py index 86c159d3bbc11bc4c0fdd321d2c6ff6838488f4d..bd45dfd2bd8620c2866ac3d079a9e82dd4c20c7e 100644 --- a/RLLib_training/custom_preprocessors.py +++ b/RLLib_training/custom_preprocessors.py @@ -49,13 +49,14 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1): class CustomPreprocessor(Preprocessor): def _init_shape(self, obs_space, options): - #return (sum([space.shape[0] for space in obs_space]), ) - return ((sum([space.shape[0] for space in obs_space[:2]]) + obs_space[2].shape[0]*obs_space[2].shape[1]),) + # return (sum([space.shape[0] for space in obs_space]), ) + return ((sum([space.shape[0] for space in obs_space[:2]]) + obs_space[2].shape[0] * obs_space[2].shape[1]),) def transform(self, observation): # if len(observation) == 111: - #return np.concatenate([norm_obs_clip(obs) for obs in observation]) - return np.concatenate([norm_obs_clip(observation[0]), observation[1], observation[2].flatten()])#, norm_obs_clip(observation[1]), observation[2], observation[3].flatten()]) + # return np.concatenate([norm_obs_clip(obs) for obs in observation]) + return np.concatenate([norm_obs_clip(observation[0]), observation[1], observation[ + 2].flatten()]) #, norm_obs_clip(observation[1]), observation[2], observation[3].flatten()]) #one_hot = observation[-3:] #return np.append(obs, one_hot) # else: diff --git a/RLLib_training/train.py b/RLLib_training/train.py index 154620515157679fd8eb7947f54796ddd316710e..ba5f4eab43f5173dd410bc6d9b306d90e0e21ffc 100644 --- a/RLLib_training/train.py +++ b/RLLib_training/train.py @@ -5,13 +5,13 @@ import numpy as np import ray import ray.rllib.agents.ppo.ppo as ppo from RailEnvRLLibWrapper import RailEnvRLLibWrapper +from flatland.envs.generators import complex_rail_generator from ray.rllib.agents.ppo.ppo import PPOTrainer from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph from ray.rllib.models import ModelCatalog from ray.tune.logger import pretty_print from RLLib_training.custom_preprocessors import CustomPreprocessor -from flatland.envs.generators import complex_rail_generator ModelCatalog.register_custom_preprocessor("my_prep", CustomPreprocessor) ray.init() diff --git a/RLLib_training/train_experiment.py b/RLLib_training/train_experiment.py index 28530717304734154a9fbe2111b5982b30e50744..006bab121a3e9aab7d9f014a466ad77c1aced82f 100644 --- a/RLLib_training/train_experiment.py +++ b/RLLib_training/train_experiment.py @@ -2,6 +2,7 @@ import os import gin import gym +from flatland.envs.predictions import DummyPredictorForRailEnv from importlib_resources import path # Import PPO trainer: we can replace these imports by any other trainer from RLLib. from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG @@ -9,8 +10,6 @@ from ray.rllib.agents.ppo.ppo import PPOTrainer as Trainer from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph as PolicyGraph from ray.rllib.models import ModelCatalog -from flatland.envs.predictions import DummyPredictorForRailEnv - gin.external_configurable(DummyPredictorForRailEnv) import ray diff --git a/setup.py b/setup.py index 20ddab1f27a7b6b3b1e4df10bf90c47c86ecb82c..723e1a6f701150f5853a0199057bc234137c2aa2 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,12 @@ from setuptools import setup, find_packages # TODO: setup does not support installation from url, move to requirements*.txt # TODO: @master as soon as mr is merged on flatland. -os.system('pip install git+https://gitlab.aicrowd.com/flatland/flatland.git@57-access-resources-through-importlib_resources') +os.system( + 'pip install git+https://gitlab.aicrowd.com/flatland/flatland.git@57-access-resources-through-importlib_resources') install_reqs = [] # TODO: include requirements_RLLib_training.txt -requirements_paths = ['requirements_torch_training.txt'] #, 'requirements_RLLib_training.txt'] +requirements_paths = ['requirements_torch_training.txt'] # , 'requirements_RLLib_training.txt'] for requirements_path in requirements_paths: with open(requirements_path, 'r') as f: install_reqs += [ diff --git a/torch_training/training_navigation.py b/torch_training/training_navigation.py index 1c747c27e255ea11493ad7d7cea65ed986f3bbc1..96593864a31cf45864b3cf8f52c29a0d5a241ef0 100644 --- a/torch_training/training_navigation.py +++ b/torch_training/training_navigation.py @@ -1,22 +1,18 @@ -import os import random from collections import deque import numpy as np import torch - +from dueling_double_dqn import Agent from flatland.envs.generators import complex_rail_generator from flatland.envs.observations import TreeObsForRailEnv from flatland.envs.predictions import DummyPredictorForRailEnv from flatland.envs.rail_env import RailEnv from flatland.utils.rendertools import RenderTool -from torch_training.dueling_double_dqn import Agent random.seed(1) np.random.seed(1) -__file_dirname__ = os.path.dirname(os.path.realpath(__file__)) - # Example generate a rail given a manual specification, # a map of tuples (cell_type, rotation) transition_probability = [15, # empty cell - Case 0 @@ -47,22 +43,24 @@ env = RailEnv(width=15, env = RailEnv(width=10, height=20) -env.load_resource('torch_training.railway', "complex_scene.pkl") +env.load("./railway/complex_scene.pkl") """ -env = RailEnv(width=20, - height=20, - rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=0), +env = RailEnv(width=8, + height=8, + rail_generator=complex_rail_generator(nr_start_goal=5, nr_extra=1, min_dist=4, max_dist=99999, seed=0), obs_builder_object=TreeObsForRailEnv(max_depth=2, predictor=DummyPredictorForRailEnv()), - number_of_agents=10) + number_of_agents=3) + env.reset(True, True) env_renderer = RenderTool(env, gl="PILSVG") handle = env.get_agent_handles() -state_size = 147 * 2 +state_size = 168 * 2 action_size = 5 n_trials = 15000 +max_steps = int(1.5 * (env.height + env.width)) eps = 1. eps_end = 0.005 eps_decay = 0.9995 @@ -77,10 +75,9 @@ action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() agent = Agent(state_size, action_size, "FC", 0) -agent.qnetwork_local.load_state_dict(torch.load(os.path.join(__file_dirname__, 'Nets', 'avoid_checkpoint15000.pth'))) - -demo = True +# agent.qnetwork_local.load_state_dict(torch.load('./Nets/avoid_checkpoint15000.pth')) +demo = False def max_lt(seq, val): """ @@ -104,7 +101,7 @@ def min_lt(seq, val): min = np.inf idx = len(seq) - 1 while idx >= 0: - if seq[idx] > val and seq[idx] < min: + if seq[idx] >= val and seq[idx] < min: min = seq[idx] idx -= 1 return min @@ -119,7 +116,8 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1): :return: returnes normalized and clipped observatoin """ max_obs = max(1, max_lt(obs, 1000)) - min_obs = max(0, min_lt(obs, 0)) + min_obs = min(max_obs, min_lt(obs, 0)) + if max_obs == min_obs: return np.clip(np.array(obs) / max_obs, clip_min, clip_max) norm = np.abs(max_obs - min_obs) @@ -136,14 +134,14 @@ for trials in range(1, n_trials + 1): env_renderer.set_new_rail() final_obs = obs.copy() final_obs_next = obs.copy() - for a in range(env.get_num_agents()): - data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(obs[a]), num_features_per_node=7, + data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(obs[a]), num_features_per_node=8, current_depth=0) data = norm_obs_clip(data) distance = norm_obs_clip(distance) agent_data = np.clip(agent_data, -1, 1) obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data)) + for i in range(2): time_obs.append(obs) # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5) @@ -153,14 +151,14 @@ for trials in range(1, n_trials + 1): score = 0 env_done = 0 # Run episode - for step in range(env.height * env.width): + for step in range(max_steps): if demo: env_renderer.renderEnv(show=True, show_observations=False) # print(step) # Action for a in range(env.get_num_agents()): if demo: - eps = 1 + eps = 0 # action = agent.act(np.array(obs[a]), eps=eps) action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 @@ -169,13 +167,12 @@ for trials in range(1, n_trials + 1): next_obs, all_rewards, done, _ = env.step(action_dict) for a in range(env.get_num_agents()): - data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(next_obs[a]), num_features_per_node=7, - current_depth=0) + data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(next_obs[a]), num_features_per_node=8, + current_depth=0) data = norm_obs_clip(data) distance = norm_obs_clip(distance) agent_data = np.clip(agent_data, -1, 1) next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data)) - time_obs.append(next_obs) # Update replay buffer and train agent @@ -187,7 +184,7 @@ for trials in range(1, n_trials + 1): final_action_dict.update({a: action_dict[a]}) if not demo and not done[a]: agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a]) - score += all_rewards[a] + score += all_rewards[a] / env.get_num_agents() agent_obs = agent_next_obs.copy() if done['__all__']: @@ -199,21 +196,21 @@ for trials in range(1, n_trials + 1): eps = max(eps_end, eps_decay * eps) # decrease epsilon done_window.append(env_done) - scores_window.append(score) # save most recent score + scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append((np.mean(done_window))) print( - '\rTraining {} Agents.\t Episode {}\t Average Score: {:.0f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( - env.get_num_agents(), - trials, - np.mean(scores_window), - 100 * np.mean(done_window), - eps, action_prob / np.sum(action_prob)), end=" ") + '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( + env.get_num_agents(), + trials, + np.mean(scores_window), + 100 * np.mean(done_window), + eps, action_prob / np.sum(action_prob)), end=" ") if trials % 100 == 0: print( - '\rTraining {} Agents.\t Episode {}\t Average Score: {:.0f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( + '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( env.get_num_agents(), trials, np.mean(scores_window), @@ -221,5 +218,5 @@ for trials in range(1, n_trials + 1): eps, action_prob / np.sum(action_prob))) torch.save(agent.qnetwork_local.state_dict(), - os.path.join(__file_dirname__, 'Nets', 'avoid_checkpoint' + str(trials) + '.pth')) + './Nets/avoid_checkpoint' + str(trials) + '.pth') action_prob = [1] * action_size