diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 2d513ed84ae6ecc392aca195851db60e400d0523..2e74d689f2d19b05b75946388216908d1dfcfda1 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -22,7 +22,8 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy from reinforcement_learning.deadlockavoidance_with_decision_agent import DeadLockAvoidanceWithDecisionAgent from reinforcement_learning.multi_decision_agent import MultiDecisionAgent from reinforcement_learning.ppo_agent import PPOPolicy -from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action +from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action, \ + set_action_size_reduced, set_action_size_full, map_action_policy from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent base_dir = Path(__file__).resolve().parent.parent @@ -169,6 +170,11 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): scores_window = deque(maxlen=checkpoint_interval) # todo smooth when rendering instead completion_window = deque(maxlen=checkpoint_interval) + if train_params.action_size == "reduced": + set_action_size_reduced() + else: + set_action_size_full() + # Double Dueling DQN policy if train_params.policy == "DDDQN": policy = DDDQNPolicy(state_size, get_action_size(), train_params) @@ -212,7 +218,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): hdd.free / (2 ** 30))) # TensorBoard writer - writer = SummaryWriter() + writer = SummaryWriter(comment="_" + train_params.policy + "_" + train_params.action_size) training_timer = Timer() training_timer.start() @@ -313,7 +319,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): learn_timer.start() policy.step(agent_handle, agent_prev_obs[agent_handle], - agent_prev_action[agent_handle] - 1, + map_action_policy(agent_prev_action[agent_handle]), all_rewards[agent_handle], agent_obs[agent_handle], done[agent_handle]) @@ -540,7 +546,8 @@ if __name__ == "__main__": parser.add_argument("--max_depth", help="max depth", default=2, type=int) parser.add_argument("--policy", help="policy name [DDDQN, PPO, DeadLockAvoidance, DeadLockAvoidanceWithDecision, MultiDecision]", - default="ppo") + default="DeadLockAvoidance") + parser.add_argument("--action_size", help="define the action size [reduced,full]", default="full", type=str) training_params = parser.parse_args() env_params = [ diff --git a/utils/agent_action_config.py b/utils/agent_action_config.py index 4c1f83fadfa959c24ae4be4ff342c8508aa3d29e..9c2af58404e79b2b16430eed2cc71978420e987c 100644 --- a/utils/agent_action_config.py +++ b/utils/agent_action_config.py @@ -1,26 +1,45 @@ from flatland.envs.rail_env import RailEnvActions +# global action size +global _agent_action_config_action_size +_agent_action_config_action_size = 5 + def get_flatland_full_action_size(): # The action space of flatland is 5 discrete actions return 5 +def set_action_size_full(): + # The agents (DDDQN, PPO, ... ) have this actions space + _agent_action_config_action_size = 5 + + +def set_action_size_reduced(): + # The agents (DDDQN, PPO, ... ) have this actions space + _agent_action_config_action_size = 4 + + def get_action_size(): # The agents (DDDQN, PPO, ... ) have this actions space - return 4 + return _agent_action_config_action_size def map_actions(actions): # Map the - if get_action_size() == get_flatland_full_action_size(): - return actions - for key in actions: - value = actions.get(key, 0) - actions.update({key: map_action(value)}) + if get_action_size() != get_flatland_full_action_size(): + for key in actions: + value = actions.get(key, 0) + actions.update({key: map_action(value)}) return actions +def map_action_policy(action): + if get_action_size() != get_flatland_full_action_size(): + return action - 1 + return action + + def map_action(action): if get_action_size() == get_flatland_full_action_size(): return action diff --git a/utils/dead_lock_avoidance_agent.py b/utils/dead_lock_avoidance_agent.py index ac7fd0c744227a5c4ef3506f7a5aa74ddd7bfb1c..4c4c9033d83a6ab578f183c985df308e914d002e 100644 --- a/utils/dead_lock_avoidance_agent.py +++ b/utils/dead_lock_avoidance_agent.py @@ -67,7 +67,8 @@ class DeadlockAvoidanceShortestDistanceWalker(ShortestDistanceWalker): self.full_shortest_distance_agent_map[(handle, position[0], position[1])] = 1 class DeadLockAvoidanceAgent(HeuristicPolicy): - def __init__(self, env: RailEnv, action_size, show_debug_plot=False): + def __init__(self, env: RailEnv, action_size, enable_eps=False, show_debug_plot=False): + print(">> DeadLockAvoidance") self.env = env self.memory = DummyMemory() self.loss = 0 @@ -76,14 +77,16 @@ class DeadLockAvoidanceAgent(HeuristicPolicy): self.agent_can_move_value = {} self.switches = {} self.show_debug_plot = show_debug_plot + self.enable_eps = enable_eps def step(self, handle, state, action, reward, next_state, done): pass def act(self, handle, state, eps=0.): # Epsilon-greedy action selection - if np.random.random() < eps: - return np.random.choice(np.arange(self.action_size)) + if self.enable_eps: + if np.random.random() < eps: + return np.random.choice(np.arange(self.action_size)) # agent = self.env.agents[state[0]] check = self.agent_can_move.get(handle, None)