From 0e7aa90c71ca88945f20f154ab1be22b74ed5fee Mon Sep 17 00:00:00 2001 From: "Egli Adrian (IT-SCI-API-PFI)" <adrian.egli@sbb.ch> Date: Fri, 27 Nov 2020 20:44:33 +0100 Subject: [PATCH] bug fixed --- .../multi_agent_training.py | 44 ++++++++++++------ run.py | 45 +++++++++++++++---- 2 files changed, 67 insertions(+), 22 deletions(-) diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index ea4d6e2..040a294 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -268,8 +268,9 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Reset environment reset_timer.start() - number_of_agents = min(1 + round(n_agents * (1.0 - 0.9985 ** episode_idx)), n_agents) + number_of_agents = int(min(n_agents, 1 + np.floor(episode_idx / 200))) train_env_params.n_agents = episode_idx % number_of_agents + 1 + train_env = create_rail_env(train_env_params, tree_observation) obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True) policy.reset() @@ -343,14 +344,29 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): step_timer.start() next_obs, all_rewards, done, info = train_env.step(action_dict) - # Dead-lock found -> rewards shaping - agent_positions = get_agent_positions(train_env) - for agent_handle in train_env.get_agent_handles(): - agent = train_env.agents[agent_handle] - act = action_dict.get(agent_handle, RailEnvActions.MOVE_FORWARD) - if agent.status == RailAgentStatus.ACTIVE: - if check_for_dealock(agent_handle, train_env, agent_positions): + # Reward shaping .Dead-lock .NotMoving .NotStarted + if False: + agent_positions = get_agent_positions(train_env) + for agent_handle in train_env.get_agent_handles(): + agent = train_env.agents[agent_handle] + + act = action_dict.get(agent_handle, RailEnvActions.MOVE_FORWARD) + if agent.status == RailAgentStatus.ACTIVE: + pos = agent.position + dir = agent.direction + possible_transitions = train_env.rail.get_transitions(*pos, dir) + num_transitions = fast_count_nonzero(possible_transitions) + if act == RailEnvActions.STOP_MOVING: + all_rewards[agent_handle] -= 2.0 + + if num_transitions == 1: + if act != RailEnvActions.MOVE_FORWARD: + all_rewards[agent_handle] -= 1.0 + if check_for_dealock(agent_handle, train_env, agent_positions): + all_rewards[agent_handle] -= 5.0 + elif agent.status == RailAgentStatus.READY_TO_DEPART: all_rewards[agent_handle] -= 5.0 + step_timer.end() # Render an episode at some interval @@ -563,14 +579,14 @@ def eval_policy(env, tree_observation, policy, train_params, obs_params): if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=2000, type=int) - parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=2, + parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=1, type=int) parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=1, type=int) - parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=2, type=int) + parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=5, type=int) parser.add_argument("--checkpoint_interval", help="checkpoint interval", default=100, type=int) - parser.add_argument("--eps_start", help="max exploration", default=0.1, type=float) - parser.add_argument("--eps_end", help="min exploration", default=0.01, type=float) + parser.add_argument("--eps_start", help="max exploration", default=1.0, type=float) + parser.add_argument("--eps_end", help="min exploration", default=0.05, type=float) parser.add_argument("--eps_decay", help="exploration decay", default=0.9975, type=float) parser.add_argument("--buffer_size", help="replay buffer size", default=int(1e7), type=int) parser.add_argument("--buffer_min_size", help="min buffer size to start training", default=0, type=int) @@ -618,8 +634,8 @@ if __name__ == "__main__": { # Test_2 "n_agents": 20, - "x_dim": 30, - "y_dim": 30, + "x_dim": 35, + "y_dim": 35, "n_cities": 3, "max_rails_between_cities": 2, "max_rails_in_city": 3, diff --git a/run.py b/run.py index b637195..55e31ea 100644 --- a/run.py +++ b/run.py @@ -18,7 +18,6 @@ EPSILON = 0.500 # Sum Normalized Reward : 3.754660231871272 (primary score) EPSILON = 1.000 # Sum Normalized Reward : 1.397180159192391 (primary score) ''' - import sys import time from argparse import Namespace @@ -26,6 +25,7 @@ from pathlib import Path import numpy as np from flatland.core.env_observation_builder import DummyObservationBuilder +from flatland.envs.observations import TreeObsForRailEnv from flatland.envs.predictions import ShortestPathPredictorForRailEnv from flatland.evaluators.client import FlatlandRemoteClient from flatland.evaluators.client import TimeoutException @@ -33,6 +33,7 @@ from flatland.evaluators.client import TimeoutException from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent from utils.deadlock_check import check_if_all_blocked from utils.fast_tree_obs import FastTreeObs +from utils.observation_utils import normalize_observation base_dir = Path(__file__).resolve().parent.parent sys.path.append(str(base_dir)) @@ -44,18 +45,22 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy # Print per-step logs VERBOSE = True +USE_FAST_TREE_OBS = True # Checkpoint to use (remember to push it!) checkpoint = "./checkpoints/201124171810-7800.pth" # 18.249244799876152 DEPTH=2 AGENTS=10 +# checkpoint = "./checkpoints/201126150143-5200.pth" # 18.249244799876152 DEPTH=2 AGENTS=10 +# checkpoint = "./checkpoints/201126160144-2000.pth" # 18.249244799876152 DEPTH=2 AGENTS=10 +checkpoint = "./checkpoints/201127160352-2000.pth" -EPSILON = 0.01 +EPSILON = 0.005 # Use last action cache USE_ACTION_CACHE = False -USE_DEAD_LOCK_AVOIDANCE_AGENT = False # 21.54485505223213 +USE_DEAD_LOCK_AVOIDANCE_AGENT = False # 21.54485505223213 # Observation parameters (must match training parameters!) -observation_tree_depth = 2 +observation_tree_depth = 1 observation_radius = 10 observation_max_path_depth = 30 @@ -65,10 +70,30 @@ remote_client = FlatlandRemoteClient() # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) -tree_observation = FastTreeObs(max_depth=observation_tree_depth) +if USE_FAST_TREEOBS: + def check_is_observation_valid(observation): + return True + + def get_normalized_observation(observation, tree_depth: int, observation_radius=0): + return observation + + tree_observation = FastTreeObs(max_depth=observation_tree_depth) + state_size = tree_observation.observation_dim +else: + def check_is_observation_valid(observation): + return observation + + + def get_normalized_observation(observation, tree_depth: int, observation_radius=0): + return normalize_observation(observation, tree_depth, observation_radius) + + + tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) + # Calculate the state size given the depth of the tree observation and the number of features + n_features_per_node = tree_observation.observation_dim + n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)]) + state_size = n_features_per_node * n_nodes -# Calculates state and action sizes -state_size = tree_observation.observation_dim action_size = 5 # Creates the policy. No GPU on evaluation server. @@ -159,7 +184,11 @@ while True: action = agent_last_action[agent_handle] nb_hit += 1 else: - action = policy.act(observation[agent_handle], eps=EPSILON) + normalized_observation = get_normalized_observation(observation[agent_handle], + observation_tree_depth, + observation_radius=observation_radius) + + action = policy.act(normalized_observation, eps=EPSILON) action_dict[agent_handle] = action -- GitLab