From 0e7aa90c71ca88945f20f154ab1be22b74ed5fee Mon Sep 17 00:00:00 2001
From: "Egli Adrian (IT-SCI-API-PFI)" <adrian.egli@sbb.ch>
Date: Fri, 27 Nov 2020 20:44:33 +0100
Subject: [PATCH] bug fixed

---
 .../multi_agent_training.py                   | 44 ++++++++++++------
 run.py                                        | 45 +++++++++++++++----
 2 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py
index ea4d6e2..040a294 100755
--- a/reinforcement_learning/multi_agent_training.py
+++ b/reinforcement_learning/multi_agent_training.py
@@ -268,8 +268,9 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
 
         # Reset environment
         reset_timer.start()
-        number_of_agents = min(1 + round(n_agents * (1.0 - 0.9985 ** episode_idx)), n_agents)
+        number_of_agents = int(min(n_agents, 1 + np.floor(episode_idx / 200)))
         train_env_params.n_agents = episode_idx % number_of_agents + 1
+
         train_env = create_rail_env(train_env_params, tree_observation)
         obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True)
         policy.reset()
@@ -343,14 +344,29 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
             step_timer.start()
             next_obs, all_rewards, done, info = train_env.step(action_dict)
 
-            # Dead-lock found -> rewards shaping
-            agent_positions = get_agent_positions(train_env)
-            for agent_handle in train_env.get_agent_handles():
-                agent = train_env.agents[agent_handle]
-                act = action_dict.get(agent_handle, RailEnvActions.MOVE_FORWARD)
-                if agent.status == RailAgentStatus.ACTIVE:
-                    if check_for_dealock(agent_handle, train_env, agent_positions):
+            # Reward shaping .Dead-lock .NotMoving .NotStarted
+            if False:
+                agent_positions = get_agent_positions(train_env)
+                for agent_handle in train_env.get_agent_handles():
+                    agent = train_env.agents[agent_handle]
+
+                    act = action_dict.get(agent_handle, RailEnvActions.MOVE_FORWARD)
+                    if agent.status == RailAgentStatus.ACTIVE:
+                        pos = agent.position
+                        dir = agent.direction
+                        possible_transitions = train_env.rail.get_transitions(*pos, dir)
+                        num_transitions = fast_count_nonzero(possible_transitions)
+                        if act == RailEnvActions.STOP_MOVING:
+                            all_rewards[agent_handle] -= 2.0
+
+                        if num_transitions == 1:
+                            if act != RailEnvActions.MOVE_FORWARD:
+                                all_rewards[agent_handle] -= 1.0
+                        if check_for_dealock(agent_handle, train_env, agent_positions):
+                            all_rewards[agent_handle] -= 5.0
+                    elif agent.status == RailAgentStatus.READY_TO_DEPART:
                         all_rewards[agent_handle] -= 5.0
+
             step_timer.end()
 
             # Render an episode at some interval
@@ -563,14 +579,14 @@ def eval_policy(env, tree_observation, policy, train_params, obs_params):
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=2000, type=int)
-    parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=2,
+    parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=1,
                         type=int)
     parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=1,
                         type=int)
-    parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=2, type=int)
+    parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=5, type=int)
     parser.add_argument("--checkpoint_interval", help="checkpoint interval", default=100, type=int)
-    parser.add_argument("--eps_start", help="max exploration", default=0.1, type=float)
-    parser.add_argument("--eps_end", help="min exploration", default=0.01, type=float)
+    parser.add_argument("--eps_start", help="max exploration", default=1.0, type=float)
+    parser.add_argument("--eps_end", help="min exploration", default=0.05, type=float)
     parser.add_argument("--eps_decay", help="exploration decay", default=0.9975, type=float)
     parser.add_argument("--buffer_size", help="replay buffer size", default=int(1e7), type=int)
     parser.add_argument("--buffer_min_size", help="min buffer size to start training", default=0, type=int)
@@ -618,8 +634,8 @@ if __name__ == "__main__":
         {
             # Test_2
             "n_agents": 20,
-            "x_dim": 30,
-            "y_dim": 30,
+            "x_dim": 35,
+            "y_dim": 35,
             "n_cities": 3,
             "max_rails_between_cities": 2,
             "max_rails_in_city": 3,
diff --git a/run.py b/run.py
index b637195..55e31ea 100644
--- a/run.py
+++ b/run.py
@@ -18,7 +18,6 @@ EPSILON = 0.500 # Sum Normalized Reward :  3.754660231871272 (primary score)
 EPSILON = 1.000 # Sum Normalized Reward :  1.397180159192391 (primary score)
 '''
 
-
 import sys
 import time
 from argparse import Namespace
@@ -26,6 +25,7 @@ from pathlib import Path
 
 import numpy as np
 from flatland.core.env_observation_builder import DummyObservationBuilder
+from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.evaluators.client import FlatlandRemoteClient
 from flatland.evaluators.client import TimeoutException
@@ -33,6 +33,7 @@ from flatland.evaluators.client import TimeoutException
 from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
 from utils.deadlock_check import check_if_all_blocked
 from utils.fast_tree_obs import FastTreeObs
+from utils.observation_utils import normalize_observation
 
 base_dir = Path(__file__).resolve().parent.parent
 sys.path.append(str(base_dir))
@@ -44,18 +45,22 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy
 
 # Print per-step logs
 VERBOSE = True
+USE_FAST_TREE_OBS = True
 
 # Checkpoint to use (remember to push it!)
 checkpoint = "./checkpoints/201124171810-7800.pth"  # 18.249244799876152 DEPTH=2 AGENTS=10
+# checkpoint = "./checkpoints/201126150143-5200.pth"  # 18.249244799876152 DEPTH=2 AGENTS=10
+# checkpoint = "./checkpoints/201126160144-2000.pth"  # 18.249244799876152 DEPTH=2 AGENTS=10
+checkpoint = "./checkpoints/201127160352-2000.pth"
 
-EPSILON = 0.01
+EPSILON = 0.005
 
 # Use last action cache
 USE_ACTION_CACHE = False
-USE_DEAD_LOCK_AVOIDANCE_AGENT = False # 21.54485505223213
+USE_DEAD_LOCK_AVOIDANCE_AGENT = False  # 21.54485505223213
 
 # Observation parameters (must match training parameters!)
-observation_tree_depth = 2
+observation_tree_depth = 1
 observation_radius = 10
 observation_max_path_depth = 30
 
@@ -65,10 +70,30 @@ remote_client = FlatlandRemoteClient()
 
 # Observation builder
 predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
-tree_observation = FastTreeObs(max_depth=observation_tree_depth)
+if USE_FAST_TREEOBS:
+    def check_is_observation_valid(observation):
+        return True
+
+    def get_normalized_observation(observation, tree_depth: int, observation_radius=0):
+        return observation
+
+    tree_observation = FastTreeObs(max_depth=observation_tree_depth)
+    state_size = tree_observation.observation_dim
+else:
+    def check_is_observation_valid(observation):
+        return observation
+
+
+    def get_normalized_observation(observation, tree_depth: int, observation_radius=0):
+        return normalize_observation(observation, tree_depth, observation_radius)
+
+
+    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)
+    # Calculate the state size given the depth of the tree observation and the number of features
+    n_features_per_node = tree_observation.observation_dim
+    n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)])
+    state_size = n_features_per_node * n_nodes
 
-# Calculates state and action sizes
-state_size = tree_observation.observation_dim
 action_size = 5
 
 # Creates the policy. No GPU on evaluation server.
@@ -159,7 +184,11 @@ while True:
                             action = agent_last_action[agent_handle]
                             nb_hit += 1
                         else:
-                            action = policy.act(observation[agent_handle], eps=EPSILON)
+                            normalized_observation = get_normalized_observation(observation[agent_handle],
+                                                                                observation_tree_depth,
+                                                                                observation_radius=observation_radius)
+
+                            action = policy.act(normalized_observation, eps=EPSILON)
 
                     action_dict[agent_handle] = action
 
-- 
GitLab