From 4bee50ba29d73d5f2067e60e49b8b4245b321a1a Mon Sep 17 00:00:00 2001
From: Erik Nygren <erik.nygren@sbb.ch>
Date: Thu, 13 Jun 2019 18:38:02 +0200
Subject: [PATCH] fixed normalizing function

---
 torch_training/training_navigation.py | 40 +++++++++++++++------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/torch_training/training_navigation.py b/torch_training/training_navigation.py
index f16a2c4..9659386 100644
--- a/torch_training/training_navigation.py
+++ b/torch_training/training_navigation.py
@@ -46,19 +46,21 @@ env = RailEnv(width=10,
 env.load("./railway/complex_scene.pkl")
 """
 
-env = RailEnv(width=20,
-              height=20,
-              rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=0),
+env = RailEnv(width=8,
+              height=8,
+              rail_generator=complex_rail_generator(nr_start_goal=5, nr_extra=1, min_dist=4, max_dist=99999, seed=0),
               obs_builder_object=TreeObsForRailEnv(max_depth=2, predictor=DummyPredictorForRailEnv()),
-              number_of_agents=10)
+              number_of_agents=3)
+
 env.reset(True, True)
 
 env_renderer = RenderTool(env, gl="PILSVG")
 handle = env.get_agent_handles()
 
-state_size = 147 * 2
+state_size = 168 * 2
 action_size = 5
 n_trials = 15000
+max_steps = int(1.5 * (env.height + env.width))
 eps = 1.
 eps_end = 0.005
 eps_decay = 0.9995
@@ -73,9 +75,9 @@ action_prob = [0] * action_size
 agent_obs = [None] * env.get_num_agents()
 agent_next_obs = [None] * env.get_num_agents()
 agent = Agent(state_size, action_size, "FC", 0)
-agent.qnetwork_local.load_state_dict(torch.load('./Nets/avoid_checkpoint15000.pth'))
+# agent.qnetwork_local.load_state_dict(torch.load('./Nets/avoid_checkpoint15000.pth'))
 
-demo = True
+demo = False
 
 def max_lt(seq, val):
     """
@@ -99,7 +101,7 @@ def min_lt(seq, val):
     min = np.inf
     idx = len(seq) - 1
     while idx >= 0:
-        if seq[idx] > val and seq[idx] < min:
+        if seq[idx] >= val and seq[idx] < min:
             min = seq[idx]
         idx -= 1
     return min
@@ -114,7 +116,8 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1):
     :return: returnes normalized and clipped observatoin
     """
     max_obs = max(1, max_lt(obs, 1000))
-    min_obs = max(0, min_lt(obs, 0))
+    min_obs = min(max_obs, min_lt(obs, 0))
+
     if max_obs == min_obs:
         return np.clip(np.array(obs) / max_obs, clip_min, clip_max)
     norm = np.abs(max_obs - min_obs)
@@ -131,13 +134,14 @@ for trials in range(1, n_trials + 1):
         env_renderer.set_new_rail()
     final_obs = obs.copy()
     final_obs_next = obs.copy()
-
     for a in range(env.get_num_agents()):
-        data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(obs[a]), num_features_per_node=7, current_depth=0)
+        data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(obs[a]), num_features_per_node=8,
+                                                                current_depth=0)
         data = norm_obs_clip(data)
         distance = norm_obs_clip(distance)
         agent_data = np.clip(agent_data, -1, 1)
         obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+
     for i in range(2):
         time_obs.append(obs)
     # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
@@ -147,7 +151,7 @@ for trials in range(1, n_trials + 1):
     score = 0
     env_done = 0
     # Run episode
-    for step in range(env.height * env.width):
+    for step in range(max_steps):
         if demo:
             env_renderer.renderEnv(show=True, show_observations=False)
         # print(step)
@@ -163,13 +167,12 @@ for trials in range(1, n_trials + 1):
 
         next_obs, all_rewards, done, _ = env.step(action_dict)
         for a in range(env.get_num_agents()):
-            data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(next_obs[a]), num_features_per_node=7,
+            data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(next_obs[a]), num_features_per_node=8,
                                                         current_depth=0)
             data = norm_obs_clip(data)
             distance = norm_obs_clip(distance)
             agent_data = np.clip(agent_data, -1, 1)
             next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
-
         time_obs.append(next_obs)
 
         # Update replay buffer and train agent
@@ -181,7 +184,7 @@ for trials in range(1, n_trials + 1):
                 final_action_dict.update({a: action_dict[a]})
             if not demo and not done[a]:
                 agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
-            score += all_rewards[a]
+            score += all_rewards[a] / env.get_num_agents()
 
         agent_obs = agent_next_obs.copy()
         if done['__all__']:
@@ -193,11 +196,12 @@ for trials in range(1, n_trials + 1):
     eps = max(eps_end, eps_decay * eps)  # decrease epsilon
 
     done_window.append(env_done)
-    scores_window.append(score)  # save most recent score
+    scores_window.append(score / max_steps)  # save most recent score
     scores.append(np.mean(scores_window))
     dones_list.append((np.mean(done_window)))
 
-    print('\rTraining {} Agents.\t Episode {}\t Average Score: {:.0f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+    print(
+        '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
               env.get_num_agents(),
               trials,
               np.mean(scores_window),
@@ -206,7 +210,7 @@ for trials in range(1, n_trials + 1):
 
     if trials % 100 == 0:
         print(
-            '\rTraining {} Agents.\t Episode {}\t Average Score: {:.0f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+            '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                 env.get_num_agents(),
                 trials,
                 np.mean(scores_window),
-- 
GitLab