From 23e98b594dcb2c54832413317de422f1a7d2628e Mon Sep 17 00:00:00 2001
From: Erik Nygren <erik.nygren@sbb.ch>
Date: Sun, 1 Sep 2019 13:58:39 -0400
Subject: [PATCH] only store observations in buffer when you are allowed to
 take an action!

---
 .../observation_builders/observations.py      |  2 +-
 torch_training/render_agent_behavior.py       |  8 +--
 torch_training/training_navigation.py         | 58 +------------------
 utils/observation_utils.py                    |  3 +-
 4 files changed, 8 insertions(+), 63 deletions(-)

diff --git a/torch_training/observation_builders/observations.py b/torch_training/observation_builders/observations.py
index 10bd1f0..70e5840 100644
--- a/torch_training/observation_builders/observations.py
+++ b/torch_training/observation_builders/observations.py
@@ -346,7 +346,7 @@ class TreeObsForRailEnv(ObservationBuilder):
         unusable_switch = np.inf
         other_agent_same_direction = 0
         other_agent_opposite_direction = 0
-        malfunctioning_agent = 0
+        malfunctioning_agent = 0.
         min_fractional_speed = 1.
         num_steps = 1
         while exploring:
diff --git a/torch_training/render_agent_behavior.py b/torch_training/render_agent_behavior.py
index fc0e067..651ec3e 100644
--- a/torch_training/render_agent_behavior.py
+++ b/torch_training/render_agent_behavior.py
@@ -103,7 +103,7 @@ action_prob = [0] * action_size
 agent_obs = [None] * env.get_num_agents()
 agent_next_obs = [None] * env.get_num_agents()
 agent = Agent(state_size, action_size, "FC", 0)
-with path(torch_training.Nets, "navigator_checkpoint1200.pth") as file_in:
+with path(torch_training.Nets, "navigator_checkpoint500.pth") as file_in:
     agent.qnetwork_local.load_state_dict(torch.load(file_in))
 
 record_images = False
@@ -126,14 +126,12 @@ for trials in range(1, n_trials + 1):
 
         # Action
         for a in range(env.get_num_agents()):
-            action = agent.act(agent_obs[a], eps=0)
+            action = agent.act(agent_obs[a], eps=0.)
             action_prob[action] += 1
             action_dict.update({a: action})
-
         # Environment step
         obs, all_rewards, done, _ = env.step(action_dict)
-
-        env_renderer.render_env(show=True, show_predictions=False, show_observations=False)
+        env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
         # Build agent specific observations and normalize
         for a in range(env.get_num_agents()):
             agent_obs[a] = normalize_observation(obs[a], observation_radius=10)
diff --git a/torch_training/training_navigation.py b/torch_training/training_navigation.py
index 25b8c14..2417746 100644
--- a/torch_training/training_navigation.py
+++ b/torch_training/training_navigation.py
@@ -71,11 +71,9 @@ def main(argv):
                   number_of_agents=n_agents,
                   stochastic_data=stochastic_data,  # Malfunction data generator
                   obs_builder_object=TreeObservation)
-    env.reset(True, True)
 
     # After training we want to render the results so we also load a renderer
     env_renderer = RenderTool(env, gl="PILSVG", )
-
     # Given the depth of the tree observation and the number of features per node we get the following state_size
     num_features_per_node = env.obs_builder.observation_dim
     tree_depth = 2
@@ -104,7 +102,6 @@ def main(argv):
     final_action_dict = dict()
     scores_window = deque(maxlen=100)
     done_window = deque(maxlen=100)
-    time_obs = deque(maxlen=2)
     scores = []
     dones_list = []
     action_prob = [0] * action_size
@@ -114,8 +111,6 @@ def main(argv):
     # Now we load a Double dueling DQN agent
     agent = Agent(state_size, action_size, "FC", 0)
 
-    Training = True
-
     for trials in range(1, n_trials + 1):
 
         # Reset environment
@@ -126,19 +121,17 @@ def main(argv):
 
         # Build agent specific observations
         for a in range(env.get_num_agents()):
-            agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], observation_radius=10)
+            agent_obs[a] = normalize_observation(obs[a], observation_radius=10)
 
         # Reset score and done
         score = 0
         env_done = 0
 
-
         # Run episode
         for step in range(max_steps):
-
             # Action
             for a in range(env.get_num_agents()):
-                if env.agents[a].speed_data['position_fraction'] == 0.:
+                if env.agents[a].speed_data['position_fraction'] < 0.001:
                     register_action_state[a] = True
                 else:
                     register_action_state[a] = False
@@ -166,7 +159,6 @@ def main(argv):
 
             # Copy observation
             agent_obs = agent_next_obs.copy()
-
             if done['__all__']:
                 env_done = 1
                 for a in range(env.get_num_agents()):
@@ -206,52 +198,6 @@ def main(argv):
                        './Nets/navigator_checkpoint' + str(trials) + '.pth')
             action_prob = [1] * action_size
 
-    # Render the trained agent
-
-    # Reset environment
-    obs = env.reset(True, True)
-    env_renderer.set_new_rail()
-
-    # Split the observation tree into its parts and normalize the observation using the utility functions.
-    # Build agent specific local observation
-    for a in range(env.get_num_agents()):
-        rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
-                                                          num_features_per_node=num_features_per_node,
-                                                          current_depth=0)
-        rail_data = norm_obs_clip(rail_data)
-        distance_data = norm_obs_clip(distance_data)
-        agent_data = np.clip(agent_data, -1, 1)
-        agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
-
-    # Reset score and done
-    score = 0
-    env_done = 0
-
-    # Run episode
-    for step in range(max_steps):
-        env_renderer.render_env(show=True, show_observations=False)
-
-        # Chose the actions
-        for a in range(env.get_num_agents()):
-            eps = 0
-            action = agent.act(agent_obs[a], eps=eps)
-            action_dict.update({a: action})
-
-        # Environment step
-        next_obs, all_rewards, done, _ = env.step(action_dict)
-
-        for a in range(env.get_num_agents()):
-            rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                              num_features_per_node=num_features_per_node,
-                                                              current_depth=0)
-            rail_data = norm_obs_clip(rail_data)
-            distance_data = norm_obs_clip(distance_data)
-            agent_data = np.clip(agent_data, -1, 1)
-            agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
-
-        agent_obs = agent_next_obs.copy()
-        if done['__all__']:
-            break
     # Plot overall training progress at the end
     plt.plot(scores)
     plt.show()
diff --git a/utils/observation_utils.py b/utils/observation_utils.py
index b4badeb..7352601 100644
--- a/utils/observation_utils.py
+++ b/utils/observation_utils.py
@@ -95,6 +95,7 @@ def split_tree(tree, num_features_per_node, current_depth=0):
             tree_data.extend(tmp_tree_data)
             distance_data.extend(tmp_distance_data)
             agent_data.extend(tmp_agent_data)
+
     return tree_data, distance_data, agent_data
 
 
@@ -103,6 +104,6 @@ def normalize_observation(observation, num_features_per_node=11, observation_rad
                                             current_depth=0)
     data = norm_obs_clip(data, fixed_radius=observation_radius)
     distance = norm_obs_clip(distance, normalize_to_range=True)
-    agent_data = np.clip(agent_data, -1, 20)
+    agent_data = np.clip(agent_data, -1, 1)
     normalized_obs = np.concatenate((np.concatenate((data, distance)), agent_data))
     return normalized_obs
-- 
GitLab