only store observations in buffer when you are allowed to take an action!

23e98b59 · Erik Nygren · 08d70296 · 23e98b59 · 23e98b59 · 23e98b59
Commit 23e98b59 authored 5 years ago by Erik Nygren
--- a/torch_training/observation_builders/observations.py
+++ b/torch_training/observation_builders/observations.py
@@ -346,7 +346,7 @@ class TreeObsForRailEnv(ObservationBuilder):
        unusable_switch = np.inf
        other_agent_same_direction = 0
        other_agent_opposite_direction = 0
-        malfunctioning_agent = 0
+        malfunctioning_agent = 0.
        min_fractional_speed = 1.
        num_steps = 1
        while exploring:

--- a/torch_training/render_agent_behavior.py
+++ b/torch_training/render_agent_behavior.py
@@ -103,7 +103,7 @@ action_prob = [0] * action_size
 agent_obs = [None] * env.get_num_agents()
 agent_next_obs = [None] * env.get_num_agents()
 agent = Agent(state_size, action_size, "FC", 0)
-with path(torch_training.Nets, "navigator_checkpoint1200.pth") as file_in:
+with path(torch_training.Nets, "navigator_checkpoint500.pth") as file_in:
    agent.qnetwork_local.load_state_dict(torch.load(file_in))

 record_images = False
@@ -126,14 +126,12 @@ for trials in range(1, n_trials + 1):

        # Action
        for a in range(env.get_num_agents()):
-            action = agent.act(agent_obs[a], eps=0)
+            action = agent.act(agent_obs[a], eps=0.)
            action_prob[action] += 1
            action_dict.update({a: action})
-
        # Environment step
        obs, all_rewards, done, _ = env.step(action_dict)
-
-        env_renderer.render_env(show=True, show_predictions=False, show_observations=False)
+        env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
        # Build agent specific observations and normalize
        for a in range(env.get_num_agents()):
            agent_obs[a] = normalize_observation(obs[a], observation_radius=10)

--- a/torch_training/training_navigation.py
+++ b/torch_training/training_navigation.py
@@ -71,11 +71,9 @@ def main(argv):
                  number_of_agents=n_agents,
                  stochastic_data=stochastic_data,  # Malfunction data generator
                  obs_builder_object=TreeObservation)
-    env.reset(True, True)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG", )
-
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim
    tree_depth = 2
@@ -104,7 +102,6 @@ def main(argv):
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
-    time_obs = deque(maxlen=2)
    scores = []
    dones_list = []
    action_prob = [0] * action_size
@@ -114,8 +111,6 @@ def main(argv):
    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size, "FC", 0)

-    Training = True
-
    for trials in range(1, n_trials + 1):

        # Reset environment
@@ -126,19 +121,17 @@ def main(argv):

        # Build agent specific observations
        for a in range(env.get_num_agents()):
-            agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], observation_radius=10)
+            agent_obs[a] = normalize_observation(obs[a], observation_radius=10)

        # Reset score and done
        score = 0
        env_done = 0

-
        # Run episode
        for step in range(max_steps):
-
            # Action
            for a in range(env.get_num_agents()):
-                if env.agents[a].speed_data['position_fraction'] == 0.:
+                if env.agents[a].speed_data['position_fraction'] < 0.001:
                    register_action_state[a] = True
                else:
                    register_action_state[a] = False
@@ -166,7 +159,6 @@ def main(argv):

            # Copy observation
            agent_obs = agent_next_obs.copy()
-
            if done['__all__']:
                env_done = 1
                for a in range(env.get_num_agents()):
@@ -206,52 +198,6 @@ def main(argv):
                       './Nets/navigator_checkpoint' + str(trials) + '.pth')
            action_prob = [1] * action_size

-    # Render the trained agent
-
-    # Reset environment
-    obs = env.reset(True, True)
-    env_renderer.set_new_rail()
-
-    # Split the observation tree into its parts and normalize the observation using the utility functions.
-    # Build agent specific local observation
-    for a in range(env.get_num_agents()):
-        rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
-                                                          num_features_per_node=num_features_per_node,
-                                                          current_depth=0)
-        rail_data = norm_obs_clip(rail_data)
-        distance_data = norm_obs_clip(distance_data)
-        agent_data = np.clip(agent_data, -1, 1)
-        agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
-
-    # Reset score and done
-    score = 0
-    env_done = 0
-
-    # Run episode
-    for step in range(max_steps):
-        env_renderer.render_env(show=True, show_observations=False)
-
-        # Chose the actions
-        for a in range(env.get_num_agents()):
-            eps = 0
-            action = agent.act(agent_obs[a], eps=eps)
-            action_dict.update({a: action})
-
-        # Environment step
-        next_obs, all_rewards, done, _ = env.step(action_dict)
-
-        for a in range(env.get_num_agents()):
-            rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                              num_features_per_node=num_features_per_node,
-                                                              current_depth=0)
-            rail_data = norm_obs_clip(rail_data)
-            distance_data = norm_obs_clip(distance_data)
-            agent_data = np.clip(agent_data, -1, 1)
-            agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
-
-        agent_obs = agent_next_obs.copy()
-        if done['__all__']:
-            break
    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()

--- a/utils/observation_utils.py
+++ b/utils/observation_utils.py
@@ -95,6 +95,7 @@ def split_tree(tree, num_features_per_node, current_depth=0):
            tree_data.extend(tmp_tree_data)
            distance_data.extend(tmp_distance_data)
            agent_data.extend(tmp_agent_data)
+
    return tree_data, distance_data, agent_data


@@ -103,6 +104,6 @@ def normalize_observation(observation, num_features_per_node=11, observation_rad
                                            current_depth=0)
    data = norm_obs_clip(data, fixed_radius=observation_radius)
    distance = norm_obs_clip(distance, normalize_to_range=True)
-    agent_data = np.clip(agent_data, -1, 20)
+    agent_data = np.clip(agent_data, -1, 1)
    normalized_obs = np.concatenate((np.concatenate((data, distance)), agent_data))
    return normalized_obs