Compare revisions

c977124a · c977124a · c977124a
--- a/torch_training/training_navigation.py
+++ b/torch_training/training_navigation.py
@@ -2,18 +2,25 @@ import getopt
 import random
 import sys
 from collections import deque
+# make sure the root path is in system path
+from pathlib import Path
+
+from flatland.envs.malfunction_generators import malfunction_from_params, MalfunctionParameters
+
+base_dir = Path(__file__).resolve().parent.parent
+sys.path.append(str(base_dir))

 import matplotlib.pyplot as plt
 import numpy as np
 import torch
-from dueling_double_dqn import Agent
-from flatland.envs.generators import complex_rail_generator
-from flatland.envs.observations import TreeObsForRailEnv
+from torch_training.dueling_double_dqn import Agent
+
 from flatland.envs.rail_env import RailEnv
+from flatland.envs.rail_generators import sparse_rail_generator
+from flatland.envs.schedule_generators import sparse_schedule_generator
 from flatland.utils.rendertools import RenderTool
-
-from utils.observation_utils import norm_obs_clip, split_tree
-
+from utils.observation_utils import normalize_observation
+from flatland.envs.observations import TreeObsForRailEnv

 def main(argv):
    try:
@@ -29,28 +36,44 @@ def main(argv):
    np.random.seed(1)

    # Parameters for the Environment
-    x_dim = 10
-    y_dim = 10
+    x_dim = 35
+    y_dim = 35
    n_agents = 1
-    n_goals = 5
-    min_dist = 5

-    # We are training an Agent using the Tree Observation with depth 2
-    observation_builder = TreeObsForRailEnv(max_depth=2)

-    # Load the Environment
+    # Use a the malfunction generator to break agents from time to time
+    stochastic_data = MalfunctionParameters(malfunction_rate=1./10000,  # Rate of malfunction occurence
+                                            min_duration=15,  # Minimal duration of malfunction
+                                            max_duration=50  # Max duration of malfunction
+                                            )
+
+
+    # Custom observation builder
+    TreeObservation = TreeObsForRailEnv(max_depth=2)
+
+    # Different agent types (trains) with different speeds.
+    speed_ration_map = {1.: 0.,  # Fast passenger train
+                        1. / 2.: 1.0,  # Fast freight train
+                        1. / 3.: 0.0,  # Slow commuter train
+                        1. / 4.: 0.0}  # Slow freight train
+
    env = RailEnv(width=x_dim,
                  height=y_dim,
-                  rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
-                                                        max_dist=99999,
-                                                        seed=0),
-                  obs_builder_object=observation_builder,
-                  number_of_agents=n_agents)
-    env.reset(True, True)
-
+                  rail_generator=sparse_rail_generator(max_num_cities=3,
+                                                       # Number of cities in map (where train stations are)
+                                                       seed=1,  # Random seed
+                                                       grid_mode=False,
+                                                       max_rails_between_cities=2,
+                                                       max_rails_in_city=3),
+                  schedule_generator=sparse_schedule_generator(speed_ration_map),
+                  number_of_agents=n_agents,
+                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
+                  # Malfunction data generator
+                  obs_builder_object=TreeObservation)
+    # Reset env
+    env.reset(True,True)
    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG", )
-
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim
    tree_depth = 2
@@ -64,7 +87,7 @@ def main(argv):

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
-        n_trials = 6000
+        n_trials = 15000

    # And the max number of steps we want to take per episode
    max_steps = int(3 * (env.height + env.width))
@@ -79,35 +102,28 @@ def main(argv):
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
-    time_obs = deque(maxlen=2)
    scores = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
-
+    agent_obs_buffer = [None] * env.get_num_agents()
+    agent_action_buffer = [2] * env.get_num_agents()
+    cummulated_reward = np.zeros(env.get_num_agents())
+    update_values = False
    # Now we load a Double dueling DQN agent
-    agent = Agent(state_size, action_size, "FC", 0)
-
-    Training = True
+    agent = Agent(state_size, action_size)

    for trials in range(1, n_trials + 1):

        # Reset environment
-        obs = env.reset(True, True)
-        if not Training:
-            env_renderer.set_new_rail()
-
-        # Split the observation tree into its parts and normalize the observation using the utility functions.
-        # Build agent specific local observation
+        obs, info = env.reset(True, True)
+        env_renderer.reset()
+        # Build agent specific observations
        for a in range(env.get_num_agents()):
-            rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
-                                                              num_features_per_node=num_features_per_node,
-                                                              current_depth=0)
-            rail_data = norm_obs_clip(rail_data)
-            distance_data = norm_obs_clip(distance_data)
-            agent_data = np.clip(agent_data, -1, 1)
-            agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
+            if obs[a]:
+                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
+                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
@@ -115,45 +131,36 @@ def main(argv):

        # Run episode
        for step in range(max_steps):
-
-            # Only render when not triaing
-            if not Training:
-                env_renderer.renderEnv(show=True, show_observations=True)
-
-            # Chose the actions
+            # Action
            for a in range(env.get_num_agents()):
-                if not Training:
-                    eps = 0
-
-                action = agent.act(agent_obs[a], eps=eps)
+                if info['action_required'][a]:
+                    # If an action is require, we want to store the obs a that step as well as the action
+                    update_values = True
+                    action = agent.act(agent_obs[a], eps=eps)
+                    action_prob[action] += 1
+                else:
+                    update_values = False
+                    action = 0
                action_dict.update({a: action})

-                # Count number of actions takes for statistics
-                action_prob[action] += 1
-
            # Environment step
-            next_obs, all_rewards, done, _ = env.step(action_dict)
-
-            for a in range(env.get_num_agents()):
-                rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                                  num_features_per_node=num_features_per_node,
-                                                                  current_depth=0)
-                rail_data = norm_obs_clip(rail_data)
-                distance_data = norm_obs_clip(distance_data)
-                agent_data = np.clip(agent_data, -1, 1)
-                agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
-
+            next_obs, all_rewards, done, info = env.step(action_dict)
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
+                # Only update the values when we are done or when an action was taken and thus relevant information is present
+                if update_values or done[a]:
+                    agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
+                               agent_obs[a], done[a])
+                    cummulated_reward[a] = 0.

-                # Remember and train agent
-                if Training:
-                    agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
+                    agent_obs_buffer[a] = agent_obs[a].copy()
+                    agent_action_buffer[a] = action_dict[a]
+                if next_obs[a]:
+                    agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)

-                # Update the current score
                score += all_rewards[a] / env.get_num_agents()

-            agent_obs = agent_next_obs.copy()
+            # Copy observation
            if done['__all__']:
                env_done = 1
                break
@@ -161,8 +168,12 @@ def main(argv):
        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

-        # Store the information about training progress
-        done_window.append(env_done)
+        # Collection information about training
+        tasks_finished = 0
+        for _idx in range(env.get_num_agents()):
+            if done[_idx] == 1:
+                tasks_finished += 1
+        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))
@@ -187,52 +198,6 @@ def main(argv):
                       './Nets/navigator_checkpoint' + str(trials) + '.pth')
            action_prob = [1] * action_size

-    # Render the trained agent
-
-    # Reset environment
-    obs = env.reset(True, True)
-    env_renderer.set_new_rail()
-
-    # Split the observation tree into its parts and normalize the observation using the utility functions.
-    # Build agent specific local observation
-    for a in range(env.get_num_agents()):
-        rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
-                                                          num_features_per_node=num_features_per_node,
-                                                          current_depth=0)
-        rail_data = norm_obs_clip(rail_data)
-        distance_data = norm_obs_clip(distance_data)
-        agent_data = np.clip(agent_data, -1, 1)
-        agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
-
-    # Reset score and done
-    score = 0
-    env_done = 0
-
-    # Run episode
-    for step in range(max_steps):
-        env_renderer.renderEnv(show=True, show_observations=False)
-
-        # Chose the actions
-        for a in range(env.get_num_agents()):
-            eps = 0
-            action = agent.act(agent_obs[a], eps=eps)
-            action_dict.update({a: action})
-
-        # Environment step
-        next_obs, all_rewards, done, _ = env.step(action_dict)
-
-        for a in range(env.get_num_agents()):
-            rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                              num_features_per_node=num_features_per_node,
-                                                              current_depth=0)
-            rail_data = norm_obs_clip(rail_data)
-            distance_data = norm_obs_clip(distance_data)
-            agent_data = np.clip(agent_data, -1, 1)
-            agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
-
-        agent_obs = agent_next_obs.copy()
-        if done['__all__']:
-            break
    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()

--- a/utils/misc_utils.py
+++ b/utils/misc_utils.py
@@ -3,15 +3,16 @@ import time
 from collections import deque

 import numpy as np
-from flatland.envs.generators import complex_rail_generator
 from flatland.envs.observations import GlobalObsForRailEnv
 from flatland.envs.rail_env import RailEnv
+from flatland.envs.rail_generators import complex_rail_generator
+from flatland.envs.schedule_generators import complex_schedule_generator
 from line_profiler import LineProfiler

-from utils.observation_utils import norm_obs_clip, split_tree
+from utils.observation_utils import norm_obs_clip, split_tree_into_feature_groups


-def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '*'):
+def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='*'):
    """
    Call in a loop to create terminal progress bar
    @params:
@@ -31,13 +32,14 @@ def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1,
    if iteration == total:
        print('')

+
 class RandomAgent:

    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

-    def act(self, state, eps = 0):
+    def act(self, state, eps=0):
        """
        :param state: input is the observation of the agent
        :return: returns an action
@@ -87,6 +89,7 @@ def run_test(parameters, agent, test_nr=0, tree_depth=3):
                  rail_generator=complex_rail_generator(nr_start_goal=nr_paths, nr_extra=5, min_dist=min_dist,
                                                        max_dist=99999,
                                                        seed=parameters[3]),
+                  schedule_generator=complex_schedule_generator(),
                  obs_builder_object=GlobalObsForRailEnv(),
                  number_of_agents=parameters[2])
    max_steps = int(3 * (env.height + env.width))
@@ -99,10 +102,9 @@ def run_test(parameters, agent, test_nr=0, tree_depth=3):
        # Reset the env

        lp_reset(True, True)
-        obs = env.reset(True, True)
+        obs, info = env.reset(True, True)
        for a in range(env.get_num_agents()):
-            data, distance, agent_data = split_tree(tree=np.array(obs[a]),
-                                                    current_depth=0)
+            data, distance, agent_data = split_tree_into_feature_groups(obs[a], tree_depth)
            data = norm_obs_clip(data)
            distance = norm_obs_clip(distance)
            agent_data = np.clip(agent_data, -1, 1)
@@ -126,8 +128,7 @@ def run_test(parameters, agent, test_nr=0, tree_depth=3):
            next_obs, all_rewards, done, _ = lp_step(action_dict)

            for a in range(env.get_num_agents()):
-                data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                        current_depth=0)
+                data, distance, agent_data = split_tree_into_feature_groups(next_obs[a], tree_depth)
                data = norm_obs_clip(data)
                distance = norm_obs_clip(distance)
                agent_data = np.clip(agent_data, -1, 1)

--- a/utils/observation_utils.py
+++ b/utils/observation_utils.py
 import numpy as np
+from flatland.envs.observations import TreeObsForRailEnv


 def max_lt(seq, val):
@@ -15,7 +16,7 @@ def max_lt(seq, val):
    return max


-def min_lt(seq, val):
+def min_gt(seq, val):
    """
    Return smallest item in seq for which item > val applies.
    None is returned if seq was empty or all items in seq were >= val.
@@ -29,7 +30,7 @@ def min_lt(seq, val):
    return min


-def norm_obs_clip(obs, clip_min=-1, clip_max=1, fixed_radius=0):
+def norm_obs_clip(obs, clip_min=-1, clip_max=1, fixed_radius=0, normalize_to_range=False):
    """
    This function returns the difference between min and max value of an observation
    :param obs: Observation that should be normalized
@@ -42,58 +43,84 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1, fixed_radius=0):
    else:
        max_obs = max(1, max_lt(obs, 1000)) + 1

-    min_obs = 0  # min(max_obs, min_lt(obs, 0))
-
+    min_obs = 0  # min(max_obs, min_gt(obs, 0))
+    if normalize_to_range:
+        min_obs = min_gt(obs, 0)
+    if min_obs > max_obs:
+        min_obs = max_obs
    if max_obs == min_obs:
        return np.clip(np.array(obs) / max_obs, clip_min, clip_max)
    norm = np.abs(max_obs - min_obs)
-    if norm == 0:
-        norm = 1.
    return np.clip((np.array(obs) - min_obs) / norm, clip_min, clip_max)


-def split_tree(tree, num_features_per_node, current_depth=0):
+def _split_node_into_feature_groups(node: TreeObsForRailEnv.Node) -> (np.ndarray, np.ndarray, np.ndarray):
+    data = np.zeros(6)
+    distance = np.zeros(1)
+    agent_data = np.zeros(4)
+
+    data[0] = node.dist_own_target_encountered
+    data[1] = node.dist_other_target_encountered
+    data[2] = node.dist_other_agent_encountered
+    data[3] = node.dist_potential_conflict
+    data[4] = node.dist_unusable_switch
+    data[5] = node.dist_to_next_branch
+
+    distance[0] = node.dist_min_to_target
+
+    agent_data[0] = node.num_agents_same_direction
+    agent_data[1] = node.num_agents_opposite_direction
+    agent_data[2] = node.num_agents_malfunctioning
+    agent_data[3] = node.speed_min_fractional
+
+    return data, distance, agent_data
+
+
+def _split_subtree_into_feature_groups(node: TreeObsForRailEnv.Node, current_tree_depth: int, max_tree_depth: int) -> (np.ndarray, np.ndarray, np.ndarray):
+
+    if node == -np.inf:
+        remaining_depth = max_tree_depth - current_tree_depth
+        # reference: https://stackoverflow.com/questions/515214/total-number-of-nodes-in-a-tree-data-structure
+        num_remaining_nodes = int((4**(remaining_depth+1) - 1) / (4 - 1))
+        return [-np.inf] * num_remaining_nodes*6, [-np.inf] * num_remaining_nodes, [-np.inf] * num_remaining_nodes*4
+
+    data, distance, agent_data = _split_node_into_feature_groups(node)
+
+    if not node.childs:
+        return data, distance, agent_data
+
+    for direction in TreeObsForRailEnv.tree_explored_actions_char:
+        sub_data, sub_distance, sub_agent_data = _split_subtree_into_feature_groups(node.childs[direction], current_tree_depth + 1, max_tree_depth)
+        data = np.concatenate((data, sub_data))
+        distance = np.concatenate((distance, sub_distance))
+        agent_data = np.concatenate((agent_data, sub_agent_data))
+
+    return data, distance, agent_data
+
+
+def split_tree_into_feature_groups(tree: TreeObsForRailEnv.Node, max_tree_depth: int) -> (np.ndarray, np.ndarray, np.ndarray):
    """
-    Splits the tree observation into different sub groups that need the same normalization.
-    This is necessary because the tree observation includes two different distance:
-    1. Distance from the agent --> This is measured in cells from current agent location
-    2. Distance to targer --> This is measured as distance from cell to agent target
-    3. Binary data --> Contains information about presence of object --> No normalization necessary
-    Number 1. will depend on the depth and size of the tree search
-    Number 2. will depend on the size of the map and thus the max distance on the map
-    Number 3. Is independent of tree depth and map size and thus must be handled differently
-    Therefore we split the tree into these two classes for better normalization.
-    :param tree: Tree that needs to be split
-    :param num_features_per_node: Features per node ATTENTION! this parameter is vital to correct splitting of the tree.
-    :param current_depth: Keeping track of the current depth in the tree
-    :return: Returns the three different groups of distance and binary values.
+    This function splits the tree into three difference arrays of values
    """
-    if len(tree) < num_features_per_node:
-        return [], [], []
-
-    depth = 0
-    tmp = len(tree) / num_features_per_node - 1
-    pow4 = 4
-    while tmp > 0:
-        tmp -= pow4
-        depth += 1
-        pow4 *= 4
-    child_size = (len(tree) - num_features_per_node) // 4
+    data, distance, agent_data = _split_node_into_feature_groups(tree)
+
+    for direction in TreeObsForRailEnv.tree_explored_actions_char:
+        sub_data, sub_distance, sub_agent_data = _split_subtree_into_feature_groups(tree.childs[direction], 1, max_tree_depth)
+        data = np.concatenate((data, sub_data))
+        distance = np.concatenate((distance, sub_distance))
+        agent_data = np.concatenate((agent_data, sub_agent_data))
+
+    return data, distance, agent_data
+
+
+def normalize_observation(observation: TreeObsForRailEnv.Node, tree_depth: int, observation_radius=0):
    """
-    Here we split the node features into the different classes of distances and binary values.
-    Pay close attention to this part if you modify any of the features in the tree observation.
+    This function normalizes the observation used by the RL algorithm
    """
-    tree_data = tree[:6].tolist()
-    distance_data = [tree[6]]
-    agent_data = tree[7:num_features_per_node].tolist()
-    # Split each child of the current node and continue to next depth level
-    for children in range(4):
-        child_tree = tree[(num_features_per_node + children * child_size):
-                          (num_features_per_node + (children + 1) * child_size)]
-        tmp_tree_data, tmp_distance_data, tmp_agent_data = split_tree(child_tree, num_features_per_node,
-                                                                      current_depth=current_depth + 1)
-        if len(tmp_tree_data) > 0:
-            tree_data.extend(tmp_tree_data)
-            distance_data.extend(tmp_distance_data)
-            agent_data.extend(tmp_agent_data)
-    return tree_data, distance_data, agent_data
+    data, distance, agent_data = split_tree_into_feature_groups(observation, tree_depth)
+
+    data = norm_obs_clip(data, fixed_radius=observation_radius)
+    distance = norm_obs_clip(distance, normalize_to_range=True)
+    agent_data = np.clip(agent_data, -1, 1)
+    normalized_obs = np.concatenate((np.concatenate((data, distance)), agent_data))
+    return normalized_obs
No results found