Compare revisions

c977124a · c977124a · c977124a · c977124a · c977124a · c977124a
--- a/RLLib_training/__init__.py
+++ b/RLLib_training/__init__.py
--- a/scoring/utils/misc_utils.py
+++ b/scoring/utils/misc_utils.py
+import random
+import time
+
+import numpy as np
+from flatland.envs.observations import TreeObsForRailEnv
+from flatland.envs.predictions import ShortestPathPredictorForRailEnv
+from flatland.envs.rail_env import RailEnv
+from flatland.envs.rail_generators import complex_rail_generator, rail_from_file
+from flatland.envs.schedule_generators import complex_schedule_generator
+from flatland.utils.rendertools import RenderTool
+
+# Time factor to test the max time allowed for an env.
+max_time_factor = 1
+
+
+def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='*'):
+    """
+    Call in a loop to create terminal progress bar
+    @params:
+        iteration   - Required  : current iteration (Int)
+        total       - Required  : total iterations (Int)
+        prefix      - Optional  : prefix string (Str)
+        suffix      - Optional  : suffix string (Str)
+        decimals    - Optional  : positive number of decimals in percent complete (Int)
+        length      - Optional  : character length of bar (Int)
+        fill        - Optional  : bar fill character (Str)
+    """
+    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
+    filledLength = int(length * iteration // total)
+    bar = fill * filledLength + '_' * (length - filledLength)
+    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end=" ")
+    # Print New Line on Complete
+    if iteration == total:
+        print('')
+
+
+def run_test(parameters, agent, observation_builder=None, observation_wrapper=None, test_nr=0, nr_trials_per_test=100):
+    # Parameter initialization
+    features_per_node = 9
+    start_time_scoring = time.time()
+    action_dict = dict()
+
+    print('Running {} with (x_dim,y_dim) = ({},{}) and {} Agents.'.format(test_nr, parameters[0], parameters[1],
+                                                                          parameters[2]))
+    if observation_builder == None:
+        print("No observation defined!")
+        return
+    # Reset all measurements
+    test_scores = []
+    test_dones = []
+
+    # Reset environment
+    random.seed(parameters[3])
+    np.random.seed(parameters[3])
+
+    printProgressBar(0, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20)
+    for trial in range(nr_trials_per_test):
+        # Reset the env
+        file_name = "./Tests/{}/Level_{}.pkl".format(test_nr, trial)
+
+        env = RailEnv(width=3,
+                      height=3,
+                      rail_generator=rail_from_file(file_name),
+                      obs_builder_object=observation_builder,
+                      number_of_agents=1,
+                      )
+
+        obs, info = env.reset()
+
+        if observation_wrapper is not None:
+            for a in range(env.get_num_agents()):
+                obs[a] = observation_wrapper(obs[a])
+
+        # Run episode
+        trial_score = 0
+        max_steps = int(max_time_factor * (env.height + env.width))
+        for step in range(max_steps):
+
+            for a in range(env.get_num_agents()):
+                action = agent.act(obs[a], eps=0)
+                action_dict.update({a: action})
+
+            # Environment step
+            obs, all_rewards, done, _ = env.step(action_dict)
+
+            for a in range(env.get_num_agents()):
+                if observation_wrapper is not None:
+                    obs[a] = observation_wrapper(obs[a])
+                trial_score += np.mean(all_rewards[a])
+
+            if done['__all__']:
+                break
+        test_scores.append(trial_score / max_steps)
+        test_dones.append(done['__all__'])
+        printProgressBar(trial + 1, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20)
+    end_time_scoring = time.time()
+    tot_test_time = end_time_scoring - start_time_scoring
+    return test_scores, test_dones, tot_test_time
+
+
+def create_testfiles(parameters, test_nr=0, nr_trials_per_test=100):
+    # Parameter initialization
+    print('Creating {} with (x_dim,y_dim) = ({},{}) and {} Agents.'.format(test_nr, parameters[0], parameters[1],
+                                                                           parameters[2]))
+    # Reset environment
+    random.seed(parameters[3])
+    np.random.seed(parameters[3])
+    nr_paths = max(4, parameters[2] + int(0.5 * parameters[2]))
+    min_dist = int(min([parameters[0], parameters[1]]) * 0.75)
+    env = RailEnv(width=parameters[0],
+                  height=parameters[1],
+                  rail_generator=complex_rail_generator(nr_start_goal=nr_paths, nr_extra=5, min_dist=min_dist,
+                                                        max_dist=99999,
+                                                        seed=parameters[3]),
+                  schedule_generator=complex_schedule_generator(),
+                  obs_builder_object=TreeObsForRailEnv(max_depth=2),
+                  number_of_agents=parameters[2])
+    printProgressBar(0, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20)
+    for trial in range(nr_trials_per_test):
+        # Reset the env
+        env.reset(True, True)
+        env.save("./Tests/{}/Level_{}.pkl".format(test_nr, trial))
+        printProgressBar(trial + 1, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20)
+
+    return
+
+
+def render_test(parameters, test_nr=0, nr_examples=5):
+    for trial in range(nr_examples):
+        # Reset the env
+        print('Showing {} Level {} with (x_dim,y_dim) = ({},{}) and {} Agents.'.format(test_nr, trial, parameters[0],
+                                                                                       parameters[1],
+                                                                                       parameters[2]))
+        file_name = "./Tests/{}/Level_{}.pkl".format(test_nr, trial)
+
+        env = RailEnv(width=1,
+                      height=1,
+                      rail_generator=rail_from_file(file_name),
+                      obs_builder_object=TreeObsForRailEnv(max_depth=2),
+                      number_of_agents=1,
+                      )
+        env_renderer = RenderTool(env, gl="PILSVG", )
+        env_renderer.set_new_rail()
+
+        env.reset(False, False)
+        env_renderer.render_env(show=True, show_observations=False)
+
+        time.sleep(0.1)
+        env_renderer.close_window()
+    return
+
+
+def run_test_sequential(parameters, agent, test_nr=0, tree_depth=3):
+    # Parameter initialization
+    features_per_node = 9
+    start_time_scoring = time.time()
+    action_dict = dict()
+    nr_trials_per_test = 100
+    print('Running {} with (x_dim,y_dim) = ({},{}) and {} Agents.'.format(test_nr, parameters[0], parameters[1],
+                                                                          parameters[2]))
+
+    # Reset all measurements
+    test_scores = []
+    test_dones = []
+
+    # Reset environment
+    random.seed(parameters[3])
+    np.random.seed(parameters[3])
+
+    printProgressBar(0, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20)
+    for trial in range(nr_trials_per_test):
+        # Reset the env
+        file_name = "./Tests/{}/Level_{}.pkl".format(test_nr, trial)
+
+        env = RailEnv(width=3,
+                      height=3,
+                      rail_generator=rail_from_file(file_name),
+                      obs_builder_object=TreeObsForRailEnv(max_depth=tree_depth,
+                                                           predictor=ShortestPathPredictorForRailEnv()),
+                      number_of_agents=1,
+                      )
+
+        obs, info = env.reset()
+        done = env.dones
+        # Run episode
+        trial_score = 0
+        max_steps = int(max_time_factor * (env.height + env.width))
+        for step in range(max_steps):
+
+            # Action
+            acting_agent = 0
+            for a in range(env.get_num_agents()):
+                if done[a]:
+                    acting_agent += 1
+                if acting_agent == a:
+                    action = agent.act(obs[acting_agent], eps=0)
+                else:
+                    action = 0
+                action_dict.update({a: action})
+
+            # Environment step
+
+            obs, all_rewards, done, _ = env.step(action_dict)
+            for a in range(env.get_num_agents()):
+                trial_score += np.mean(all_rewards[a])
+            if done['__all__']:
+                break
+        test_scores.append(trial_score / max_steps)
+        test_dones.append(done['__all__'])
+        printProgressBar(trial + 1, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20)
+    end_time_scoring = time.time()
+    tot_test_time = end_time_scoring - start_time_scoring
+    return test_scores, test_dones, tot_test_time
--- a/RLLib_training/experiment_configs/__init__.py
+++ b/RLLib_training/experiment_configs/__init__.py
--- a/sequential_agent/run_test.py
+++ b/sequential_agent/run_test.py
+import numpy as np
+from flatland.envs.observations import TreeObsForRailEnv
+from flatland.envs.predictions import ShortestPathPredictorForRailEnv
+from flatland.envs.rail_env import RailEnv
+from flatland.envs.rail_generators import complex_rail_generator
+from flatland.envs.schedule_generators import complex_schedule_generator
+from flatland.utils.rendertools import RenderTool
+
+from sequential_agent.simple_order_agent import OrderedAgent
+
+np.random.seed(2)
+"""
+file_name = "../torch_training/railway/complex_scene.pkl"
+env = RailEnv(width=10,
+              height=20,
+              rail_generator=rail_from_file(file_name),
+              obs_builder_object=TreeObsForRailEnv(max_depth=1, predictor=ShortestPathPredictorForRailEnv()))
+x_dim = env.width
+y_dim = env.height
+
+"""
+
+x_dim = 20  # np.random.randint(8, 20)
+y_dim = 20  # np.random.randint(8, 20)
+n_agents = 10  # np.random.randint(3, 8)
+n_goals = n_agents + np.random.randint(0, 3)
+min_dist = int(0.75 * min(x_dim, y_dim))
+
+env = RailEnv(width=x_dim,
+              height=y_dim,
+              rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                                                    max_dist=99999,
+                                                    seed=0),
+              schedule_generator=complex_schedule_generator(),
+              obs_builder_object=TreeObsForRailEnv(max_depth=1, predictor=ShortestPathPredictorForRailEnv()),
+              number_of_agents=n_agents)
+env.reset(True, True)
+
+tree_depth = 1
+observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())
+env_renderer = RenderTool(env, gl="PILSVG", )
+handle = env.get_agent_handles()
+n_trials = 1
+max_steps = 100 * (env.height + env.width)
+record_images = False
+agent = OrderedAgent()
+action_dict = dict()
+
+for trials in range(1, n_trials + 1):
+
+    # Reset environment
+    obs, info = env.reset(True, True)
+    done = env.dones
+    env_renderer.reset()
+    frame_step = 0
+    # Run episode
+    for step in range(max_steps):
+        env_renderer.render_env(show=True, show_observations=False, show_predictions=True)
+
+        if record_images:
+            env_renderer.gl.save_image("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
+            frame_step += 1
+
+        # Action
+        acting_agent = 0
+        for a in range(env.get_num_agents()):
+            if done[a]:
+                acting_agent += 1
+            if a == acting_agent:
+                action = agent.act(obs[a], eps=0)
+                print(action)
+            else:
+                action = 4
+            action_dict.update({a: action})
+
+        # Environment step
+
+        obs, all_rewards, done, _ = env.step(action_dict)
+
+        if done['__all__']:
+            break
--- a/sequential_agent/simple_order_agent.py
+++ b/sequential_agent/simple_order_agent.py
+import numpy as np
+from utils.observation_utils import split_tree_into_feature_groups, min_gt
+
+
+class OrderedAgent:
+
+    def __init__(self):
+        self.action_size = 5
+
+    def act(self, state, eps=0):
+        """
+        :param state: input is the observation of the agent
+        :return: returns an action
+        """
+        _, distance, _ = split_tree_into_feature_groups(state, 1)
+        distance = distance[1:]
+        min_dist = min_gt(distance, 0)
+        min_direction = np.where(distance == min_dist)
+        if len(min_direction[0]) > 1:
+            return min_direction[0][-1] + 1
+        return min_direction[0] + 1
+
+    def step(self, memories):
+        """
+        Step function to improve agent by adjusting policy given the observations
+
+        :param memories: SARS Tuple to be
+        :return:
+        """
+        return
+
+    def save(self, filename):
+        # Store the current policy
+        return
+
+    def load(self, filename):
+        # Load a policy
+        return
--- a/setup.py
+++ b/setup.py
-import os
-
 from setuptools import setup, find_packages

-# TODO: setup does not support installation from url, move to requirements*.txt
-# TODO: @master as soon as mr is merged on flatland.
-os.system(
-    'pip install git+https://gitlab.aicrowd.com/flatland/flatland.git@57-access-resources-through-importlib_resources')
-
 install_reqs = []
-# TODO: include requirements_RLLib_training.txt
-requirements_paths = ['requirements_torch_training.txt']  # , 'requirements_RLLib_training.txt']
+dependency_links = []
+requirements_paths = ['requirements_torch_training.txt']
 for requirements_path in requirements_paths:
    with open(requirements_path, 'r') as f:
        install_reqs += [
            s for s in [
                line.strip(' \n') for line in f
-            ] if not s.startswith('#') and s != ''
+            ] if not s.startswith('#') and s != '' and not s.startswith('git+')
        ]
+with open(requirements_path, 'r') as f:
+    dependency_links += [
+        s for s in [
+            line.strip(' \n') for line in f
+        ] if s.startswith('git+')
+    ]
+
 requirements = install_reqs
 setup_requirements = install_reqs
 test_requirements = install_reqs
@@ -47,6 +47,7 @@ setup(
    setup_requires=setup_requirements,
    test_suite='tests',
    tests_require=test_requirements,
+    dependency_links=dependency_links,
    url='https://gitlab.aicrowd.com/flatland/baselines',
    version='0.1.1',
    zip_safe=False,

--- a/tests/test_normalize_features.py
+++ b/tests/test_normalize_features.py
+import random
+
+import numpy as np
+from flatland.envs.observations import TreeObsForRailEnv
+from flatland.envs.rail_env import RailEnv
+from flatland.envs.rail_generators import complex_rail_generator
+from flatland.envs.schedule_generators import complex_schedule_generator
+
+from utils.observation_utils import normalize_observation
+
+
+def test_normalize_features():
+
+    random.seed(1)
+    np.random.seed(1)
+    max_depth = 4
+
+    for i in range(10):
+        tree_observer = TreeObsForRailEnv(max_depth=max_depth)
+        next_rand_number = random.randint(0, 100)
+
+        env = RailEnv(width=10,
+                      height=10,
+                      rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999,
+                                                            seed=next_rand_number),
+                      schedule_generator=complex_schedule_generator(),
+                      number_of_agents=1,
+                      obs_builder_object=tree_observer)
+
+        obs, all_rewards, done, _ = env.step({0: 0})
+
+        obs_new = tree_observer.get()
+        # data, distance, agent_data = split_tree(tree=np.array(obs_old), num_features_per_node=11)
+        data_normalized = normalize_observation(obs_new, max_depth, observation_radius=10)
+
+        filename = 'testdata/test_array_{}.csv'.format(i)
+        data_loaded = np.loadtxt(filename, delimiter=',')
+
+        assert np.allclose(data_loaded, data_normalized)
+
--- a/tests/testdata/test_array_0.csv
+++ b/tests/testdata/test_array_0.csv
--- a/tests/testdata/test_array_1.csv
+++ b/tests/testdata/test_array_1.csv
--- a/tests/testdata/test_array_2.csv
+++ b/tests/testdata/test_array_2.csv
--- a/tests/testdata/test_array_3.csv
+++ b/tests/testdata/test_array_3.csv
--- a/tests/testdata/test_array_4.csv
+++ b/tests/testdata/test_array_4.csv
--- a/tests/testdata/test_array_5.csv
+++ b/tests/testdata/test_array_5.csv
--- a/tests/testdata/test_array_6.csv
+++ b/tests/testdata/test_array_6.csv
--- a/tests/testdata/test_array_7.csv
+++ b/tests/testdata/test_array_7.csv
--- a/tests/testdata/test_array_8.csv
+++ b/tests/testdata/test_array_8.csv
--- a/tests/testdata/test_array_9.csv
+++ b/tests/testdata/test_array_9.csv
--- a/torch_training/Getting_Started_Training.md
+++ b/torch_training/Getting_Started_Training.md
+# How to train an Agent on Flatland
+Quick introduction on how to train a simple DQN agent using Flatland and Pytorch. At the end of this Tutorial you should be able to train a single agent to navigate in Flatland.
+We use the `training_navigation.py` ([here](https://gitlab.aicrowd.com/flatland/baselines/blob/master/torch_training/training_navigation.py)) file to train a simple agent with the tree observation to solve the navigation task.
+
+## Actions in Flatland
+Flatland is a railway simulation. Thus the actions of an agent are strongly limited to the railway network. This means that in many cases not all actions are valid.
+The possible actions of an agent are
+
+- 0 *Do Nothing*:  If the agent is moving it continues moving, if it is stopped it stays stopped
+- 1 *Deviate Left*: This action is only valid at cells where the agent can change direction towards left. If action is chosen, the left transition and a rotation of the agent orientation to the left is executed. If the agent is stopped at any position, this action will cause it to start moving in any cell where forward or left is allowed!
+- 2 *Go Forward*: This action will start the agent when stopped. At switches this will chose the forward direction.
+- 3 *Deviate Right*: Exactly the same as deviate left but for right turns.
+- 4 *Stop*: This action causes the agent to stop, this is necessary to avoid conflicts in multi agent setups (Not needed for navigation).
+
+## Tree Observation
+Flatland offers three basic observations from the beginning. We encourage you to develop your own observations that are better suited for this specific task.
+
+For the navigation training we start with the Tree Observation as agents will learn the task very quickly using this observation.
+The tree observation exploits the fact that a railway network is a graph and thus the observation is only built along allowed transitions in the graph.
+
+Here is a small example of a railway network with an agent in the top left corner. The tree observation is build by following the allowed transitions for that agent.
+
+![Small_Network](https://i.imgur.com/utqMx08.png)
+
+As we move along the allowed transitions we build up a tree where a new node is created at every cell where the agent has different possibilities (Switch), dead-end or the target is reached.
+It is important to note that the tree observation is always build according to the orientation of the agent at a given node. This means that each node always has 4 branches coming from it in the directions *Left, Forward, Right and Backward*. These are illustrated with different colors in the figure below. The tree is build form the example rail above. Nodes where there are no possibilities are filled with `-inf` and are not all shown here for simplicity. The tree however, always has the same number of nodes for a given tree depth.
+
+![Tree_Observation](https://i.imgur.com/VsUQOQz.png)
+
+### Node Information
+Each node is filled with information gathered along the path to the node. Currently each node contains 9 features:
+
+- 1: if own target lies on the explored branch the current distance from the agent in number of cells is stored.
+
+- 2: if another agents target is detected the distance in number of cells from current agent position is stored.
+
+- 3: if another agent is detected the distance in number of cells from current agent position is stored.
+
+- 4: possible conflict detected (This only works when we use a predictor and will not be important in this tutorial)
+
+
+- 5: if an not usable switch (for agent) is detected we store the distance. An unusable switch is a switch where the agent does not have any choice of path, but other agents coming from different directions might. 
+
+
+- 6: This feature stores the distance (in number of cells) to the next node (e.g. switch or target or dead-end)
+
+- 7: minimum remaining travel distance from node to the agent's target given the direction of the agent if this path is chosen
+
+
+- 8: agent in the same direction found on path to node
+    - n = number of agents present same direction (possible future use: number of other agents in the same direction in this branch)
+    - 0 = no agent present same direction
+
+- 9: agent in the opposite direction on path to node
+    - n = number of agents present other direction than myself
+    - 0 = no agent present other direction than myself
+
+For training purposes the tree is flattend into a single array.
+
+## Training
+### Setting up the environment
+Before you get started with the training make sure that you have [pytorch](https://pytorch.org/get-started/locally/) installed.
+Let us now train a simPle double dueling DQN agent to navigate to its target on flatland. We start by importing flatland
+
+```
+from flatland.envs.generators import complex_rail_generator
+from flatland.envs.observations import TreeObsForRailEnv
+from flatland.envs.rail_env import RailEnv
+from flatland.utils.rendertools import RenderTool
+from utils.observation_utils import norm_obs_clip, split_tree
+```
+
+For this simple example we want to train on randomly generated levels using the `complex_rail_generator`. We use the following parameter for our first experiment:
+
+```
+# Parameters for the Environment
+x_dim = 10
+y_dim = 10
+n_agents = 1
+n_goals = 5
+min_dist = 5
+```
+
+As mentioned above, for this experiment we are going to use the tree observation and thus we load the observation builder:
+
+```
+# We are training an Agent using the Tree Observation with depth 2
+observation_builder = TreeObsForRailEnv(max_depth=2)
+```
+
+And pass it as an argument to the environment setup
+
+```
+env = RailEnv(width=x_dim,
+              height=y_dim,
+              rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                                                    max_dist=99999,
+                                                    seed=0),
+              obs_builder_object=observation_builder,
+              number_of_agents=n_agents)
+```
+
+We have no successfully set up the environment for training. To visualize it in the renderer we also initiate the renderer with.
+
+```
+env_renderer = RenderTool(env, gl="PILSVG", )
+```
+
+### Setting up the agent
+
+To set up a appropriate agent we need the state and action space sizes. From the discussion above about the tree observation we end up with:
+
+[**Adrian**: I just wonder, why this is not done in seperate method in the the observation: get_state_size, then we don't have to write down much more. And the user don't need to 
+understand anything about the observation. I suggest moving this into the observation, base ObservationBuilder declare it as an abstract method. ... ] 
+
+```
+# Given the depth of the tree observation and the number of features per node we get the following state_size
+features_per_node = 9
+tree_depth = 2
+nr_nodes = 0
+for i in range(tree_depth + 1):
+    nr_nodes += np.power(4, i)
+state_size = features_per_node * nr_nodes
+
+# The action space of flatland is 5 discrete actions
+action_size = 5
+```
+
+In the `training_navigation.py` file you will find further variable that we initiate in order to keep track of the training progress.
+Below you see an example code to train an agent. It is important to note that we reshape and normalize the tree observation provided by the environment to facilitate training.
+To do so, we use the utility functions `split_tree(tree=np.array(obs[a]), num_features_per_node=features_per_node, current_depth=0)` and `norm_obs_clip()`. Feel free to modify the normalization as you see fit.
+
+```
+# Split the observation tree into its parts and normalize the observation using the utility functions.
+    # Build agent specific local observation
+    for a in range(env.get_num_agents()):
+        rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
+                                                          num_features_per_node=features_per_node,
+                                                          current_depth=0)
+        rail_data = norm_obs_clip(rail_data)
+        distance_data = norm_obs_clip(distance_data)
+        agent_data = np.clip(agent_data, -1, 1)
+        agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
+```
+
+We now use the normalized `agent_obs` for our training loop:
+[**Adrian**: Same question as above, why not done in the observation class?]
+
+```
+for trials in range(1, n_trials + 1):
+
+    # Reset environment
+    obs, info = env.reset(True, True)
+    if not Training:
+        env_renderer.set_new_rail()
+
+    # Split the observation tree into its parts and normalize the observation using the utility functions.
+    # Build agent specific local observation
+    for a in range(env.get_num_agents()):
+        rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
+                                                          num_features_per_node=features_per_node,
+                                                          current_depth=0)
+        rail_data = norm_obs_clip(rail_data)
+        distance_data = norm_obs_clip(distance_data)
+        agent_data = np.clip(agent_data, -1, 1)
+        agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
+
+    # Reset score and done
+    score = 0
+    env_done = 0
+
+    # Run episode
+    for step in range(max_steps):
+
+        # Only render when not triaing
+        if not Training:
+            env_renderer.renderEnv(show=True, show_observations=True)
+
+        # Chose the actions
+        for a in range(env.get_num_agents()):
+            if not Training:
+                eps = 0
+
+            action = agent.act(agent_obs[a], eps=eps)
+            action_dict.update({a: action})
+
+            # Count number of actions takes for statistics
+            action_prob[action] += 1
+
+        # Environment step
+        next_obs, all_rewards, done, _ = env.step(action_dict)
+
+        for a in range(env.get_num_agents()):
+            rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
+                                                              num_features_per_node=features_per_node,
+                                                              current_depth=0)
+            rail_data = norm_obs_clip(rail_data)
+            distance_data = norm_obs_clip(distance_data)
+            agent_data = np.clip(agent_data, -1, 1)
+            agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
+
+        # Update replay buffer and train agent
+        for a in range(env.get_num_agents()):
+
+            # Remember and train agent
+            if Training:
+                agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
+
+            # Update the current score
+            score += all_rewards[a] / env.get_num_agents()
+
+        agent_obs = agent_next_obs.copy()
+        if done['__all__']:
+            env_done = 1
+            break
+
+    # Epsilon decay
+    eps = max(eps_end, eps_decay * eps)  # decrease epsilon
+```
+
+Running the `training_navigation.py` file trains a simple agent to navigate to any random target within the railway network. After running you should see a learning curve similiar to this one:
+
+![Learning_curve](https://i.imgur.com/yVGXpUy.png)
+
+and the agent behavior should look like this:
+
+![Single_Agent_Navigation](https://i.imgur.com/t5ULr4L.gif)
+
--- a/torch_training/Multi_Agent_Training_Intro.md
+++ b/torch_training/Multi_Agent_Training_Intro.md
+# How to train multiple Agents on Flatland
+Quick introduction on how to train a simple DQN agent using Flatland and Pytorch. At the end of this Tutorial you should be able to train a single agent to navigate in Flatland.
+We use the `multi_agent_training.py` ([here](https://gitlab.aicrowd.com/flatland/baselines/blob/master/torch_training/multi_agent_training.py)) file to train multiple agents on the avoid conflicts task.
+
+## Actions in Flatland
+Flatland is a railway simulation. Thus the actions of an agent are strongly limited to the railway network. This means that in many cases not all actions are valid.
+The possible actions of an agent are
+
+- 0 *Do Nothing*:  If the agent is moving it continues moving, if it is stopped it stays stopped
+- 1 *Deviate Left*: This action is only valid at cells where the agent can change direction towards left. If action is chosen, the left transition and a rotation of the agent orientation to the left is executed. If the agent is stopped at any position, this action will cause it to start moving in any cell where forward or left is allowed!
+- 2 *Go Forward*: This action will start the agent when stopped. At switches this will chose the forward direction.
+- 3 *Deviate Right*: Exactly the same as deviate left but for right turns.
+- 4 *Stop*: This action causes the agent to stop, this is necessary to avoid conflicts in multi agent setups (Not needed for navigation).
+
+## Shortest path predictor
+With multiple agents alot of conlflicts will arise on the railway network. These conflicts arise because different agents want to occupie the same cells at the same time. Due to the nature of the railway network and the dynamic of the railway agents (can't turn around), the conflicts have to be detected in advance in order to avoid them. If agents are facing each other and don't have any options to deviate from their path it is called a *deadlock*.
+Therefore we introduce a simple prediction function that predicts the most likely (here shortest) path of all the agents. Furthermore, the prediction is withdrawn if an agent stopps and replaced by a prediction that the agent will stay put. The predictions allow the agents to detect possible conflicts before they happen and thus performe counter measures.
+*ATTENTION*: This is a very basic implementation of a predictor. It will not solve all the problems because it always predicts shortest paths and not alternative routes. It is up to you to come up with much more clever predictors to avod conflicts!
+
+## Tree Observation
+Flatland offers three basic observations from the beginning. We encourage you to develop your own observations that are better suited for this specific task.
+
+For the navigation training we start with the Tree Observation as agents will learn the task very quickly using this observation.
+The tree observation exploits the fact that a railway network is a graph and thus the observation is only built along allowed transitions in the graph.
+
+Here is a small example of a railway network with an agent in the top left corner. The tree observation is build by following the allowed transitions for that agent.
+
+![Small_Network](https://i.imgur.com/utqMx08.png)
+
+As we move along the allowed transitions we build up a tree where a new node is created at every cell where the agent has different possibilities (Switch), dead-end or the target is reached.
+It is important to note that the tree observation is always build according to the orientation of the agent at a given node. This means that each node always has 4 branches coming from it in the directions *Left, Forward, Right and Backward*. These are illustrated with different colors in the figure below. The tree is build form the example rail above. Nodes where there are no possibilities are filled with `-inf` and are not all shown here for simplicity. The tree however, always has the same number of nodes for a given tree depth.
+
+![Tree_Observation](https://i.imgur.com/VsUQOQz.png)
+
+### Node Information
+Each node is filled with information gathered along the path to the node. Currently each node contains 9 features:
+
+- 1: if own target lies on the explored branch the current distance from the agent in number of cells is stored.
+
+- 2: if another agents target is detected the distance in number of cells from current agent position is stored.
+
+- 3: if another agent is detected the distance in number of cells from current agent position is stored.
+
+- 4: possible conflict detected (This only works when we use a predictor and will not be important in this tutorial)
+
+
+- 5: if an not usable switch (for agent) is detected we store the distance. An unusable switch is a switch where the agent does not have any choice of path, but other agents coming from different directions might. 
+
+
+- 6: This feature stores the distance (in number of cells) to the next node (e.g. switch or target or dead-end)
+
+- 7: minimum remaining travel distance from node to the agent's target given the direction of the agent if this path is chosen
+
+
+- 8: agent in the same direction found on path to node
+    - n = number of agents present same direction (possible future use: number of other agents in the same direction in this branch)
+    - 0 = no agent present same direction
+
+- 9: agent in the opposite direction on path to node
+    - n = number of agents present other direction than myself
+    - 0 = no agent present other direction than myself
+
+For training purposes the tree is flattend into a single array.
+
+
+## Training
+### Setting up the environment
+Let us now train a simle double dueling DQN agent to detect to find its target and try to avoid conflicts on flatland. We start by importing the necessary packages from Flatland. Note that we now also import a predictor from `flatland.envs.predictions`
+
+```
+from flatland.envs.generators import complex_rail_generator
+from flatland.envs.observations import TreeObsForRailEnv
+from flatland.envs.predictions import ShortestPathPredictorForRailEnv
+from flatland.envs.rail_env import RailEnv
+from utils.observation_utils import norm_obs_clip, split_tree
+```
+
+For this simple example we want to train on randomly generated levels using the `complex_rail_generator`. The training curriculum will use different sets of parameters throughout training to enhance generalizability of the solution.
+
+```
+# Initialize a random map with a random number of agents
+x_dim = np.random.randint(8, 20)
+y_dim = np.random.randint(8, 20)
+n_agents = np.random.randint(3, 8)
+n_goals = n_agents + np.random.randint(0, 3)
+min_dist = int(0.75 * min(x_dim, y_dim))
+tree_depth = 3
+```
+
+As mentioned above, for this experiment we are going to use the tree observation and thus we load the observation builder. Also we are now using the predictor as well which is passed to the observation builder.
+
+```
+"""
+ Get an observation builder and predictor:
+ The predictor will always predict the shortest path from the current location of the agent.
+ This is used to warn for potential conflicts --> Should be enhanced to get better performance!
+"""
+predictor = ShortestPathPredictorForRailEnv()
+observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=predictor)
+```
+
+And pass it as an argument to the environment setup
+
+```
+env = RailEnv(width=x_dim,
+              height=y_dim,
+              rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                                                    max_dist=99999,
+                                                    seed=0),
+              obs_builder_object=observation_builder,
+              number_of_agents=n_agents)
+```
+
+We have no successfully set up the environment for training. To visualize it in the renderer we also initiate the renderer with.
+
+###Setting up the agent
+
+To set up a appropriate agent we need the state and action space sizes. From the discussion above about the tree observation we end up with:
+
+
+```
+num_features_per_node = env.obs_builder.observation_dim
+nr_nodes = 0
+for i in range(tree_depth + 1):
+    nr_nodes += np.power(4, i)
+state_size = num_features_per_node * nr_nodes
+action_size = 5
+```
+
+In the `multi_agent_training.py` file you will find further variable that we initiate in order to keep track of the training progress.
+Below you see an example code to train an agent. It is important to note that we reshape and normalize the tree observation provided by the environment to facilitate training.
+To do so, we use the utility functions `split_tree(tree=np.array(obs[a]), num_features_per_node=features_per_node, current_depth=0)` and `norm_obs_clip()`. Feel free to modify the normalization as you see fit.
+
+```
+# Split the observation tree into its parts and normalize the observation using the utility functions.
+    # Build agent specific local observation
+    for a in range(env.get_num_agents()):
+        rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
+                                                          num_features_per_node=features_per_node,
+                                                          current_depth=0)
+        rail_data = norm_obs_clip(rail_data)
+        distance_data = norm_obs_clip(distance_data)
+        agent_data = np.clip(agent_data, -1, 1)
+        agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
+```
+
+We now use the normalized `agent_obs` for our training loop:
+
+
+```
+# Do training over n_episodes
+    for episodes in range(1, n_episodes + 1):
+        """
+        Training Curriculum: In order to get good generalization we change the number of agents
+        and the size of the levels every 50 episodes.
+        """
+        if episodes % 50 == 0:
+            x_dim = np.random.randint(8, 20)
+            y_dim = np.random.randint(8, 20)
+            n_agents = np.random.randint(3, 8)
+            n_goals = n_agents + np.random.randint(0, 3)
+            min_dist = int(0.75 * min(x_dim, y_dim))
+            env = RailEnv(width=x_dim,
+                          height=y_dim,
+                          rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                                                                max_dist=99999,
+                                                                seed=0),
+                          obs_builder_object=observation_helper,
+                          number_of_agents=n_agents)
+
+            # Adjust the parameters according to the new env.
+            max_steps = int(3 * (env.height + env.width))
+            agent_obs = [None] * env.get_num_agents()
+            agent_next_obs = [None] * env.get_num_agents()
+
+        # Reset environment
+        obs, info = env.reset(True, True)
+
+        # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
+        # different times during an episode
+        final_obs = agent_obs.copy()
+        final_obs_next = agent_next_obs.copy()
+
+        # Build agent specific observations
+        for a in range(env.get_num_agents()):
+            data, distance, agent_data = split_tree(tree=np.array(obs[a]), num_features_per_node=num_features_per_node,
+                                                    current_depth=0)
+            data = norm_obs_clip(data, fixed_radius=observation_radius)
+            distance = norm_obs_clip(distance)
+            agent_data = np.clip(agent_data, -1, 1)
+            agent_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+
+        score = 0
+        env_done = 0
+
+        # Run episode
+        for step in range(max_steps):
+
+            # Action
+            for a in range(env.get_num_agents()):
+                action = agent.act(agent_obs[a], eps=eps)
+                action_prob[action] += 1
+                action_dict.update({a: action})
+
+            # Environment step
+            next_obs, all_rewards, done, _ = env.step(action_dict)
+
+            # Build agent specific observations and normalize
+            for a in range(env.get_num_agents()):
+                data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
+                                                        num_features_per_node=num_features_per_node, current_depth=0)
+                data = norm_obs_clip(data, fixed_radius=observation_radius)
+                distance = norm_obs_clip(distance)
+                agent_data = np.clip(agent_data, -1, 1)
+                agent_next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+
+            # Update replay buffer and train agent
+            for a in range(env.get_num_agents()):
+                if done[a]:
+                    final_obs[a] = agent_obs[a].copy()
+                    final_obs_next[a] = agent_next_obs[a].copy()
+                    final_action_dict.update({a: action_dict[a]})
+                if not done[a]:
+                    agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
+                score += all_rewards[a] / env.get_num_agents()
+
+            # Copy observation
+            agent_obs = agent_next_obs.copy()
+
+            if done['__all__']:
+                env_done = 1
+                for a in range(env.get_num_agents()):
+                    agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
+                break
+
+        # Epsilon decay
+        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
+
+        # Collection information about training
+        done_window.append(env_done)
+        scores_window.append(score / max_steps)  # save most recent score
+        scores.append(np.mean(scores_window))
+        dones_list.append((np.mean(done_window)))
+```
+
+Running the `multi_agent_training.py` file trains a simple agent to navigate to any random target within the railway network. After running you should see a learning curve similiar to this one:
+
+![Learning_Curve](https://i.imgur.com/Po4j4yK.png)
+
+and the agent behavior should look like this:
+
+![Conflict_Avoidence](https://i.imgur.com/AvBHKaD.gif)
--- a/torch_training/Nets/avoid_checkpoint15000.pth
+++ b/torch_training/Nets/avoid_checkpoint15000.pth
No results found