diff --git a/RLLib_training/RailEnvRLLibWrapper.py b/RLLib_training/RailEnvRLLibWrapper.py
index d06506383544df5ef03b4128c37213d0a1a2ac69..989800044250d60b68c68b5d2e702b5625964024 100644
--- a/RLLib_training/RailEnvRLLibWrapper.py
+++ b/RLLib_training/RailEnvRLLibWrapper.py
@@ -1,9 +1,10 @@
 import numpy as np
-from flatland.envs.generators import complex_rail_generator, random_rail_generator
-from flatland.envs.rail_env import RailEnv
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.utils.seed import seed as set_seed
 
+from flatland.envs.generators import complex_rail_generator, random_rail_generator
+from flatland.envs.rail_env import RailEnv
+
 
 class RailEnvRLLibWrapper(MultiAgentEnv):
 
@@ -63,7 +64,7 @@ class RailEnvRLLibWrapper(MultiAgentEnv):
 
         for i_agent in range(len(self.env.agents)):
             data, distance, agent_data = self.env.obs_builder.split_tree(tree=np.array(obs[i_agent]),
-                                                                         num_features_per_node=8, current_depth=0)
+                                                                         current_depth=0)
             o[i_agent] = [data, distance, agent_data]
 
         # needed for the renderer
@@ -72,8 +73,6 @@ class RailEnvRLLibWrapper(MultiAgentEnv):
         self.agents_static = self.env.agents_static
         self.dev_obs_dict = self.env.dev_obs_dict
 
-
-
         # If step_memory > 1, we need to concatenate it the observations in memory, only works for
         # step_memory = 1 or 2 for the moment
         if self.step_memory < 2:
@@ -96,7 +95,7 @@ class RailEnvRLLibWrapper(MultiAgentEnv):
         for i_agent in range(len(self.env.agents)):
             if i_agent not in self.agents_done:
                 data, distance, agent_data = self.env.obs_builder.split_tree(tree=np.array(obs[i_agent]),
-                                                                             num_features_per_node=8, current_depth=0)
+                                                                             current_depth=0)
 
                 o[i_agent] = [data, distance, agent_data]
                 r[i_agent] = rewards[i_agent]
diff --git a/requirements_torch_training.txt b/requirements_torch_training.txt
index 2bce630587233b4c771ef7a43bc3aaf7f78fbb07..c2b65e16a361157ef265ec9412025da516531334 100644
--- a/requirements_torch_training.txt
+++ b/requirements_torch_training.txt
@@ -1 +1,4 @@
-torch==1.1.0
\ No newline at end of file
+git+https://gitlab.aicrowd.com/flatland/flatland.git@42-run-baselines-in-ci
+importlib-metadata>=0.17
+importlib_resources>=1.0.2
+torch>=1.1.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 723e1a6f701150f5853a0199057bc234137c2aa2..2b9b731ea02a0c9bdbea7602ea1dfa2ad6e194e2 100644
--- a/setup.py
+++ b/setup.py
@@ -1,13 +1,7 @@
-import os
-
 from setuptools import setup, find_packages
 
-# TODO: setup does not support installation from url, move to requirements*.txt
-# TODO: @master as soon as mr is merged on flatland.
-os.system(
-    'pip install git+https://gitlab.aicrowd.com/flatland/flatland.git@57-access-resources-through-importlib_resources')
-
 install_reqs = []
+dependency_links = []
 # TODO: include requirements_RLLib_training.txt
 requirements_paths = ['requirements_torch_training.txt']  # , 'requirements_RLLib_training.txt']
 for requirements_path in requirements_paths:
@@ -15,8 +9,15 @@ for requirements_path in requirements_paths:
         install_reqs += [
             s for s in [
                 line.strip(' \n') for line in f
-            ] if not s.startswith('#') and s != ''
+            ] if not s.startswith('#') and s != '' and not s.startswith('git+')
         ]
+with open(requirements_path, 'r') as f:
+    dependency_links += [
+        s for s in [
+            line.strip(' \n') for line in f
+        ] if s.startswith('git+')
+    ]
+
 requirements = install_reqs
 setup_requirements = install_reqs
 test_requirements = install_reqs
@@ -47,6 +48,7 @@ setup(
     setup_requires=setup_requirements,
     test_suite='tests',
     tests_require=test_requirements,
+    dependency_links=dependency_links,
     url='https://gitlab.aicrowd.com/flatland/baselines',
     version='0.1.1',
     zip_safe=False,
diff --git a/torch_training/bla.py b/torch_training/bla.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ec308c2b6bc5498d9198e2a03e562b02e7c96d
--- /dev/null
+++ b/torch_training/bla.py
@@ -0,0 +1,225 @@
+import getopt
+import random
+import sys
+from collections import deque
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from importlib_resources import path
+
+import torch_training.Nets
+from flatland.envs.generators import complex_rail_generator
+from flatland.envs.observations import TreeObsForRailEnv
+from flatland.envs.predictions import ShortestPathPredictorForRailEnv
+from flatland.envs.rail_env import RailEnv
+from flatland.utils.rendertools import RenderTool
+from torch_training.dueling_double_dqn import Agent
+from utils.observation_utils import norm_obs_clip, split_tree
+
+print("multi_agent_trainging.py (1)")
+
+def main(argv):
+    try:
+        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
+    except getopt.GetoptError:
+        print('training_navigation.py -n <n_trials>')
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt in ('-n', '--n_trials'):
+            n_trials = int(arg)
+    print("main1")
+    random.seed(1)
+    np.random.seed(1)
+
+    """
+    env = RailEnv(width=10,
+                  height=20, obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()))
+    env.load("./railway/complex_scene.pkl")
+    file_load = True
+    """
+
+    x_dim = np.random.randint(8, 20)
+    y_dim = np.random.randint(8, 20)
+    n_agents = np.random.randint(3, 8)
+    n_goals = n_agents + np.random.randint(0, 3)
+    min_dist = int(0.75 * min(x_dim, y_dim))
+    print("main2")
+
+    env = RailEnv(width=x_dim,
+                  height=y_dim,
+                  rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                                                        max_dist=99999,
+                                                        seed=0),
+                  obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()),
+                  number_of_agents=n_agents)
+    env.reset(True, True)
+    file_load = False
+    observation_helper = TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv())
+    env_renderer = RenderTool(env, gl="PILSVG", )
+    handle = env.get_agent_handles()
+    features_per_node = 9
+    state_size = features_per_node * 85 * 2
+    action_size = 5
+
+    print("main3")
+
+    # We set the number of episodes we would like to train on
+    if 'n_trials' not in locals():
+        n_trials = 30000
+    max_steps = int(3 * (env.height + env.width))
+    eps = 1.
+    eps_end = 0.005
+    eps_decay = 0.9995
+    action_dict = dict()
+    final_action_dict = dict()
+    scores_window = deque(maxlen=100)
+    done_window = deque(maxlen=100)
+    time_obs = deque(maxlen=2)
+    scores = []
+    dones_list = []
+    action_prob = [0] * action_size
+    agent_obs = [None] * env.get_num_agents()
+    agent_next_obs = [None] * env.get_num_agents()
+    agent = Agent(state_size, action_size, "FC", 0)
+    with path(torch_training.Nets, "avoid_checkpoint30000.pth") as file_in:
+        agent.qnetwork_local.load_state_dict(torch.load(file_in))
+
+    demo = False
+    record_images = False
+    frame_step = 0
+
+    print("Going to run training for {} trials...".format(n_trials))
+    for trials in range(1, n_trials + 1):
+
+        if trials % 50 == 0 and not demo:
+            x_dim = np.random.randint(8, 20)
+            y_dim = np.random.randint(8, 20)
+            n_agents = np.random.randint(3, 8)
+            n_goals = n_agents + np.random.randint(0, 3)
+            min_dist = int(0.75 * min(x_dim, y_dim))
+            env = RailEnv(width=x_dim,
+                          height=y_dim,
+                          rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                                                                max_dist=99999,
+                                                                seed=0),
+                          obs_builder_object=TreeObsForRailEnv(max_depth=3,
+                                                               predictor=ShortestPathPredictorForRailEnv()),
+                          number_of_agents=n_agents)
+            env.reset(True, True)
+            max_steps = int(3 * (env.height + env.width))
+            agent_obs = [None] * env.get_num_agents()
+            agent_next_obs = [None] * env.get_num_agents()
+        # # Reset environment
+        # if file_load:
+        #     obs = env.reset(False, False)
+        # else:
+        #     obs = env.reset(True, True)
+        # if demo:
+        #     env_renderer.set_new_rail()
+        # obs_original = obs.copy()
+        # final_obs = obs.copy()
+        # final_obs_next = obs.copy()
+        # for a in range(env.get_num_agents()):
+        #     data, distance, agent_data = split_tree(tree=np.array(obs[a]),
+        #                                             current_depth=0)
+        #     data = norm_obs_clip(data)
+        #     distance = norm_obs_clip(distance)
+        #     agent_data = np.clip(agent_data, -1, 1)
+        #     obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+        #     agent_data = env.agents[a]
+        #     speed = 1  # np.random.randint(1,5)
+        #     agent_data.speed_data['speed'] = 1. / speed
+        #
+        # for i in range(2):
+        #     time_obs.append(obs)
+        # # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
+        # for a in range(env.get_num_agents()):
+        #     agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
+        #
+        # score = 0
+        # env_done = 0
+        # # Run episode
+        # for step in range(max_steps):
+        #     if demo:
+        #         env_renderer.renderEnv(show=True, show_observations=False)
+        #         # observation_helper.util_print_obs_subtree(obs_original[0])
+        #         if record_images:
+        #             env_renderer.gl.saveImage("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
+        #             frame_step += 1
+        #     # print(step)
+        #     # Action
+        #     for a in range(env.get_num_agents()):
+        #         if demo:
+        #             eps = 0
+        #         # action = agent.act(np.array(obs[a]), eps=eps)
+        #         action = agent.act(agent_obs[a], eps=eps)
+        #         action_prob[action] += 1
+        #         action_dict.update({a: action})
+        #     # Environment step
+        #
+        #     next_obs, all_rewards, done, _ = env.step(action_dict)
+        #     # print(all_rewards,action)
+        #     obs_original = next_obs.copy()
+        #     for a in range(env.get_num_agents()):
+        #         data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
+        #                                                 current_depth=0)
+        #         data = norm_obs_clip(data)
+        #         distance = norm_obs_clip(distance)
+        #         agent_data = np.clip(agent_data, -1, 1)
+        #         next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+        #     time_obs.append(next_obs)
+        #
+        #     # Update replay buffer and train agent
+        #     for a in range(env.get_num_agents()):
+        #         agent_next_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
+        #         if done[a]:
+        #             final_obs[a] = agent_obs[a].copy()
+        #             final_obs_next[a] = agent_next_obs[a].copy()
+        #             final_action_dict.update({a: action_dict[a]})
+        #         if not demo and not done[a]:
+        #             agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
+        #         score += all_rewards[a] / env.get_num_agents()
+        #
+        #     agent_obs = agent_next_obs.copy()
+        #     if done['__all__']:
+        #         env_done = 1
+        #         for a in range(env.get_num_agents()):
+        #             agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
+        #         break
+        # # Epsilon decay
+        # eps = max(eps_end, eps_decay * eps)  # decrease epsilon
+        #
+        # done_window.append(env_done)
+        # scores_window.append(score / max_steps)  # save most recent score
+        # scores.append(np.mean(scores_window))
+        # dones_list.append((np.mean(done_window)))
+
+        print(
+            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                env.get_num_agents(), x_dim, y_dim,
+                trials,
+                np.mean(scores_window),
+                100 * np.mean(done_window),
+                eps, action_prob / np.sum(action_prob)), end=" ")
+
+        if trials % 100 == 0:
+            print(
+                '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                    env.get_num_agents(),
+                    trials,
+                    np.mean(scores_window),
+                    100 * np.mean(done_window),
+                    eps,
+                    action_prob / np.sum(action_prob)))
+            torch.save(agent.qnetwork_local.state_dict(),
+                       './Nets/avoid_checkpoint' + str(trials) + '.pth')
+            action_prob = [1] * action_size
+
+print("multi_agent_trainging.py (2)")
+
+if __name__ == '__main__':
+    print("main")
+    main(sys.argv[1:])
+
+print("multi_agent_trainging.py (3)")
\ No newline at end of file
diff --git a/torch_training/multi_agent_training.py b/torch_training/multi_agent_training.py
index 79355763c74b8270430770b3f5613f1068f20284..4f823be331850668e18bfdf66d35a915e3f6ccdc 100644
--- a/torch_training/multi_agent_training.py
+++ b/torch_training/multi_agent_training.py
@@ -1,195 +1,230 @@
+import getopt
 import random
+import sys
 from collections import deque
 
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
-from dueling_double_dqn import Agent
+from importlib_resources import path
+
+import torch_training.Nets
 from flatland.envs.generators import complex_rail_generator
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.utils.rendertools import RenderTool
-
+from torch_training.dueling_double_dqn import Agent
 from utils.observation_utils import norm_obs_clip, split_tree
 
-random.seed(1)
-np.random.seed(1)
-
-"""
-env = RailEnv(width=10,
-              height=20, obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()))
-env.load("./railway/complex_scene.pkl")
-file_load = True
-"""
-
-x_dim = np.random.randint(8, 20)
-y_dim = np.random.randint(8, 20)
-n_agents = np.random.randint(3, 8)
-n_goals = n_agents + np.random.randint(0, 3)
-min_dist = int(0.75 * min(x_dim, y_dim))
-env = RailEnv(width=x_dim,
-              height=y_dim,
-              rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
-                                                    max_dist=99999,
-                                                    seed=0),
-              obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()),
-              number_of_agents=n_agents)
-env.reset(True, True)
-file_load = False
-"""
-
-"""
-observation_helper = TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv())
-env_renderer = RenderTool(env, gl="PILSVG", )
-handle = env.get_agent_handles()
-features_per_node = 9
-state_size = features_per_node * 85 * 2
-action_size = 5
-n_trials = 30000
-max_steps = int(3 * (env.height + env.width))
-eps = 1.
-eps_end = 0.005
-eps_decay = 0.9995
-action_dict = dict()
-final_action_dict = dict()
-scores_window = deque(maxlen=100)
-done_window = deque(maxlen=100)
-time_obs = deque(maxlen=2)
-scores = []
-dones_list = []
-action_prob = [0] * action_size
-agent_obs = [None] * env.get_num_agents()
-agent_next_obs = [None] * env.get_num_agents()
-agent = Agent(state_size, action_size, "FC", 0)
-agent.qnetwork_local.load_state_dict(torch.load('./Nets/avoid_checkpoint30000.pth'))
-
-demo = False
-record_images = False
-frame_step = 0
-for trials in range(1, n_trials + 1):
-
-    if trials % 50 == 0 and not demo:
-        x_dim = np.random.randint(8, 20)
-        y_dim = np.random.randint(8, 20)
-        n_agents = np.random.randint(3, 8)
-        n_goals = n_agents + np.random.randint(0, 3)
-        min_dist = int(0.75 * min(x_dim, y_dim))
-        env = RailEnv(width=x_dim,
-                      height=y_dim,
-                      rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
-                                                            max_dist=99999,
-                                                            seed=0),
-                      obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()),
-                      number_of_agents=n_agents)
-        env.reset(True, True)
-        max_steps = int(3 * (env.height + env.width))
-        agent_obs = [None] * env.get_num_agents()
-        agent_next_obs = [None] * env.get_num_agents()
-    # Reset environment
-    if file_load:
-        obs = env.reset(False, False)
-    else:
-        obs = env.reset(True, True)
-    if demo:
-        env_renderer.set_new_rail()
-    obs_original = obs.copy()
-    final_obs = obs.copy()
-    final_obs_next = obs.copy()
-    for a in range(env.get_num_agents()):
-        data, distance, agent_data = split_tree(tree=np.array(obs[a]), num_features_per_node=features_per_node,
-                                                current_depth=0)
-        data = norm_obs_clip(data)
-        distance = norm_obs_clip(distance)
-        agent_data = np.clip(agent_data, -1, 1)
-        obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
-        agent_data = env.agents[a]
-        speed = 1  # np.random.randint(1,5)
-        agent_data.speed_data['speed'] = 1. / speed
-
-    for i in range(2):
-        time_obs.append(obs)
-    # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
-    for a in range(env.get_num_agents()):
-        agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
-
-    score = 0
-    env_done = 0
-    # Run episode
-    for step in range(max_steps):
+print("multi_agent_trainging.py (1)")
+
+
+def main(argv):
+    try:
+        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
+    except getopt.GetoptError:
+        print('training_navigation.py -n <n_trials>')
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt in ('-n', '--n_trials'):
+            n_trials = int(arg)
+    print("main1")
+    random.seed(1)
+    np.random.seed(1)
+
+    """
+    env = RailEnv(width=10,
+                  height=20, obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()))
+    env.load("./railway/complex_scene.pkl")
+    file_load = True
+    """
+
+    x_dim = np.random.randint(8, 20)
+    y_dim = np.random.randint(8, 20)
+    n_agents = np.random.randint(3, 8)
+    n_goals = n_agents + np.random.randint(0, 3)
+    min_dist = int(0.75 * min(x_dim, y_dim))
+    print("main2")
+
+    env = RailEnv(width=x_dim,
+                  height=y_dim,
+                  rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                                                        max_dist=99999,
+                                                        seed=0),
+                  obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()),
+                  number_of_agents=n_agents)
+    env.reset(True, True)
+    file_load = False
+
+    observation_helper = TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv())
+    env_renderer = RenderTool(env, gl="PILSVG", )
+    handle = env.get_agent_handles()
+    features_per_node = 9
+    state_size = features_per_node * 85 * 2
+    action_size = 5
+
+    print("main3")
+
+    # We set the number of episodes we would like to train on
+    if 'n_trials' not in locals():
+        n_trials = 30000
+    max_steps = int(3 * (env.height + env.width))
+    eps = 1.
+    eps_end = 0.005
+    eps_decay = 0.9995
+    action_dict = dict()
+    final_action_dict = dict()
+    scores_window = deque(maxlen=100)
+    done_window = deque(maxlen=100)
+    time_obs = deque(maxlen=2)
+    scores = []
+    dones_list = []
+    action_prob = [0] * action_size
+    agent_obs = [None] * env.get_num_agents()
+    agent_next_obs = [None] * env.get_num_agents()
+    agent = Agent(state_size, action_size, "FC", 0)
+    with path(torch_training.Nets, "avoid_checkpoint30000.pth") as file_in:
+        agent.qnetwork_local.load_state_dict(torch.load(file_in))
+
+    demo = False
+    record_images = False
+    frame_step = 0
+
+    print("Going to run training for {} trials...".format(n_trials))
+    for trials in range(1, n_trials + 1):
+
+        if trials % 50 == 0 and not demo:
+            x_dim = np.random.randint(8, 20)
+            y_dim = np.random.randint(8, 20)
+            n_agents = np.random.randint(3, 8)
+            n_goals = n_agents + np.random.randint(0, 3)
+            min_dist = int(0.75 * min(x_dim, y_dim))
+            env = RailEnv(width=x_dim,
+                          height=y_dim,
+                          rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                                                                max_dist=99999,
+                                                                seed=0),
+                          obs_builder_object=TreeObsForRailEnv(max_depth=3,
+                                                               predictor=ShortestPathPredictorForRailEnv()),
+                          number_of_agents=n_agents)
+            env.reset(True, True)
+            max_steps = int(3 * (env.height + env.width))
+            agent_obs = [None] * env.get_num_agents()
+            agent_next_obs = [None] * env.get_num_agents()
+        # Reset environment
+        if file_load:
+            obs = env.reset(False, False)
+        else:
+            obs = env.reset(True, True)
         if demo:
-            env_renderer.renderEnv(show=True, show_observations=False)
-            # observation_helper.util_print_obs_subtree(obs_original[0])
-            if record_images:
-                env_renderer.gl.saveImage("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
-                frame_step += 1
-        # print(step)
-        # Action
+            env_renderer.set_new_rail()
+        obs_original = obs.copy()
+        final_obs = obs.copy()
+        final_obs_next = obs.copy()
         for a in range(env.get_num_agents()):
-            if demo:
-                eps = 0
-            # action = agent.act(np.array(obs[a]), eps=eps)
-            action = agent.act(agent_obs[a], eps=eps)
-            action_prob[action] += 1
-            action_dict.update({a: action})
-        # Environment step
-
-        next_obs, all_rewards, done, _ = env.step(action_dict)
-        # print(all_rewards,action)
-        obs_original = next_obs.copy()
-        for a in range(env.get_num_agents()):
-            data, distance, agent_data = split_tree(tree=np.array(next_obs[a]), num_features_per_node=features_per_node,
+            data, distance, agent_data = split_tree(tree=np.array(obs[a]),
                                                     current_depth=0)
             data = norm_obs_clip(data)
             distance = norm_obs_clip(distance)
             agent_data = np.clip(agent_data, -1, 1)
-            next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
-        time_obs.append(next_obs)
-
-        # Update replay buffer and train agent
+            obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+            agent_data = env.agents[a]
+            speed = 1  # np.random.randint(1,5)
+            agent_data.speed_data['speed'] = 1. / speed
+
+        for i in range(2):
+            time_obs.append(obs)
+        # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
         for a in range(env.get_num_agents()):
-            agent_next_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
-            if done[a]:
-                final_obs[a] = agent_obs[a].copy()
-                final_obs_next[a] = agent_next_obs[a].copy()
-                final_action_dict.update({a: action_dict[a]})
-            if not demo and not done[a]:
-                agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
-            score += all_rewards[a] / env.get_num_agents()
-
-        agent_obs = agent_next_obs.copy()
-        if done['__all__']:
-            env_done = 1
+            agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
+
+        score = 0
+        env_done = 0
+        # Run episode
+        for step in range(max_steps):
+            if demo:
+                env_renderer.renderEnv(show=True, show_observations=False)
+                # observation_helper.util_print_obs_subtree(obs_original[0])
+                if record_images:
+                    env_renderer.gl.saveImage("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
+                    frame_step += 1
+            # print(step)
+            # Action
+            for a in range(env.get_num_agents()):
+                if demo:
+                    eps = 0
+                # action = agent.act(np.array(obs[a]), eps=eps)
+                action = agent.act(agent_obs[a], eps=eps)
+                action_prob[action] += 1
+                action_dict.update({a: action})
+            # Environment step
+
+            next_obs, all_rewards, done, _ = env.step(action_dict)
+            # print(all_rewards,action)
+            obs_original = next_obs.copy()
             for a in range(env.get_num_agents()):
-                agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
-            break
-    # Epsilon decay
-    eps = max(eps_end, eps_decay * eps)  # decrease epsilon
-
-    done_window.append(env_done)
-    scores_window.append(score / max_steps)  # save most recent score
-    scores.append(np.mean(scores_window))
-    dones_list.append((np.mean(done_window)))
-
-    print(
-        '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-            env.get_num_agents(), x_dim, y_dim,
-            trials,
-            np.mean(scores_window),
-            100 * np.mean(done_window),
-            eps, action_prob / np.sum(action_prob)), end=" ")
-
-    if trials % 100 == 0:
+                data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
+                                                        current_depth=0)
+                data = norm_obs_clip(data)
+                distance = norm_obs_clip(distance)
+                agent_data = np.clip(agent_data, -1, 1)
+                next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+            time_obs.append(next_obs)
+
+            # Update replay buffer and train agent
+            for a in range(env.get_num_agents()):
+                agent_next_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
+                if done[a]:
+                    final_obs[a] = agent_obs[a].copy()
+                    final_obs_next[a] = agent_next_obs[a].copy()
+                    final_action_dict.update({a: action_dict[a]})
+                if not demo and not done[a]:
+                    agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
+                score += all_rewards[a] / env.get_num_agents()
+
+            agent_obs = agent_next_obs.copy()
+            if done['__all__']:
+                env_done = 1
+                for a in range(env.get_num_agents()):
+                    agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
+                break
+        # Epsilon decay
+        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
+
+        done_window.append(env_done)
+        scores_window.append(score / max_steps)  # save most recent score
+        scores.append(np.mean(scores_window))
+        dones_list.append((np.mean(done_window)))
+
         print(
-            '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-                env.get_num_agents(),
+            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                env.get_num_agents(), x_dim, y_dim,
                 trials,
                 np.mean(scores_window),
                 100 * np.mean(done_window),
-                eps,
-                action_prob / np.sum(action_prob)))
-        torch.save(agent.qnetwork_local.state_dict(),
-                   './Nets/avoid_checkpoint' + str(trials) + '.pth')
-        action_prob = [1] * action_size
-plt.plot(scores)
-plt.show()
+                eps, action_prob / np.sum(action_prob)), end=" ")
+
+        if trials % 100 == 0:
+            print(
+                '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                    env.get_num_agents(),
+                    trials,
+                    np.mean(scores_window),
+                    100 * np.mean(done_window),
+                    eps,
+                    action_prob / np.sum(action_prob)))
+            torch.save(agent.qnetwork_local.state_dict(),
+                       './Nets/avoid_checkpoint' + str(trials) + '.pth')
+            action_prob = [1] * action_size
+    plt.plot(scores)
+    plt.show()
+
+
+print("multi_agent_trainging.py (2)")
+
+if __name__ == '__main__':
+    print("main")
+    main(sys.argv[1:])
+
+print("multi_agent_trainging.py (3)")
diff --git a/torch_training/training_navigation.py b/torch_training/training_navigation.py
index dd4d4799b9e72a825be040f722d078d04afacbdf..f575510f6e0e32eeaee2d29b7f5da0ced852fb81 100644
--- a/torch_training/training_navigation.py
+++ b/torch_training/training_navigation.py
@@ -1,8 +1,10 @@
+import getopt
 import random
+import sys
 from collections import deque
+
 import matplotlib.pyplot as plt
 import numpy as np
-
 import torch
 from dueling_double_dqn import Agent
 
@@ -12,83 +14,187 @@ from flatland.envs.rail_env import RailEnv
 from flatland.utils.rendertools import RenderTool
 from utils.observation_utils import norm_obs_clip, split_tree
 
-random.seed(1)
-np.random.seed(1)
-
-# Parameters for the Environment
-x_dim = 10
-y_dim = 10
-n_agents = 1
-n_goals = 5
-min_dist = 5
-
-# We are training an Agent using the Tree Observation with depth 2
-observation_builder = TreeObsForRailEnv(max_depth=2)
-
-# Load the Environment
-env = RailEnv(width=x_dim,
-              height=y_dim,
-              rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
-                                                    max_dist=99999,
-                                                    seed=0),
-              obs_builder_object=observation_builder,
-              number_of_agents=n_agents)
-env.reset(True, True)
-
-# After training we want to render the results so we also load a renderer
-env_renderer = RenderTool(env, gl="PILSVG", )
-
-# Given the depth of the tree observation and the number of features per node we get the following state_size
-features_per_node = 9
-tree_depth = 2
-nr_nodes = 0
-for i in range(tree_depth + 1):
-    nr_nodes += np.power(4, i)
-state_size = features_per_node * nr_nodes
-
-# The action space of flatland is 5 discrete actions
-action_size = 5
-
-# We set the number of episodes we would like to train on
-n_trials = 6000
-
-# And the max number of steps we want to take per episode
-max_steps = int(3 * (env.height + env.width))
-
-# Define training parameters
-eps = 1.
-eps_end = 0.005
-eps_decay = 0.998
-
-# And some variables to keep track of the progress
-action_dict = dict()
-final_action_dict = dict()
-scores_window = deque(maxlen=100)
-done_window = deque(maxlen=100)
-time_obs = deque(maxlen=2)
-scores = []
-dones_list = []
-action_prob = [0] * action_size
-agent_obs = [None] * env.get_num_agents()
-agent_next_obs = [None] * env.get_num_agents()
-
-# Now we load a Double dueling DQN agent
-agent = Agent(state_size, action_size, "FC", 0)
-
-Training = True
-
-for trials in range(1, n_trials + 1):
+
+def main(argv):
+    try:
+        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
+    except getopt.GetoptError:
+        print('training_navigation.py -n <n_trials>')
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt in ('-n', '--n_trials'):
+            n_trials = int(arg)
+
+    random.seed(1)
+    np.random.seed(1)
+
+    # Parameters for the Environment
+    x_dim = 10
+    y_dim = 10
+    n_agents = 1
+    n_goals = 5
+    min_dist = 5
+
+    # We are training an Agent using the Tree Observation with depth 2
+    observation_builder = TreeObsForRailEnv(max_depth=2)
+
+    # Load the Environment
+    env = RailEnv(width=x_dim,
+                  height=y_dim,
+                  rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                                                        max_dist=99999,
+                                                        seed=0),
+                  obs_builder_object=observation_builder,
+                  number_of_agents=n_agents)
+    env.reset(True, True)
+
+    # After training we want to render the results so we also load a renderer
+    env_renderer = RenderTool(env, gl="PILSVG", )
+
+    # Given the depth of the tree observation and the number of features per node we get the following state_size
+    features_per_node = 9
+    tree_depth = 2
+    nr_nodes = 0
+    for i in range(tree_depth + 1):
+        nr_nodes += np.power(4, i)
+    state_size = features_per_node * nr_nodes
+
+    # The action space of flatland is 5 discrete actions
+    action_size = 5
+
+    # We set the number of episodes we would like to train on
+    if 'n_trials' not in locals():
+        n_trials = 6000
+
+    # And the max number of steps we want to take per episode
+    max_steps = int(3 * (env.height + env.width))
+
+    # Define training parameters
+    eps = 1.
+    eps_end = 0.005
+    eps_decay = 0.998
+
+    # And some variables to keep track of the progress
+    action_dict = dict()
+    final_action_dict = dict()
+    scores_window = deque(maxlen=100)
+    done_window = deque(maxlen=100)
+    time_obs = deque(maxlen=2)
+    scores = []
+    dones_list = []
+    action_prob = [0] * action_size
+    agent_obs = [None] * env.get_num_agents()
+    agent_next_obs = [None] * env.get_num_agents()
+
+    # Now we load a Double dueling DQN agent
+    agent = Agent(state_size, action_size, "FC", 0)
+
+    Training = True
+
+    for trials in range(1, n_trials + 1):
+
+        # Reset environment
+        obs = env.reset(True, True)
+        if not Training:
+            env_renderer.set_new_rail()
+
+        # Split the observation tree into its parts and normalize the observation using the utility functions.
+        # Build agent specific local observation
+        for a in range(env.get_num_agents()):
+            rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
+                                                              current_depth=0)
+            rail_data = norm_obs_clip(rail_data)
+            distance_data = norm_obs_clip(distance_data)
+            agent_data = np.clip(agent_data, -1, 1)
+            agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
+
+        # Reset score and done
+        score = 0
+        env_done = 0
+
+        # Run episode
+        for step in range(max_steps):
+
+            # Only render when not triaing
+            if not Training:
+                env_renderer.renderEnv(show=True, show_observations=True)
+
+            # Chose the actions
+            for a in range(env.get_num_agents()):
+                if not Training:
+                    eps = 0
+
+                action = agent.act(agent_obs[a], eps=eps)
+                action_dict.update({a: action})
+
+                # Count number of actions takes for statistics
+                action_prob[action] += 1
+
+            # Environment step
+            next_obs, all_rewards, done, _ = env.step(action_dict)
+
+            for a in range(env.get_num_agents()):
+                rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
+                                                                  current_depth=0)
+                rail_data = norm_obs_clip(rail_data)
+                distance_data = norm_obs_clip(distance_data)
+                agent_data = np.clip(agent_data, -1, 1)
+                agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
+
+            # Update replay buffer and train agent
+            for a in range(env.get_num_agents()):
+
+                # Remember and train agent
+                if Training:
+                    agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
+
+                # Update the current score
+                score += all_rewards[a] / env.get_num_agents()
+
+            agent_obs = agent_next_obs.copy()
+            if done['__all__']:
+                env_done = 1
+                break
+
+        # Epsilon decay
+        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
+
+        # Store the information about training progress
+        done_window.append(env_done)
+        scores_window.append(score / max_steps)  # save most recent score
+        scores.append(np.mean(scores_window))
+        dones_list.append((np.mean(done_window)))
+
+        print(
+            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                env.get_num_agents(), x_dim, y_dim,
+                trials,
+                np.mean(scores_window),
+                100 * np.mean(done_window),
+                eps, action_prob / np.sum(action_prob)), end=" ")
+
+        if trials % 100 == 0:
+            print(
+                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                    env.get_num_agents(), x_dim, y_dim,
+                    trials,
+                    np.mean(scores_window),
+                    100 * np.mean(done_window),
+                    eps, action_prob / np.sum(action_prob)))
+            torch.save(agent.qnetwork_local.state_dict(),
+                       './Nets/navigator_checkpoint' + str(trials) + '.pth')
+            action_prob = [1] * action_size
+
+    # Render the trained agent
 
     # Reset environment
     obs = env.reset(True, True)
-    if not Training:
-        env_renderer.set_new_rail()
+    env_renderer.set_new_rail()
 
     # Split the observation tree into its parts and normalize the observation using the utility functions.
     # Build agent specific local observation
     for a in range(env.get_num_agents()):
         rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
-                                                          num_features_per_node=features_per_node,
                                                           current_depth=0)
         rail_data = norm_obs_clip(rail_data)
         distance_data = norm_obs_clip(distance_data)
@@ -101,123 +207,32 @@ for trials in range(1, n_trials + 1):
 
     # Run episode
     for step in range(max_steps):
-
-        # Only render when not triaing
-        if not Training:
-            env_renderer.renderEnv(show=True, show_observations=True)
+        env_renderer.renderEnv(show=True, show_observations=False)
 
         # Chose the actions
         for a in range(env.get_num_agents()):
-            if not Training:
-                eps = 0
-
+            eps = 0
             action = agent.act(agent_obs[a], eps=eps)
             action_dict.update({a: action})
 
-            # Count number of actions takes for statistics
-            action_prob[action] += 1
-
         # Environment step
         next_obs, all_rewards, done, _ = env.step(action_dict)
 
         for a in range(env.get_num_agents()):
             rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                              num_features_per_node=features_per_node,
                                                               current_depth=0)
             rail_data = norm_obs_clip(rail_data)
             distance_data = norm_obs_clip(distance_data)
             agent_data = np.clip(agent_data, -1, 1)
             agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
 
-        # Update replay buffer and train agent
-        for a in range(env.get_num_agents()):
-
-            # Remember and train agent
-            if Training:
-                agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
-
-            # Update the current score
-            score += all_rewards[a] / env.get_num_agents()
-
         agent_obs = agent_next_obs.copy()
         if done['__all__']:
-            env_done = 1
             break
+    # Plot overall training progress at the end
+    plt.plot(scores)
+    plt.show()
 
-    # Epsilon decay
-    eps = max(eps_end, eps_decay * eps)  # decrease epsilon
-
-    # Store the information about training progress
-    done_window.append(env_done)
-    scores_window.append(score / max_steps)  # save most recent score
-    scores.append(np.mean(scores_window))
-    dones_list.append((np.mean(done_window)))
-
-    print(
-        '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-            env.get_num_agents(), x_dim, y_dim,
-            trials,
-            np.mean(scores_window),
-            100 * np.mean(done_window),
-            eps, action_prob / np.sum(action_prob)), end=" ")
 
-    if trials % 100 == 0:
-        print(
-            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-                env.get_num_agents(), x_dim, y_dim,
-                trials,
-                np.mean(scores_window),
-                100 * np.mean(done_window),
-                eps, action_prob / np.sum(action_prob)))
-        torch.save(agent.qnetwork_local.state_dict(),
-                   './Nets/navigator_checkpoint' + str(trials) + '.pth')
-        action_prob = [1] * action_size
-
-# Render the trained agent
-
-# Reset environment
-obs = env.reset(True, True)
-env_renderer.set_new_rail()
-
-# Split the observation tree into its parts and normalize the observation using the utility functions.
-# Build agent specific local observation
-for a in range(env.get_num_agents()):
-    rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]), num_features_per_node=features_per_node,
-                                                      current_depth=0)
-    rail_data = norm_obs_clip(rail_data)
-    distance_data = norm_obs_clip(distance_data)
-    agent_data = np.clip(agent_data, -1, 1)
-    agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
-
-# Reset score and done
-score = 0
-env_done = 0
-
-# Run episode
-for step in range(max_steps):
-    env_renderer.renderEnv(show=True, show_observations=False)
-
-    # Chose the actions
-    for a in range(env.get_num_agents()):
-        eps = 0
-        action = agent.act(agent_obs[a], eps=eps)
-        action_dict.update({a: action})
-
-    # Environment step
-    next_obs, all_rewards, done, _ = env.step(action_dict)
-
-    for a in range(env.get_num_agents()):
-        rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                          num_features_per_node=features_per_node,
-                                                          current_depth=0)
-        rail_data = norm_obs_clip(rail_data)
-        distance_data = norm_obs_clip(distance_data)
-        agent_data = np.clip(agent_data, -1, 1)
-        agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
-
-    agent_obs = agent_next_obs.copy()
-    if done['__all__']:
-        break
-# Plot overall training progress at the end
-plt.plot(scores)
-plt.show()
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/tox.ini b/tox.ini
index 3c22b56780ffa59d41f64cad3f9698c3f62a204d..36b7c10a081b629375b202ad1e642e7366db2abe 100644
--- a/tox.ini
+++ b/tox.ini
@@ -15,13 +15,15 @@ setenv =
     PYTHONPATH = {toxinidir}
 passenv =
     DISPLAY
+    XAUTHORITY
 ; HTTP_PROXY+HTTPS_PROXY required behind corporate proxies
     HTTP_PROXY
     HTTPS_PROXY
 deps =
     -r{toxinidir}/requirements_torch_training.txt
 commands =
-    python torch_training/training_navigation.py
+    python -m pip install -r requirements_torch_training.txt
+    python torch_training/bla.py --n_trials=10
 
 [flake8]
 max-line-length = 120
@@ -29,7 +31,12 @@ ignore = E121 E126 E123 E128 E133 E226 E241 E242 E704 W291 W293 W391 W503 W504 W
 
 [testenv:flake8]
 basepython = python
-passenv = DISPLAY
+passenv =
+    DISPLAY
+    XAUTHORITY
+; HTTP_PROXY+HTTPS_PROXY required behind corporate proxies
+    HTTP_PROXY
+    HTTPS_PROXY
 deps =
     -r{toxinidir}/requirements_torch_training.txt
 commands =
diff --git a/utils/misc_utils.py b/utils/misc_utils.py
index 03c9fdde9368bf324f7e10841b2d30b993858fd6..5b29c6b15f61b46062bac8d4fb6c4130fe61c6ec 100644
--- a/utils/misc_utils.py
+++ b/utils/misc_utils.py
@@ -101,7 +101,7 @@ def run_test(parameters, agent, test_nr=0, tree_depth=3):
         lp_reset(True, True)
         obs = env.reset(True, True)
         for a in range(env.get_num_agents()):
-            data, distance, agent_data = split_tree(tree=np.array(obs[a]), num_features_per_node=9,
+            data, distance, agent_data = split_tree(tree=np.array(obs[a]),
                                                     current_depth=0)
             data = norm_obs_clip(data)
             distance = norm_obs_clip(distance)
@@ -127,7 +127,6 @@ def run_test(parameters, agent, test_nr=0, tree_depth=3):
 
             for a in range(env.get_num_agents()):
                 data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                        num_features_per_node=features_per_node,
                                                         current_depth=0)
                 data = norm_obs_clip(data)
                 distance = norm_obs_clip(distance)
diff --git a/utils/observation_utils.py b/utils/observation_utils.py
index fda5b530fb4f473915b0f043d901d3e8cb4fe727..fff6701693e00c5e660e169c8714c27d217cf6c7 100644
--- a/utils/observation_utils.py
+++ b/utils/observation_utils.py
@@ -1,5 +1,7 @@
 import numpy as np
 
+from flatland.envs.observations import TreeObsForRailEnv
+
 
 def max_lt(seq, val):
     """
@@ -52,7 +54,7 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1, fixed_radius=0):
     return np.clip((np.array(obs) - min_obs) / norm, clip_min, clip_max)
 
 
-def split_tree(tree, num_features_per_node=9, current_depth=0):
+def split_tree(tree, current_depth=0):
     """
     Splits the tree observation into different sub groups that need the same normalization.
     This is necessary because the tree observation includes two different distance:
@@ -68,6 +70,7 @@ def split_tree(tree, num_features_per_node=9, current_depth=0):
     :param current_depth: Keeping track of the current depth in the tree
     :return: Returns the three different groups of distance and binary values.
     """
+    num_features_per_node = TreeObsForRailEnv.observation_dim
 
     if len(tree) < num_features_per_node:
         return [], [], []
@@ -92,7 +95,6 @@ def split_tree(tree, num_features_per_node=9, current_depth=0):
         child_tree = tree[(num_features_per_node + children * child_size):
                           (num_features_per_node + (children + 1) * child_size)]
         tmp_tree_data, tmp_distance_data, tmp_agent_data = split_tree(child_tree,
-                                                                      num_features_per_node,
                                                                       current_depth=current_depth + 1)
         if len(tmp_tree_data) > 0:
             tree_data.extend(tmp_tree_data)