diff --git a/RLLib_training/RailEnvRLLibWrapper.py b/RLLib_training/RailEnvRLLibWrapper.py
index 4cba2f36fe5f92edfd609c6434dec0d23aa9aa34..6d34e95e6b1e0fdada612fffcbd00c6e18a5ba8e 100644
--- a/RLLib_training/RailEnvRLLibWrapper.py
+++ b/RLLib_training/RailEnvRLLibWrapper.py
@@ -1,9 +1,8 @@
 import numpy as np
-from ray.rllib.env.multi_agent_env import MultiAgentEnv
-from ray.rllib.utils.seed import seed as set_seed
-
 from flatland.envs.generators import complex_rail_generator, random_rail_generator
 from flatland.envs.rail_env import RailEnv
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+from ray.rllib.utils.seed import seed as set_seed
 
 
 class RailEnvRLLibWrapper(MultiAgentEnv):
diff --git a/RLLib_training/custom_preprocessors.py b/RLLib_training/custom_preprocessors.py
index 86c159d3bbc11bc4c0fdd321d2c6ff6838488f4d..bd45dfd2bd8620c2866ac3d079a9e82dd4c20c7e 100644
--- a/RLLib_training/custom_preprocessors.py
+++ b/RLLib_training/custom_preprocessors.py
@@ -49,13 +49,14 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1):
 
 class CustomPreprocessor(Preprocessor):
     def _init_shape(self, obs_space, options):
-        #return (sum([space.shape[0] for space in obs_space]), )
-        return ((sum([space.shape[0] for space in obs_space[:2]]) + obs_space[2].shape[0]*obs_space[2].shape[1]),)
+        # return (sum([space.shape[0] for space in obs_space]), )
+        return ((sum([space.shape[0] for space in obs_space[:2]]) + obs_space[2].shape[0] * obs_space[2].shape[1]),)
 
     def transform(self, observation):
         # if len(observation) == 111:
-        #return np.concatenate([norm_obs_clip(obs) for obs in observation])
-        return np.concatenate([norm_obs_clip(observation[0]), observation[1], observation[2].flatten()])#, norm_obs_clip(observation[1]), observation[2], observation[3].flatten()])
+        # return np.concatenate([norm_obs_clip(obs) for obs in observation])
+        return np.concatenate([norm_obs_clip(observation[0]), observation[1], observation[
+            2].flatten()])  #, norm_obs_clip(observation[1]), observation[2], observation[3].flatten()])
         #one_hot = observation[-3:]
         #return np.append(obs, one_hot)
         # else:
diff --git a/RLLib_training/train.py b/RLLib_training/train.py
index 154620515157679fd8eb7947f54796ddd316710e..ba5f4eab43f5173dd410bc6d9b306d90e0e21ffc 100644
--- a/RLLib_training/train.py
+++ b/RLLib_training/train.py
@@ -5,13 +5,13 @@ import numpy as np
 import ray
 import ray.rllib.agents.ppo.ppo as ppo
 from RailEnvRLLibWrapper import RailEnvRLLibWrapper
+from flatland.envs.generators import complex_rail_generator
 from ray.rllib.agents.ppo.ppo import PPOTrainer
 from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
 from ray.rllib.models import ModelCatalog
 from ray.tune.logger import pretty_print
 
 from RLLib_training.custom_preprocessors import CustomPreprocessor
-from flatland.envs.generators import complex_rail_generator
 
 ModelCatalog.register_custom_preprocessor("my_prep", CustomPreprocessor)
 ray.init()
diff --git a/RLLib_training/train_experiment.py b/RLLib_training/train_experiment.py
index 28530717304734154a9fbe2111b5982b30e50744..006bab121a3e9aab7d9f014a466ad77c1aced82f 100644
--- a/RLLib_training/train_experiment.py
+++ b/RLLib_training/train_experiment.py
@@ -2,6 +2,7 @@ import os
 
 import gin
 import gym
+from flatland.envs.predictions import DummyPredictorForRailEnv
 from importlib_resources import path
 # Import PPO trainer: we can replace these imports by any other trainer from RLLib.
 from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG
@@ -9,8 +10,6 @@ from ray.rllib.agents.ppo.ppo import PPOTrainer as Trainer
 from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph as PolicyGraph
 from ray.rllib.models import ModelCatalog
 
-from flatland.envs.predictions import DummyPredictorForRailEnv
-
 gin.external_configurable(DummyPredictorForRailEnv)
 
 import ray
diff --git a/setup.py b/setup.py
index 20ddab1f27a7b6b3b1e4df10bf90c47c86ecb82c..723e1a6f701150f5853a0199057bc234137c2aa2 100644
--- a/setup.py
+++ b/setup.py
@@ -4,11 +4,12 @@ from setuptools import setup, find_packages
 
 # TODO: setup does not support installation from url, move to requirements*.txt
 # TODO: @master as soon as mr is merged on flatland.
-os.system('pip install git+https://gitlab.aicrowd.com/flatland/flatland.git@57-access-resources-through-importlib_resources')
+os.system(
+    'pip install git+https://gitlab.aicrowd.com/flatland/flatland.git@57-access-resources-through-importlib_resources')
 
 install_reqs = []
 # TODO: include requirements_RLLib_training.txt
-requirements_paths = ['requirements_torch_training.txt'] #, 'requirements_RLLib_training.txt']
+requirements_paths = ['requirements_torch_training.txt']  # , 'requirements_RLLib_training.txt']
 for requirements_path in requirements_paths:
     with open(requirements_path, 'r') as f:
         install_reqs += [
diff --git a/torch_training/training_navigation.py b/torch_training/training_navigation.py
index 1c747c27e255ea11493ad7d7cea65ed986f3bbc1..96593864a31cf45864b3cf8f52c29a0d5a241ef0 100644
--- a/torch_training/training_navigation.py
+++ b/torch_training/training_navigation.py
@@ -1,22 +1,18 @@
-import os
 import random
 from collections import deque
 
 import numpy as np
 import torch
-
+from dueling_double_dqn import Agent
 from flatland.envs.generators import complex_rail_generator
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import DummyPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.utils.rendertools import RenderTool
-from torch_training.dueling_double_dqn import Agent
 
 random.seed(1)
 np.random.seed(1)
 
-__file_dirname__ = os.path.dirname(os.path.realpath(__file__))
-
 # Example generate a rail given a manual specification,
 # a map of tuples (cell_type, rotation)
 transition_probability = [15,  # empty cell - Case 0
@@ -47,22 +43,24 @@ env = RailEnv(width=15,
 
 env = RailEnv(width=10,
               height=20)
-env.load_resource('torch_training.railway', "complex_scene.pkl")
+env.load("./railway/complex_scene.pkl")
 """
 
-env = RailEnv(width=20,
-              height=20,
-              rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=0),
+env = RailEnv(width=8,
+              height=8,
+              rail_generator=complex_rail_generator(nr_start_goal=5, nr_extra=1, min_dist=4, max_dist=99999, seed=0),
               obs_builder_object=TreeObsForRailEnv(max_depth=2, predictor=DummyPredictorForRailEnv()),
-              number_of_agents=10)
+              number_of_agents=3)
+
 env.reset(True, True)
 
 env_renderer = RenderTool(env, gl="PILSVG")
 handle = env.get_agent_handles()
 
-state_size = 147 * 2
+state_size = 168 * 2
 action_size = 5
 n_trials = 15000
+max_steps = int(1.5 * (env.height + env.width))
 eps = 1.
 eps_end = 0.005
 eps_decay = 0.9995
@@ -77,10 +75,9 @@ action_prob = [0] * action_size
 agent_obs = [None] * env.get_num_agents()
 agent_next_obs = [None] * env.get_num_agents()
 agent = Agent(state_size, action_size, "FC", 0)
-agent.qnetwork_local.load_state_dict(torch.load(os.path.join(__file_dirname__, 'Nets', 'avoid_checkpoint15000.pth')))
-
-demo = True
+# agent.qnetwork_local.load_state_dict(torch.load('./Nets/avoid_checkpoint15000.pth'))
 
+demo = False
 
 def max_lt(seq, val):
     """
@@ -104,7 +101,7 @@ def min_lt(seq, val):
     min = np.inf
     idx = len(seq) - 1
     while idx >= 0:
-        if seq[idx] > val and seq[idx] < min:
+        if seq[idx] >= val and seq[idx] < min:
             min = seq[idx]
         idx -= 1
     return min
@@ -119,7 +116,8 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1):
     :return: returnes normalized and clipped observatoin
     """
     max_obs = max(1, max_lt(obs, 1000))
-    min_obs = max(0, min_lt(obs, 0))
+    min_obs = min(max_obs, min_lt(obs, 0))
+
     if max_obs == min_obs:
         return np.clip(np.array(obs) / max_obs, clip_min, clip_max)
     norm = np.abs(max_obs - min_obs)
@@ -136,14 +134,14 @@ for trials in range(1, n_trials + 1):
         env_renderer.set_new_rail()
     final_obs = obs.copy()
     final_obs_next = obs.copy()
-
     for a in range(env.get_num_agents()):
-        data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(obs[a]), num_features_per_node=7,
+        data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(obs[a]), num_features_per_node=8,
                                                                 current_depth=0)
         data = norm_obs_clip(data)
         distance = norm_obs_clip(distance)
         agent_data = np.clip(agent_data, -1, 1)
         obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+
     for i in range(2):
         time_obs.append(obs)
     # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
@@ -153,14 +151,14 @@ for trials in range(1, n_trials + 1):
     score = 0
     env_done = 0
     # Run episode
-    for step in range(env.height * env.width):
+    for step in range(max_steps):
         if demo:
             env_renderer.renderEnv(show=True, show_observations=False)
         # print(step)
         # Action
         for a in range(env.get_num_agents()):
             if demo:
-                eps = 1
+                eps = 0
             # action = agent.act(np.array(obs[a]), eps=eps)
             action = agent.act(agent_obs[a], eps=eps)
             action_prob[action] += 1
@@ -169,13 +167,12 @@ for trials in range(1, n_trials + 1):
 
         next_obs, all_rewards, done, _ = env.step(action_dict)
         for a in range(env.get_num_agents()):
-            data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(next_obs[a]), num_features_per_node=7,
-                                                                    current_depth=0)
+            data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(next_obs[a]), num_features_per_node=8,
+                                                        current_depth=0)
             data = norm_obs_clip(data)
             distance = norm_obs_clip(distance)
             agent_data = np.clip(agent_data, -1, 1)
             next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
-
         time_obs.append(next_obs)
 
         # Update replay buffer and train agent
@@ -187,7 +184,7 @@ for trials in range(1, n_trials + 1):
                 final_action_dict.update({a: action_dict[a]})
             if not demo and not done[a]:
                 agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
-            score += all_rewards[a]
+            score += all_rewards[a] / env.get_num_agents()
 
         agent_obs = agent_next_obs.copy()
         if done['__all__']:
@@ -199,21 +196,21 @@ for trials in range(1, n_trials + 1):
     eps = max(eps_end, eps_decay * eps)  # decrease epsilon
 
     done_window.append(env_done)
-    scores_window.append(score)  # save most recent score
+    scores_window.append(score / max_steps)  # save most recent score
     scores.append(np.mean(scores_window))
     dones_list.append((np.mean(done_window)))
 
     print(
-        '\rTraining {} Agents.\t Episode {}\t Average Score: {:.0f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-            env.get_num_agents(),
-            trials,
-            np.mean(scores_window),
-            100 * np.mean(done_window),
-            eps, action_prob / np.sum(action_prob)), end=" ")
+        '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+              env.get_num_agents(),
+              trials,
+              np.mean(scores_window),
+              100 * np.mean(done_window),
+              eps, action_prob / np.sum(action_prob)), end=" ")
 
     if trials % 100 == 0:
         print(
-            '\rTraining {} Agents.\t Episode {}\t Average Score: {:.0f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+            '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                 env.get_num_agents(),
                 trials,
                 np.mean(scores_window),
@@ -221,5 +218,5 @@ for trials in range(1, n_trials + 1):
                 eps,
                 action_prob / np.sum(action_prob)))
         torch.save(agent.qnetwork_local.state_dict(),
-                   os.path.join(__file_dirname__, 'Nets', 'avoid_checkpoint' + str(trials) + '.pth'))
+                   './Nets/avoid_checkpoint' + str(trials) + '.pth')
         action_prob = [1] * action_size