diff --git a/scoring/utils/misc_utils.py b/scoring/utils/misc_utils.py
index dee5f47f7f8f09f253dfc3f8e3d48931df94efe7..6f10af999ff08f9f1559eb09cf69021a3086165d 100644
--- a/scoring/utils/misc_utils.py
+++ b/scoring/utils/misc_utils.py
@@ -2,7 +2,6 @@ import random
 import time
 
 import numpy as np
-
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
@@ -66,7 +65,7 @@ def run_test(parameters, agent, observation_builder=None, observation_wrapper=No
                       number_of_agents=1,
                       )
 
-        obs = env.reset()
+        obs, info = env.reset()
 
         if observation_wrapper is not None:
             for a in range(env.get_num_agents()):
@@ -181,7 +180,7 @@ def run_test_sequential(parameters, agent, test_nr=0, tree_depth=3):
                       number_of_agents=1,
                       )
 
-        obs = env.reset()
+        obs, info = env.reset()
         done = env.dones
         # Run episode
         trial_score = 0
diff --git a/sequential_agent/run_test.py b/sequential_agent/run_test.py
index a8c0bbec93282ecac1f65bb35870805cf9e9e298..92e814571a83be09f86ada389e18d34cca94402d 100644
--- a/sequential_agent/run_test.py
+++ b/sequential_agent/run_test.py
@@ -1,11 +1,11 @@
 import numpy as np
-
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.envs.rail_generators import complex_rail_generator
 from flatland.envs.schedule_generators import complex_schedule_generator
 from flatland.utils.rendertools import RenderTool
+
 from sequential_agent.simple_order_agent import OrderedAgent
 
 np.random.seed(2)
@@ -49,7 +49,7 @@ action_dict = dict()
 for trials in range(1, n_trials + 1):
 
     # Reset environment
-    obs = env.reset(True, True)
+    obs, info = env.reset(True, True)
     done = env.dones
     env_renderer.reset()
     frame_step = 0
diff --git a/torch_training/Getting_Started_Training.md b/torch_training/Getting_Started_Training.md
index 8610bfd15c2e5cad0d4ec19db883f4ceb9407963..cbf4a3cd294ed3e2018e4386b2b76bb3b4d7bb1b 100644
--- a/torch_training/Getting_Started_Training.md
+++ b/torch_training/Getting_Started_Training.md
@@ -150,7 +150,7 @@ We now use the normalized `agent_obs` for our training loop:
 for trials in range(1, n_trials + 1):
 
     # Reset environment
-    obs = env.reset(True, True)
+    obs, info = env.reset(True, True)
     if not Training:
         env_renderer.set_new_rail()
 
diff --git a/torch_training/Multi_Agent_Training_Intro.md b/torch_training/Multi_Agent_Training_Intro.md
index d4eefae068601b9ea568759b49f95b6985cafac7..69f89aa987e94b10fa2b18f9260779f06b6fc4bf 100644
--- a/torch_training/Multi_Agent_Training_Intro.md
+++ b/torch_training/Multi_Agent_Training_Intro.md
@@ -174,7 +174,7 @@ We now use the normalized `agent_obs` for our training loop:
             agent_next_obs = [None] * env.get_num_agents()
 
         # Reset environment
-        obs = env.reset(True, True)
+        obs, info = env.reset(True, True)
 
         # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
         # different times during an episode
diff --git a/torch_training/multi_agent_inference.py b/torch_training/multi_agent_inference.py
index 580886b1db73ba34d539e14968deea384b5b98be..b376623e7ecde3bbfcf01a5eeb11e8a76c132cc7 100644
--- a/torch_training/multi_agent_inference.py
+++ b/torch_training/multi_agent_inference.py
@@ -3,16 +3,15 @@ from collections import deque
 
 import numpy as np
 import torch
-from importlib_resources import path
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
-
-import torch_training.Nets
 from flatland.envs.rail_env import RailEnv
-from flatland.envs.rail_generators import rail_from_file, sparse_rail_generator
-from flatland.envs.schedule_generators import schedule_from_file, sparse_schedule_generator
-
+from flatland.envs.rail_generators import sparse_rail_generator
+from flatland.envs.schedule_generators import sparse_schedule_generator
 from flatland.utils.rendertools import RenderTool
+from importlib_resources import path
+
+import torch_training.Nets
 from torch_training.dueling_double_dqn import Agent
 from utils.observation_utils import normalize_observation
 
@@ -97,7 +96,7 @@ frame_step = 0
 for trials in range(1, n_trials + 1):
 
     # Reset environment
-    obs = env.reset(True, True)
+    obs, info = env.reset(True, True)
 
     env_renderer.reset()
 
diff --git a/torch_training/multi_agent_training.py b/torch_training/multi_agent_training.py
index e8ed93f052b0073b790e59a3de2f900934e690dc..ec8ac964a2319f99a85cae87a08c27f9a731ff1f 100644
--- a/torch_training/multi_agent_training.py
+++ b/torch_training/multi_agent_training.py
@@ -162,7 +162,7 @@ def main(argv):
             agent_next_obs = [None] * env.get_num_agents()
 
         # Reset environment
-        obs = env.reset(True, True)
+        obs, info = env.reset(True, True)
 
         # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
         # different times during an episode
diff --git a/torch_training/multi_agent_two_time_step_training.py b/torch_training/multi_agent_two_time_step_training.py
index d02e4b221b30f58b89faa540ea0c505506a3c454..466ddf52dc7e937dab36e72d407aa5d09f8fd445 100644
--- a/torch_training/multi_agent_two_time_step_training.py
+++ b/torch_training/multi_agent_two_time_step_training.py
@@ -121,7 +121,7 @@ def main(argv):
             agent_next_obs = [None] * env.get_num_agents()
 
         # Reset environment
-        obs = env.reset(True, True)
+        obs, info = env.reset(True, True)
 
         # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
         # different times during an episode
diff --git a/torch_training/render_agent_behavior.py b/torch_training/render_agent_behavior.py
index 2649a2367367e17e39328ca8c28cc9c2f1fc0172..969b7e92ea1f3a3eca961fa7400b8b0f36321d3e 100644
--- a/torch_training/render_agent_behavior.py
+++ b/torch_training/render_agent_behavior.py
@@ -3,15 +3,15 @@ from collections import deque
 
 import numpy as np
 import torch
-from importlib_resources import path
-
-import torch_training.Nets
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.envs.rail_generators import sparse_rail_generator
 from flatland.envs.schedule_generators import sparse_schedule_generator
 from flatland.utils.rendertools import RenderTool
+from importlib_resources import path
+
+import torch_training.Nets
 from torch_training.dueling_double_dqn import Agent
 from utils.observation_utils import normalize_observation
 
@@ -111,7 +111,7 @@ frame_step = 0
 for trials in range(1, n_trials + 1):
 
     # Reset environment
-    obs = env.reset(True, True)
+    obs, info = env.reset(True, True)
     env_renderer.reset()
     # Build agent specific observations
     for a in range(env.get_num_agents()):
diff --git a/torch_training/training_navigation.py b/torch_training/training_navigation.py
index 607206ec8f423c421d2e57397cce3a2a1900679e..4f82c523474f6880a8595fe8c9dc3e4565e4902b 100644
--- a/torch_training/training_navigation.py
+++ b/torch_training/training_navigation.py
@@ -133,13 +133,11 @@ def main(argv):
             # Action
             for a in range(env.get_num_agents()):
                 if info['action_required'][a]:
-                    register_action_state[a] = True
                     action = agent.act(agent_obs[a], eps=eps)
                     action_prob[action] += 1
                     if step == 0:
                         agent_action_buffer[a] = action
                 else:
-                    register_action_state[a] = False
                     action = 0
                 action_dict.update({a: action})
 
@@ -151,24 +149,21 @@ def main(argv):
                 # Penalize waiting in order to get agent to move
                 if env.agents[a].status == 0:
                     all_rewards[a] -= 1
+
                 agent_next_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
                 cummulated_reward[a] += all_rewards[a]
 
             # Update replay buffer and train agent
             for a in range(env.get_num_agents()):
-                if done[a]:
-                    final_obs[a] = agent_obs_buffer[a]
-                    final_obs_next[a] = agent_next_obs[a].copy()
-                    final_action_dict.update({a: agent_action_buffer[a]})
-                if not done[a]:
-                    if agent_obs_buffer[a] is not None and register_action_state[a]:
-                        agent_delayed_next = agent_obs[a].copy()
-                        agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
-                                   agent_delayed_next, done[a])
-                        cummulated_reward[a] = 0.
-                    if register_action_state[a]:
-                        agent_obs_buffer[a] = agent_obs[a].copy()
-                        agent_action_buffer[a] = action_dict[a]
+                if (agent_obs_buffer[a] is not None and register_action_state[a] and env.agents[a].status != 3) or \
+                        env.agents[a].status == 2:
+                    agent_delayed_next = agent_obs[a].copy()
+                    agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
+                               agent_delayed_next, done[a])
+                    cummulated_reward[a] = 0.
+                if info['action_required'][a]:
+                    agent_obs_buffer[a] = agent_obs[a].copy()
+                    agent_action_buffer[a] = action_dict[a]
 
                 score += all_rewards[a] / env.get_num_agents()
 
@@ -176,8 +171,6 @@ def main(argv):
             agent_obs = agent_next_obs.copy()
             if done['__all__']:
                 env_done = 1
-                for a in range(env.get_num_agents()):
-                    agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
                 break
 
         # Epsilon decay
diff --git a/utils/misc_utils.py b/utils/misc_utils.py
index 4702c82e7ec8722924de9bce367f72b16bcda0fc..e4962ca1097b141c6397736480ed67a618789538 100644
--- a/utils/misc_utils.py
+++ b/utils/misc_utils.py
@@ -3,12 +3,12 @@ import time
 from collections import deque
 
 import numpy as np
-from line_profiler import LineProfiler
-
 from flatland.envs.observations import GlobalObsForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.envs.rail_generators import complex_rail_generator
 from flatland.envs.schedule_generators import complex_schedule_generator
+from line_profiler import LineProfiler
+
 from utils.observation_utils import norm_obs_clip, split_tree_into_feature_groups
 
 
@@ -102,7 +102,7 @@ def run_test(parameters, agent, test_nr=0, tree_depth=3):
         # Reset the env
 
         lp_reset(True, True)
-        obs = env.reset(True, True)
+        obs, info = env.reset(True, True)
         for a in range(env.get_num_agents()):
             data, distance, agent_data = split_tree_into_feature_groups(obs[a], tree_depth)
             data = norm_obs_clip(data)