diff --git a/checkpoints/201106090621-3300.pth.local b/checkpoints/201106090621-3300.pth.local
new file mode 100644
index 0000000000000000000000000000000000000000..453e1cdb7f0166eb52de7e89a257b49be88e36f8
Binary files /dev/null and b/checkpoints/201106090621-3300.pth.local differ
diff --git a/checkpoints/201106090621-3300.pth.target b/checkpoints/201106090621-3300.pth.target
new file mode 100644
index 0000000000000000000000000000000000000000..f94422b5b2c56cf46dae8cbbba7189d558581fa4
Binary files /dev/null and b/checkpoints/201106090621-3300.pth.target differ
diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py
index 6f250a83fb830afef842037c312629b99cdb78c1..db4b1a805e4d3fa5a3c29bc58224ec88d2d4b2f1 100755
--- a/reinforcement_learning/multi_agent_training.py
+++ b/reinforcement_learning/multi_agent_training.py
@@ -18,6 +18,7 @@ from flatland.envs.schedule_generators import sparse_schedule_generator
 from flatland.utils.rendertools import RenderTool
 from torch.utils.tensorboard import SummaryWriter
 
+from reinforcement_learning.dddqn_policy import DDDQNPolicy
 from reinforcement_learning.ppo.ppo_agent import PPOAgent
 
 base_dir = Path(__file__).resolve().parent.parent
@@ -172,8 +173,8 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
     completion_window = deque(maxlen=checkpoint_interval)
 
     # Double Dueling DQN policy
-    # policy = DDDQNPolicy(state_size, action_size, train_params)
-    policy = PPOAgent(state_size, action_size, n_agents)
+    policy = DDDQNPolicy(state_size, action_size, train_params)
+    # policy = PPOAgent(state_size, action_size, n_agents)
     # Load existing policy
     if train_params.load_policy is not "":
         policy.load(train_params.load_policy)
@@ -480,7 +481,7 @@ def eval_policy(env, tree_observation, policy, train_params, obs_params):
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=5400, type=int)
-    parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=1, type=int)
+    parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=2, type=int)
     parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=0,
                         type=int)
     parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=5, type=int)
diff --git a/reinforcement_learning/ppo/ppo_agent.py b/reinforcement_learning/ppo/ppo_agent.py
index 663a05acb42fd5a919f4eff0c3c45146d9bbd471..a7431f85201def6f189ccdc6101a89428b598e47 100644
--- a/reinforcement_learning/ppo/ppo_agent.py
+++ b/reinforcement_learning/ppo/ppo_agent.py
@@ -1,5 +1,4 @@
 import os
-import random
 
 import numpy as np
 import torch
@@ -17,6 +16,7 @@ CLIP_FACTOR = .005
 UPDATE_EVERY = 30
 
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print("device:", device)
 
 
 class PPOAgent(Policy):
@@ -31,7 +31,6 @@ class PPOAgent(Policy):
         self.memory = ReplayBuffer(BUFFER_SIZE)
         self.t_step = 0
         self.loss = 0
-        self.num_agents = num_agents
 
     def reset(self):
         self.finished = [False] * len(self.episodes)
@@ -43,7 +42,8 @@ class PPOAgent(Policy):
         self.policy.eval()
         with torch.no_grad():
             output = self.policy(torch.from_numpy(state).float().unsqueeze(0).to(device))
-            return Categorical(output).sample().item()
+            ret = Categorical(output).sample().item()
+            return ret
 
     # Record the results of the agent's action and update the model
     def step(self, handle, state, action, reward, next_state, done):
@@ -118,14 +118,14 @@ class PPOAgent(Policy):
         if os.path.exists(filename + ".policy"):
             print(' >> ', filename + ".policy")
             try:
-                self.policy.load_state_dict(torch.load(filename + ".policy"))
+                self.policy.load_state_dict(torch.load(filename + ".policy", map_location=device))
             except:
                 print(" >> failed!")
                 pass
         if os.path.exists(filename + ".optimizer"):
             print(' >> ', filename + ".optimizer")
             try:
-                self.optimizer.load_state_dict(torch.load(filename + ".optimizer"))
+                self.optimizer.load_state_dict(torch.load(filename + ".optimizer", map_location=device))
             except:
                 print(" >> failed!")
                 pass
diff --git a/run.py b/run.py
index ac6a3cb3589a93c43e44e68fe6faf9796d31235e..06405868c06c8463eae756a38f063571421a7b8b 100644
--- a/run.py
+++ b/run.py
@@ -6,6 +6,7 @@ from pathlib import Path
 import numpy as np
 from flatland.core.env_observation_builder import DummyObservationBuilder
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
+from flatland.envs.rail_env import RailEnvActions
 from flatland.evaluators.client import FlatlandRemoteClient
 from flatland.evaluators.client import TimeoutException
 
@@ -25,10 +26,12 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy
 VERBOSE = True
 
 # Checkpoint to use (remember to push it!)
-checkpoint = "./checkpoints/201105173637-4700.pth" # 18.50097663335293 : Depth = 1
+checkpoint = "./checkpoints/201105222046-5400.pth"  # 17.66104361971127 Depth 1
+checkpoint = "./checkpoints/201106073658-4300.pth"  # 15.64082361736683 Depth 1
+checkpoint = "./checkpoints/201106090621-3300.pth"  # 15.64082361736683 Depth 1
 
 # Use last action cache
-USE_ACTION_CACHE = True
+USE_ACTION_CACHE = False
 USE_DEAD_LOCK_AVOIDANCE_AGENT = False
 
 # Observation parameters (must match training parameters!)
@@ -50,6 +53,7 @@ action_size = 5
 
 # Creates the policy. No GPU on evaluation server.
 policy = DDDQNPolicy(state_size, action_size, Namespace(**{'use_gpu': False}), evaluation_mode=True)
+# policy = PPOAgent(state_size, action_size, 10)
 policy.load(checkpoint)
 
 #####################################################################
@@ -134,7 +138,10 @@ while True:
                             action = agent_last_action[agent]
                             nb_hit += 1
                         else:
-                            action = policy.act(observation[agent], eps=0.0)
+                            action = policy.act(observation[agent], eps=0.01)
+
+                        if observation[agent][26] == 1:
+                            action = RailEnvActions.STOP_MOVING
 
                         action_dict[agent] = action