diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py
index 94c6208ddde96a1bd9aea3728bb2a73f3f4ab667..a3a2b1a3c69921e15715115e7db50ca5239a67f1 100755
--- a/reinforcement_learning/multi_agent_training.py
+++ b/reinforcement_learning/multi_agent_training.py
@@ -22,7 +22,8 @@ from torch.utils.tensorboard import SummaryWriter
 from reinforcement_learning.dddqn_policy import DDDQNPolicy
 from reinforcement_learning.ppo_agent import PPOPolicy
 from reinforcement_learning.ppo_deadlockavoidance_agent import MultiDecisionAgent
-from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action
+from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action, \
+    map_rail_env_action
 from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
 from utils.deadlock_check import get_agent_positions, check_for_deadlock
 
@@ -173,7 +174,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
     # Double Dueling DQN policy
     policy = DDDQNPolicy(state_size, get_action_size(), train_params)
     if True:
-        policy = PPOPolicy(state_size, get_action_size())
+        policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=True, in_parameters=train_params)
     if False:
         policy = DeadLockAvoidanceAgent(train_env, get_action_size())
     if False:
@@ -517,9 +518,9 @@ def eval_policy(env, tree_observation, policy, train_params, obs_params):
 if __name__ == "__main__":
     parser = ArgumentParser()
     parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=12000, type=int)
-    parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=2,
+    parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=1,
                         type=int)
-    parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=2,
+    parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=1,
                         type=int)
     parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=10, type=int)
     parser.add_argument("--checkpoint_interval", help="checkpoint interval", default=100, type=int)
diff --git a/reinforcement_learning/ppo_agent.py b/reinforcement_learning/ppo_agent.py
index 4072ecc190361f3a08aa150c80db17e66675017c..d467cfeb018811fdfc3b8cda0ed6a1c6519d376f 100644
--- a/reinforcement_learning/ppo_agent.py
+++ b/reinforcement_learning/ppo_agent.py
@@ -1,7 +1,6 @@
 import copy
 import os
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.optim as optim
@@ -11,10 +10,6 @@ from torch.distributions import Categorical
 from reinforcement_learning.policy import LearningPolicy
 from reinforcement_learning.replay_buffer import ReplayBuffer
 
-device = torch.device("cpu")  # "cuda:0" if torch.cuda.is_available() else "cpu")
-print("device:", device)
-
-
 # https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
 
 class EpisodeBuffers:
@@ -96,27 +91,46 @@ class ActorCriticModel(nn.Module):
 
 
 class PPOPolicy(LearningPolicy):
-    def __init__(self, state_size, action_size, use_replay_buffer=False):
+    def __init__(self, state_size, action_size, use_replay_buffer=False, in_parameters=None):
         print(">> PPOPolicy")
         super(PPOPolicy, self).__init__()
         # parameters
-        self.learning_rate = 1.0e-3
-        self.gamma = 0.95
+        self.ppo_parameters = in_parameters
+        if self.ppo_parameters is not None:
+            self.hidsize = self.ppo_parameters.hidden_size
+            self.buffer_size = self.ppo_parameters.buffer_size
+            self.batch_size = self.ppo_parameters.batch_size
+            self.learning_rate = self.ppo_parameters.learning_rate
+            self.gamma = self.ppo_parameters.gamma
+            # Device
+            if self.ppo_parameters.use_gpu and torch.cuda.is_available():
+                self.device = torch.device("cuda:0")
+                # print("🐇 Using GPU")
+            else:
+                self.device = torch.device("cpu")
+                # print("🐢 Using CPU")
+        else:
+            self.hidsize = 128
+            self.learning_rate = 1.0e-3
+            self.gamma = 0.95
+            self.buffer_size = 32_000
+            self.batch_size = 1024
+            self.device = torch.device("cpu")
+
         self.surrogate_eps_clip = 0.1
         self.K_epoch = 10
         self.weight_loss = 0.5
         self.weight_entropy = 0.01
 
-        self.buffer_size = 32_000
-        self.batch_size = 1024
         self.buffer_min_size = 0
         self.use_replay_buffer = use_replay_buffer
-        self.device = device
 
         self.current_episode_memory = EpisodeBuffers()
         self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
         self.loss = 0
-        self.actor_critic_model = ActorCriticModel(state_size, action_size, self.device)
+        self.actor_critic_model = ActorCriticModel(state_size, action_size,self.device,
+                                                   hidsize1=self.hidsize,
+                                                   hidsize2=self.hidsize)
         self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate)
         self.loss_function = nn.MSELoss()  # nn.SmoothL1Loss()