diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py
index 640db58c376c776004f1865309c787f31a0c2c34..bc1dc7f1c2febe8b9992a7721787e00617ab6ffd 100755
--- a/reinforcement_learning/multi_agent_training.py
+++ b/reinforcement_learning/multi_agent_training.py
@@ -257,7 +257,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
         # Reset environment
         reset_timer.start()
         number_of_agents = int(min(n_agents, 1 + np.floor(episode_idx / 200)))
-        train_env_params.n_agents = 1  # episode_idx % number_of_agents + 1
+        train_env_params.n_agents = episode_idx % number_of_agents + 1
 
         train_env = create_rail_env(train_env_params, tree_observation)
         obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True)
@@ -314,7 +314,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
             next_obs, all_rewards, done, info = train_env.step(action_dict)
 
             # Reward shaping .Dead-lock .NotMoving .NotStarted
-            if False:
+            if True:
                 agent_positions = get_agent_positions(train_env)
                 for agent_handle in train_env.get_agent_handles():
                     agent = train_env.agents[agent_handle]
diff --git a/reinforcement_learning/ppo/ppo_agent.py b/reinforcement_learning/ppo/ppo_agent.py
index 3322c822a1fed208d49ed1648e04913551073b83..3240e6f5dafa93b97d06e621f95b790f1fb54dd1 100644
--- a/reinforcement_learning/ppo/ppo_agent.py
+++ b/reinforcement_learning/ppo/ppo_agent.py
@@ -12,11 +12,11 @@ from reinforcement_learning.policy import Policy
 
 LEARNING_RATE = 0.1e-4
 GAMMA = 0.98
-LMBDA = 0.9
-EPS_CLIP = 0.1
+LAMBDA = 0.9
+SURROGATE_EPS_CLIP = 0.01
 K_EPOCH = 3
 
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device("cpu")#"cuda:0" if torch.cuda.is_available() else "cpu")
 print("device:", device)
 
 
@@ -215,7 +215,7 @@ class PPOAgent(Policy):
                     advantage_list = []
                     advantage_value = 0.0
                     for difference_to_expected_value_t in difference_to_expected_value_deltas[::-1]:
-                        advantage_value = LMBDA * advantage_value + difference_to_expected_value_t[0]
+                        advantage_value = LAMBDA * advantage_value + difference_to_expected_value_t[0]
                         advantage_list.append([advantage_value])
                     advantage_list.reverse()
                     advantages = torch.tensor(advantage_list, dtype=torch.float)
@@ -227,9 +227,11 @@ class PPOAgent(Policy):
                     # Normal Policy Gradient objective
                     surrogate_objective = ratios * advantages
                     # clipped version of Normal Policy Gradient objective
-                    clipped_surrogate_objective = torch.clamp(ratios * advantages, 1 - EPS_CLIP, 1 + EPS_CLIP)
+                    clipped_surrogate_objective = torch.clamp(ratios * advantages,
+                                                              1 - SURROGATE_EPS_CLIP,
+                                                              1 + SURROGATE_EPS_CLIP)
                     # create value loss function
-                    value_loss = F.mse_loss(self.value_network(states),
+                    value_loss = F.smooth_l1_loss(self.value_network(states),
                                             estimated_target_value.detach())
                     # create final loss function
                     loss = -torch.min(surrogate_objective, clipped_surrogate_objective) + value_loss