diff --git a/reinforcement_learning/dddqn_policy.py b/reinforcement_learning/dddqn_policy.py
index 7a3525d903d323487507f991e6bcb099a436b35e..ecd3e463ba6dfe98c87ea7eb33d418d4f11b4b73 100644
--- a/reinforcement_learning/dddqn_policy.py
+++ b/reinforcement_learning/dddqn_policy.py
@@ -2,7 +2,6 @@ import copy
 import os
 import pickle
 import random
-from collections import namedtuple, deque, Iterable
 
 import numpy as np
 import torch
@@ -11,6 +10,7 @@ import torch.optim as optim
 
 from reinforcement_learning.model import DuelingQNetwork
 from reinforcement_learning.policy import Policy
+from reinforcement_learning.replay_buffer import ReplayBuffer
 
 
 class DDDQNPolicy(Policy):
@@ -90,7 +90,7 @@ class DDDQNPolicy(Policy):
 
     def _learn(self):
         experiences = self.memory.sample()
-        states, actions, rewards, next_states, dones = experiences
+        states, actions, rewards, next_states, dones, _ = experiences
 
         # Get expected Q values from local model
         q_expected = self.qnetwork_local(states).gather(1, actions)
@@ -161,55 +161,3 @@ class DDDQNPolicy(Policy):
         me.qnetwork_target = copy.deepcopy(self.qnetwork_local)
         me.qnetwork_target = copy.deepcopy(self.qnetwork_target)
         return me
-
-
-Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
-
-
-class ReplayBuffer:
-    """Fixed-size buffer to store experience tuples."""
-
-    def __init__(self, action_size, buffer_size, batch_size, device):
-        """Initialize a ReplayBuffer object.
-
-        Params
-        ======
-            action_size (int): dimension of each action
-            buffer_size (int): maximum size of buffer
-            batch_size (int): size of each training batch
-        """
-        self.action_size = action_size
-        self.memory = deque(maxlen=buffer_size)
-        self.batch_size = batch_size
-        self.device = device
-
-    def add(self, state, action, reward, next_state, done):
-        """Add a new experience to memory."""
-        e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done)
-        self.memory.append(e)
-
-    def sample(self):
-        """Randomly sample a batch of experiences from memory."""
-        experiences = random.sample(self.memory, k=self.batch_size)
-
-        states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
-            .float().to(self.device)
-        actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
-            .long().to(self.device)
-        rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
-            .float().to(self.device)
-        next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
-            .float().to(self.device)
-        dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
-            .float().to(self.device)
-
-        return states, actions, rewards, next_states, dones
-
-    def __len__(self):
-        """Return the current size of internal memory."""
-        return len(self.memory)
-
-    def __v_stack_impr(self, states):
-        sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1
-        np_states = np.reshape(np.array(states), (len(states), sub_dim))
-        return np_states
diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py
index 5cb6ba1d633e8997b255447521a2938270d6d173..c12750356efcccf7225bbe51e0c13fbfd44ac41b 100755
--- a/reinforcement_learning/multi_agent_training.py
+++ b/reinforcement_learning/multi_agent_training.py
@@ -283,28 +283,26 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
             next_obs, all_rewards, done, info = train_env.step(map_actions(action_dict))
 
             # Reward shaping .Dead-lock .NotMoving .NotStarted
-            if False:
+            if True:
                 agent_positions = get_agent_positions(train_env)
                 for agent_handle in train_env.get_agent_handles():
                     agent = train_env.agents[agent_handle]
-                    act = action_dict.get(agent_handle, RailEnvActions.DO_NOTHING)
-                    act = map_action(act)
+                    act = map_action(action_dict.get(agent_handle, RailEnvActions.DO_NOTHING))
                     if agent.status == RailAgentStatus.ACTIVE:
-                        all_rewards[agent_handle] = 0.0
                         if done[agent_handle] == False:
                             if check_for_deadlock(agent_handle, train_env, agent_positions):
-                                all_rewards[agent_handle] = -5.0
+                                all_rewards[agent_handle] -= 5.0
                             else:
                                 pos = agent.position
                                 possible_transitions = train_env.rail.get_transitions(*pos, agent.direction)
                                 num_transitions = fast_count_nonzero(possible_transitions)
                                 if num_transitions < 2 and ((act != RailEnvActions.MOVE_FORWARD) or
                                                             (act != RailEnvActions.STOP_MOVING)):
-                                    all_rewards[agent_handle] = -0.5
+                                    all_rewards[agent_handle] -= 0.5
                                 else:
-                                    all_rewards[agent_handle] = -0.01
+                                    all_rewards[agent_handle] -= 0.01
                         else:
-                            all_rewards[agent_handle] *= 10.0
+                            all_rewards[agent_handle] *= 9.0
                             all_rewards[agent_handle] += 1.0
 
             step_timer.end()
@@ -523,9 +521,9 @@ if __name__ == "__main__":
     parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=12000, type=int)
     parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=3,
                         type=int)
-    parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=2,
+    parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=1,
                         type=int)
-    parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=10, type=int)
+    parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=1, type=int)
     parser.add_argument("--checkpoint_interval", help="checkpoint interval", default=100, type=int)
     parser.add_argument("--eps_start", help="max exploration", default=0.1, type=float)
     parser.add_argument("--eps_end", help="min exploration", default=0.005, type=float)
diff --git a/reinforcement_learning/ppo_agent.py b/reinforcement_learning/ppo_agent.py
index a4e74ec884fd99288e45353a56d52cf31a27555f..16206e9f6330e8cfe1b96ea2b797e4ea4f1ecdaf 100644
--- a/reinforcement_learning/ppo_agent.py
+++ b/reinforcement_learning/ppo_agent.py
@@ -8,6 +8,7 @@ from torch.distributions import Categorical
 
 # Hyperparameters
 from reinforcement_learning.policy import Policy
+from reinforcement_learning.replay_buffer import ReplayBuffer
 
 device = torch.device("cpu")  # "cuda:0" if torch.cuda.is_available() else "cpu")
 print("device:", device)
@@ -15,7 +16,7 @@ print("device:", device)
 
 # https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
 
-class DataBuffers:
+class EpisodeBuffers:
     def __init__(self):
         self.reset()
 
@@ -37,8 +38,9 @@ class DataBuffers:
 
 class ActorCriticModel(nn.Module):
 
-    def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128):
+    def __init__(self, state_size, action_size, device, hidsize1=128, hidsize2=128):
         super(ActorCriticModel, self).__init__()
+        self.device = device
         self.actor = nn.Sequential(
             nn.Linear(state_size, hidsize1),
             nn.Tanh(),
@@ -46,7 +48,7 @@ class ActorCriticModel(nn.Module):
             nn.Tanh(),
             nn.Linear(hidsize2, action_size),
             nn.Softmax(dim=-1)
-        ).to(device)
+        ).to(self.device)
 
         self.critic = nn.Sequential(
             nn.Linear(state_size, hidsize1),
@@ -54,7 +56,7 @@ class ActorCriticModel(nn.Module):
             nn.Linear(hidsize1, hidsize2),
             nn.Tanh(),
             nn.Linear(hidsize2, 1)
-        ).to(device)
+        ).to(self.device)
 
     def forward(self, x):
         raise NotImplementedError
@@ -81,7 +83,7 @@ class ActorCriticModel(nn.Module):
         if os.path.exists(filename):
             print(' >> ', filename)
             try:
-                obj.load_state_dict(torch.load(filename, map_location=device))
+                obj.load_state_dict(torch.load(filename, map_location=self.device))
             except:
                 print(" >> failed!")
         return obj
@@ -104,10 +106,16 @@ class PPOAgent(Policy):
         self.weight_loss = 0.5
         self.weight_entropy = 0.01
 
-        # objects
-        self.memory = DataBuffers()
+        self.buffer_size = 32_000
+        self.batch_size = 512
+        self.buffer_min_size = 8_000
+        self.use_replay_buffer = True
+        self.device = device
+
+        self.current_episode_memory = EpisodeBuffers()
+        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
         self.loss = 0
-        self.actor_critic_model = ActorCriticModel(state_size, action_size)
+        self.actor_critic_model = ActorCriticModel(state_size, action_size, self.device)
         self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate)
         self.loss_function = nn.SmoothL1Loss()  # nn.MSELoss()
 
@@ -116,20 +124,20 @@ class PPOAgent(Policy):
 
     def act(self, handle, state, eps=None):
         # sample a action to take
-        torch_state = torch.tensor(state, dtype=torch.float).to(device)
+        torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
         dist = self.actor_critic_model.get_actor_dist(torch_state)
         action = dist.sample()
         return action.item()
 
     def step(self, handle, state, action, reward, next_state, done):
         # record transitions ([state] -> [action] -> [reward, next_state, done])
-        torch_action = torch.tensor(action, dtype=torch.float).to(device)
-        torch_state = torch.tensor(state, dtype=torch.float).to(device)
+        torch_action = torch.tensor(action, dtype=torch.float).to(self.device)
+        torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
         # evaluate actor
         dist = self.actor_critic_model.get_actor_dist(torch_state)
         action_logprobs = dist.log_prob(torch_action)
         transition = (state, action, reward, next_state, action_logprobs.item(), done)
-        self.memory.push_transition(handle, transition)
+        self.current_episode_memory.push_transition(handle, transition)
 
     def _convert_transitions_to_torch_tensors(self, transitions_array):
         # build empty lists(arrays)
@@ -155,14 +163,22 @@ class PPOAgent(Policy):
             state_next_list.insert(0, state_next_i)
             prob_a_list.insert(0, prob_action_i)
 
+            if self.use_replay_buffer:
+                self.memory.add(state_i, action_i, discounted_reward, state_next_i, done_i)
+
+        if self.use_replay_buffer:
+            if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
+                states, actions, rewards, next_states, dones, prob_actions = self.memory.sample()
+                return states, actions, rewards, next_states, dones, prob_actions
+
         # convert data to torch tensors
         states, actions, rewards, states_next, dones, prob_actions = \
-            torch.tensor(state_list, dtype=torch.float).to(device), \
-            torch.tensor(action_list).to(device), \
-            torch.tensor(reward_list, dtype=torch.float).to(device), \
-            torch.tensor(state_next_list, dtype=torch.float).to(device), \
-            torch.tensor(done_list, dtype=torch.float).to(device), \
-            torch.tensor(prob_a_list).to(device)
+            torch.tensor(state_list, dtype=torch.float).to(self.device), \
+            torch.tensor(action_list).to(self.device), \
+            torch.tensor(reward_list, dtype=torch.float).to(self.device), \
+            torch.tensor(state_next_list, dtype=torch.float).to(self.device), \
+            torch.tensor(done_list, dtype=torch.float).to(self.device), \
+            torch.tensor(prob_a_list).to(self.device)
 
         # standard-normalize rewards
         # rewards = (rewards - rewards.mean()) / (rewards.std() + 1.e-5)
@@ -171,9 +187,9 @@ class PPOAgent(Policy):
 
     def train_net(self):
         # All agents have to propagate their experiences made during past episode
-        for handle in range(len(self.memory)):
+        for handle in range(len(self.current_episode_memory)):
             # Extract agent's episode history (list of all transitions)
-            agent_episode_history = self.memory.get_transitions(handle)
+            agent_episode_history = self.current_episode_memory.get_transitions(handle)
             if len(agent_episode_history) > 0:
                 # Convert the replay buffer to torch tensors (arrays)
                 states, actions, rewards, states_next, dones, probs_action = \
@@ -209,7 +225,7 @@ class PPOAgent(Policy):
 
         self.K_epoch = max(3, self.K_epoch - 0.01)
         # Reset all collect transition data
-        self.memory.reset()
+        self.current_episode_memory.reset()
 
     def end_episode(self, train):
         if train:
@@ -225,7 +241,7 @@ class PPOAgent(Policy):
         if os.path.exists(filename):
             print(' >> ', filename)
             try:
-                obj.load_state_dict(torch.load(filename, map_location=device))
+                obj.load_state_dict(torch.load(filename, map_location=self.device))
             except:
                 print(" >> failed!")
         return obj
diff --git a/reinforcement_learning/replay_buffer.py b/reinforcement_learning/replay_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c28810398cdc8243e5e72c700678b642a7bc568a
--- /dev/null
+++ b/reinforcement_learning/replay_buffer.py
@@ -0,0 +1,58 @@
+import random
+from collections import namedtuple, deque, Iterable
+
+import numpy as np
+import torch
+
+Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "action_prob"])
+
+
+class ReplayBuffer:
+    """Fixed-size buffer to store experience tuples."""
+
+    def __init__(self, action_size, buffer_size, batch_size, device):
+        """Initialize a ReplayBuffer object.
+
+        Params
+        ======
+            action_size (int): dimension of each action
+            buffer_size (int): maximum size of buffer
+            batch_size (int): size of each training batch
+        """
+        self.action_size = action_size
+        self.memory = deque(maxlen=buffer_size)
+        self.batch_size = batch_size
+        self.device = device
+
+    def add(self, state, action, reward, next_state, done, action_prob=0.0):
+        """Add a new experience to memory."""
+        e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, action_prob)
+        self.memory.append(e)
+
+    def sample(self):
+        """Randomly sample a batch of experiences from memory."""
+        experiences = random.sample(self.memory, k=self.batch_size)
+
+        states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
+            .float().to(self.device)
+        actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
+            .long().to(self.device)
+        rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
+            .float().to(self.device)
+        next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
+            .float().to(self.device)
+        dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
+            .float().to(self.device)
+        action_probs = torch.from_numpy(self.__v_stack_impr([e.action_prob for e in experiences if e is not None])) \
+            .float().to(self.device)
+
+        return states, actions, rewards, next_states, dones, action_probs
+
+    def __len__(self):
+        """Return the current size of internal memory."""
+        return len(self.memory)
+
+    def __v_stack_impr(self, states):
+        sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1
+        np_states = np.reshape(np.array(states), (len(states), sub_dim))
+        return np_states