diff --git a/reinforcement_learning/ppo_agent.py b/reinforcement_learning/ppo_agent.py
index 7b7d65bdeae385382dbe38e6cc791011a2fe486e..44c57010b80b5c073bc074e71b6af9eeff1024ca 100644
--- a/reinforcement_learning/ppo_agent.py
+++ b/reinforcement_learning/ppo_agent.py
@@ -165,38 +165,45 @@ class PPOAgent(Policy):
         return states, actions, rewards, states_next, dones, prob_actions
 
     def train_net(self):
-        for handle in range(len(self.memory)):
-            agent_episode_history = self.memory.get_transitions(handle)
-            if len(agent_episode_history) > 0:
-                # convert the replay buffer to torch tensors (arrays)
-                states, actions, rewards, states_next, dones, probs_action = \
-                    self._convert_transitions_to_torch_tensors(agent_episode_history)
-
-                # Optimize policy for K epochs:
-                for _ in range(self.K_epoch):
-                    # evaluating actions (actor) and values (critic)
+        # Optimize policy for K epochs:
+        for _ in range(self.K_epoch):
+            # All agents have to propagate their experiences made during past episode
+            for handle in range(len(self.memory)):
+                # Extract agent's episode history (list of all transitions)
+                agent_episode_history = self.memory.get_transitions(handle)
+                if len(agent_episode_history) > 0:
+                    # Convert the replay buffer to torch tensors (arrays)
+                    states, actions, rewards, states_next, dones, probs_action = \
+                        self._convert_transitions_to_torch_tensors(agent_episode_history)
+
+                    # Evaluating actions (actor) and values (critic)
                     logprobs, state_values, dist_entropy = self.actor_critic_model.evaluate(states, actions)
 
-                    # finding the ratios (pi_thetas / pi_thetas_replayed):
+                    # Finding the ratios (pi_thetas / pi_thetas_replayed):
                     ratios = torch.exp(logprobs - probs_action.detach())
 
-                    # finding Surrogate Loss:
+                    # Finding Surrogate Loos
                     advantages = rewards - state_values.detach()
                     surr1 = ratios * advantages
                     surr2 = torch.clamp(ratios, 1. - self.surrogate_eps_clip, 1. + self.surrogate_eps_clip) * advantages
+
+                    # The loss function is used to estimate the gardient and use the entropy function based
+                    # heuristic to penalize the gradient function when the policy becomes deterministic this would let
+                    # the gardient to become very flat and so the gradient is no longer useful.
                     loss = \
                         -torch.min(surr1, surr2) \
                         + self.weight_loss * self.loss_function(state_values, rewards) \
                         - self.weight_entropy * dist_entropy
 
-                    # make a gradient step
+                    # Make a gradient step
                     self.optimizer.zero_grad()
                     loss.mean().backward()
                     self.optimizer.step()
 
-                    # store current loss to the agent
+                    # Transfer the current loss to the agents loss (information) for debug purpose only
                     self.loss = loss.mean().detach().numpy()
 
+        # Reset all collect transition data
         self.memory.reset()
 
     def end_episode(self, train):