From 91c77d25638f169c7566b2694b819983ade62b70 Mon Sep 17 00:00:00 2001
From: "Egli Adrian (IT-SCI-API-PFI)" <adrian.egli@sbb.ch>
Date: Fri, 18 Dec 2020 10:54:40 +0100
Subject: [PATCH] refactored and added new test (gym - CartPole)

---
 reinforcement_learning/dddqn_policy.py        |  1 +
 .../multi_agent_training.py                   |  6 +-
 reinforcement_learning/multi_policy.py        |  4 +-
 reinforcement_learning/ppo_agent.py           | 78 ++++++++++++-----
 .../ppo_deadlockavoidance_agent.py            |  5 +-
 reinforcement_learning/replay_buffer.py       |  1 -
 reinforcement_learning/rl_agent_test.py       | 84 +++++++++++++++++++
 run.py                                        | 10 +--
 utils/agent_action_config.py                  | 31 ++++++-
 utils/dead_lock_avoidance_agent.py            |  5 +-
 10 files changed, 183 insertions(+), 42 deletions(-)
 create mode 100644 reinforcement_learning/rl_agent_test.py

diff --git a/reinforcement_learning/dddqn_policy.py b/reinforcement_learning/dddqn_policy.py
index ecd3e46..9864ca6 100644
--- a/reinforcement_learning/dddqn_policy.py
+++ b/reinforcement_learning/dddqn_policy.py
@@ -17,6 +17,7 @@ class DDDQNPolicy(Policy):
     """Dueling Double DQN policy"""
 
     def __init__(self, state_size, action_size, in_parameters, evaluation_mode=False):
+        print(">> DDDQNPolicy")
         super(Policy, self).__init__()
 
         self.ddqn_parameters = in_parameters
diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py
index 1092d84..b9a819a 100755
--- a/reinforcement_learning/multi_agent_training.py
+++ b/reinforcement_learning/multi_agent_training.py
@@ -20,7 +20,7 @@ from flatland.utils.rendertools import RenderTool
 from torch.utils.tensorboard import SummaryWriter
 
 from reinforcement_learning.dddqn_policy import DDDQNPolicy
-from reinforcement_learning.ppo_agent import PPOAgent
+from reinforcement_learning.ppo_agent import PPOPolicy
 from reinforcement_learning.ppo_deadlockavoidance_agent import MultiDecisionAgent
 from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action
 from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
@@ -173,7 +173,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
     # Double Dueling DQN policy
     policy = DDDQNPolicy(state_size, get_action_size(), train_params)
     if True:
-        policy = PPOAgent(state_size, get_action_size())
+        policy = PPOPolicy(state_size, get_action_size())
     if False:
         policy = DeadLockAvoidanceAgent(train_env, get_action_size())
     if False:
@@ -283,7 +283,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
             next_obs, all_rewards, done, info = train_env.step(map_actions(action_dict))
 
             # Reward shaping .Dead-lock .NotMoving .NotStarted
-            if True:
+            if False:
                 agent_positions = get_agent_positions(train_env)
                 for agent_handle in train_env.get_agent_handles():
                     agent = train_env.agents[agent_handle]
diff --git a/reinforcement_learning/multi_policy.py b/reinforcement_learning/multi_policy.py
index 0c2ae32..5ee8cb4 100644
--- a/reinforcement_learning/multi_policy.py
+++ b/reinforcement_learning/multi_policy.py
@@ -2,7 +2,7 @@ import numpy as np
 from flatland.envs.rail_env import RailEnv
 
 from reinforcement_learning.policy import Policy
-from reinforcement_learning.ppo_agent import PPOAgent
+from reinforcement_learning.ppo_agent import PPOPolicy
 from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
 
 
@@ -13,7 +13,7 @@ class MultiPolicy(Policy):
         self.memory = []
         self.loss = 0
         self.deadlock_avoidance_policy = DeadLockAvoidanceAgent(env, action_size, False)
-        self.ppo_policy = PPOAgent(state_size + action_size, action_size)
+        self.ppo_policy = PPOPolicy(state_size + action_size, action_size)
 
     def load(self, filename):
         self.ppo_policy.load(filename)
diff --git a/reinforcement_learning/ppo_agent.py b/reinforcement_learning/ppo_agent.py
index f80bea8..43de9f7 100644
--- a/reinforcement_learning/ppo_agent.py
+++ b/reinforcement_learning/ppo_agent.py
@@ -1,6 +1,7 @@
 import copy
 import os
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.optim as optim
@@ -94,21 +95,21 @@ class ActorCriticModel(nn.Module):
         self.critic = self._load(self.critic, filename + ".critic")
 
 
-class PPOAgent(Policy):
+class PPOPolicy(Policy):
     def __init__(self, state_size, action_size):
-        super(PPOAgent, self).__init__()
-
+        print(">> PPOPolicy")
+        super(PPOPolicy, self).__init__()
         # parameters
-        self.learning_rate = 1.0e-5
+        self.learning_rate = 1.0e-3
         self.gamma = 0.95
-        self.surrogate_eps_clip = 0.1
-        self.K_epoch = 50
-        self.weight_loss = 0.5
+        self.surrogate_eps_clip = 0.01
+        self.K_epoch = 5
+        self.weight_loss = 0.25
         self.weight_entropy = 0.01
 
-        self.buffer_size = 32_000
-        self.batch_size = 512
-        self.buffer_min_size = 8_000
+        self.buffer_size = 2_000
+        self.batch_size = 64
+        self.buffer_min_size = 0
         self.use_replay_buffer = True
         self.device = device
 
@@ -117,7 +118,7 @@ class PPOAgent(Policy):
         self.loss = 0
         self.actor_critic_model = ActorCriticModel(state_size, action_size, self.device)
         self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate)
-        self.loss_function = nn.SmoothL1Loss()  # nn.MSELoss()
+        self.loss_function = nn.MSELoss()  # nn.SmoothL1Loss()
 
     def reset(self, env):
         pass
@@ -139,6 +140,22 @@ class PPOAgent(Policy):
         transition = (state, action, reward, next_state, action_logprobs.item(), done)
         self.current_episode_memory.push_transition(handle, transition)
 
+    def _push_transitions_to_replay_buffer(self,
+                                           state_list,
+                                           action_list,
+                                           reward_list,
+                                           state_next_list,
+                                           done_list,
+                                           prob_a_list):
+        for idx in range(len(reward_list)):
+            state_i = state_list[idx]
+            action_i = action_list[idx]
+            reward_i = reward_list[idx]
+            state_next_i = state_next_list[idx]
+            done_i = done_list[idx]
+            prob_action_i = prob_a_list[idx]
+            self.memory.add(state_i, action_i, reward_i, state_next_i, done_i, prob_action_i)
+
     def _convert_transitions_to_torch_tensors(self, transitions_array):
         # build empty lists(arrays)
         state_list, action_list, reward_list, state_next_list, prob_a_list, done_list = [], [], [], [], [], []
@@ -153,18 +170,23 @@ class PPOAgent(Policy):
             if done_i:
                 discounted_reward = 0
                 done_list.insert(0, 1)
-                reward_i = 1
             else:
                 done_list.insert(0, 0)
-                reward_i = 0
 
             discounted_reward = reward_i + self.gamma * discounted_reward
             reward_list.insert(0, discounted_reward)
             state_next_list.insert(0, state_next_i)
             prob_a_list.insert(0, prob_action_i)
 
-            if self.use_replay_buffer:
-                self.memory.add(state_i, action_i, discounted_reward, state_next_i, done_i)
+        # standard-normalize rewards
+        reward_list = np.array(reward_list)
+        reward_list = (reward_list - reward_list.mean()) / (reward_list.std() + 1.e-5)
+
+        if self.use_replay_buffer:
+            self._push_transitions_to_replay_buffer(state_list, action_list,
+                                                    reward_list, state_next_list,
+                                                    done_list, prob_a_list)
+
 
         # convert data to torch tensors
         states, actions, rewards, states_next, dones, prob_actions = \
@@ -175,11 +197,18 @@ class PPOAgent(Policy):
             torch.tensor(done_list, dtype=torch.float).to(self.device), \
             torch.tensor(prob_a_list).to(self.device)
 
-        # standard-normalize rewards
-        # rewards = (rewards - rewards.mean()) / (rewards.std() + 1.e-5)
-
         return states, actions, rewards, states_next, dones, prob_actions
 
+    def _get_transitions_from_replay_buffer(self, states, actions, rewards, states_next, dones, probs_action):
+        if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
+            states, actions, rewards, states_next, dones, probs_action = self.memory.sample()
+            actions = torch.squeeze(actions)
+            rewards = torch.squeeze(rewards)
+            states_next = torch.squeeze(states_next)
+            dones = torch.squeeze(dones)
+            probs_action = torch.squeeze(probs_action)
+        return states, actions, rewards, states_next, dones, probs_action
+
     def train_net(self):
         # All agents have to propagate their experiences made during past episode
         for handle in range(len(self.current_episode_memory)):
@@ -189,11 +218,15 @@ class PPOAgent(Policy):
                 # Convert the replay buffer to torch tensors (arrays)
                 states, actions, rewards, states_next, dones, probs_action = \
                     self._convert_transitions_to_torch_tensors(agent_episode_history)
+
                 # Optimize policy for K epochs:
                 for k_loop in range(int(self.K_epoch)):
-                    if self.use_replay_buffer and k_loop > 0:
-                        if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
-                            states, actions, rewards, states_next, dones, probs_action = self.memory.sample()
+
+                    if self.use_replay_buffer:
+                        states, actions, rewards, states_next, dones, probs_action = \
+                            self._get_transitions_from_replay_buffer(
+                                states, actions, rewards, states_next, dones, probs_action
+                            )
 
                     # Evaluating actions (actor) and values (critic)
                     logprobs, state_values, dist_entropy = self.actor_critic_model.evaluate(states, actions)
@@ -222,7 +255,6 @@ class PPOAgent(Policy):
                     # Transfer the current loss to the agents loss (information) for debug purpose only
                     self.loss = loss.mean().detach().cpu().numpy()
 
-        self.K_epoch = max(3, self.K_epoch - 0.01)
         # Reset all collect transition data
         self.current_episode_memory.reset()
 
@@ -252,7 +284,7 @@ class PPOAgent(Policy):
         self.optimizer = self._load(self.optimizer, filename + ".optimizer")
 
     def clone(self):
-        policy = PPOAgent(self.state_size, self.action_size)
+        policy = PPOPolicy(self.state_size, self.action_size)
         policy.actor_critic_model = copy.deepcopy(self.actor_critic_model)
         policy.optimizer = copy.deepcopy(self.optimizer)
         return self
diff --git a/reinforcement_learning/ppo_deadlockavoidance_agent.py b/reinforcement_learning/ppo_deadlockavoidance_agent.py
index e344d8e..737634c 100644
--- a/reinforcement_learning/ppo_deadlockavoidance_agent.py
+++ b/reinforcement_learning/ppo_deadlockavoidance_agent.py
@@ -2,6 +2,7 @@ from flatland.envs.agent_utils import RailAgentStatus
 from flatland.envs.rail_env import RailEnv, RailEnvActions
 
 from reinforcement_learning.policy import Policy
+from utils.agent_action_config import map_rail_env_action
 from utils.agent_can_choose_helper import AgentCanChooseHelper
 from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
 
@@ -36,9 +37,7 @@ class MultiDecisionAgent(Policy):
                 return self.learning_agent.act(handle, state, eps)
             else:
                 act = self.dead_lock_avoidance_agent.act(handle, state, -1.0)
-                if self.action_size == 4:
-                    act = max(act - 1, 0)
-                return act
+                return map_rail_env_action(act)
         # Agent is still at target cell
         return RailEnvActions.DO_NOTHING
 
diff --git a/reinforcement_learning/replay_buffer.py b/reinforcement_learning/replay_buffer.py
index c288103..2ba147d 100644
--- a/reinforcement_learning/replay_buffer.py
+++ b/reinforcement_learning/replay_buffer.py
@@ -32,7 +32,6 @@ class ReplayBuffer:
     def sample(self):
         """Randomly sample a batch of experiences from memory."""
         experiences = random.sample(self.memory, k=self.batch_size)
-
         states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
             .float().to(self.device)
         actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
diff --git a/reinforcement_learning/rl_agent_test.py b/reinforcement_learning/rl_agent_test.py
new file mode 100644
index 0000000..d764b8c
--- /dev/null
+++ b/reinforcement_learning/rl_agent_test.py
@@ -0,0 +1,84 @@
+from collections import deque
+from collections import namedtuple
+
+import gym
+import numpy as np
+
+from reinforcement_learning.dddqn_policy import DDDQNPolicy
+from reinforcement_learning.ppo_agent import PPOPolicy
+
+dddqn_param_nt = namedtuple('DDDQN_Param', ['hidden_size', 'buffer_size', 'batch_size', 'update_every', 'learning_rate',
+                                            'tau', 'gamma', 'buffer_min_size', 'use_gpu'])
+dddqn_param = dddqn_param_nt(hidden_size=128,
+                             buffer_size=1000,
+                             batch_size=64,
+                             update_every=10,
+                             learning_rate=1.e-3,
+                             tau=1.e-2,
+                             gamma=0.95,
+                             buffer_min_size=0,
+                             use_gpu=False)
+
+
+def cartpole(use_dddqn=False):
+    eps = 1.0
+    eps_decay = 0.99
+    min_eps = 0.01
+    training_mode = True
+
+    env = gym.make("CartPole-v1")
+    observation_space = env.observation_space.shape[0]
+    action_space = env.action_space.n
+    if not use_dddqn:
+        policy = PPOPolicy(observation_space, action_space)
+    else:
+        policy = DDDQNPolicy(observation_space, action_space, dddqn_param)
+    episode = 0
+    checkpoint_interval = 20
+    scores_window = deque(maxlen=checkpoint_interval)
+    while True:
+        episode += 1
+        state = env.reset()
+        policy.reset(env)
+        handle = 0
+        tot_reward = 0
+
+        policy.start_episode(train=training_mode)
+        while True:
+            env.render()
+            policy.start_step(train=training_mode)
+            action = policy.act(handle, state, eps)
+            state_next, reward, terminal, info = env.step(action)
+            policy.end_step(train=training_mode)
+            tot_reward += reward
+            # reward = reward if not terminal else -reward
+            reward = 0 if not terminal else -1
+            policy.step(handle, state, action, reward, state_next, terminal)
+            state = np.copy(state_next)
+            if terminal:
+                break
+
+        policy.end_episode(train=training_mode)
+        eps = max(min_eps, eps * eps_decay)
+        scores_window.append(tot_reward)
+        if episode % checkpoint_interval == 0:
+            print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
+                                                                                                              tot_reward,
+                                                                                                              np.mean(
+                                                                                                                  scores_window),
+                                                                                                              eps,
+                                                                                                              len(
+                                                                                                                  policy.memory)))
+        else:
+            print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
+                                                                                                              tot_reward,
+                                                                                                              np.mean(
+                                                                                                                  scores_window),
+                                                                                                              eps,
+                                                                                                              len(
+                                                                                                                  policy.memory)),
+                  end=" ")
+
+
+if __name__ == "__main__":
+    cartpole()
diff --git a/run.py b/run.py
index 8e0535b..a3e116c 100644
--- a/run.py
+++ b/run.py
@@ -30,7 +30,7 @@ from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.evaluators.client import FlatlandRemoteClient
 from flatland.evaluators.client import TimeoutException
 
-from reinforcement_learning.ppo_agent import PPOAgent
+from reinforcement_learning.ppo_agent import PPOPolicy
 from reinforcement_learning.ppo_deadlockavoidance_agent import MultiDecisionAgent
 from utils.agent_action_config import get_action_size, map_actions
 from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
@@ -49,10 +49,10 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy
 # Print per-step logs
 VERBOSE = True
 USE_FAST_TREEOBS = True
-USE_PPO_AGENT = False
+USE_PPO_AGENT = True
 
 # Checkpoint to use (remember to push it!)
-checkpoint = "./checkpoints/201215160226-12000.pth"  #
+checkpoint = "./checkpoints/201217163219-6500.pth"  #
 # checkpoint = "./checkpoints/201215212134-12000.pth"  #
 
 EPSILON = 0.0
@@ -60,7 +60,7 @@ EPSILON = 0.0
 # Use last action cache
 USE_ACTION_CACHE = False
 USE_DEAD_LOCK_AVOIDANCE_AGENT = False  # 21.54485505223213
-USE_MULTI_DECISION_AGENT = True
+USE_MULTI_DECISION_AGENT = False
 
 # Observation parameters (must match training parameters!)
 observation_tree_depth = 2
@@ -105,7 +105,7 @@ action_size = get_action_size()
 if not USE_PPO_AGENT:
     trained_policy = DDDQNPolicy(state_size, action_size, Namespace(**{'use_gpu': False}), evaluation_mode=True)
 else:
-    trained_policy = PPOAgent(state_size, action_size)
+    trained_policy = PPOPolicy(state_size, action_size)
 trained_policy.load(checkpoint)
 
 #####################################################################
diff --git a/utils/agent_action_config.py b/utils/agent_action_config.py
index 3a84875..4c1f83f 100644
--- a/utils/agent_action_config.py
+++ b/utils/agent_action_config.py
@@ -1,3 +1,6 @@
+from flatland.envs.rail_env import RailEnvActions
+
+
 def get_flatland_full_action_size():
     # The action space of flatland is 5 discrete actions
     return 5
@@ -14,11 +17,35 @@ def map_actions(actions):
         return actions
     for key in actions:
         value = actions.get(key, 0)
-        actions.update({key: (value + 1)})
+        actions.update({key: map_action(value)})
     return actions
 
 
 def map_action(action):
     if get_action_size() == get_flatland_full_action_size():
         return action
-    return action + 1
+
+    if action == 0:
+        return RailEnvActions.MOVE_LEFT
+    if action == 1:
+        return RailEnvActions.MOVE_FORWARD
+    if action == 2:
+        return RailEnvActions.MOVE_RIGHT
+    if action == 3:
+        return RailEnvActions.STOP_MOVING
+
+
+def map_rail_env_action(action):
+    if get_action_size() == get_flatland_full_action_size():
+        return action
+
+    if action == RailEnvActions.MOVE_LEFT:
+        return 0
+    elif action == RailEnvActions.MOVE_FORWARD:
+        return 1
+    elif action == RailEnvActions.MOVE_RIGHT:
+        return 2
+    elif action == RailEnvActions.STOP_MOVING:
+        return 3
+    # action == RailEnvActions.DO_NOTHING:
+    return 3
diff --git a/utils/dead_lock_avoidance_agent.py b/utils/dead_lock_avoidance_agent.py
index 87f7e28..cad3b74 100644
--- a/utils/dead_lock_avoidance_agent.py
+++ b/utils/dead_lock_avoidance_agent.py
@@ -7,6 +7,7 @@ from flatland.envs.agent_utils import RailAgentStatus
 from flatland.envs.rail_env import RailEnv, RailEnvActions, fast_count_nonzero
 
 from reinforcement_learning.policy import Policy
+from utils.agent_action_config import map_rail_env_action
 from utils.shortest_distance_walker import ShortestDistanceWalker
 
 
@@ -98,9 +99,7 @@ class DeadLockAvoidanceAgent(Policy):
         act = RailEnvActions.STOP_MOVING
         if check is not None:
             act = check[3]
-        if self.action_size == 4:
-            act = max(act - 1, 0)
-        return act
+        return map_rail_env_action(act)
 
     def get_agent_can_move_value(self, handle):
         return self.agent_can_move_value.get(handle, np.inf)
-- 
GitLab