From 91c77d25638f169c7566b2694b819983ade62b70 Mon Sep 17 00:00:00 2001 From: "Egli Adrian (IT-SCI-API-PFI)" <adrian.egli@sbb.ch> Date: Fri, 18 Dec 2020 10:54:40 +0100 Subject: [PATCH] refactored and added new test (gym - CartPole) --- reinforcement_learning/dddqn_policy.py | 1 + .../multi_agent_training.py | 6 +- reinforcement_learning/multi_policy.py | 4 +- reinforcement_learning/ppo_agent.py | 78 ++++++++++++----- .../ppo_deadlockavoidance_agent.py | 5 +- reinforcement_learning/replay_buffer.py | 1 - reinforcement_learning/rl_agent_test.py | 84 +++++++++++++++++++ run.py | 10 +-- utils/agent_action_config.py | 31 ++++++- utils/dead_lock_avoidance_agent.py | 5 +- 10 files changed, 183 insertions(+), 42 deletions(-) create mode 100644 reinforcement_learning/rl_agent_test.py diff --git a/reinforcement_learning/dddqn_policy.py b/reinforcement_learning/dddqn_policy.py index ecd3e46..9864ca6 100644 --- a/reinforcement_learning/dddqn_policy.py +++ b/reinforcement_learning/dddqn_policy.py @@ -17,6 +17,7 @@ class DDDQNPolicy(Policy): """Dueling Double DQN policy""" def __init__(self, state_size, action_size, in_parameters, evaluation_mode=False): + print(">> DDDQNPolicy") super(Policy, self).__init__() self.ddqn_parameters = in_parameters diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 1092d84..b9a819a 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -20,7 +20,7 @@ from flatland.utils.rendertools import RenderTool from torch.utils.tensorboard import SummaryWriter from reinforcement_learning.dddqn_policy import DDDQNPolicy -from reinforcement_learning.ppo_agent import PPOAgent +from reinforcement_learning.ppo_agent import PPOPolicy from reinforcement_learning.ppo_deadlockavoidance_agent import MultiDecisionAgent from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent @@ -173,7 +173,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Double Dueling DQN policy policy = DDDQNPolicy(state_size, get_action_size(), train_params) if True: - policy = PPOAgent(state_size, get_action_size()) + policy = PPOPolicy(state_size, get_action_size()) if False: policy = DeadLockAvoidanceAgent(train_env, get_action_size()) if False: @@ -283,7 +283,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): next_obs, all_rewards, done, info = train_env.step(map_actions(action_dict)) # Reward shaping .Dead-lock .NotMoving .NotStarted - if True: + if False: agent_positions = get_agent_positions(train_env) for agent_handle in train_env.get_agent_handles(): agent = train_env.agents[agent_handle] diff --git a/reinforcement_learning/multi_policy.py b/reinforcement_learning/multi_policy.py index 0c2ae32..5ee8cb4 100644 --- a/reinforcement_learning/multi_policy.py +++ b/reinforcement_learning/multi_policy.py @@ -2,7 +2,7 @@ import numpy as np from flatland.envs.rail_env import RailEnv from reinforcement_learning.policy import Policy -from reinforcement_learning.ppo_agent import PPOAgent +from reinforcement_learning.ppo_agent import PPOPolicy from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent @@ -13,7 +13,7 @@ class MultiPolicy(Policy): self.memory = [] self.loss = 0 self.deadlock_avoidance_policy = DeadLockAvoidanceAgent(env, action_size, False) - self.ppo_policy = PPOAgent(state_size + action_size, action_size) + self.ppo_policy = PPOPolicy(state_size + action_size, action_size) def load(self, filename): self.ppo_policy.load(filename) diff --git a/reinforcement_learning/ppo_agent.py b/reinforcement_learning/ppo_agent.py index f80bea8..43de9f7 100644 --- a/reinforcement_learning/ppo_agent.py +++ b/reinforcement_learning/ppo_agent.py @@ -1,6 +1,7 @@ import copy import os +import numpy as np import torch import torch.nn as nn import torch.optim as optim @@ -94,21 +95,21 @@ class ActorCriticModel(nn.Module): self.critic = self._load(self.critic, filename + ".critic") -class PPOAgent(Policy): +class PPOPolicy(Policy): def __init__(self, state_size, action_size): - super(PPOAgent, self).__init__() - + print(">> PPOPolicy") + super(PPOPolicy, self).__init__() # parameters - self.learning_rate = 1.0e-5 + self.learning_rate = 1.0e-3 self.gamma = 0.95 - self.surrogate_eps_clip = 0.1 - self.K_epoch = 50 - self.weight_loss = 0.5 + self.surrogate_eps_clip = 0.01 + self.K_epoch = 5 + self.weight_loss = 0.25 self.weight_entropy = 0.01 - self.buffer_size = 32_000 - self.batch_size = 512 - self.buffer_min_size = 8_000 + self.buffer_size = 2_000 + self.batch_size = 64 + self.buffer_min_size = 0 self.use_replay_buffer = True self.device = device @@ -117,7 +118,7 @@ class PPOAgent(Policy): self.loss = 0 self.actor_critic_model = ActorCriticModel(state_size, action_size, self.device) self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate) - self.loss_function = nn.SmoothL1Loss() # nn.MSELoss() + self.loss_function = nn.MSELoss() # nn.SmoothL1Loss() def reset(self, env): pass @@ -139,6 +140,22 @@ class PPOAgent(Policy): transition = (state, action, reward, next_state, action_logprobs.item(), done) self.current_episode_memory.push_transition(handle, transition) + def _push_transitions_to_replay_buffer(self, + state_list, + action_list, + reward_list, + state_next_list, + done_list, + prob_a_list): + for idx in range(len(reward_list)): + state_i = state_list[idx] + action_i = action_list[idx] + reward_i = reward_list[idx] + state_next_i = state_next_list[idx] + done_i = done_list[idx] + prob_action_i = prob_a_list[idx] + self.memory.add(state_i, action_i, reward_i, state_next_i, done_i, prob_action_i) + def _convert_transitions_to_torch_tensors(self, transitions_array): # build empty lists(arrays) state_list, action_list, reward_list, state_next_list, prob_a_list, done_list = [], [], [], [], [], [] @@ -153,18 +170,23 @@ class PPOAgent(Policy): if done_i: discounted_reward = 0 done_list.insert(0, 1) - reward_i = 1 else: done_list.insert(0, 0) - reward_i = 0 discounted_reward = reward_i + self.gamma * discounted_reward reward_list.insert(0, discounted_reward) state_next_list.insert(0, state_next_i) prob_a_list.insert(0, prob_action_i) - if self.use_replay_buffer: - self.memory.add(state_i, action_i, discounted_reward, state_next_i, done_i) + # standard-normalize rewards + reward_list = np.array(reward_list) + reward_list = (reward_list - reward_list.mean()) / (reward_list.std() + 1.e-5) + + if self.use_replay_buffer: + self._push_transitions_to_replay_buffer(state_list, action_list, + reward_list, state_next_list, + done_list, prob_a_list) + # convert data to torch tensors states, actions, rewards, states_next, dones, prob_actions = \ @@ -175,11 +197,18 @@ class PPOAgent(Policy): torch.tensor(done_list, dtype=torch.float).to(self.device), \ torch.tensor(prob_a_list).to(self.device) - # standard-normalize rewards - # rewards = (rewards - rewards.mean()) / (rewards.std() + 1.e-5) - return states, actions, rewards, states_next, dones, prob_actions + def _get_transitions_from_replay_buffer(self, states, actions, rewards, states_next, dones, probs_action): + if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size: + states, actions, rewards, states_next, dones, probs_action = self.memory.sample() + actions = torch.squeeze(actions) + rewards = torch.squeeze(rewards) + states_next = torch.squeeze(states_next) + dones = torch.squeeze(dones) + probs_action = torch.squeeze(probs_action) + return states, actions, rewards, states_next, dones, probs_action + def train_net(self): # All agents have to propagate their experiences made during past episode for handle in range(len(self.current_episode_memory)): @@ -189,11 +218,15 @@ class PPOAgent(Policy): # Convert the replay buffer to torch tensors (arrays) states, actions, rewards, states_next, dones, probs_action = \ self._convert_transitions_to_torch_tensors(agent_episode_history) + # Optimize policy for K epochs: for k_loop in range(int(self.K_epoch)): - if self.use_replay_buffer and k_loop > 0: - if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size: - states, actions, rewards, states_next, dones, probs_action = self.memory.sample() + + if self.use_replay_buffer: + states, actions, rewards, states_next, dones, probs_action = \ + self._get_transitions_from_replay_buffer( + states, actions, rewards, states_next, dones, probs_action + ) # Evaluating actions (actor) and values (critic) logprobs, state_values, dist_entropy = self.actor_critic_model.evaluate(states, actions) @@ -222,7 +255,6 @@ class PPOAgent(Policy): # Transfer the current loss to the agents loss (information) for debug purpose only self.loss = loss.mean().detach().cpu().numpy() - self.K_epoch = max(3, self.K_epoch - 0.01) # Reset all collect transition data self.current_episode_memory.reset() @@ -252,7 +284,7 @@ class PPOAgent(Policy): self.optimizer = self._load(self.optimizer, filename + ".optimizer") def clone(self): - policy = PPOAgent(self.state_size, self.action_size) + policy = PPOPolicy(self.state_size, self.action_size) policy.actor_critic_model = copy.deepcopy(self.actor_critic_model) policy.optimizer = copy.deepcopy(self.optimizer) return self diff --git a/reinforcement_learning/ppo_deadlockavoidance_agent.py b/reinforcement_learning/ppo_deadlockavoidance_agent.py index e344d8e..737634c 100644 --- a/reinforcement_learning/ppo_deadlockavoidance_agent.py +++ b/reinforcement_learning/ppo_deadlockavoidance_agent.py @@ -2,6 +2,7 @@ from flatland.envs.agent_utils import RailAgentStatus from flatland.envs.rail_env import RailEnv, RailEnvActions from reinforcement_learning.policy import Policy +from utils.agent_action_config import map_rail_env_action from utils.agent_can_choose_helper import AgentCanChooseHelper from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent @@ -36,9 +37,7 @@ class MultiDecisionAgent(Policy): return self.learning_agent.act(handle, state, eps) else: act = self.dead_lock_avoidance_agent.act(handle, state, -1.0) - if self.action_size == 4: - act = max(act - 1, 0) - return act + return map_rail_env_action(act) # Agent is still at target cell return RailEnvActions.DO_NOTHING diff --git a/reinforcement_learning/replay_buffer.py b/reinforcement_learning/replay_buffer.py index c288103..2ba147d 100644 --- a/reinforcement_learning/replay_buffer.py +++ b/reinforcement_learning/replay_buffer.py @@ -32,7 +32,6 @@ class ReplayBuffer: def sample(self): """Randomly sample a batch of experiences from memory.""" experiences = random.sample(self.memory, k=self.batch_size) - states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \ .float().to(self.device) actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \ diff --git a/reinforcement_learning/rl_agent_test.py b/reinforcement_learning/rl_agent_test.py new file mode 100644 index 0000000..d764b8c --- /dev/null +++ b/reinforcement_learning/rl_agent_test.py @@ -0,0 +1,84 @@ +from collections import deque +from collections import namedtuple + +import gym +import numpy as np + +from reinforcement_learning.dddqn_policy import DDDQNPolicy +from reinforcement_learning.ppo_agent import PPOPolicy + +dddqn_param_nt = namedtuple('DDDQN_Param', ['hidden_size', 'buffer_size', 'batch_size', 'update_every', 'learning_rate', + 'tau', 'gamma', 'buffer_min_size', 'use_gpu']) +dddqn_param = dddqn_param_nt(hidden_size=128, + buffer_size=1000, + batch_size=64, + update_every=10, + learning_rate=1.e-3, + tau=1.e-2, + gamma=0.95, + buffer_min_size=0, + use_gpu=False) + + +def cartpole(use_dddqn=False): + eps = 1.0 + eps_decay = 0.99 + min_eps = 0.01 + training_mode = True + + env = gym.make("CartPole-v1") + observation_space = env.observation_space.shape[0] + action_space = env.action_space.n + if not use_dddqn: + policy = PPOPolicy(observation_space, action_space) + else: + policy = DDDQNPolicy(observation_space, action_space, dddqn_param) + episode = 0 + checkpoint_interval = 20 + scores_window = deque(maxlen=checkpoint_interval) + while True: + episode += 1 + state = env.reset() + policy.reset(env) + handle = 0 + tot_reward = 0 + + policy.start_episode(train=training_mode) + while True: + env.render() + policy.start_step(train=training_mode) + action = policy.act(handle, state, eps) + state_next, reward, terminal, info = env.step(action) + policy.end_step(train=training_mode) + tot_reward += reward + # reward = reward if not terminal else -reward + reward = 0 if not terminal else -1 + policy.step(handle, state, action, reward, state_next, terminal) + state = np.copy(state_next) + if terminal: + break + + policy.end_episode(train=training_mode) + eps = max(min_eps, eps * eps_decay) + scores_window.append(tot_reward) + if episode % checkpoint_interval == 0: + print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode, + tot_reward, + np.mean( + scores_window), + eps, + len( + policy.memory))) + else: + print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode, + tot_reward, + np.mean( + scores_window), + eps, + len( + policy.memory)), + end=" ") + + +if __name__ == "__main__": + cartpole() diff --git a/run.py b/run.py index 8e0535b..a3e116c 100644 --- a/run.py +++ b/run.py @@ -30,7 +30,7 @@ from flatland.envs.predictions import ShortestPathPredictorForRailEnv from flatland.evaluators.client import FlatlandRemoteClient from flatland.evaluators.client import TimeoutException -from reinforcement_learning.ppo_agent import PPOAgent +from reinforcement_learning.ppo_agent import PPOPolicy from reinforcement_learning.ppo_deadlockavoidance_agent import MultiDecisionAgent from utils.agent_action_config import get_action_size, map_actions from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent @@ -49,10 +49,10 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy # Print per-step logs VERBOSE = True USE_FAST_TREEOBS = True -USE_PPO_AGENT = False +USE_PPO_AGENT = True # Checkpoint to use (remember to push it!) -checkpoint = "./checkpoints/201215160226-12000.pth" # +checkpoint = "./checkpoints/201217163219-6500.pth" # # checkpoint = "./checkpoints/201215212134-12000.pth" # EPSILON = 0.0 @@ -60,7 +60,7 @@ EPSILON = 0.0 # Use last action cache USE_ACTION_CACHE = False USE_DEAD_LOCK_AVOIDANCE_AGENT = False # 21.54485505223213 -USE_MULTI_DECISION_AGENT = True +USE_MULTI_DECISION_AGENT = False # Observation parameters (must match training parameters!) observation_tree_depth = 2 @@ -105,7 +105,7 @@ action_size = get_action_size() if not USE_PPO_AGENT: trained_policy = DDDQNPolicy(state_size, action_size, Namespace(**{'use_gpu': False}), evaluation_mode=True) else: - trained_policy = PPOAgent(state_size, action_size) + trained_policy = PPOPolicy(state_size, action_size) trained_policy.load(checkpoint) ##################################################################### diff --git a/utils/agent_action_config.py b/utils/agent_action_config.py index 3a84875..4c1f83f 100644 --- a/utils/agent_action_config.py +++ b/utils/agent_action_config.py @@ -1,3 +1,6 @@ +from flatland.envs.rail_env import RailEnvActions + + def get_flatland_full_action_size(): # The action space of flatland is 5 discrete actions return 5 @@ -14,11 +17,35 @@ def map_actions(actions): return actions for key in actions: value = actions.get(key, 0) - actions.update({key: (value + 1)}) + actions.update({key: map_action(value)}) return actions def map_action(action): if get_action_size() == get_flatland_full_action_size(): return action - return action + 1 + + if action == 0: + return RailEnvActions.MOVE_LEFT + if action == 1: + return RailEnvActions.MOVE_FORWARD + if action == 2: + return RailEnvActions.MOVE_RIGHT + if action == 3: + return RailEnvActions.STOP_MOVING + + +def map_rail_env_action(action): + if get_action_size() == get_flatland_full_action_size(): + return action + + if action == RailEnvActions.MOVE_LEFT: + return 0 + elif action == RailEnvActions.MOVE_FORWARD: + return 1 + elif action == RailEnvActions.MOVE_RIGHT: + return 2 + elif action == RailEnvActions.STOP_MOVING: + return 3 + # action == RailEnvActions.DO_NOTHING: + return 3 diff --git a/utils/dead_lock_avoidance_agent.py b/utils/dead_lock_avoidance_agent.py index 87f7e28..cad3b74 100644 --- a/utils/dead_lock_avoidance_agent.py +++ b/utils/dead_lock_avoidance_agent.py @@ -7,6 +7,7 @@ from flatland.envs.agent_utils import RailAgentStatus from flatland.envs.rail_env import RailEnv, RailEnvActions, fast_count_nonzero from reinforcement_learning.policy import Policy +from utils.agent_action_config import map_rail_env_action from utils.shortest_distance_walker import ShortestDistanceWalker @@ -98,9 +99,7 @@ class DeadLockAvoidanceAgent(Policy): act = RailEnvActions.STOP_MOVING if check is not None: act = check[3] - if self.action_size == 4: - act = max(act - 1, 0) - return act + return map_rail_env_action(act) def get_agent_can_move_value(self, handle): return self.agent_can_move_value.get(handle, np.inf) -- GitLab