diff --git a/reinforcement_learning/dddqn_policy.py b/reinforcement_learning/dddqn_policy.py index 7a3525d903d323487507f991e6bcb099a436b35e..ecd3e463ba6dfe98c87ea7eb33d418d4f11b4b73 100644 --- a/reinforcement_learning/dddqn_policy.py +++ b/reinforcement_learning/dddqn_policy.py @@ -2,7 +2,6 @@ import copy import os import pickle import random -from collections import namedtuple, deque, Iterable import numpy as np import torch @@ -11,6 +10,7 @@ import torch.optim as optim from reinforcement_learning.model import DuelingQNetwork from reinforcement_learning.policy import Policy +from reinforcement_learning.replay_buffer import ReplayBuffer class DDDQNPolicy(Policy): @@ -90,7 +90,7 @@ class DDDQNPolicy(Policy): def _learn(self): experiences = self.memory.sample() - states, actions, rewards, next_states, dones = experiences + states, actions, rewards, next_states, dones, _ = experiences # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather(1, actions) @@ -161,55 +161,3 @@ class DDDQNPolicy(Policy): me.qnetwork_target = copy.deepcopy(self.qnetwork_local) me.qnetwork_target = copy.deepcopy(self.qnetwork_target) return me - - -Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) - - -class ReplayBuffer: - """Fixed-size buffer to store experience tuples.""" - - def __init__(self, action_size, buffer_size, batch_size, device): - """Initialize a ReplayBuffer object. - - Params - ====== - action_size (int): dimension of each action - buffer_size (int): maximum size of buffer - batch_size (int): size of each training batch - """ - self.action_size = action_size - self.memory = deque(maxlen=buffer_size) - self.batch_size = batch_size - self.device = device - - def add(self, state, action, reward, next_state, done): - """Add a new experience to memory.""" - e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done) - self.memory.append(e) - - def sample(self): - """Randomly sample a batch of experiences from memory.""" - experiences = random.sample(self.memory, k=self.batch_size) - - states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \ - .float().to(self.device) - actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \ - .long().to(self.device) - rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \ - .float().to(self.device) - next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \ - .float().to(self.device) - dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \ - .float().to(self.device) - - return states, actions, rewards, next_states, dones - - def __len__(self): - """Return the current size of internal memory.""" - return len(self.memory) - - def __v_stack_impr(self, states): - sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1 - np_states = np.reshape(np.array(states), (len(states), sub_dim)) - return np_states diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 5cb6ba1d633e8997b255447521a2938270d6d173..c12750356efcccf7225bbe51e0c13fbfd44ac41b 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -283,28 +283,26 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): next_obs, all_rewards, done, info = train_env.step(map_actions(action_dict)) # Reward shaping .Dead-lock .NotMoving .NotStarted - if False: + if True: agent_positions = get_agent_positions(train_env) for agent_handle in train_env.get_agent_handles(): agent = train_env.agents[agent_handle] - act = action_dict.get(agent_handle, RailEnvActions.DO_NOTHING) - act = map_action(act) + act = map_action(action_dict.get(agent_handle, RailEnvActions.DO_NOTHING)) if agent.status == RailAgentStatus.ACTIVE: - all_rewards[agent_handle] = 0.0 if done[agent_handle] == False: if check_for_deadlock(agent_handle, train_env, agent_positions): - all_rewards[agent_handle] = -5.0 + all_rewards[agent_handle] -= 5.0 else: pos = agent.position possible_transitions = train_env.rail.get_transitions(*pos, agent.direction) num_transitions = fast_count_nonzero(possible_transitions) if num_transitions < 2 and ((act != RailEnvActions.MOVE_FORWARD) or (act != RailEnvActions.STOP_MOVING)): - all_rewards[agent_handle] = -0.5 + all_rewards[agent_handle] -= 0.5 else: - all_rewards[agent_handle] = -0.01 + all_rewards[agent_handle] -= 0.01 else: - all_rewards[agent_handle] *= 10.0 + all_rewards[agent_handle] *= 9.0 all_rewards[agent_handle] += 1.0 step_timer.end() @@ -523,9 +521,9 @@ if __name__ == "__main__": parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=12000, type=int) parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=3, type=int) - parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=2, + parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=1, type=int) - parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=10, type=int) + parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=1, type=int) parser.add_argument("--checkpoint_interval", help="checkpoint interval", default=100, type=int) parser.add_argument("--eps_start", help="max exploration", default=0.1, type=float) parser.add_argument("--eps_end", help="min exploration", default=0.005, type=float) diff --git a/reinforcement_learning/ppo_agent.py b/reinforcement_learning/ppo_agent.py index a4e74ec884fd99288e45353a56d52cf31a27555f..16206e9f6330e8cfe1b96ea2b797e4ea4f1ecdaf 100644 --- a/reinforcement_learning/ppo_agent.py +++ b/reinforcement_learning/ppo_agent.py @@ -8,6 +8,7 @@ from torch.distributions import Categorical # Hyperparameters from reinforcement_learning.policy import Policy +from reinforcement_learning.replay_buffer import ReplayBuffer device = torch.device("cpu") # "cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) @@ -15,7 +16,7 @@ print("device:", device) # https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html -class DataBuffers: +class EpisodeBuffers: def __init__(self): self.reset() @@ -37,8 +38,9 @@ class DataBuffers: class ActorCriticModel(nn.Module): - def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128): + def __init__(self, state_size, action_size, device, hidsize1=128, hidsize2=128): super(ActorCriticModel, self).__init__() + self.device = device self.actor = nn.Sequential( nn.Linear(state_size, hidsize1), nn.Tanh(), @@ -46,7 +48,7 @@ class ActorCriticModel(nn.Module): nn.Tanh(), nn.Linear(hidsize2, action_size), nn.Softmax(dim=-1) - ).to(device) + ).to(self.device) self.critic = nn.Sequential( nn.Linear(state_size, hidsize1), @@ -54,7 +56,7 @@ class ActorCriticModel(nn.Module): nn.Linear(hidsize1, hidsize2), nn.Tanh(), nn.Linear(hidsize2, 1) - ).to(device) + ).to(self.device) def forward(self, x): raise NotImplementedError @@ -81,7 +83,7 @@ class ActorCriticModel(nn.Module): if os.path.exists(filename): print(' >> ', filename) try: - obj.load_state_dict(torch.load(filename, map_location=device)) + obj.load_state_dict(torch.load(filename, map_location=self.device)) except: print(" >> failed!") return obj @@ -104,10 +106,16 @@ class PPOAgent(Policy): self.weight_loss = 0.5 self.weight_entropy = 0.01 - # objects - self.memory = DataBuffers() + self.buffer_size = 32_000 + self.batch_size = 512 + self.buffer_min_size = 8_000 + self.use_replay_buffer = True + self.device = device + + self.current_episode_memory = EpisodeBuffers() + self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device) self.loss = 0 - self.actor_critic_model = ActorCriticModel(state_size, action_size) + self.actor_critic_model = ActorCriticModel(state_size, action_size, self.device) self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate) self.loss_function = nn.SmoothL1Loss() # nn.MSELoss() @@ -116,20 +124,20 @@ class PPOAgent(Policy): def act(self, handle, state, eps=None): # sample a action to take - torch_state = torch.tensor(state, dtype=torch.float).to(device) + torch_state = torch.tensor(state, dtype=torch.float).to(self.device) dist = self.actor_critic_model.get_actor_dist(torch_state) action = dist.sample() return action.item() def step(self, handle, state, action, reward, next_state, done): # record transitions ([state] -> [action] -> [reward, next_state, done]) - torch_action = torch.tensor(action, dtype=torch.float).to(device) - torch_state = torch.tensor(state, dtype=torch.float).to(device) + torch_action = torch.tensor(action, dtype=torch.float).to(self.device) + torch_state = torch.tensor(state, dtype=torch.float).to(self.device) # evaluate actor dist = self.actor_critic_model.get_actor_dist(torch_state) action_logprobs = dist.log_prob(torch_action) transition = (state, action, reward, next_state, action_logprobs.item(), done) - self.memory.push_transition(handle, transition) + self.current_episode_memory.push_transition(handle, transition) def _convert_transitions_to_torch_tensors(self, transitions_array): # build empty lists(arrays) @@ -155,14 +163,22 @@ class PPOAgent(Policy): state_next_list.insert(0, state_next_i) prob_a_list.insert(0, prob_action_i) + if self.use_replay_buffer: + self.memory.add(state_i, action_i, discounted_reward, state_next_i, done_i) + + if self.use_replay_buffer: + if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size: + states, actions, rewards, next_states, dones, prob_actions = self.memory.sample() + return states, actions, rewards, next_states, dones, prob_actions + # convert data to torch tensors states, actions, rewards, states_next, dones, prob_actions = \ - torch.tensor(state_list, dtype=torch.float).to(device), \ - torch.tensor(action_list).to(device), \ - torch.tensor(reward_list, dtype=torch.float).to(device), \ - torch.tensor(state_next_list, dtype=torch.float).to(device), \ - torch.tensor(done_list, dtype=torch.float).to(device), \ - torch.tensor(prob_a_list).to(device) + torch.tensor(state_list, dtype=torch.float).to(self.device), \ + torch.tensor(action_list).to(self.device), \ + torch.tensor(reward_list, dtype=torch.float).to(self.device), \ + torch.tensor(state_next_list, dtype=torch.float).to(self.device), \ + torch.tensor(done_list, dtype=torch.float).to(self.device), \ + torch.tensor(prob_a_list).to(self.device) # standard-normalize rewards # rewards = (rewards - rewards.mean()) / (rewards.std() + 1.e-5) @@ -171,9 +187,9 @@ class PPOAgent(Policy): def train_net(self): # All agents have to propagate their experiences made during past episode - for handle in range(len(self.memory)): + for handle in range(len(self.current_episode_memory)): # Extract agent's episode history (list of all transitions) - agent_episode_history = self.memory.get_transitions(handle) + agent_episode_history = self.current_episode_memory.get_transitions(handle) if len(agent_episode_history) > 0: # Convert the replay buffer to torch tensors (arrays) states, actions, rewards, states_next, dones, probs_action = \ @@ -209,7 +225,7 @@ class PPOAgent(Policy): self.K_epoch = max(3, self.K_epoch - 0.01) # Reset all collect transition data - self.memory.reset() + self.current_episode_memory.reset() def end_episode(self, train): if train: @@ -225,7 +241,7 @@ class PPOAgent(Policy): if os.path.exists(filename): print(' >> ', filename) try: - obj.load_state_dict(torch.load(filename, map_location=device)) + obj.load_state_dict(torch.load(filename, map_location=self.device)) except: print(" >> failed!") return obj diff --git a/reinforcement_learning/replay_buffer.py b/reinforcement_learning/replay_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..c28810398cdc8243e5e72c700678b642a7bc568a --- /dev/null +++ b/reinforcement_learning/replay_buffer.py @@ -0,0 +1,58 @@ +import random +from collections import namedtuple, deque, Iterable + +import numpy as np +import torch + +Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "action_prob"]) + + +class ReplayBuffer: + """Fixed-size buffer to store experience tuples.""" + + def __init__(self, action_size, buffer_size, batch_size, device): + """Initialize a ReplayBuffer object. + + Params + ====== + action_size (int): dimension of each action + buffer_size (int): maximum size of buffer + batch_size (int): size of each training batch + """ + self.action_size = action_size + self.memory = deque(maxlen=buffer_size) + self.batch_size = batch_size + self.device = device + + def add(self, state, action, reward, next_state, done, action_prob=0.0): + """Add a new experience to memory.""" + e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, action_prob) + self.memory.append(e) + + def sample(self): + """Randomly sample a batch of experiences from memory.""" + experiences = random.sample(self.memory, k=self.batch_size) + + states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \ + .float().to(self.device) + actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \ + .long().to(self.device) + rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \ + .float().to(self.device) + next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \ + .float().to(self.device) + dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \ + .float().to(self.device) + action_probs = torch.from_numpy(self.__v_stack_impr([e.action_prob for e in experiences if e is not None])) \ + .float().to(self.device) + + return states, actions, rewards, next_states, dones, action_probs + + def __len__(self): + """Return the current size of internal memory.""" + return len(self.memory) + + def __v_stack_impr(self, states): + sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1 + np_states = np.reshape(np.array(states), (len(states), sub_dim)) + return np_states