diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 94c6208ddde96a1bd9aea3728bb2a73f3f4ab667..a3a2b1a3c69921e15715115e7db50ca5239a67f1 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -22,7 +22,8 @@ from torch.utils.tensorboard import SummaryWriter from reinforcement_learning.dddqn_policy import DDDQNPolicy from reinforcement_learning.ppo_agent import PPOPolicy from reinforcement_learning.ppo_deadlockavoidance_agent import MultiDecisionAgent -from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action +from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action, \ + map_rail_env_action from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent from utils.deadlock_check import get_agent_positions, check_for_deadlock @@ -173,7 +174,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Double Dueling DQN policy policy = DDDQNPolicy(state_size, get_action_size(), train_params) if True: - policy = PPOPolicy(state_size, get_action_size()) + policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=True, in_parameters=train_params) if False: policy = DeadLockAvoidanceAgent(train_env, get_action_size()) if False: @@ -517,9 +518,9 @@ def eval_policy(env, tree_observation, policy, train_params, obs_params): if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=12000, type=int) - parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=2, + parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=1, type=int) - parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=2, + parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=1, type=int) parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=10, type=int) parser.add_argument("--checkpoint_interval", help="checkpoint interval", default=100, type=int) diff --git a/reinforcement_learning/ppo_agent.py b/reinforcement_learning/ppo_agent.py index 4072ecc190361f3a08aa150c80db17e66675017c..d467cfeb018811fdfc3b8cda0ed6a1c6519d376f 100644 --- a/reinforcement_learning/ppo_agent.py +++ b/reinforcement_learning/ppo_agent.py @@ -1,7 +1,6 @@ import copy import os -import numpy as np import torch import torch.nn as nn import torch.optim as optim @@ -11,10 +10,6 @@ from torch.distributions import Categorical from reinforcement_learning.policy import LearningPolicy from reinforcement_learning.replay_buffer import ReplayBuffer -device = torch.device("cpu") # "cuda:0" if torch.cuda.is_available() else "cpu") -print("device:", device) - - # https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html class EpisodeBuffers: @@ -96,27 +91,46 @@ class ActorCriticModel(nn.Module): class PPOPolicy(LearningPolicy): - def __init__(self, state_size, action_size, use_replay_buffer=False): + def __init__(self, state_size, action_size, use_replay_buffer=False, in_parameters=None): print(">> PPOPolicy") super(PPOPolicy, self).__init__() # parameters - self.learning_rate = 1.0e-3 - self.gamma = 0.95 + self.ppo_parameters = in_parameters + if self.ppo_parameters is not None: + self.hidsize = self.ppo_parameters.hidden_size + self.buffer_size = self.ppo_parameters.buffer_size + self.batch_size = self.ppo_parameters.batch_size + self.learning_rate = self.ppo_parameters.learning_rate + self.gamma = self.ppo_parameters.gamma + # Device + if self.ppo_parameters.use_gpu and torch.cuda.is_available(): + self.device = torch.device("cuda:0") + # print("🇠Using GPU") + else: + self.device = torch.device("cpu") + # print("🢠Using CPU") + else: + self.hidsize = 128 + self.learning_rate = 1.0e-3 + self.gamma = 0.95 + self.buffer_size = 32_000 + self.batch_size = 1024 + self.device = torch.device("cpu") + self.surrogate_eps_clip = 0.1 self.K_epoch = 10 self.weight_loss = 0.5 self.weight_entropy = 0.01 - self.buffer_size = 32_000 - self.batch_size = 1024 self.buffer_min_size = 0 self.use_replay_buffer = use_replay_buffer - self.device = device self.current_episode_memory = EpisodeBuffers() self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device) self.loss = 0 - self.actor_critic_model = ActorCriticModel(state_size, action_size, self.device) + self.actor_critic_model = ActorCriticModel(state_size, action_size,self.device, + hidsize1=self.hidsize, + hidsize2=self.hidsize) self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate) self.loss_function = nn.MSELoss() # nn.SmoothL1Loss()