diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 730774bc4cdfc2ada170d1fadf5f0c5f9dc6b23e..68bcb64d7ad86e050b6bb15d01671b7458895208 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -560,7 +560,7 @@ def eval_policy(env, tree_observation, policy, train_params, obs_params): if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=25000, type=int) - parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=2, + parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=0, type=int) parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=0, type=int) diff --git a/reinforcement_learning/ppo/ppo_agent.py b/reinforcement_learning/ppo/ppo_agent.py index 77a0fb5c8c933b2966e45128191e34eeee187c3d..446012bbf8d8886d516ffe55982aa2d1afa9e158 100644 --- a/reinforcement_learning/ppo/ppo_agent.py +++ b/reinforcement_learning/ppo/ppo_agent.py @@ -10,14 +10,6 @@ from torch.distributions import Categorical # Hyperparameters from reinforcement_learning.policy import Policy -LEARNING_RATE = 0.1e-4 -GAMMA = 0.98 -LAMBDA = 0.9 -SURROGATE_EPS_CLIP = 0.01 -K_EPOCH = 3 -WEIGHT_LOSS = 0.5 -WEIGHT_ENTROPY = 0.01 - device = torch.device("cpu") # "cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) @@ -101,10 +93,20 @@ class ActorCriticModel(nn.Module): class PPOAgent(Policy): def __init__(self, state_size, action_size): super(PPOAgent, self).__init__() + + # parameters + self.learning_rate = 0.1e-3 + self.gamma = 0.98 + self.surrogate_eps_clip = 0.1 + self.K_epoch = 3 + self.weight_loss = 0.9 + self.weight_entropy = 0.01 + + # objects self.memory = DataBuffers() self.loss = 0 self.actor_critic_model = ActorCriticModel(state_size, action_size) - self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=LEARNING_RATE) + self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate) self.lossFunction = nn.MSELoss() def reset(self): @@ -136,7 +138,7 @@ class PPOAgent(Policy): discounted_reward = 0 done_list.insert(0, 1) else: - discounted_reward = reward_i + GAMMA * discounted_reward + discounted_reward = reward_i + self.gamma * discounted_reward done_list.insert(0, 0) reward_list.insert(0, discounted_reward) state_next_list.insert(0, state_next_i) @@ -165,7 +167,7 @@ class PPOAgent(Policy): self._convert_transitions_to_torch_tensors(agent_episode_history) # Optimize policy for K epochs: - for _ in range(K_EPOCH): + for _ in range(self.K_epoch): # evaluating actions (actor) and values (critic) logprobs, state_values, dist_entropy = self.actor_critic_model.evaluate(states, actions) @@ -175,11 +177,11 @@ class PPOAgent(Policy): # finding Surrogate Loss: advantages = rewards - state_values.detach() surr1 = ratios * advantages - surr2 = torch.clamp(ratios, 1 - SURROGATE_EPS_CLIP, 1 + SURROGATE_EPS_CLIP) * advantages + surr2 = torch.clamp(ratios, 1 - self.surrogate_eps_clip, 1 + self.surrogate_eps_clip) * advantages loss = \ -torch.min(surr1, surr2) \ - + WEIGHT_LOSS * self.lossFunction(state_values, rewards) \ - - WEIGHT_ENTROPY * dist_entropy + + self.weight_loss * self.lossFunction(state_values, rewards) \ + - self.weight_entropy * dist_entropy # make a gradient step self.optimizer.zero_grad() diff --git a/reinforcement_learning/single_agent_training.py b/reinforcement_learning/single_agent_training.py index a5ee6c5132652757cdd8fd8dab6992e08fdfd14b..bfcc88656c8b37a8c09e72b51701d0750cf7f238 100644 --- a/reinforcement_learning/single_agent_training.py +++ b/reinforcement_learning/single_agent_training.py @@ -103,9 +103,9 @@ def train_agent(n_episodes): 'buffer_size': int(1e5), 'batch_size': 32, 'update_every': 8, - 'LEARNING_RATE': 0.5e-4, + 'learning_rate': 0.5e-4, 'tau': 1e-3, - 'GAMMA': 0.99, + 'gamma': 0.99, 'buffer_min_size': 0, 'hidden_size': 256, 'use_gpu': False