From 8cf48167d8029166695b97feb6237c287e8634c9 Mon Sep 17 00:00:00 2001 From: "Egli Adrian (IT-SCI-API-PFI)" <adrian.egli@sbb.ch> Date: Thu, 3 Dec 2020 11:28:54 +0100 Subject: [PATCH] looks good --- .../multi_agent_training.py | 4 +-- reinforcement_learning/ppo/ppo_agent.py | 28 ++++++++++--------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 71e3efd..640db58 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -548,8 +548,8 @@ def eval_policy(env, tree_observation, policy, train_params, obs_params): if __name__ == "__main__": parser = ArgumentParser() - parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=1000, type=int) - parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=0, + parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=5000, type=int) + parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=1, type=int) parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=0, type=int) diff --git a/reinforcement_learning/ppo/ppo_agent.py b/reinforcement_learning/ppo/ppo_agent.py index 31d728e..d1064f3 100644 --- a/reinforcement_learning/ppo/ppo_agent.py +++ b/reinforcement_learning/ppo/ppo_agent.py @@ -10,9 +10,9 @@ from torch.distributions import Categorical # Hyperparameters from reinforcement_learning.policy import Policy -LEARNING_RATE = 0.00001 +LEARNING_RATE = 0.1e-4 GAMMA = 0.98 -LMBDA = 0.95 +LMBDA = 0.9 EPS_CLIP = 0.1 K_EPOCH = 3 @@ -76,7 +76,7 @@ class PPOAgent(Policy): transition = (state, action, reward, next_state, prob[action].item(), done) self.memory.push_transition(handle, transition) - def _convert_transitions_to_torch(self, transitions_array): + def _convert_transitions_to_torch_tensors(self, transitions_array): state_list, action_list, reward_list, state_next_list, prob_a_list, done_list = [], [], [], [], [], [] total_reward = 0 for transition in transitions_array: @@ -108,14 +108,14 @@ class PPOAgent(Policy): agent_episode_history = self.memory.get_transitions(handle) if len(agent_episode_history) > 0: # convert the replay buffer to torch tensors (arrays) - state, action, reward, state_next, done, prob_action = \ - self._convert_transitions_to_torch(agent_episode_history) + states, actions, rewards, states_next, dones, probs_action = \ + self._convert_transitions_to_torch_tensors(agent_episode_history) # run K_EPOCH optimisation steps for i in range(K_EPOCH): # temporal difference function / and prepare advantage function data - estimated_target_value = reward + GAMMA * self.v(state_next) * (1.0 - done) - difference_to_expected_value_deltas = estimated_target_value - self.v(state) + estimated_target_value = rewards + GAMMA * self.v(states_next) * (1.0 - dones) + difference_to_expected_value_deltas = estimated_target_value - self.v(states) difference_to_expected_value_deltas = difference_to_expected_value_deltas.detach().numpy() # build advantage function and convert it to torch tensor (array) @@ -125,16 +125,18 @@ class PPOAgent(Policy): advantage_value = LMBDA * advantage_value + difference_to_expected_value_t[0] advantage_list.append([advantage_value]) advantage_list.reverse() - advantage = torch.tensor(advantage_list, dtype=torch.float) + advantages = torch.tensor(advantage_list, dtype=torch.float) - pi_action = self.pi(state, softmax_dim=1).gather(1, action) - ratio = torch.exp(torch.log(pi_action) - torch.log(prob_action)) # a/b == exp(log(a)-log(b)) + # estimate pi_action for all state + pi_actions = self.pi(states, softmax_dim=1).gather(1, actions) + # calculate the ratios + ratios = torch.exp(torch.log(pi_actions) - torch.log(probs_action)) # Normal Policy Gradient objective - surrogate_objective = ratio * advantage + surrogate_objective = ratios * advantages # clipped version of Normal Policy Gradient objective - clipped_surrogate_objective = torch.clamp(ratio * advantage, 1 - EPS_CLIP, 1 + EPS_CLIP) + clipped_surrogate_objective = torch.clamp(ratios * advantages, 1 - EPS_CLIP, 1 + EPS_CLIP) # value function loss - value_loss = F.mse_loss(self.v(state), estimated_target_value.detach()) + value_loss = F.mse_loss(self.v(states), estimated_target_value.detach()) # loss loss = -torch.min(surrogate_objective, clipped_surrogate_objective) + value_loss -- GitLab