From 769f25ec37bcf2cd99e4ca461aace74a908ff154 Mon Sep 17 00:00:00 2001 From: "Egli Adrian (IT-SCI-API-PFI)" <adrian.egli@sbb.ch> Date: Thu, 3 Dec 2020 16:19:33 +0100 Subject: [PATCH] small fix in object --- reinforcement_learning/multi_agent_training.py | 4 ++-- reinforcement_learning/ppo/ppo_agent.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 640db58..bc1dc7f 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -257,7 +257,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): # Reset environment reset_timer.start() number_of_agents = int(min(n_agents, 1 + np.floor(episode_idx / 200))) - train_env_params.n_agents = 1 # episode_idx % number_of_agents + 1 + train_env_params.n_agents = episode_idx % number_of_agents + 1 train_env = create_rail_env(train_env_params, tree_observation) obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True) @@ -314,7 +314,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): next_obs, all_rewards, done, info = train_env.step(action_dict) # Reward shaping .Dead-lock .NotMoving .NotStarted - if False: + if True: agent_positions = get_agent_positions(train_env) for agent_handle in train_env.get_agent_handles(): agent = train_env.agents[agent_handle] diff --git a/reinforcement_learning/ppo/ppo_agent.py b/reinforcement_learning/ppo/ppo_agent.py index 3322c82..3240e6f 100644 --- a/reinforcement_learning/ppo/ppo_agent.py +++ b/reinforcement_learning/ppo/ppo_agent.py @@ -12,11 +12,11 @@ from reinforcement_learning.policy import Policy LEARNING_RATE = 0.1e-4 GAMMA = 0.98 -LMBDA = 0.9 -EPS_CLIP = 0.1 +LAMBDA = 0.9 +SURROGATE_EPS_CLIP = 0.01 K_EPOCH = 3 -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +device = torch.device("cpu")#"cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) @@ -215,7 +215,7 @@ class PPOAgent(Policy): advantage_list = [] advantage_value = 0.0 for difference_to_expected_value_t in difference_to_expected_value_deltas[::-1]: - advantage_value = LMBDA * advantage_value + difference_to_expected_value_t[0] + advantage_value = LAMBDA * advantage_value + difference_to_expected_value_t[0] advantage_list.append([advantage_value]) advantage_list.reverse() advantages = torch.tensor(advantage_list, dtype=torch.float) @@ -227,9 +227,11 @@ class PPOAgent(Policy): # Normal Policy Gradient objective surrogate_objective = ratios * advantages # clipped version of Normal Policy Gradient objective - clipped_surrogate_objective = torch.clamp(ratios * advantages, 1 - EPS_CLIP, 1 + EPS_CLIP) + clipped_surrogate_objective = torch.clamp(ratios * advantages, + 1 - SURROGATE_EPS_CLIP, + 1 + SURROGATE_EPS_CLIP) # create value loss function - value_loss = F.mse_loss(self.value_network(states), + value_loss = F.smooth_l1_loss(self.value_network(states), estimated_target_value.detach()) # create final loss function loss = -torch.min(surrogate_objective, clipped_surrogate_objective) + value_loss -- GitLab