From 729722e31996457ada1ce2a4cf9de515eded06bc Mon Sep 17 00:00:00 2001 From: "Egli Adrian (IT-SCI-API-PFI)" <adrian.egli@sbb.ch> Date: Mon, 7 Dec 2020 10:41:48 +0100 Subject: [PATCH] very slow convergence, but convergates :-) --- .../multi_agent_training.py | 4 +- reinforcement_learning/ppo_agent.py | 47 +++++++++++-------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 098eb68..01add0f 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -336,7 +336,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): elif agent.status == RailAgentStatus.READY_TO_DEPART: all_rewards[agent_handle] -= 5.0 else: - if True: + if False: agent_positions = get_agent_positions(train_env) for agent_handle in train_env.get_agent_handles(): agent = train_env.agents[agent_handle] @@ -565,7 +565,7 @@ if __name__ == "__main__": parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=0, type=int) parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=5, type=int) - parser.add_argument("--checkpoint_interval", help="checkpoint interval", default=200, type=int) + parser.add_argument("--checkpoint_interval", help="checkpoint interval", default=2000, type=int) parser.add_argument("--eps_start", help="max exploration", default=1.0, type=float) parser.add_argument("--eps_end", help="min exploration", default=0.05, type=float) parser.add_argument("--eps_decay", help="exploration decay", default=0.9975, type=float) diff --git a/reinforcement_learning/ppo_agent.py b/reinforcement_learning/ppo_agent.py index f4382bd..2c4119f 100644 --- a/reinforcement_learning/ppo_agent.py +++ b/reinforcement_learning/ppo_agent.py @@ -1,16 +1,16 @@ import copy import os +import numpy as np import torch import torch.nn as nn -import torch.nn.functional as F import torch.optim as optim from torch.distributions import Categorical # Hyperparameters from reinforcement_learning.policy import Policy -device = torch.device("cpu") # "cuda:0" if torch.cuda.is_available() else "cpu") +device = torch.device("cpu")#"cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) @@ -43,7 +43,8 @@ class ActorCriticModel(nn.Module): nn.Tanh(), nn.Linear(hidsize1, hidsize2), nn.Tanh(), - nn.Linear(hidsize2, action_size) + nn.Linear(hidsize2, action_size), + nn.Softmax(dim=-1) ) self.critic = nn.Sequential( @@ -57,13 +58,13 @@ class ActorCriticModel(nn.Module): def forward(self, x): raise NotImplementedError - def act_prob(self, states, softmax_dim=0): - x = self.actor(states) - prob = F.softmax(x, dim=softmax_dim) - return prob + def get_actor_dist(self, state): + action_probs = self.actor(state) + dist = Categorical(action_probs) + return dist def evaluate(self, states, actions): - action_probs = self.act_prob(states) + action_probs = self.actor(states) dist = Categorical(action_probs) action_logprobs = dist.log_prob(actions) dist_entropy = dist.entropy() @@ -95,11 +96,11 @@ class PPOAgent(Policy): super(PPOAgent, self).__init__() # parameters - self.learning_rate = 0.1e-3 - self.gamma = 0.98 - self.surrogate_eps_clip = 0.1 + self.learning_rate = 0.1e-4 + self.gamma = 0.99 + self.surrogate_eps_clip = 0.2 self.K_epoch = 3 - self.weight_loss = 0.9 + self.weight_loss = 0.5 self.weight_entropy = 0.01 # objects @@ -107,20 +108,26 @@ class PPOAgent(Policy): self.loss = 0 self.actor_critic_model = ActorCriticModel(state_size, action_size) self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate) - self.lossFunction = nn.MSELoss() + self.loss_function = nn.MSELoss() def reset(self): pass def act(self, state, eps=None): # sample a action to take - prob = self.actor_critic_model.act_prob(torch.from_numpy(state).float()) - return Categorical(prob).sample().item() + torch_state = torch.tensor(state, dtype=torch.float).to(device) + dist = self.actor_critic_model.get_actor_dist(torch_state) + action = dist.sample() + return action.item() def step(self, handle, state, action, reward, next_state, done): - # record transitions ([state] -> [action] -> [reward, nextstate, done]) - prob = self.actor_critic_model.act_prob(torch.from_numpy(state).float()) - transition = (state, action, reward, next_state, prob[action].item(), done) + # record transitions ([state] -> [action] -> [reward, next_state, done]) + torch_action = torch.tensor(action, dtype=torch.float).to(device) + torch_state = torch.tensor(state, dtype=torch.float).to(device) + # evaluate actor + dist = self.actor_critic_model.get_actor_dist(torch_state) + action_logprobs = dist.log_prob(torch_action) + transition = (state, action, reward, next_state, action_logprobs.item(), done) self.memory.push_transition(handle, transition) def _convert_transitions_to_torch_tensors(self, transitions_array): @@ -177,10 +184,10 @@ class PPOAgent(Policy): # finding Surrogate Loss: advantages = rewards - state_values.detach() surr1 = ratios * advantages - surr2 = torch.clamp(ratios, 1 - self.surrogate_eps_clip, 1 + self.surrogate_eps_clip) * advantages + surr2 = torch.clamp(ratios, 1. - self.surrogate_eps_clip, 1. + self.surrogate_eps_clip) * advantages loss = \ -torch.min(surr1, surr2) \ - + self.weight_loss * self.lossFunction(state_values, rewards) \ + + self.weight_loss * self.loss_function(state_values, rewards) \ - self.weight_entropy * dist_entropy # make a gradient step -- GitLab