diff --git a/checkpoints/201106090621-3300.pth.local b/checkpoints/201106090621-3300.pth.local new file mode 100644 index 0000000000000000000000000000000000000000..453e1cdb7f0166eb52de7e89a257b49be88e36f8 Binary files /dev/null and b/checkpoints/201106090621-3300.pth.local differ diff --git a/checkpoints/201106090621-3300.pth.target b/checkpoints/201106090621-3300.pth.target new file mode 100644 index 0000000000000000000000000000000000000000..f94422b5b2c56cf46dae8cbbba7189d558581fa4 Binary files /dev/null and b/checkpoints/201106090621-3300.pth.target differ diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 6f250a83fb830afef842037c312629b99cdb78c1..db4b1a805e4d3fa5a3c29bc58224ec88d2d4b2f1 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -18,6 +18,7 @@ from flatland.envs.schedule_generators import sparse_schedule_generator from flatland.utils.rendertools import RenderTool from torch.utils.tensorboard import SummaryWriter +from reinforcement_learning.dddqn_policy import DDDQNPolicy from reinforcement_learning.ppo.ppo_agent import PPOAgent base_dir = Path(__file__).resolve().parent.parent @@ -172,8 +173,8 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): completion_window = deque(maxlen=checkpoint_interval) # Double Dueling DQN policy - # policy = DDDQNPolicy(state_size, action_size, train_params) - policy = PPOAgent(state_size, action_size, n_agents) + policy = DDDQNPolicy(state_size, action_size, train_params) + # policy = PPOAgent(state_size, action_size, n_agents) # Load existing policy if train_params.load_policy is not "": policy.load(train_params.load_policy) @@ -480,7 +481,7 @@ def eval_policy(env, tree_observation, policy, train_params, obs_params): if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=5400, type=int) - parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=1, type=int) + parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=2, type=int) parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=0, type=int) parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=5, type=int) diff --git a/reinforcement_learning/ppo/ppo_agent.py b/reinforcement_learning/ppo/ppo_agent.py index 663a05acb42fd5a919f4eff0c3c45146d9bbd471..a7431f85201def6f189ccdc6101a89428b598e47 100644 --- a/reinforcement_learning/ppo/ppo_agent.py +++ b/reinforcement_learning/ppo/ppo_agent.py @@ -1,5 +1,4 @@ import os -import random import numpy as np import torch @@ -17,6 +16,7 @@ CLIP_FACTOR = .005 UPDATE_EVERY = 30 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +print("device:", device) class PPOAgent(Policy): @@ -31,7 +31,6 @@ class PPOAgent(Policy): self.memory = ReplayBuffer(BUFFER_SIZE) self.t_step = 0 self.loss = 0 - self.num_agents = num_agents def reset(self): self.finished = [False] * len(self.episodes) @@ -43,7 +42,8 @@ class PPOAgent(Policy): self.policy.eval() with torch.no_grad(): output = self.policy(torch.from_numpy(state).float().unsqueeze(0).to(device)) - return Categorical(output).sample().item() + ret = Categorical(output).sample().item() + return ret # Record the results of the agent's action and update the model def step(self, handle, state, action, reward, next_state, done): @@ -118,14 +118,14 @@ class PPOAgent(Policy): if os.path.exists(filename + ".policy"): print(' >> ', filename + ".policy") try: - self.policy.load_state_dict(torch.load(filename + ".policy")) + self.policy.load_state_dict(torch.load(filename + ".policy", map_location=device)) except: print(" >> failed!") pass if os.path.exists(filename + ".optimizer"): print(' >> ', filename + ".optimizer") try: - self.optimizer.load_state_dict(torch.load(filename + ".optimizer")) + self.optimizer.load_state_dict(torch.load(filename + ".optimizer", map_location=device)) except: print(" >> failed!") pass diff --git a/run.py b/run.py index ac6a3cb3589a93c43e44e68fe6faf9796d31235e..06405868c06c8463eae756a38f063571421a7b8b 100644 --- a/run.py +++ b/run.py @@ -6,6 +6,7 @@ from pathlib import Path import numpy as np from flatland.core.env_observation_builder import DummyObservationBuilder from flatland.envs.predictions import ShortestPathPredictorForRailEnv +from flatland.envs.rail_env import RailEnvActions from flatland.evaluators.client import FlatlandRemoteClient from flatland.evaluators.client import TimeoutException @@ -25,10 +26,12 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy VERBOSE = True # Checkpoint to use (remember to push it!) -checkpoint = "./checkpoints/201105173637-4700.pth" # 18.50097663335293 : Depth = 1 +checkpoint = "./checkpoints/201105222046-5400.pth" # 17.66104361971127 Depth 1 +checkpoint = "./checkpoints/201106073658-4300.pth" # 15.64082361736683 Depth 1 +checkpoint = "./checkpoints/201106090621-3300.pth" # 15.64082361736683 Depth 1 # Use last action cache -USE_ACTION_CACHE = True +USE_ACTION_CACHE = False USE_DEAD_LOCK_AVOIDANCE_AGENT = False # Observation parameters (must match training parameters!) @@ -50,6 +53,7 @@ action_size = 5 # Creates the policy. No GPU on evaluation server. policy = DDDQNPolicy(state_size, action_size, Namespace(**{'use_gpu': False}), evaluation_mode=True) +# policy = PPOAgent(state_size, action_size, 10) policy.load(checkpoint) ##################################################################### @@ -134,7 +138,10 @@ while True: action = agent_last_action[agent] nb_hit += 1 else: - action = policy.act(observation[agent], eps=0.0) + action = policy.act(observation[agent], eps=0.01) + + if observation[agent][26] == 1: + action = RailEnvActions.STOP_MOVING action_dict[agent] = action