diff --git a/scoring/utils/misc_utils.py b/scoring/utils/misc_utils.py index dee5f47f7f8f09f253dfc3f8e3d48931df94efe7..6f10af999ff08f9f1559eb09cf69021a3086165d 100644 --- a/scoring/utils/misc_utils.py +++ b/scoring/utils/misc_utils.py @@ -2,7 +2,6 @@ import random import time import numpy as np - from flatland.envs.observations import TreeObsForRailEnv from flatland.envs.predictions import ShortestPathPredictorForRailEnv from flatland.envs.rail_env import RailEnv @@ -66,7 +65,7 @@ def run_test(parameters, agent, observation_builder=None, observation_wrapper=No number_of_agents=1, ) - obs = env.reset() + obs, info = env.reset() if observation_wrapper is not None: for a in range(env.get_num_agents()): @@ -181,7 +180,7 @@ def run_test_sequential(parameters, agent, test_nr=0, tree_depth=3): number_of_agents=1, ) - obs = env.reset() + obs, info = env.reset() done = env.dones # Run episode trial_score = 0 diff --git a/sequential_agent/run_test.py b/sequential_agent/run_test.py index a8c0bbec93282ecac1f65bb35870805cf9e9e298..92e814571a83be09f86ada389e18d34cca94402d 100644 --- a/sequential_agent/run_test.py +++ b/sequential_agent/run_test.py @@ -1,11 +1,11 @@ import numpy as np - from flatland.envs.observations import TreeObsForRailEnv from flatland.envs.predictions import ShortestPathPredictorForRailEnv from flatland.envs.rail_env import RailEnv from flatland.envs.rail_generators import complex_rail_generator from flatland.envs.schedule_generators import complex_schedule_generator from flatland.utils.rendertools import RenderTool + from sequential_agent.simple_order_agent import OrderedAgent np.random.seed(2) @@ -49,7 +49,7 @@ action_dict = dict() for trials in range(1, n_trials + 1): # Reset environment - obs = env.reset(True, True) + obs, info = env.reset(True, True) done = env.dones env_renderer.reset() frame_step = 0 diff --git a/torch_training/Getting_Started_Training.md b/torch_training/Getting_Started_Training.md index 8610bfd15c2e5cad0d4ec19db883f4ceb9407963..cbf4a3cd294ed3e2018e4386b2b76bb3b4d7bb1b 100644 --- a/torch_training/Getting_Started_Training.md +++ b/torch_training/Getting_Started_Training.md @@ -150,7 +150,7 @@ We now use the normalized `agent_obs` for our training loop: for trials in range(1, n_trials + 1): # Reset environment - obs = env.reset(True, True) + obs, info = env.reset(True, True) if not Training: env_renderer.set_new_rail() diff --git a/torch_training/Multi_Agent_Training_Intro.md b/torch_training/Multi_Agent_Training_Intro.md index d4eefae068601b9ea568759b49f95b6985cafac7..69f89aa987e94b10fa2b18f9260779f06b6fc4bf 100644 --- a/torch_training/Multi_Agent_Training_Intro.md +++ b/torch_training/Multi_Agent_Training_Intro.md @@ -174,7 +174,7 @@ We now use the normalized `agent_obs` for our training loop: agent_next_obs = [None] * env.get_num_agents() # Reset environment - obs = env.reset(True, True) + obs, info = env.reset(True, True) # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at # different times during an episode diff --git a/torch_training/multi_agent_inference.py b/torch_training/multi_agent_inference.py index 580886b1db73ba34d539e14968deea384b5b98be..b376623e7ecde3bbfcf01a5eeb11e8a76c132cc7 100644 --- a/torch_training/multi_agent_inference.py +++ b/torch_training/multi_agent_inference.py @@ -3,16 +3,15 @@ from collections import deque import numpy as np import torch -from importlib_resources import path from flatland.envs.observations import TreeObsForRailEnv from flatland.envs.predictions import ShortestPathPredictorForRailEnv - -import torch_training.Nets from flatland.envs.rail_env import RailEnv -from flatland.envs.rail_generators import rail_from_file, sparse_rail_generator -from flatland.envs.schedule_generators import schedule_from_file, sparse_schedule_generator - +from flatland.envs.rail_generators import sparse_rail_generator +from flatland.envs.schedule_generators import sparse_schedule_generator from flatland.utils.rendertools import RenderTool +from importlib_resources import path + +import torch_training.Nets from torch_training.dueling_double_dqn import Agent from utils.observation_utils import normalize_observation @@ -97,7 +96,7 @@ frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment - obs = env.reset(True, True) + obs, info = env.reset(True, True) env_renderer.reset() diff --git a/torch_training/multi_agent_training.py b/torch_training/multi_agent_training.py index e8ed93f052b0073b790e59a3de2f900934e690dc..ec8ac964a2319f99a85cae87a08c27f9a731ff1f 100644 --- a/torch_training/multi_agent_training.py +++ b/torch_training/multi_agent_training.py @@ -162,7 +162,7 @@ def main(argv): agent_next_obs = [None] * env.get_num_agents() # Reset environment - obs = env.reset(True, True) + obs, info = env.reset(True, True) # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at # different times during an episode diff --git a/torch_training/multi_agent_two_time_step_training.py b/torch_training/multi_agent_two_time_step_training.py index d02e4b221b30f58b89faa540ea0c505506a3c454..466ddf52dc7e937dab36e72d407aa5d09f8fd445 100644 --- a/torch_training/multi_agent_two_time_step_training.py +++ b/torch_training/multi_agent_two_time_step_training.py @@ -121,7 +121,7 @@ def main(argv): agent_next_obs = [None] * env.get_num_agents() # Reset environment - obs = env.reset(True, True) + obs, info = env.reset(True, True) # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at # different times during an episode diff --git a/torch_training/render_agent_behavior.py b/torch_training/render_agent_behavior.py index 2649a2367367e17e39328ca8c28cc9c2f1fc0172..969b7e92ea1f3a3eca961fa7400b8b0f36321d3e 100644 --- a/torch_training/render_agent_behavior.py +++ b/torch_training/render_agent_behavior.py @@ -3,15 +3,15 @@ from collections import deque import numpy as np import torch -from importlib_resources import path - -import torch_training.Nets from flatland.envs.observations import TreeObsForRailEnv from flatland.envs.predictions import ShortestPathPredictorForRailEnv from flatland.envs.rail_env import RailEnv from flatland.envs.rail_generators import sparse_rail_generator from flatland.envs.schedule_generators import sparse_schedule_generator from flatland.utils.rendertools import RenderTool +from importlib_resources import path + +import torch_training.Nets from torch_training.dueling_double_dqn import Agent from utils.observation_utils import normalize_observation @@ -111,7 +111,7 @@ frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment - obs = env.reset(True, True) + obs, info = env.reset(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): diff --git a/torch_training/training_navigation.py b/torch_training/training_navigation.py index 607206ec8f423c421d2e57397cce3a2a1900679e..4f82c523474f6880a8595fe8c9dc3e4565e4902b 100644 --- a/torch_training/training_navigation.py +++ b/torch_training/training_navigation.py @@ -133,13 +133,11 @@ def main(argv): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: - register_action_state[a] = True action = agent.act(agent_obs[a], eps=eps) action_prob[action] += 1 if step == 0: agent_action_buffer[a] = action else: - register_action_state[a] = False action = 0 action_dict.update({a: action}) @@ -151,24 +149,21 @@ def main(argv): # Penalize waiting in order to get agent to move if env.agents[a].status == 0: all_rewards[a] -= 1 + agent_next_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10) cummulated_reward[a] += all_rewards[a] # Update replay buffer and train agent for a in range(env.get_num_agents()): - if done[a]: - final_obs[a] = agent_obs_buffer[a] - final_obs_next[a] = agent_next_obs[a].copy() - final_action_dict.update({a: agent_action_buffer[a]}) - if not done[a]: - if agent_obs_buffer[a] is not None and register_action_state[a]: - agent_delayed_next = agent_obs[a].copy() - agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], - agent_delayed_next, done[a]) - cummulated_reward[a] = 0. - if register_action_state[a]: - agent_obs_buffer[a] = agent_obs[a].copy() - agent_action_buffer[a] = action_dict[a] + if (agent_obs_buffer[a] is not None and register_action_state[a] and env.agents[a].status != 3) or \ + env.agents[a].status == 2: + agent_delayed_next = agent_obs[a].copy() + agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a], + agent_delayed_next, done[a]) + cummulated_reward[a] = 0. + if info['action_required'][a]: + agent_obs_buffer[a] = agent_obs[a].copy() + agent_action_buffer[a] = action_dict[a] score += all_rewards[a] / env.get_num_agents() @@ -176,8 +171,6 @@ def main(argv): agent_obs = agent_next_obs.copy() if done['__all__']: env_done = 1 - for a in range(env.get_num_agents()): - agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a]) break # Epsilon decay diff --git a/utils/misc_utils.py b/utils/misc_utils.py index 4702c82e7ec8722924de9bce367f72b16bcda0fc..e4962ca1097b141c6397736480ed67a618789538 100644 --- a/utils/misc_utils.py +++ b/utils/misc_utils.py @@ -3,12 +3,12 @@ import time from collections import deque import numpy as np -from line_profiler import LineProfiler - from flatland.envs.observations import GlobalObsForRailEnv from flatland.envs.rail_env import RailEnv from flatland.envs.rail_generators import complex_rail_generator from flatland.envs.schedule_generators import complex_schedule_generator +from line_profiler import LineProfiler + from utils.observation_utils import norm_obs_clip, split_tree_into_feature_groups @@ -102,7 +102,7 @@ def run_test(parameters, agent, test_nr=0, tree_depth=3): # Reset the env lp_reset(True, True) - obs = env.reset(True, True) + obs, info = env.reset(True, True) for a in range(env.get_num_agents()): data, distance, agent_data = split_tree_into_feature_groups(obs[a], tree_depth) data = norm_obs_clip(data)