Skip to content
Snippets Groups Projects
Commit 5befd0e4 authored by Erik Nygren's avatar Erik Nygren :bullettrain_front:
Browse files

updated multi-agent training

parent 22f746ff
No related branches found
No related tags found
No related merge requests found
# Import packages for plotting and system
import getopt import getopt
import random import random
import sys import sys
...@@ -12,58 +11,44 @@ sys.path.append(str(base_dir)) ...@@ -12,58 +11,44 @@ sys.path.append(str(base_dir))
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import torch import torch
from importlib_resources import path from torch_training.dueling_double_dqn import Agent
# Import Torch and utility functions to normalize observation
import torch_training.Nets
from flatland.envs.observations import TreeObsForRailEnv from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import sparse_rail_generator from flatland.envs.rail_generators import sparse_rail_generator
# Import Flatland/ Observations and Predictors
from flatland.envs.schedule_generators import sparse_schedule_generator from flatland.envs.schedule_generators import sparse_schedule_generator
from torch_training.dueling_double_dqn import Agent from flatland.utils.rendertools import RenderTool
from utils.observation_utils import normalize_observation from utils.observation_utils import normalize_observation
def main(argv): def main(argv):
try: try:
opts, args = getopt.getopt(argv, "n:", ["n_episodes="]) opts, args = getopt.getopt(argv, "n:", ["n_trials="])
except getopt.GetoptError: except getopt.GetoptError:
print('training_navigation.py -n <n_episodes>') print('training_navigation.py -n <n_trials>')
sys.exit(2) sys.exit(2)
for opt, arg in opts: for opt, arg in opts:
if opt in ('-n', '--n_episodes'): if opt in ('-n', '--n_trials'):
n_episodes = int(arg) n_trials = int(arg)
## Initialize the random
random.seed(1) random.seed(1)
np.random.seed(1) np.random.seed(1)
# Initialize a random map with a random number of agents
"""
Get an observation builder and predictor:
The predictor will always predict the shortest path from the current location of the agent.
This is used to warn for potential conflicts --> Should be enhanced to get better performance!
"""
# Parameters for the Environment # Parameters for the Environment
x_dim = 20 x_dim = 40
y_dim = 20 y_dim = 40
n_agents = 3 n_agents = 4
tree_depth = 2
# Use a the malfunction generator to break agents from time to time # Use a the malfunction generator to break agents from time to time
stochastic_data = {'prop_malfunction': 0.1, # Percentage of defective agents stochastic_data = {'prop_malfunction': 0.05, # Percentage of defective agents
'malfunction_rate': 30, # Rate of malfunction occurence 'malfunction_rate': 50, # Rate of malfunction occurence
'min_duration': 3, # Minimal duration of malfunction 'min_duration': 3, # Minimal duration of malfunction
'max_duration': 20 # Max duration of malfunction 'max_duration': 20 # Max duration of malfunction
} }
# Custom observation builder # Custom observation builder
predictor = ShortestPathPredictorForRailEnv() TreeObservation = TreeObsForRailEnv(max_depth=2)
observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=predictor)
# Different agent types (trains) with different speeds. # Different agent types (trains) with different speeds.
speed_ration_map = {1.: 0.25, # Fast passenger train speed_ration_map = {1.: 0.25, # Fast passenger train
...@@ -73,42 +58,43 @@ def main(argv): ...@@ -73,42 +58,43 @@ def main(argv):
env = RailEnv(width=x_dim, env = RailEnv(width=x_dim,
height=y_dim, height=y_dim,
rail_generator=sparse_rail_generator(num_cities=5, rail_generator=sparse_rail_generator(max_num_cities=3,
# Number of cities in map (where train stations are) # Number of cities in map (where train stations are)
num_intersections=4, seed=1, # Random seed
# Number of intersections (no start / target) grid_mode=False,
num_trainstations=10, # Number of possible start/targets on map max_rails_between_cities=2,
min_node_dist=3, # Minimal distance of nodes max_rails_in_city=3),
node_radius=2, # Proximity of stations to city center
num_neighb=3,
# Number of connections to other cities/intersections
seed=15, # Random seed
grid_mode=True,
enhance_intersection=False
),
schedule_generator=sparse_schedule_generator(speed_ration_map), schedule_generator=sparse_schedule_generator(speed_ration_map),
number_of_agents=n_agents, number_of_agents=n_agents,
stochastic_data=stochastic_data, # Malfunction data generator stochastic_data=stochastic_data, # Malfunction data generator
obs_builder_object=observation_helper) obs_builder_object=TreeObservation)
env.reset(True, True)
handle = env.get_agent_handles() # After training we want to render the results so we also load a renderer
env_renderer = RenderTool(env, gl="PILSVG", )
# Given the depth of the tree observation and the number of features per node we get the following state_size
num_features_per_node = env.obs_builder.observation_dim num_features_per_node = env.obs_builder.observation_dim
tree_depth = 2
nr_nodes = 0 nr_nodes = 0
for i in range(tree_depth + 1): for i in range(tree_depth + 1):
nr_nodes += np.power(4, i) nr_nodes += np.power(4, i)
state_size = num_features_per_node * nr_nodes state_size = num_features_per_node * nr_nodes
# The action space of flatland is 5 discrete actions
action_size = 5 action_size = 5
# We set the number of episodes we would like to train on # We set the number of episodes we would like to train on
if 'n_episodes' not in locals(): if 'n_trials' not in locals():
n_episodes = 60000 n_trials = 15000
# And the max number of steps we want to take per episode
max_steps = int(3 * (env.height + env.width))
# Set max number of steps per episode as well as other training relevant parameter # Define training parameters
max_steps = int((env.height + env.width))
eps = 1. eps = 1.
eps_end = 0.005 eps_end = 0.005
eps_decay = 0.9995 eps_decay = 0.998
# And some variables to keep track of the progress
action_dict = dict() action_dict = dict()
final_action_dict = dict() final_action_dict = dict()
scores_window = deque(maxlen=100) scores_window = deque(maxlen=100)
...@@ -118,101 +104,60 @@ def main(argv): ...@@ -118,101 +104,60 @@ def main(argv):
action_prob = [0] * action_size action_prob = [0] * action_size
agent_obs = [None] * env.get_num_agents() agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents()
observation_radius = 10 agent_obs_buffer = [None] * env.get_num_agents()
agent_action_buffer = [2] * env.get_num_agents()
# Initialize the agent cummulated_reward = np.zeros(env.get_num_agents())
update_values = False
# Now we load a Double dueling DQN agent
agent = Agent(state_size, action_size) agent = Agent(state_size, action_size)
# Here you can pre-load an agent for trials in range(1, n_trials + 1):
if False:
with path(torch_training.Nets, "avoid_checkpoint500.pth") as file_in:
agent.qnetwork_local.load_state_dict(torch.load(file_in))
# Do training over n_episodes
for episodes in range(1, n_episodes + 1):
"""
Training Curriculum: In order to get good generalization we change the number of agents
and the size of the levels every 50 episodes.
"""
if episodes % 50 == 1:
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=sparse_rail_generator(num_cities=5,
# Number of cities in map (where train stations are)
num_intersections=4,
# Number of intersections (no start / target)
num_trainstations=10,
# Number of possible start/targets on map
min_node_dist=3, # Minimal distance of nodes
node_radius=2, # Proximity of stations to city center
num_neighb=3,
# Number of connections to other cities/intersections
seed=15, # Random seed
grid_mode=True,
enhance_intersection=False
),
schedule_generator=sparse_schedule_generator(speed_ration_map),
number_of_agents=n_agents,
stochastic_data=stochastic_data, # Malfunction data generator
obs_builder_object=observation_helper)
# Adjust the parameters according to the new env.
max_steps = int((env.height + env.width))
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
# Reset environment # Reset environment
obs, info = env.reset(True, True) obs, info = env.reset(True, True)
env_renderer.reset()
# Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
# different times during an episode
final_obs = agent_obs.copy()
final_obs_next = agent_next_obs.copy()
register_action_state = np.zeros(env.get_num_agents(), dtype=bool)
# Build agent specific observations # Build agent specific observations
for a in range(env.get_num_agents()): for a in range(env.get_num_agents()):
agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
agent_obs_buffer[a] = agent_obs[a].copy()
# Reset score and done
score = 0 score = 0
env_done = 0 env_done = 0
# Run episode # Run episode
for step in range(max_steps): for step in range(max_steps):
# Action # Action
for a in range(env.get_num_agents()): for a in range(env.get_num_agents()):
if env.agents[a].speed_data['position_fraction'] == 0.: if info['action_required'][a]:
register_action_state[a] = True # If an action is require, we want to store the obs a that step as well as the action
update_values = True
action = agent.act(agent_obs[a], eps=eps)
action_prob[action] += 1
else: else:
register_action_state[a] = False update_values = False
action = agent.act(agent_obs[a], eps=eps) action = 0
action_prob[action] += 1
action_dict.update({a: action}) action_dict.update({a: action})
# Environment step # Environment step
next_obs, all_rewards, done, _ = env.step(action_dict) next_obs, all_rewards, done, info = env.step(action_dict)
# Build agent specific observations and normalize
for a in range(env.get_num_agents()):
agent_next_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
# Update replay buffer and train agent # Update replay buffer and train agent
for a in range(env.get_num_agents()): for a in range(env.get_num_agents()):
if done[a]: # Only update the values when we are done or when an action was taken and thus relevant information is present
final_obs[a] = agent_obs[a].copy() if update_values or done[a]:
final_obs_next[a] = agent_next_obs[a].copy() agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
final_action_dict.update({a: action_dict[a]}) agent_obs[a], done[a])
if not done[a] and register_action_state[a]: cummulated_reward[a] = 0.
agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
agent_obs_buffer[a] = agent_obs[a].copy()
agent_action_buffer[a] = action_dict[a]
agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
score += all_rewards[a] / env.get_num_agents() score += all_rewards[a] / env.get_num_agents()
# Copy observation # Copy observation
agent_obs = agent_next_obs.copy()
if done['__all__']: if done['__all__']:
env_done = 1 env_done = 1
for a in range(env.get_num_agents()):
agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
break break
# Epsilon decay # Epsilon decay
...@@ -223,7 +168,7 @@ def main(argv): ...@@ -223,7 +168,7 @@ def main(argv):
for _idx in range(env.get_num_agents()): for _idx in range(env.get_num_agents()):
if done[_idx] == 1: if done[_idx] == 1:
tasks_finished += 1 tasks_finished += 1
done_window.append(tasks_finished / env.get_num_agents()) done_window.append(tasks_finished / max(1, env.get_num_agents()))
scores_window.append(score / max_steps) # save most recent score scores_window.append(score / max_steps) # save most recent score
scores.append(np.mean(scores_window)) scores.append(np.mean(scores_window))
dones_list.append((np.mean(done_window))) dones_list.append((np.mean(done_window)))
...@@ -231,23 +176,24 @@ def main(argv): ...@@ -231,23 +176,24 @@ def main(argv):
print( print(
'\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(), x_dim, y_dim, env.get_num_agents(), x_dim, y_dim,
episodes, trials,
np.mean(scores_window), np.mean(scores_window),
100 * np.mean(done_window), 100 * np.mean(done_window),
eps, action_prob / np.sum(action_prob)), end=" ") eps, action_prob / np.sum(action_prob)), end=" ")
if episodes % 100 == 0: if trials % 100 == 0:
print( print(
'\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(), env.get_num_agents(), x_dim, y_dim,
episodes, trials,
np.mean(scores_window), np.mean(scores_window),
100 * np.mean(done_window), 100 * np.mean(done_window),
eps, eps, action_prob / np.sum(action_prob)))
action_prob / np.sum(action_prob)))
torch.save(agent.qnetwork_local.state_dict(), torch.save(agent.qnetwork_local.state_dict(),
'./Nets/avoid_checkpoint' + str(episodes) + '.pth') './Nets/avoider_checkpoint' + str(trials) + '.pth')
action_prob = [1] * action_size action_prob = [1] * action_size
# Plot overall training progress at the end
plt.plot(scores) plt.plot(scores)
plt.show() plt.show()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment