Skip to content
Snippets Groups Projects
Commit 828db1d6 authored by Erik Nygren's avatar Erik Nygren
Browse files

simplified training example for participants

parent c167f87e
No related branches found
No related tags found
No related merge requests found
......@@ -7,7 +7,6 @@ import torch
from dueling_double_dqn import Agent
from flatland.envs.generators import complex_rail_generator
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.utils.rendertools import RenderTool
......@@ -16,66 +15,52 @@ from utils.observation_utils import norm_obs_clip, split_tree
random.seed(1)
np.random.seed(1)
# Example generate a rail given a manual specification,
# a map of tuples (cell_type, rotation)
transition_probability = [15, # empty cell - Case 0
5, # Case 1 - straight
5, # Case 2 - simple switch
1, # Case 3 - diamond crossing
1, # Case 4 - single slip
1, # Case 5 - double slip
1, # Case 6 - symmetrical
0, # Case 7 - dead end
1, # Case 1b (8) - simple turn right
1, # Case 1c (9) - simple turn left
1] # Case 2b (10) - simple switch mirrored
# Example generate a random rail
"""
env = RailEnv(width=20,
height=20,
rail_generator=random_rail_generator(cell_type_relative_proportion=transition_probability),
number_of_agents=1)
env = RailEnv(width=15,
height=15,
rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=10, min_dist=10, max_dist=99999, seed=0),
number_of_agents=1)
env = RailEnv(width=10,
height=20, obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()))
env.load("./railway/complex_scene.pkl")
file_load = True
"""
x_dim = np.random.randint(8, 20)
y_dim = np.random.randint(8, 20)
n_agents = np.random.randint(3, 8)
n_goals = n_agents + np.random.randint(0, 3)
min_dist = int(0.75 * min(x_dim, y_dim))
# Parameters for the Environment
x_dim = 10
y_dim = 10
n_agents = 1
n_goals = 5
min_dist = 5
# We are training an Agent using the Tree Observation with depth 2
observation_builder = TreeObsForRailEnv(max_depth=2)
# Load the Environment
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
max_dist=99999,
seed=0),
obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()),
obs_builder_object=observation_builder,
number_of_agents=n_agents)
env.reset(True, True)
file_load = False
"""
"""
observation_helper = TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv())
env_renderer = RenderTool(env, gl="PILSVG",)
handle = env.get_agent_handles()
# After training we want to render the results so we also load a renderer
env_renderer = RenderTool(env, gl="PILSVG", )
# Given the depth of the tree observation and the number of features per node we get the following state_size
features_per_node = 9
state_size = features_per_node * 85 * 2
tree_depth = 2
nr_nodes = 0
for i in range(tree_depth + 1):
nr_nodes += np.power(4, i)
state_size = features_per_node * nr_nodes
# The action space of flatland is 5 discrete actions
action_size = 5
n_trials = 30000
# We set the number of episodes we would like to train on
n_trials = 6000
# And the max number of steps we want to take per episode
max_steps = int(3 * (env.height + env.width))
# Define training parameters
eps = 1.
eps_end = 0.005
eps_decay = 0.9995
eps_decay = 0.998
# And some variables to keep track of the progress
action_dict = dict()
final_action_dict = dict()
scores_window = deque(maxlen=100)
......@@ -86,112 +71,83 @@ dones_list = []
action_prob = [0] * action_size
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
agent = Agent(state_size, action_size, "FC", 0)
agent.qnetwork_local.load_state_dict(torch.load('./Nets/avoid_checkpoint30000.pth'))
demo = True
record_images = False
# Now we load a Double dueling DQN agent
agent = Agent(state_size, action_size, "FC", 0)
Training = True
for trials in range(1, n_trials + 1):
if trials % 50 == 0 and not demo:
x_dim = np.random.randint(8, 20)
y_dim = np.random.randint(8, 20)
n_agents = np.random.randint(3, 8)
n_goals = n_agents + np.random.randint(0, 3)
min_dist = int(0.75 * min(x_dim, y_dim))
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
max_dist=99999,
seed=0),
obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()),
number_of_agents=n_agents)
env.reset(True, True)
max_steps = int(3 * (env.height + env.width))
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
# Reset environment
if file_load:
obs = env.reset(False, False)
else:
obs = env.reset(True, True)
if demo:
obs = env.reset(True, True)
if not Training:
env_renderer.set_new_rail()
obs_original = obs.copy()
final_obs = obs.copy()
final_obs_next = obs.copy()
# Split the observation tree into its parts and normalize the observation using the utility functions.
# Build agent specific local observation
for a in range(env.get_num_agents()):
data, distance, agent_data = split_tree(tree=np.array(obs[a]), num_features_per_node=features_per_node,
current_depth=0)
data = norm_obs_clip(data)
distance = norm_obs_clip(distance)
rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]),
num_features_per_node=features_per_node,
current_depth=0)
rail_data = norm_obs_clip(rail_data)
distance_data = norm_obs_clip(distance_data)
agent_data = np.clip(agent_data, -1, 1)
obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
agent_data = env.agents[a]
speed = 1 #np.random.randint(1,5)
agent_data.speed_data['speed'] = 1. / speed
for i in range(2):
time_obs.append(obs)
# env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
for a in range(env.get_num_agents()):
agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
# Reset score and done
score = 0
env_done = 0
# Run episode
for step in range(max_steps):
if demo:
# Only render when not triaing
if not Training:
env_renderer.renderEnv(show=True, show_observations=True)
# observation_helper.util_print_obs_subtree(obs_original[0])
if record_images:
env_renderer.gl.saveImage("./Images/flatland_frame_{:04d}.bmp".format(step))
# print(step)
# Action
# Chose the actions
for a in range(env.get_num_agents()):
if demo:
if not Training:
eps = 0
# action = agent.act(np.array(obs[a]), eps=eps)
action = agent.act(agent_obs[a], eps=eps)
action_prob[action] += 1
action_dict.update({a: action})
# Environment step
# Count number of actions takes for statistics
action_prob[action] += 1
# Environment step
next_obs, all_rewards, done, _ = env.step(action_dict)
# print(all_rewards,action)
obs_original = next_obs.copy()
for a in range(env.get_num_agents()):
data, distance, agent_data = split_tree(tree=np.array(next_obs[a]), num_features_per_node=features_per_node,
current_depth=0)
data = norm_obs_clip(data)
distance = norm_obs_clip(distance)
rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
num_features_per_node=features_per_node,
current_depth=0)
rail_data = norm_obs_clip(rail_data)
distance_data = norm_obs_clip(distance_data)
agent_data = np.clip(agent_data, -1, 1)
next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
time_obs.append(next_obs)
agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
# Update replay buffer and train agent
for a in range(env.get_num_agents()):
agent_next_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
if done[a]:
final_obs[a] = agent_obs[a].copy()
final_obs_next[a] = agent_next_obs[a].copy()
final_action_dict.update({a: action_dict[a]})
if not demo and not done[a]:
# Remember and train agent
if Training:
agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
# Update the current score
score += all_rewards[a] / env.get_num_agents()
agent_obs = agent_next_obs.copy()
if done['__all__']:
env_done = 1
for a in range(env.get_num_agents()):
agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
break
# Epsilon decay
eps = max(eps_end, eps_decay * eps) # decrease epsilon
# Store the information about training progress
done_window.append(env_done)
scores_window.append(score / max_steps) # save most recent score
scores.append(np.mean(scores_window))
......@@ -200,22 +156,68 @@ for trials in range(1, n_trials + 1):
print(
'\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(), x_dim, y_dim,
trials,
np.mean(scores_window),
100 * np.mean(done_window),
eps, action_prob / np.sum(action_prob)), end=" ")
trials,
np.mean(scores_window),
100 * np.mean(done_window),
eps, action_prob / np.sum(action_prob)), end=" ")
if trials % 100 == 0:
print(
'\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(),
'\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(), x_dim, y_dim,
trials,
np.mean(scores_window),
100 * np.mean(done_window),
eps,
action_prob / np.sum(action_prob)))
eps, action_prob / np.sum(action_prob)))
torch.save(agent.qnetwork_local.state_dict(),
'./Nets/avoid_checkpoint' + str(trials) + '.pth')
'./Nets/navigator_checkpoint' + str(trials) + '.pth')
action_prob = [1] * action_size
# Render the trained agent
# Reset environment
obs = env.reset(True, True)
env_renderer.set_new_rail()
# Split the observation tree into its parts and normalize the observation using the utility functions.
# Build agent specific local observation
for a in range(env.get_num_agents()):
rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]), num_features_per_node=features_per_node,
current_depth=0)
rail_data = norm_obs_clip(rail_data)
distance_data = norm_obs_clip(distance_data)
agent_data = np.clip(agent_data, -1, 1)
agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
# Reset score and done
score = 0
env_done = 0
# Run episode
for step in range(max_steps):
env_renderer.renderEnv(show=True, show_observations=False)
# Chose the actions
for a in range(env.get_num_agents()):
eps = 0
action = agent.act(agent_obs[a], eps=eps)
action_dict.update({a: action})
# Environment step
next_obs, all_rewards, done, _ = env.step(action_dict)
for a in range(env.get_num_agents()):
rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]),
num_features_per_node=features_per_node,
current_depth=0)
rail_data = norm_obs_clip(rail_data)
distance_data = norm_obs_clip(distance_data)
agent_data = np.clip(agent_data, -1, 1)
agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data))
agent_obs = agent_next_obs.copy()
if done['__all__']:
break
# Plot overall training progress at the end
plt.plot(scores)
plt.show()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment