Commit b4cdfd75 authored by MasterScrat's avatar MasterScrat
Browse files

Making the repo submittable, lot of cleanup

parent 151de552
{
"challenge_id": "neurips-2020-flatland-challenge",
"grader_id": "neurips-2020-flatland-challenge",
"debug": true,
"tags": ["RL"]
}
curl
git
vim
ssh
gcc
python-cairo-dev
name: flatland-rl
channels:
- anaconda
- conda-forge
- defaults
dependencies:
- tk=8.6.8
- cairo=1.16.0
- cairocffi=1.1.0
- cairosvg=2.4.2
- cffi=1.12.3
- cssselect2=0.2.1
- defusedxml=0.6.0
- fontconfig=2.13.1
- freetype=2.10.0
- gettext=0.19.8.1
- glib=2.58.3
- icu=64.2
- jpeg=9c
- libiconv=1.15
- libpng=1.6.37
- libtiff=4.0.10
- libuuid=2.32.1
- libxcb=1.13
- libxml2=2.9.9
- lz4-c=1.8.3
- olefile=0.46
- pcre=8.41
- pillow=5.3.0
- pixman=0.38.0
- pthread-stubs=0.4
- pycairo=1.18.1
- pycparser=2.19
- tinycss2=1.0.2
- webencodings=0.5.1
- xorg-kbproto=1.0.7
- xorg-libice=1.0.10
- xorg-libsm=1.2.3
- xorg-libx11=1.6.8
- xorg-libxau=1.0.9
- xorg-libxdmcp=1.1.3
- xorg-libxext=1.3.4
- xorg-libxrender=0.9.10
- xorg-renderproto=0.11.1
- xorg-xextproto=7.3.0
- xorg-xproto=7.0.31
- zstd=1.4.0
- _libgcc_mutex=0.1
- ca-certificates=2019.5.15
- certifi=2019.6.16
- libedit=3.1.20181209
- libffi=3.2.1
- ncurses=6.1
- openssl=1.1.1c
- pip=19.1.1
- python=3.6.8
- readline=7.0
- setuptools=41.0.1
- sqlite=3.29.0
- wheel=0.33.4
- xz=5.2.4
- zlib=1.2.11
- pip:
- atomicwrites==1.3.0
- importlib-metadata==0.19
- importlib-resources==1.0.2
- attrs==19.1.0
- chardet==3.0.4
- click==7.0
- cloudpickle==1.2.2
- crowdai-api==0.1.21
- cycler==0.10.0
- filelock==3.0.12
- flatland-rl==2.2.1
- future==0.17.1
- gym==0.14.0
- idna==2.8
- kiwisolver==1.1.0
- lxml==4.4.0
- matplotlib==3.1.1
- more-itertools==7.2.0
- msgpack==0.6.1
- msgpack-numpy==0.4.4.3
- numpy==1.17.0
- packaging==19.0
- pandas==0.25.0
- pluggy==0.12.0
- py==1.8.0
- pyarrow==0.14.1
- pyglet==1.3.2
- pyparsing==2.4.1.1
- pytest==5.0.1
- pytest-runner==5.1
- python-dateutil==2.8.0
- python-gitlab==1.10.0
- pytz==2019.1
- recordtype==1.3
- redis==3.3.2
- requests==2.22.0
- scipy==1.3.1
- six==1.12.0
- svgutils==0.3.1
- timeout-decorator==0.4.1
- toml==0.10.0
- tox==3.13.2
- urllib3==1.25.3
- ushlex==0.99.1
- virtualenv==16.7.2
- wcwidth==0.1.7
- xarray==0.12.3
- zipp==0.5.2
import copy
import os
import pickle
import random
from collections import namedtuple, deque, Iterable
......@@ -123,6 +124,18 @@ class DDDQNPolicy(Policy):
if os.path.exists(filename + ".target"):
self.qnetwork_target.load_state_dict(torch.load(filename + ".target"))
def save_replay_buffer(self, filename):
memory = self.memory.memory
with open(filename, 'wb') as f:
pickle.dump(list(memory)[-500000:], f)
def load_replay_buffer(self, filename):
with open(filename, 'rb') as f:
self.memory.memory = pickle.load(f)
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
class ReplayBuffer:
"""Fixed-size buffer to store experience tuples."""
......@@ -140,11 +153,10 @@ class ReplayBuffer:
self.memory = deque(maxlen=buffer_size)
self.batch_size = batch_size
self.device = device
self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
def add(self, state, action, reward, next_state, done):
"""Add a new experience to memory."""
e = self.experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done)
e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done)
self.memory.append(e)
def sample(self):
......
......@@ -28,13 +28,22 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy
def check_if_all_blocked(env):
"""
Checks whether all the agents are blocked (full deadlock situation).
In that case it is pointless to keep running inference as no agent will be able to move.
FIXME still experimental!
:param env: current environment
:return:
"""
# First build a map of agents in each position
location_has_agent = {}
for agent in env.agents:
if agent.status in [RailAgentStatus.ACTIVE, RailAgentStatus.DONE] and agent.position:
location_has_agent[tuple(agent.position)] = 1
# Looks for any agent that can still move
for handle in env.get_agent_handles():
agent = env.agents[handle]
if agent.status == RailAgentStatus.READY_TO_DEPART:
agent_virtual_position = agent.initial_position
......@@ -55,11 +64,12 @@ def check_if_all_blocked(env):
if new_position not in location_has_agent:
return False
# No agent can move at all: full deadlock!
return True
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render, allow_skipping):
# evaluation is faster on CPU, except if you have huge networks
# Evaluation is faster on CPU (except if you use a really huge)
parameters = {
'use_gpu': False
}
......@@ -77,11 +87,6 @@ def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size,
max_rails_between_cities = env_params.max_rails_between_cities
max_rails_in_city = env_params.max_rails_in_city
# Observation parameters
observation_tree_depth = env_params.observation_tree_depth
observation_radius = env_params.observation_radius
observation_max_path_depth = env_params.observation_max_path_depth
# Malfunction and speed profiles
# TODO pass these parameters properly from main!
malfunction_parameters = MalfunctionParameters(
......@@ -89,6 +94,8 @@ def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size,
min_duration=20, # Minimal duration
max_duration=50 # Max duration
)
# Only fast trains in Round 1
speed_profiles = {
1.: 1.0, # Fast passenger train
1. / 2.: 0.0, # Fast freight train
......@@ -96,14 +103,18 @@ def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size,
1. / 4.: 0.0 # Slow freight train
}
# Observation parameters
observation_tree_depth = env_params.observation_tree_depth
observation_radius = env_params.observation_radius
observation_max_path_depth = env_params.observation_max_path_depth
# Observation builder
predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)
# Setup the environment
env = RailEnv(
width=x_dim,
height=y_dim,
width=x_dim, height=y_dim,
rail_generator=sparse_rail_generator(
max_num_cities=n_cities,
grid_mode=False,
......@@ -185,6 +196,9 @@ def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size,
show_predictions=False
)
if step % 100 == 0:
print("{}/{}".format(step, max_steps - 1))
for agent in env.get_agent_handles():
score += all_rewards[agent]
......@@ -247,36 +261,36 @@ def evaluate_agents(file, n_evaluation_episodes, use_gpu, render, allow_skipping
if total_nb_eval != n_evaluation_episodes:
print("(Rounding up from {} to fill all cores)".format(n_evaluation_episodes))
# env_params_dict = {
# # sample configuration
# "n_agents": 3,
# "x_dim": 35,
# "y_dim": 35,
# "n_cities": 4,
# "max_rails_between_cities": 2,
# "max_rails_in_city": 3,
#
# "seed": 42,
# "observation_tree_depth": 2,
# "observation_radius": 10,
# "observation_max_path_depth": 30
# }
env_params_dict = {
# environment
"n_agents": 15,
"x_dim": 60,
"y_dim": 60,
"n_cities": 7,
# sample configuration
"n_agents": 2,
"x_dim": 35,
"y_dim": 35,
"n_cities": 4,
"max_rails_between_cities": 2,
"max_rails_in_city": 4,
"max_rails_in_city": 3,
# observations
"seed": 42,
"observation_tree_depth": 2,
"observation_radius": 10,
"observation_max_path_depth": 30
}
# env_params_dict = {
# # environment
# "n_agents": 15,
# "x_dim": 60,
# "y_dim": 60,
# "n_cities": 7,
# "max_rails_between_cities": 2,
# "max_rails_in_city": 4,
#
# # observations
# "observation_tree_depth": 2,
# "observation_radius": 10,
# "observation_max_path_depth": 30
# }
env_params = Namespace(**env_params_dict)
print("Environment parameters:")
......@@ -300,9 +314,9 @@ def evaluate_agents(file, n_evaluation_episodes, use_gpu, render, allow_skipping
else:
with Pool() as p:
results = p.starmap(eval_policy,
[(env_params_dict, file, eval_per_thread, max_steps, action_size, state_size, seed * nb_threads, render, allow_skipping)
[(env_params_dict, file, 1, max_steps, action_size, state_size, seed * nb_threads, render, allow_skipping)
for seed in
range(nb_threads)])
range(total_nb_eval)])
scores = []
completions = []
......
......@@ -11,21 +11,25 @@ class DuelingQNetwork(nn.Module):
# value network
self.fc1_val = nn.Linear(state_size, hidsize1)
self.fc2_val = nn.Linear(hidsize1, hidsize2)
self.fc3_val = nn.Linear(hidsize2, 1)
self.fc3_val = nn.Linear(hidsize1, hidsize2)
self.fc4_val = nn.Linear(hidsize2, 1)
# advantage network
self.fc1_adv = nn.Linear(state_size, hidsize1)
self.fc2_adv = nn.Linear(hidsize1, hidsize2)
self.fc3_adv = nn.Linear(hidsize2, action_size)
self.fc3_adv = nn.Linear(hidsize1, hidsize2)
self.fc4_adv = nn.Linear(hidsize2, action_size)
def forward(self, x):
val = F.relu(self.fc1_val(x))
val = F.relu(self.fc2_val(val))
val = self.fc3_val(val)
#val = F.relu(self.fc3_val(val))
val = self.fc4_val(val)
# advantage calculation
adv = F.relu(self.fc1_adv(x))
adv = F.relu(self.fc2_adv(adv))
adv = self.fc3_adv(adv)
#adv = F.relu(self.fc3_adv(adv))
adv = self.fc4_adv(adv)
return val + adv - adv.mean()
......@@ -5,6 +5,7 @@ from argparse import ArgumentParser, Namespace
from pathlib import Path
from pprint import pprint
import psutil
from flatland.utils.rendertools import RenderTool
from torch.utils.tensorboard import SummaryWriter
import numpy as np
......@@ -27,7 +28,6 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy
try:
import wandb
wandb.init(sync_tensorboard=True)
except ImportError:
print("Install wandb to log to Weights & Biases")
......@@ -39,14 +39,6 @@ Documentation: https://flatland.aicrowd.com/getting-started/rl/multi-agent.html
Results: https://app.wandb.ai/masterscrat/flatland-examples-reinforcement_learning/reports/Flatland-Examples--VmlldzoxNDI2MTA
"""
SUPPRESS_OUTPUT = False
if SUPPRESS_OUTPUT:
# ugly hack to be able to run hyperparameters sweeps with w&b
# they currently have a bug which prevents runs that output emojis to run :(
def print(*args, **kwargs):
pass
def train_agent(env_params, train_params):
# Environment parameters
......@@ -71,22 +63,24 @@ def train_agent(env_params, train_params):
checkpoint_interval = train_params.checkpoint_interval
n_eval_episodes = train_params.n_evaluation_episodes
# TODO make command line parameters
replay_buffer_path = "replay_buffers/rb-100.pkl"
save_replay_buffer = True
# Set the seeds
random.seed(seed)
np.random.seed(seed)
# Break agents from time to time
# TODO should be passed in clean way from main
malfunction_parameters = MalfunctionParameters(
malfunction_rate=1. / 10000, # Rate of malfunctions
min_duration=15, # Minimal duration
max_duration=50 # Max duration
)
# Observation builder
predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)
# Fraction of train which each speed
# Only consider fast trains in Round 1
speed_profiles = {
1.: 1.0, # Fast passenger train
1. / 2.: 0.0, # Fast freight train
......@@ -94,10 +88,13 @@ def train_agent(env_params, train_params):
1. / 4.: 0.0 # Slow freight train
}
# Observation builder
predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)
# Setup the environment
env = RailEnv(
width=x_dim,
height=y_dim,
width=x_dim, height=y_dim,
rail_generator=sparse_rail_generator(
max_num_cities=n_cities,
grid_mode=False,
......@@ -119,9 +116,7 @@ def train_agent(env_params, train_params):
# Calculate the state size given the depth of the tree observation and the number of features
n_features_per_node = env.obs_builder.observation_dim
n_nodes = 0
for i in range(observation_tree_depth + 1):
n_nodes += np.power(4, i)
n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)])
state_size = n_features_per_node * n_nodes
# The action space of flatland is 5 discrete actions
......@@ -134,10 +129,12 @@ def train_agent(env_params, train_params):
action_count = [0] * action_size
action_dict = dict()
agent_obs = [None] * env.get_num_agents()
agent_prev_obs = [None] * env.get_num_agents()
agent_prev_action = [2] * env.get_num_agents()
update_values = False
agent_obs = [None] * n_agents
agent_prev_obs = [None] * n_agents
agent_prev_action = [2] * n_agents
update_values = [False] * n_agents
# Smoothed values used as target for hyperparameter tuning
smoothed_normalized_score = -1.0
smoothed_eval_normalized_score = -1.0
smoothed_completion = 0.0
......@@ -146,6 +143,15 @@ def train_agent(env_params, train_params):
# Double Dueling DQN policy
policy = DDDQNPolicy(state_size, action_size, train_params)
# Loads existing replay buffer
# TODO take proper command line parameter
policy.load_replay_buffer(replay_buffer_path)
print("\n💾 Replay buffer status: {}/{} experiences".format(len(policy.memory.memory), train_params.buffer_size))
hdd = psutil.disk_usage('/')
if save_replay_buffer and (hdd.free / (2**30)) < 500.0:
print("⚠️ Careful! Saving replay buffers will quickly consume a lot of disk space. You have {:.2f}gb left.".format(hdd.free / (2**30)))
# TensorBoard writer
writer = SummaryWriter()
writer.add_hparams(vars(train_params), {})
......@@ -158,11 +164,11 @@ def train_agent(env_params, train_params):
.format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval))
for episode_idx in range(n_episodes + 1):
# Timers
step_timer = Timer()
reset_timer = Timer()
learn_timer = Timer()
preproc_timer = Timer()
inference_timer = Timer()
# Reset environment
reset_timer.start()
......@@ -176,7 +182,7 @@ def train_agent(env_params, train_params):
nb_steps = 0
actions_taken = []
# Build agent specific observations
# Build initial agent-specific observations
for agent in env.get_agent_handles():
if obs[agent]:
agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
......@@ -184,23 +190,28 @@ def train_agent(env_params, train_params):
# Run episode
for step in range(max_steps - 1):
inference_timer.start()
for agent in env.get_agent_handles():
if info['action_required'][agent]:
# If an action is required, we want to store the obs at that step as well as the action
update_values = True
update_values[agent] = True
action = policy.act(agent_obs[agent], eps=eps_start)
action_count[action] += 1
actions_taken.append(action)
else:
update_values = False
# An action is not required if the train hasn't joined the railway network,
# if it already reached its target, or if is currently malfunctioning.
update_values[agent] = False
action = 0
action_dict.update({agent: action})
inference_timer.end()
# Environment step
step_timer.start()
next_obs, all_rewards, done, info = env.step(action_dict)
step_timer.end()
# Render an episode at some interval
if train_params.render and episode_idx % checkpoint_interval == 0:
env_renderer.render_env(
show=True,
......@@ -209,10 +220,10 @@ def train_agent(env_params, train_params):
show_predictions=False
)
for agent in range(env.get_num_agents()):
# Update replay buffer and train agent
# Only update the values when we are done or when an action was taken and thus relevant information is present
if update_values or done[agent]:
# Update replay buffer and train agent
for agent in env.get_agent_handles():
if update_values[agent] or done['__all__']:
# Only learn from timesteps where somethings happened
learn_timer.start()
policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent])
learn_timer.end()
......@@ -236,14 +247,13 @@ def train_agent(env_params, train_params):
# Epsilon decay
eps_start = max(eps_end, eps_decay * eps_start)
# Collection information about training
# Collect information about training
tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
completion = tasks_finished / max(1, env.get_num_agents())
normalized_score = score / (max_steps * env.get_num_agents())
action_probs = action_count / np.sum(action_count)
action_count = [1] * action_size
# Smoothed values for terminal display and for more stable hyper-parameter tuning
smoothing = 0.99
smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (1.0 - smoothing)
smoothed_completion = smoothed_completion * smoothing + completion * (1.0 - smoothing)
......@@ -251,6 +261,8 @@ def train_agent(env_params, train_params):
# Print logs
if episode_idx % checkpoint_interval == 0:
torch.save(policy.qnetwork_local, './checkpoints/multi-' + str(episode_idx) + '.pth')
policy.save_replay_buffer('./replay_buffers/rb-' + str(episode_idx) + '.pkl')
if train_params.render:
env_renderer.close_window()
......@@ -260,7 +272,7 @@ def train_agent(env_params, train_params):
' Avg: {:.3f}'
'\t 💯 Done: {:.2f}%'
' Avg: {:.2f}%'
'\t 🎲 Epsilon: {:.2f} '
'\t 🎲 Epsilon: {:.3f} '
'\t 🔀 Action Probs: {}'.format(
episode_idx,
normalized_score,
......@@ -271,8 +283,8 @@ def train_agent(env_params, train_params):
format_action_prob(action_probs)
), end=" ")
# Evaluate policy
if episode_idx % train_params.checkpoint_interval == 0:
# Evaluate policy and log results at some interval
if episode_idx % checkpoint_interval == 0 and n_eval_episodes > 0:
scores, completions, nb_steps_eval = eval_policy(env, policy, n_eval_episodes, max_steps)
writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx)
writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx)
......@@ -330,6 +342,13 @@ def format_action_prob(action_probs):
def eval_policy(env, policy, n_eval_episodes, max_steps):
"""
:param env: environment to evaluate the policy in
:param policy: policy to evaluate
:param n_eval_episodes: number of evaluation episodes
:param max_steps: maximum number of steps per episode
:return:
"""
action_dict = dict()
scores = []
completions = []
......@@ -347,7 +366,6 @@ def eval_policy(env, policy, n_eval_episodes, max_steps):
for agent in env.get_agent_handles():
if obs[agent]:
# TODO pass parameters properly
# agent_obs[agent] = normalize_observation(obs[agent], tree_depth=2, observation_radius=10)
agent_obs[agent] = normalize_observation(obs[agent], tree_depth=2, observation_radius=10)
action = 0
......@@ -387,12 +405,12 @@ if __name__ == "__main__":
parser.add_argument("--eps_start", dest="eps_start", help="max exploration", default=1.0, type=float)
parser.add_argument("--eps_end", dest="eps_end", help="min exploration", default=0.01, type=float)
parser.add_argument("--eps_decay", dest="eps_decay", help="exploration decay", default=0.99, type=float)
parser.add_argument("--buffer_size", dest="buffer_size", help="replay buffer size", default=int(1e6), type=int)
parser.add_argument("--buffer_size", dest="buffer_size", help="replay buffer size", default=int(5e5), type=int)
parser.add_argument("--buffer_min_size", dest="buffer_min_size", help="min buffer size to start training", default=0, type=int)
parser.add_argument("--batch_size", dest="batch_size", help="minibatch size", default=32, type=int)
parser.add_argument("--gamma", dest="gamma", help="discount factor", default=0.99, type=float)
parser.add_argument("--tau", dest="tau", help="soft update of target parameters", default=1e-3, type=float)