Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hebe0663/neurips2020-flatland-starter-kit
  • flatland/neurips2020-flatland-starter-kit
  • manavsinghal157/marl-flatland
3 results
Show changes
Showing
with 1872 additions and 213 deletions
File deleted
File deleted
File deleted
No preview for this file type
No preview for this file type
No preview for this file type
File added
name: flatland-rl
channels:
- anaconda
- conda-forge
- defaults
dependencies:
- tk=8.6.8
- cairo=1.16.0
- cairocffi=1.1.0
- cairosvg=2.4.2
- cffi=1.12.3
- cssselect2=0.2.1
- defusedxml=0.6.0
- fontconfig=2.13.1
- freetype=2.10.0
- gettext=0.19.8.1
- glib=2.58.3
- icu=64.2
- jpeg=9c
- libiconv=1.15
- libpng=1.6.37
- libtiff=4.0.10
- libuuid=2.32.1
- libxcb=1.13
- libxml2=2.9.9
- lz4-c=1.8.3
- olefile=0.46
- pcre=8.41
- pillow=5.3.0
- pixman=0.38.0
- pthread-stubs=0.4
- pycairo=1.18.1
- pycparser=2.19
- tinycss2=1.0.2
- webencodings=0.5.1
- xorg-kbproto=1.0.7
- xorg-libice=1.0.10
- xorg-libsm=1.2.3
- xorg-libx11=1.6.8
- xorg-libxau=1.0.9
- xorg-libxdmcp=1.1.3
- xorg-libxext=1.3.4
- xorg-libxrender=0.9.10
- xorg-renderproto=0.11.1
- xorg-xextproto=7.3.0
- xorg-xproto=7.0.31
- zstd=1.4.0
- _libgcc_mutex=0.1
- ca-certificates=2019.5.15
- certifi=2019.6.16
- libedit=3.1.20181209
- libffi=3.2.1
- ncurses=6.1
- openssl=1.1.1c
- pip=19.1.1
- python=3.6.8
- readline=7.0
- setuptools=41.0.1
- sqlite=3.29.0
- wheel=0.33.4
- xz=5.2.4
- zlib=1.2.11
- pip:
- atomicwrites==1.3.0
- importlib-metadata==0.19
- importlib-resources==1.0.2
- attrs==19.1.0
- chardet==3.0.4
- click==7.0
- cloudpickle==1.2.2
- crowdai-api==0.1.21
- cycler==0.10.0
- filelock==3.0.12
- flatland-rl==2.2.1
- future==0.17.1
- gym==0.14.0
- idna==2.8
- kiwisolver==1.1.0
- lxml==4.4.0
- matplotlib==3.1.1
- more-itertools==7.2.0
- msgpack==0.6.1
- msgpack-numpy==0.4.4.3
- numpy==1.17.0
- packaging==19.0
- pandas==0.25.0
- pluggy==0.12.0
- py==1.8.0
- pyarrow==0.14.1
- pyglet==1.3.2
- pyparsing==2.4.1.1
- pytest==5.0.1
- pytest-runner==5.1
- python-dateutil==2.8.0
- python-gitlab==1.10.0
- pytz==2019.1
- torch==1.5.0
- recordtype==1.3
- redis==3.3.2
- requests==2.22.0
- scipy==1.3.1
- six==1.12.0
- svgutils==0.3.1
- timeout-decorator==0.4.1
- toml==0.10.0
- tox==3.13.2
- urllib3==1.25.3
- ushlex==0.99.1
- virtualenv==16.7.2
- wcwidth==0.1.7
- xarray==0.12.3
- zipp==0.5.2
name: flatland-rl
channels:
- pytorch
- conda-forge
- defaults
dependencies:
- psutil==5.7.2
- pytorch==1.6.0
- pip==20.2.3
- python==3.6.8
- pip:
- tensorboard==2.3.0
- tensorboardx==2.1
\ No newline at end of file
#!/usr/bin/env python
import collections
from typing import Optional, List, Dict, Tuple
import numpy as np
from flatland.core.env import Environment
from flatland.core.env_observation_builder import ObservationBuilder
from flatland.core.env_prediction_builder import PredictionBuilder
from flatland.envs.agent_utils import RailAgentStatus, EnvAgent
class CustomObservationBuilder(ObservationBuilder):
"""
Template for building a custom observation builder for the RailEnv class
The observation in this case composed of the following elements:
- transition map array with dimensions (env.height, env.width),\
where the value at X,Y will represent the 16 bits encoding of transition-map at that point.
- the individual agent object (with position, direction, target information available)
"""
def __init__(self):
super(CustomObservationBuilder, self).__init__()
def set_env(self, env: Environment):
super().set_env(env)
# Note :
# The instantiations which depend on parameters of the Env object should be
# done here, as it is only here that the updated self.env instance is available
self.rail_obs = np.zeros((self.env.height, self.env.width))
def reset(self):
"""
Called internally on every env.reset() call,
to reset any observation specific variables that are being used
"""
self.rail_obs[:] = 0
for _x in range(self.env.width):
for _y in range(self.env.height):
# Get the transition map value at location _x, _y
transition_value = self.env.rail.get_full_transitions(_y, _x)
self.rail_obs[_y, _x] = transition_value
def get(self, handle: int = 0):
"""
Returns the built observation for a single agent with handle : handle
In this particular case, we return
- the global transition_map of the RailEnv,
- a tuple containing, the current agent's:
- state
- position
- direction
- initial_position
- target
"""
agent = self.env.agents[handle]
"""
Available information for each agent object :
- agent.status : [RailAgentStatus.READY_TO_DEPART, RailAgentStatus.ACTIVE, RailAgentStatus.DONE]
- agent.position : Current position of the agent
- agent.direction : Current direction of the agent
- agent.initial_position : Initial Position of the agent
- agent.target : Target position of the agent
"""
status = agent.status
position = agent.position
direction = agent.direction
initial_position = agent.initial_position
target = agent.target
"""
You can also optionally access the states of the rest of the agents by
using something similar to
for i in range(len(self.env.agents)):
other_agent: EnvAgent = self.env.agents[i]
# ignore other agents not in the grid any more
if other_agent.status == RailAgentStatus.DONE_REMOVED:
continue
## Gather other agent specific params
other_agent_status = other_agent.status
other_agent_position = other_agent.position
other_agent_direction = other_agent.direction
other_agent_initial_position = other_agent.initial_position
other_agent_target = other_agent.target
## Do something nice here if you wish
"""
return self.rail_obs, (status, position, direction, initial_position, target)
import copy
import os
import pickle
import random
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from reinforcement_learning.model import DuelingQNetwork
from reinforcement_learning.policy import Policy, LearningPolicy
from reinforcement_learning.replay_buffer import ReplayBuffer
class DDDQNPolicy(LearningPolicy):
"""Dueling Double DQN policy"""
def __init__(self, state_size, action_size, in_parameters, evaluation_mode=False):
print(">> DDDQNPolicy")
super(Policy, self).__init__()
self.ddqn_parameters = in_parameters
self.evaluation_mode = evaluation_mode
self.state_size = state_size
self.action_size = action_size
self.double_dqn = True
self.hidsize = 128
if not evaluation_mode:
self.hidsize = self.ddqn_parameters.hidden_size
self.buffer_size = self.ddqn_parameters.buffer_size
self.batch_size = self.ddqn_parameters.batch_size
self.update_every = self.ddqn_parameters.update_every
self.learning_rate = self.ddqn_parameters.learning_rate
self.tau = self.ddqn_parameters.tau
self.gamma = self.ddqn_parameters.gamma
self.buffer_min_size = self.ddqn_parameters.buffer_min_size
# Device
if self.ddqn_parameters.use_gpu and torch.cuda.is_available():
self.device = torch.device("cuda:0")
# print("🐇 Using GPU")
else:
self.device = torch.device("cpu")
# print("🐢 Using CPU")
# Q-Network
self.qnetwork_local = DuelingQNetwork(state_size,
action_size,
hidsize1=self.hidsize,
hidsize2=self.hidsize).to(self.device)
if not evaluation_mode:
self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.learning_rate)
self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
self.t_step = 0
self.loss = 0.0
else:
self.memory = ReplayBuffer(action_size, 1, 1, self.device)
self.loss = 0.0
def act(self, handle, state, eps=0.):
state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
self.qnetwork_local.eval()
with torch.no_grad():
action_values = self.qnetwork_local(state)
self.qnetwork_local.train()
# Epsilon-greedy action selection
if random.random() >= eps:
return np.argmax(action_values.cpu().data.numpy())
else:
return random.choice(np.arange(self.action_size))
def step(self, handle, state, action, reward, next_state, done):
assert not self.evaluation_mode, "Policy has been initialized for evaluation only."
# Save experience in replay memory
self.memory.add(state, action, reward, next_state, done)
# Learn every UPDATE_EVERY time steps.
self.t_step = (self.t_step + 1) % self.update_every
if self.t_step == 0:
# If enough samples are available in memory, get random subset and learn
if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
self._learn()
def _learn(self):
experiences = self.memory.sample()
states, actions, rewards, next_states, dones, _ = experiences
# Get expected Q values from local model
q_expected = self.qnetwork_local(states).gather(1, actions)
if self.double_dqn:
# Double DQN
q_best_action = self.qnetwork_local(next_states).max(1)[1]
q_targets_next = self.qnetwork_target(next_states).gather(1, q_best_action.unsqueeze(-1))
else:
# DQN
q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1)
# Compute Q targets for current states
q_targets = rewards + (self.gamma * q_targets_next * (1 - dones))
# Compute loss
self.loss = F.mse_loss(q_expected, q_targets)
# Minimize the loss
self.optimizer.zero_grad()
self.loss.backward()
self.optimizer.step()
# Update target network
self._soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)
def _soft_update(self, local_model, target_model, tau):
# Soft update model parameters.
# θ_target = τ*θ_local + (1 - τ)*θ_target
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def save(self, filename):
torch.save(self.qnetwork_local.state_dict(), filename + ".local")
torch.save(self.qnetwork_target.state_dict(), filename + ".target")
def load(self, filename):
try:
if os.path.exists(filename + ".local") and os.path.exists(filename + ".target"):
self.qnetwork_local.load_state_dict(torch.load(filename + ".local", map_location=self.device))
print("qnetwork_local loaded ('{}')".format(filename + ".local"))
if not self.evaluation_mode:
self.qnetwork_target.load_state_dict(torch.load(filename + ".target", map_location=self.device))
print("qnetwork_target loaded ('{}' )".format(filename + ".target"))
else:
print(">> Checkpoint not found, using untrained policy! ('{}', '{}')".format(filename + ".local",
filename + ".target"))
except Exception as exc:
print(exc)
print("Couldn't load policy from, using untrained policy! ('{}', '{}')".format(filename + ".local",
filename + ".target"))
def save_replay_buffer(self, filename):
memory = self.memory.memory
with open(filename, 'wb') as f:
pickle.dump(list(memory)[-500000:], f)
def load_replay_buffer(self, filename):
with open(filename, 'rb') as f:
self.memory.memory = pickle.load(f)
def test(self):
self.act(0, np.array([[0] * self.state_size]))
self._learn()
def clone(self):
me = DDDQNPolicy(self.state_size, self.action_size, self.ddqn_parameters, evaluation_mode=True)
me.qnetwork_target = copy.deepcopy(self.qnetwork_local)
me.qnetwork_target = copy.deepcopy(self.qnetwork_target)
return me
from flatland.envs.agent_utils import RailAgentStatus
from flatland.envs.rail_env import RailEnv, RailEnvActions
from reinforcement_learning.policy import HybridPolicy
from reinforcement_learning.ppo_agent import PPOPolicy
from utils.agent_action_config import map_rail_env_action
from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
class DeadLockAvoidanceWithDecisionAgent(HybridPolicy):
def __init__(self, env: RailEnv, state_size, action_size, learning_agent):
print(">> DeadLockAvoidanceWithDecisionAgent")
super(DeadLockAvoidanceWithDecisionAgent, self).__init__()
self.env = env
self.state_size = state_size
self.action_size = action_size
self.learning_agent = learning_agent
self.dead_lock_avoidance_agent = DeadLockAvoidanceAgent(self.env, action_size, False)
self.policy_selector = PPOPolicy(state_size, 2)
self.memory = self.learning_agent.memory
self.loss = self.learning_agent.loss
def step(self, handle, state, action, reward, next_state, done):
select = self.policy_selector.act(handle, state, 0.0)
self.policy_selector.step(handle, state, select, reward, next_state, done)
self.dead_lock_avoidance_agent.step(handle, state, action, reward, next_state, done)
self.learning_agent.step(handle, state, action, reward, next_state, done)
self.loss = self.learning_agent.loss
def act(self, handle, state, eps=0.):
select = self.policy_selector.act(handle, state, eps)
if select == 0:
return self.learning_agent.act(handle, state, eps)
return self.dead_lock_avoidance_agent.act(handle, state, -1.0)
def save(self, filename):
self.dead_lock_avoidance_agent.save(filename)
self.learning_agent.save(filename)
self.policy_selector.save(filename + '.selector')
def load(self, filename):
self.dead_lock_avoidance_agent.load(filename)
self.learning_agent.load(filename)
self.policy_selector.load(filename + '.selector')
def start_step(self, train):
self.dead_lock_avoidance_agent.start_step(train)
self.learning_agent.start_step(train)
self.policy_selector.start_step(train)
def end_step(self, train):
self.dead_lock_avoidance_agent.end_step(train)
self.learning_agent.end_step(train)
self.policy_selector.end_step(train)
def start_episode(self, train):
self.dead_lock_avoidance_agent.start_episode(train)
self.learning_agent.start_episode(train)
self.policy_selector.start_episode(train)
def end_episode(self, train):
self.dead_lock_avoidance_agent.end_episode(train)
self.learning_agent.end_episode(train)
self.policy_selector.end_episode(train)
def load_replay_buffer(self, filename):
self.dead_lock_avoidance_agent.load_replay_buffer(filename)
self.learning_agent.load_replay_buffer(filename)
self.policy_selector.load_replay_buffer(filename + ".selector")
def test(self):
self.dead_lock_avoidance_agent.test()
self.learning_agent.test()
self.policy_selector.test()
def reset(self, env: RailEnv):
self.env = env
self.dead_lock_avoidance_agent.reset(env)
self.learning_agent.reset(env)
self.policy_selector.reset(env)
def clone(self):
return self
import math
import multiprocessing
import os
import sys
from argparse import ArgumentParser, Namespace
from multiprocessing import Pool
from pathlib import Path
from pprint import pprint
import numpy as np
import torch
from flatland.envs.malfunction_generators import malfunction_from_params, MalfunctionParameters
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import sparse_rail_generator
from flatland.envs.schedule_generators import sparse_schedule_generator
from flatland.utils.rendertools import RenderTool
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
from utils.deadlock_check import check_if_all_blocked
from utils.timer import Timer
from utils.observation_utils import normalize_observation
from reinforcement_learning.dddqn_policy import DDDQNPolicy
def eval_policy(env_params, checkpoint, n_eval_episodes, max_steps, action_size, state_size, seed, render,
allow_skipping, allow_caching):
# Evaluation is faster on CPU (except if you use a really huge policy)
parameters = {
'use_gpu': False
}
policy = DDDQNPolicy(state_size, action_size, Namespace(**parameters), evaluation_mode=True)
policy.qnetwork_local = torch.load(checkpoint)
env_params = Namespace(**env_params)
# Environment parameters
n_agents = env_params.n_agents
x_dim = env_params.x_dim
y_dim = env_params.y_dim
n_cities = env_params.n_cities
max_rails_between_cities = env_params.max_rails_between_cities
max_rails_in_city = env_params.max_rails_in_city
# Malfunction and speed profiles
# TODO pass these parameters properly from main!
malfunction_parameters = MalfunctionParameters(
malfunction_rate=1. / 2000, # Rate of malfunctions
min_duration=20, # Minimal duration
max_duration=50 # Max duration
)
# Only fast trains in Round 1
speed_profiles = {
1.: 1.0, # Fast passenger train
1. / 2.: 0.0, # Fast freight train
1. / 3.: 0.0, # Slow commuter train
1. / 4.: 0.0 # Slow freight train
}
# Observation parameters
observation_tree_depth = env_params.observation_tree_depth
observation_radius = env_params.observation_radius
observation_max_path_depth = env_params.observation_max_path_depth
# Observation builder
predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)
# Setup the environment
env = RailEnv(
width=x_dim, height=y_dim,
rail_generator=sparse_rail_generator(
max_num_cities=n_cities,
grid_mode=False,
max_rails_between_cities=max_rails_between_cities,
max_rails_in_city=max_rails_in_city,
),
schedule_generator=sparse_schedule_generator(speed_profiles),
number_of_agents=n_agents,
malfunction_generator_and_process_data=malfunction_from_params(malfunction_parameters),
obs_builder_object=tree_observation
)
if render:
env_renderer = RenderTool(env, gl="PGL")
action_dict = dict()
scores = []
completions = []
nb_steps = []
inference_times = []
preproc_times = []
agent_times = []
step_times = []
for episode_idx in range(n_eval_episodes):
seed += 1
inference_timer = Timer()
preproc_timer = Timer()
agent_timer = Timer()
step_timer = Timer()
step_timer.start()
obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True, random_seed=seed)
step_timer.end()
agent_obs = [None] * env.get_num_agents()
score = 0.0
if render:
env_renderer.set_new_rail()
final_step = 0
skipped = 0
nb_hit = 0
agent_last_obs = {}
agent_last_action = {}
for step in range(max_steps - 1):
if allow_skipping and check_if_all_blocked(env):
# FIXME why -1? bug where all agents are "done" after max_steps!
skipped = max_steps - step - 1
final_step = max_steps - 2
n_unfinished_agents = sum(not done[idx] for idx in env.get_agent_handles())
score -= skipped * n_unfinished_agents
break
agent_timer.start()
for agent in env.get_agent_handles():
if obs[agent] and info['action_required'][agent]:
if agent in agent_last_obs and np.all(agent_last_obs[agent] == obs[agent]):
nb_hit += 1
action = agent_last_action[agent]
else:
preproc_timer.start()
norm_obs = normalize_observation(obs[agent], tree_depth=observation_tree_depth,
observation_radius=observation_radius)
preproc_timer.end()
inference_timer.start()
action = policy.act(agent, norm_obs, eps=0.0)
inference_timer.end()
action_dict.update({agent: action})
if allow_caching:
agent_last_obs[agent] = obs[agent]
agent_last_action[agent] = action
agent_timer.end()
step_timer.start()
obs, all_rewards, done, info = env.step(action_dict)
step_timer.end()
if render:
env_renderer.render_env(
show=True,
frames=False,
show_observations=False,
show_predictions=False
)
if step % 100 == 0:
print("{}/{}".format(step, max_steps - 1))
for agent in env.get_agent_handles():
score += all_rewards[agent]
final_step = step
if done['__all__']:
break
normalized_score = score / (max_steps * env.get_num_agents())
scores.append(normalized_score)
tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
completion = tasks_finished / max(1, env.get_num_agents())
completions.append(completion)
nb_steps.append(final_step)
inference_times.append(inference_timer.get())
preproc_times.append(preproc_timer.get())
agent_times.append(agent_timer.get())
step_times.append(step_timer.get())
skipped_text = ""
if skipped > 0:
skipped_text = "\t⚡ Skipped {}".format(skipped)
hit_text = ""
if nb_hit > 0:
hit_text = "\t⚡ Hit {} ({:.1f}%)".format(nb_hit, (100 * nb_hit) / (n_agents * final_step))
print(
"☑️ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} "
"\t🍭 Seed: {}"
"\t🚉 Env: {:.3f}s "
"\t🤖 Agent: {:.3f}s (per step: {:.3f}s) \t[preproc: {:.3f}s \tinfer: {:.3f}s]"
"{}{}".format(
normalized_score,
completion * 100.0,
final_step,
seed,
step_timer.get(),
agent_timer.get(),
agent_timer.get() / final_step,
preproc_timer.get(),
inference_timer.get(),
skipped_text,
hit_text
)
)
return scores, completions, nb_steps, agent_times, step_times
def evaluate_agents(file, n_evaluation_episodes, use_gpu, render, allow_skipping, allow_caching):
nb_threads = 1
eval_per_thread = n_evaluation_episodes
if not render:
nb_threads = multiprocessing.cpu_count()
eval_per_thread = max(1, math.ceil(n_evaluation_episodes / nb_threads))
total_nb_eval = eval_per_thread * nb_threads
print("Will evaluate policy {} over {} episodes on {} threads.".format(file, total_nb_eval, nb_threads))
if total_nb_eval != n_evaluation_episodes:
print("(Rounding up from {} to fill all cores)".format(n_evaluation_episodes))
# Observation parameters need to match the ones used during training!
# small_v0
small_v0_params = {
# sample configuration
"n_agents": 5,
"x_dim": 25,
"y_dim": 25,
"n_cities": 4,
"max_rails_between_cities": 2,
"max_rails_in_city": 3,
# observations
"observation_tree_depth": 2,
"observation_radius": 10,
"observation_max_path_depth": 20
}
# Test_0
test0_params = {
# sample configuration
"n_agents": 5,
"x_dim": 25,
"y_dim": 25,
"n_cities": 2,
"max_rails_between_cities": 2,
"max_rails_in_city": 3,
# observations
"observation_tree_depth": 2,
"observation_radius": 10,
"observation_max_path_depth": 20
}
# Test_1
test1_params = {
# environment
"n_agents": 10,
"x_dim": 30,
"y_dim": 30,
"n_cities": 2,
"max_rails_between_cities": 2,
"max_rails_in_city": 3,
# observations
"observation_tree_depth": 2,
"observation_radius": 10,
"observation_max_path_depth": 10
}
# Test_5
test5_params = {
# environment
"n_agents": 80,
"x_dim": 35,
"y_dim": 35,
"n_cities": 5,
"max_rails_between_cities": 2,
"max_rails_in_city": 4,
# observations
"observation_tree_depth": 2,
"observation_radius": 10,
"observation_max_path_depth": 20
}
params = small_v0_params
env_params = Namespace(**params)
print("Environment parameters:")
pprint(params)
# Calculate space dimensions and max steps
max_steps = int(4 * 2 * (env_params.x_dim + env_params.y_dim + (env_params.n_agents / env_params.n_cities)))
action_size = 5
tree_observation = TreeObsForRailEnv(max_depth=env_params.observation_tree_depth)
tree_depth = env_params.observation_tree_depth
num_features_per_node = tree_observation.observation_dim
n_nodes = sum([np.power(4, i) for i in range(tree_depth + 1)])
state_size = num_features_per_node * n_nodes
results = []
if render:
results.append(
eval_policy(params, file, eval_per_thread, max_steps, action_size, state_size, 0, render, allow_skipping,
allow_caching))
else:
with Pool() as p:
results = p.starmap(eval_policy,
[(params, file, 1, max_steps, action_size, state_size, seed * nb_threads, render,
allow_skipping, allow_caching)
for seed in
range(total_nb_eval)])
scores = []
completions = []
nb_steps = []
times = []
step_times = []
for s, c, n, t, st in results:
scores.append(s)
completions.append(c)
nb_steps.append(n)
times.append(t)
step_times.append(st)
print("-" * 200)
print("✅ Score: {:.3f} \tDone: {:.1f}% \tNb steps: {:.3f} \tAgent total: {:.3f}s (per step: {:.3f}s)".format(
np.mean(scores),
np.mean(completions) * 100.0,
np.mean(nb_steps),
np.mean(times),
np.mean(times) / np.mean(nb_steps)
))
print("⏲️ Agent sum: {:.3f}s \tEnv sum: {:.3f}s \tTotal sum: {:.3f}s".format(
np.sum(times),
np.sum(step_times),
np.sum(times) + np.sum(step_times)
))
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("-f", "--file", help="checkpoint to load", required=True, type=str)
parser.add_argument("-n", "--n_evaluation_episodes", help="number of evaluation episodes", default=25, type=int)
# TODO
# parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=0, type=int)
parser.add_argument("--use_gpu", dest="use_gpu", help="use GPU if available", action='store_true')
parser.add_argument("--render", help="render a single episode", action='store_true')
parser.add_argument("--allow_skipping", help="skips to the end of the episode if all agents are deadlocked",
action='store_true')
parser.add_argument("--allow_caching", help="caches the last observation-action pair", action='store_true')
args = parser.parse_args()
os.environ["OMP_NUM_THREADS"] = str(1)
evaluate_agents(file=args.file, n_evaluation_episodes=args.n_evaluation_episodes, use_gpu=args.use_gpu,
render=args.render,
allow_skipping=args.allow_skipping, allow_caching=args.allow_caching)
import torch.nn as nn
import torch.nn.functional as F
class DuelingQNetwork(nn.Module):
"""Dueling Q-network (https://arxiv.org/abs/1511.06581)"""
def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128):
super(DuelingQNetwork, self).__init__()
# value network
self.fc1_val = nn.Linear(state_size, hidsize1)
self.fc2_val = nn.Linear(hidsize1, hidsize2)
self.fc4_val = nn.Linear(hidsize2, 1)
# advantage network
self.fc1_adv = nn.Linear(state_size, hidsize1)
self.fc2_adv = nn.Linear(hidsize1, hidsize2)
self.fc4_adv = nn.Linear(hidsize2, action_size)
def forward(self, x):
val = F.relu(self.fc1_val(x))
val = F.relu(self.fc2_val(val))
val = self.fc4_val(val)
# advantage calculation
adv = F.relu(self.fc1_adv(x))
adv = F.relu(self.fc2_adv(adv))
adv = self.fc4_adv(adv)
return val + adv - adv.mean()
This diff is collapsed.
from flatland.envs.rail_env import RailEnv
from reinforcement_learning.dddqn_policy import DDDQNPolicy
from reinforcement_learning.policy import LearningPolicy, DummyMemory
from reinforcement_learning.ppo_agent import PPOPolicy
class MultiDecisionAgent(LearningPolicy):
def __init__(self, state_size, action_size, in_parameters=None):
print(">> MultiDecisionAgent")
super(MultiDecisionAgent, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.in_parameters = in_parameters
self.memory = DummyMemory()
self.loss = 0
self.ppo_policy = PPOPolicy(state_size, action_size, use_replay_buffer=False, in_parameters=in_parameters)
self.dddqn_policy = DDDQNPolicy(state_size, action_size, in_parameters)
self.policy_selector = PPOPolicy(state_size, 2)
def step(self, handle, state, action, reward, next_state, done):
self.ppo_policy.step(handle, state, action, reward, next_state, done)
self.dddqn_policy.step(handle, state, action, reward, next_state, done)
select = self.policy_selector.act(handle, state, 0.0)
self.policy_selector.step(handle, state, select, reward, next_state, done)
def act(self, handle, state, eps=0.):
select = self.policy_selector.act(handle, state, eps)
if select == 0:
return self.dddqn_policy.act(handle, state, eps)
return self.policy_selector.act(handle, state, eps)
def save(self, filename):
self.ppo_policy.save(filename)
self.dddqn_policy.save(filename)
self.policy_selector.save(filename)
def load(self, filename):
self.ppo_policy.load(filename)
self.dddqn_policy.load(filename)
self.policy_selector.load(filename)
def start_step(self, train):
self.ppo_policy.start_step(train)
self.dddqn_policy.start_step(train)
self.policy_selector.start_step(train)
def end_step(self, train):
self.ppo_policy.end_step(train)
self.dddqn_policy.end_step(train)
self.policy_selector.end_step(train)
def start_episode(self, train):
self.ppo_policy.start_episode(train)
self.dddqn_policy.start_episode(train)
self.policy_selector.start_episode(train)
def end_episode(self, train):
self.ppo_policy.end_episode(train)
self.dddqn_policy.end_episode(train)
self.policy_selector.end_episode(train)
def load_replay_buffer(self, filename):
self.ppo_policy.load_replay_buffer(filename)
self.dddqn_policy.load_replay_buffer(filename)
self.policy_selector.load_replay_buffer(filename)
def test(self):
self.ppo_policy.test()
self.dddqn_policy.test()
self.policy_selector.test()
def reset(self, env: RailEnv):
self.ppo_policy.reset(env)
self.dddqn_policy.reset(env)
self.policy_selector.reset(env)
def clone(self):
multi_descision_agent = MultiDecisionAgent(
self.state_size,
self.action_size,
self.in_parameters
)
multi_descision_agent.ppo_policy = self.ppo_policy.clone()
multi_descision_agent.dddqn_policy = self.dddqn_policy.clone()
multi_descision_agent.policy_selector = self.policy_selector.clone()
return multi_descision_agent
import numpy as np
from flatland.envs.rail_env import RailEnv
from reinforcement_learning.policy import Policy
from reinforcement_learning.ppo_agent import PPOPolicy
from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
class MultiPolicy(Policy):
def __init__(self, state_size, action_size, n_agents, env):
self.state_size = state_size
self.action_size = action_size
self.memory = []
self.loss = 0
self.deadlock_avoidance_policy = DeadLockAvoidanceAgent(env, action_size, False)
self.ppo_policy = PPOPolicy(state_size + action_size, action_size)
def load(self, filename):
self.ppo_policy.load(filename)
self.deadlock_avoidance_policy.load(filename)
def save(self, filename):
self.ppo_policy.save(filename)
self.deadlock_avoidance_policy.save(filename)
def step(self, handle, state, action, reward, next_state, done):
action_extra_state = self.deadlock_avoidance_policy.act(handle, state, 0.0)
action_extra_next_state = self.deadlock_avoidance_policy.act(handle, next_state, 0.0)
extended_state = np.copy(state)
for action_itr in np.arange(self.action_size):
extended_state = np.append(extended_state, [int(action_extra_state == action_itr)])
extended_next_state = np.copy(next_state)
for action_itr in np.arange(self.action_size):
extended_next_state = np.append(extended_next_state, [int(action_extra_next_state == action_itr)])
self.deadlock_avoidance_policy.step(handle, state, action, reward, next_state, done)
self.ppo_policy.step(handle, extended_state, action, reward, extended_next_state, done)
def act(self, handle, state, eps=0.):
action_extra_state = self.deadlock_avoidance_policy.act(handle, state, 0.0)
extended_state = np.copy(state)
for action_itr in np.arange(self.action_size):
extended_state = np.append(extended_state, [int(action_extra_state == action_itr)])
action_ppo = self.ppo_policy.act(handle, extended_state, eps)
self.loss = self.ppo_policy.loss
return action_ppo
def reset(self, env: RailEnv):
self.ppo_policy.reset(env)
self.deadlock_avoidance_policy.reset(env)
def test(self):
self.ppo_policy.test()
self.deadlock_avoidance_policy.test()
def start_step(self, train):
self.deadlock_avoidance_policy.start_step(train)
self.ppo_policy.start_step(train)
def end_step(self, train):
self.deadlock_avoidance_policy.end_step(train)
self.ppo_policy.end_step(train)
import sys
from pathlib import Path
import numpy as np
from reinforcement_learning.policy import Policy
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
from utils.observation_utils import split_tree_into_feature_groups, min_gt
class OrderedPolicy(Policy):
def __init__(self):
self.action_size = 5
def act(self, handle, state, eps=0.):
_, distance, _ = split_tree_into_feature_groups(state, 1)
distance = distance[1:]
min_dist = min_gt(distance, 0)
min_direction = np.where(distance == min_dist)
if len(min_direction[0]) > 1:
return min_direction[0][-1] + 1
return min_direction[0] + 1
def step(self, state, action, reward, next_state, done):
return
def save(self, filename):
return
def load(self, filename):
return
from flatland.envs.rail_env import RailEnv
class DummyMemory:
def __init__(self):
self.memory = []
def __len__(self):
return 0
class Policy:
def step(self, handle, state, action, reward, next_state, done):
raise NotImplementedError
def act(self, handle, state, eps=0.):
raise NotImplementedError
def save(self, filename):
raise NotImplementedError
def load(self, filename):
raise NotImplementedError
def start_step(self, train):
pass
def end_step(self, train):
pass
def start_episode(self, train):
pass
def end_episode(self, train):
pass
def load_replay_buffer(self, filename):
pass
def test(self):
pass
def reset(self, env: RailEnv):
pass
def clone(self):
return self
class HeuristicPolicy(Policy):
def __init__(self):
super(HeuristicPolicy).__init__()
class LearningPolicy(Policy):
def __init__(self):
super(LearningPolicy).__init__()
class HybridPolicy(Policy):
def __init__(self):
super(HybridPolicy).__init__()
import copy
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
# Hyperparameters
from reinforcement_learning.policy import LearningPolicy
from reinforcement_learning.replay_buffer import ReplayBuffer
# https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
class EpisodeBuffers:
def __init__(self):
self.reset()
def __len__(self):
"""Return the current size of internal memory."""
return len(self.memory)
def reset(self):
self.memory = {}
def get_transitions(self, handle):
return self.memory.get(handle, [])
def push_transition(self, handle, transition):
transitions = self.get_transitions(handle)
transitions.append(transition)
self.memory.update({handle: transitions})
class ActorCriticModel(nn.Module):
def __init__(self, state_size, action_size, device, hidsize1=512, hidsize2=256):
super(ActorCriticModel, self).__init__()
self.device = device
self.actor = nn.Sequential(
nn.Linear(state_size, hidsize1),
nn.Tanh(),
nn.Linear(hidsize1, hidsize2),
nn.Tanh(),
nn.Linear(hidsize2, action_size),
nn.Softmax(dim=-1)
).to(self.device)
self.critic = nn.Sequential(
nn.Linear(state_size, hidsize1),
nn.Tanh(),
nn.Linear(hidsize1, hidsize2),
nn.Tanh(),
nn.Linear(hidsize2, 1)
).to(self.device)
def forward(self, x):
raise NotImplementedError
def get_actor_dist(self, state):
action_probs = self.actor(state)
dist = Categorical(action_probs)
return dist
def evaluate(self, states, actions):
action_probs = self.actor(states)
dist = Categorical(action_probs)
action_logprobs = dist.log_prob(actions)
dist_entropy = dist.entropy()
state_value = self.critic(states)
return action_logprobs, torch.squeeze(state_value), dist_entropy
def save(self, filename):
# print("Saving model from checkpoint:", filename)
torch.save(self.actor.state_dict(), filename + ".actor")
torch.save(self.critic.state_dict(), filename + ".value")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
return obj
def load(self, filename):
print("load model from file", filename)
self.actor = self._load(self.actor, filename + ".actor")
self.critic = self._load(self.critic, filename + ".value")
class PPOPolicy(LearningPolicy):
def __init__(self, state_size, action_size, use_replay_buffer=False, in_parameters=None):
print(">> PPOPolicy")
super(PPOPolicy, self).__init__()
# parameters
self.ppo_parameters = in_parameters
if self.ppo_parameters is not None:
self.hidsize = self.ppo_parameters.hidden_size
self.buffer_size = self.ppo_parameters.buffer_size
self.batch_size = self.ppo_parameters.batch_size
self.learning_rate = self.ppo_parameters.learning_rate
self.gamma = self.ppo_parameters.gamma
# Device
if self.ppo_parameters.use_gpu and torch.cuda.is_available():
self.device = torch.device("cuda:0")
# print("🐇 Using GPU")
else:
self.device = torch.device("cpu")
# print("🐢 Using CPU")
else:
self.hidsize = 128
self.learning_rate = 1.0e-3
self.gamma = 0.95
self.buffer_size = 32_000
self.batch_size = 1024
self.device = torch.device("cpu")
self.surrogate_eps_clip = 0.1
self.K_epoch = 10
self.weight_loss = 0.5
self.weight_entropy = 0.01
self.buffer_min_size = 0
self.use_replay_buffer = use_replay_buffer
self.current_episode_memory = EpisodeBuffers()
self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
self.loss = 0
self.actor_critic_model = ActorCriticModel(state_size, action_size,self.device,
hidsize1=self.hidsize,
hidsize2=self.hidsize)
self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate)
self.loss_function = nn.MSELoss() # nn.SmoothL1Loss()
def reset(self, env):
pass
def act(self, handle, state, eps=None):
# sample a action to take
torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
dist = self.actor_critic_model.get_actor_dist(torch_state)
action = dist.sample()
return action.item()
def step(self, handle, state, action, reward, next_state, done):
# record transitions ([state] -> [action] -> [reward, next_state, done])
torch_action = torch.tensor(action, dtype=torch.float).to(self.device)
torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
# evaluate actor
dist = self.actor_critic_model.get_actor_dist(torch_state)
action_logprobs = dist.log_prob(torch_action)
transition = (state, action, reward, next_state, action_logprobs.item(), done)
self.current_episode_memory.push_transition(handle, transition)
def _push_transitions_to_replay_buffer(self,
state_list,
action_list,
reward_list,
state_next_list,
done_list,
prob_a_list):
for idx in range(len(reward_list)):
state_i = state_list[idx]
action_i = action_list[idx]
reward_i = reward_list[idx]
state_next_i = state_next_list[idx]
done_i = done_list[idx]
prob_action_i = prob_a_list[idx]
self.memory.add(state_i, action_i, reward_i, state_next_i, done_i, prob_action_i)
def _convert_transitions_to_torch_tensors(self, transitions_array):
# build empty lists(arrays)
state_list, action_list, reward_list, state_next_list, prob_a_list, done_list = [], [], [], [], [], []
# set discounted_reward to zero
discounted_reward = 0
for transition in transitions_array[::-1]:
state_i, action_i, reward_i, state_next_i, prob_action_i, done_i = transition
state_list.insert(0, state_i)
action_list.insert(0, action_i)
if done_i:
discounted_reward = 0
done_list.insert(0, 1)
else:
done_list.insert(0, 0)
discounted_reward = reward_i + self.gamma * discounted_reward
reward_list.insert(0, discounted_reward)
state_next_list.insert(0, state_next_i)
prob_a_list.insert(0, prob_action_i)
if self.use_replay_buffer:
self._push_transitions_to_replay_buffer(state_list, action_list,
reward_list, state_next_list,
done_list, prob_a_list)
# convert data to torch tensors
states, actions, rewards, states_next, dones, prob_actions = \
torch.tensor(state_list, dtype=torch.float).to(self.device), \
torch.tensor(action_list).to(self.device), \
torch.tensor(reward_list, dtype=torch.float).to(self.device), \
torch.tensor(state_next_list, dtype=torch.float).to(self.device), \
torch.tensor(done_list, dtype=torch.float).to(self.device), \
torch.tensor(prob_a_list).to(self.device)
return states, actions, rewards, states_next, dones, prob_actions
def _get_transitions_from_replay_buffer(self, states, actions, rewards, states_next, dones, probs_action):
if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
states, actions, rewards, states_next, dones, probs_action = self.memory.sample()
actions = torch.squeeze(actions)
rewards = torch.squeeze(rewards)
states_next = torch.squeeze(states_next)
dones = torch.squeeze(dones)
probs_action = torch.squeeze(probs_action)
return states, actions, rewards, states_next, dones, probs_action
def train_net(self):
# All agents have to propagate their experiences made during past episode
for handle in range(len(self.current_episode_memory)):
# Extract agent's episode history (list of all transitions)
agent_episode_history = self.current_episode_memory.get_transitions(handle)
if len(agent_episode_history) > 0:
# Convert the replay buffer to torch tensors (arrays)
states, actions, rewards, states_next, dones, probs_action = \
self._convert_transitions_to_torch_tensors(agent_episode_history)
# Optimize policy for K epochs:
for k_loop in range(int(self.K_epoch)):
if self.use_replay_buffer:
states, actions, rewards, states_next, dones, probs_action = \
self._get_transitions_from_replay_buffer(
states, actions, rewards, states_next, dones, probs_action
)
# Evaluating actions (actor) and values (critic)
logprobs, state_values, dist_entropy = self.actor_critic_model.evaluate(states, actions)
# Finding the ratios (pi_thetas / pi_thetas_replayed):
ratios = torch.exp(logprobs - probs_action.detach())
# Finding Surrogate Loos
advantages = rewards - state_values.detach()
surr1 = ratios * advantages
surr2 = torch.clamp(ratios, 1. - self.surrogate_eps_clip, 1. + self.surrogate_eps_clip) * advantages
# The loss function is used to estimate the gardient and use the entropy function based
# heuristic to penalize the gradient function when the policy becomes deterministic this would let
# the gradient becomes very flat and so the gradient is no longer useful.
loss = \
-torch.min(surr1, surr2) \
+ self.weight_loss * self.loss_function(state_values, rewards) \
- self.weight_entropy * dist_entropy
# Make a gradient step
self.optimizer.zero_grad()
loss.mean().backward()
self.optimizer.step()
# Transfer the current loss to the agents loss (information) for debug purpose only
self.loss = loss.mean().detach().cpu().numpy()
# Reset all collect transition data
self.current_episode_memory.reset()
def end_episode(self, train):
if train:
self.train_net()
# Checkpointing methods
def save(self, filename):
# print("Saving model from checkpoint:", filename)
self.actor_critic_model.save(filename)
torch.save(self.optimizer.state_dict(), filename + ".optimizer")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
else:
print(" >> file not found!")
return obj
def load(self, filename):
print("load policy from file", filename)
self.actor_critic_model.load(filename)
print("load optimizer from file", filename)
self.optimizer = self._load(self.optimizer, filename + ".optimizer")
def clone(self):
policy = PPOPolicy(self.state_size, self.action_size)
policy.actor_critic_model = copy.deepcopy(self.actor_critic_model)
policy.optimizer = copy.deepcopy(self.optimizer)
return self