Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hebe0663/neurips2020-flatland-starter-kit
  • flatland/neurips2020-flatland-starter-kit
  • manavsinghal157/marl-flatland
3 results
Show changes
Showing
with 828 additions and 126 deletions
import random
from collections import namedtuple, deque, Iterable
import numpy as np
import torch
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "action_prob"])
class ReplayBuffer:
"""Fixed-size buffer to store experience tuples."""
def __init__(self, action_size, buffer_size, batch_size, device):
"""Initialize a ReplayBuffer object.
Params
======
action_size (int): dimension of each action
buffer_size (int): maximum size of buffer
batch_size (int): size of each training batch
"""
self.action_size = action_size
self.memory = deque(maxlen=buffer_size)
self.batch_size = batch_size
self.device = device
def add(self, state, action, reward, next_state, done, action_prob=0.0):
"""Add a new experience to memory."""
e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, action_prob)
self.memory.append(e)
def sample(self):
"""Randomly sample a batch of experiences from memory."""
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
.float().to(self.device)
actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
.long().to(self.device)
rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
.float().to(self.device)
next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
.float().to(self.device)
dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
.float().to(self.device)
action_probs = torch.from_numpy(self.__v_stack_impr([e.action_prob for e in experiences if e is not None])) \
.float().to(self.device)
return states, actions, rewards, next_states, dones, action_probs
def __len__(self):
"""Return the current size of internal memory."""
return len(self.memory)
def __v_stack_impr(self, states):
sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1
np_states = np.reshape(np.array(states), (len(states), sub_dim))
return np_states
from collections import deque
from collections import namedtuple
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from reinforcement_learning.dddqn_policy import DDDQNPolicy
from reinforcement_learning.ppo_agent import PPOPolicy
dddqn_param_nt = namedtuple('DDDQN_Param', ['hidden_size', 'buffer_size', 'batch_size', 'update_every', 'learning_rate',
'tau', 'gamma', 'buffer_min_size', 'use_gpu'])
dddqn_param = dddqn_param_nt(hidden_size=128,
buffer_size=1000,
batch_size=64,
update_every=10,
learning_rate=1.e-3,
tau=1.e-2,
gamma=0.95,
buffer_min_size=0,
use_gpu=False)
def cartpole(use_dddqn=False):
eps = 1.0
eps_decay = 0.99
min_eps = 0.01
training_mode = True
env = gym.make("CartPole-v1")
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
if not use_dddqn:
policy = PPOPolicy(observation_space, action_space, False)
else:
policy = DDDQNPolicy(observation_space, action_space, dddqn_param)
episode = 0
checkpoint_interval = 20
scores_window = deque(maxlen=100)
writer = SummaryWriter()
while True:
episode += 1
state = env.reset()
policy.reset(env)
handle = 0
tot_reward = 0
policy.start_episode(train=training_mode)
while True:
# env.render()
policy.start_step(train=training_mode)
action = policy.act(handle, state, eps)
state_next, reward, terminal, info = env.step(action)
policy.end_step(train=training_mode)
tot_reward += reward
# reward = reward if not terminal else -reward
reward = 0 if not terminal else -1
policy.step(handle, state, action, reward, state_next, terminal)
state = np.copy(state_next)
if terminal:
break
policy.end_episode(train=training_mode)
eps = max(min_eps, eps * eps_decay)
scores_window.append(tot_reward)
if episode % checkpoint_interval == 0:
print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
tot_reward,
np.mean(
scores_window),
eps,
len(
policy.memory)))
else:
print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
tot_reward,
np.mean(
scores_window),
eps,
len(
policy.memory)),
end=" ")
writer.add_scalar("CartPole/value", tot_reward, episode)
writer.add_scalar("CartPole/smoothed_value", np.mean(scores_window), episode)
writer.flush()
if __name__ == "__main__":
cartpole()
import sys
from pathlib import Path
import numpy as np
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.schedule_generators import complex_schedule_generator
from flatland.utils.rendertools import RenderTool
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
from reinforcement_learning.ordered_policy import OrderedPolicy
"""
This file shows how to move agents in a sequential way: it moves the trains one by one, following a shortest path strategy.
This is obviously very slow, but it's a good way to get familiar with the different Flatland components: RailEnv, TreeObsForRailEnv, etc...
multi_agent_training.py is a better starting point to train your own solution!
"""
np.random.seed(2)
x_dim = np.random.randint(8, 20)
y_dim = np.random.randint(8, 20)
n_agents = np.random.randint(3, 8)
n_goals = n_agents + np.random.randint(0, 3)
min_dist = int(0.75 * min(x_dim, y_dim))
env = RailEnv(
width=x_dim,
height=y_dim,
rail_generator=complex_rail_generator(
nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
max_dist=99999,
seed=0
),
schedule_generator=complex_schedule_generator(),
obs_builder_object=TreeObsForRailEnv(max_depth=1, predictor=ShortestPathPredictorForRailEnv()),
number_of_agents=n_agents)
env.reset(True, True)
tree_depth = 1
observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())
env_renderer = RenderTool(env, gl="PGL", )
handle = env.get_agent_handles()
n_episodes = 10
max_steps = 100 * (env.height + env.width)
record_images = False
policy = OrderedPolicy()
action_dict = dict()
for trials in range(1, n_episodes + 1):
# Reset environment
obs, info = env.reset(True, True)
done = env.dones
env_renderer.reset()
frame_step = 0
# Run episode
for step in range(max_steps):
env_renderer.render_env(show=True, show_observations=False, show_predictions=True)
if record_images:
env_renderer.gl.save_image("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
frame_step += 1
# Action
acting_agent = 0
for a in range(env.get_num_agents()):
if done[a]:
acting_agent += 1
if a == acting_agent:
action = policy.act(a, obs[a])
else:
action = 4
action_dict.update({a: action})
# Environment step
obs, all_rewards, done, _ = env.step(action_dict)
if done['__all__']:
break
import sys
from pathlib import Path
import numpy as np
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.schedule_generators import complex_schedule_generator
from flatland.utils.rendertools import RenderTool
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
from reinforcement_learning.ordered_policy import OrderedPolicy
np.random.seed(2)
x_dim = 20 # np.random.randint(8, 20)
y_dim = 20 # np.random.randint(8, 20)
n_agents = 10 # np.random.randint(3, 8)
n_goals = n_agents + np.random.randint(0, 3)
min_dist = int(0.75 * min(x_dim, y_dim))
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=complex_rail_generator(
nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
max_dist=99999,
seed=0
),
schedule_generator=complex_schedule_generator(),
obs_builder_object=TreeObsForRailEnv(max_depth=1, predictor=ShortestPathPredictorForRailEnv()),
number_of_agents=n_agents)
env.reset(True, True)
tree_depth = 1
observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())
env_renderer = RenderTool(env, gl="PGL", )
handle = env.get_agent_handles()
n_episodes = 1
max_steps = 100 * (env.height + env.width)
record_images = False
policy = OrderedPolicy()
action_dict = dict()
for trials in range(1, n_episodes + 1):
# Reset environment
obs, info = env.reset(True, True)
done = env.dones
env_renderer.reset()
frame_step = 0
# Run episode
for step in range(max_steps):
env_renderer.render_env(show=True, show_observations=False, show_predictions=True)
if record_images:
env_renderer.gl.save_image("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
frame_step += 1
# Action
acting_agent = 0
for a in range(env.get_num_agents()):
if done[a]:
acting_agent += 1
if a == acting_agent:
action = policy.act(a, obs[a])
else:
action = 4
action_dict.update({a: action})
# Environment step
obs, all_rewards, done, _ = env.step(action_dict)
if done['__all__']:
break
import random
import sys
from argparse import ArgumentParser, Namespace
from collections import deque
from pathlib import Path
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
from reinforcement_learning.dddqn_policy import DDDQNPolicy
import matplotlib.pyplot as plt
import numpy as np
import torch
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import sparse_rail_generator
from flatland.envs.schedule_generators import sparse_schedule_generator
from utils.observation_utils import normalize_observation
from flatland.envs.observations import TreeObsForRailEnv
"""
This file shows how to train a single agent using a reinforcement learning approach.
Documentation: https://flatland.aicrowd.com/getting-started/rl/single-agent.html
This is a simple method used for demonstration purposes.
multi_agent_training.py is a better starting point to train your own solution!
"""
def train_agent(n_episodes):
# Environment parameters
n_agents = 1
x_dim = 25
y_dim = 25
n_cities = 4
max_rails_between_cities = 2
max_rails_in_city = 3
seed = 42
# Observation parameters
observation_tree_depth = 2
observation_radius = 10
# Exploration parameters
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.997 # for 2500ts
# Set the seeds
random.seed(seed)
np.random.seed(seed)
# Observation builder
tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth)
# Setup the environment
env = RailEnv(
width=x_dim,
height=y_dim,
rail_generator=sparse_rail_generator(
max_num_cities=n_cities,
seed=seed,
grid_mode=False,
max_rails_between_cities=max_rails_between_cities,
max_rails_in_city=max_rails_in_city
),
schedule_generator=sparse_schedule_generator(),
number_of_agents=n_agents,
obs_builder_object=tree_observation
)
env.reset(True, True)
# Calculate the state size given the depth of the tree observation and the number of features
n_features_per_node = env.obs_builder.observation_dim
n_nodes = 0
for i in range(observation_tree_depth + 1):
n_nodes += np.power(4, i)
state_size = n_features_per_node * n_nodes
# The action space of flatland is 5 discrete actions
action_size = 5
# Max number of steps per episode
# This is the official formula used during evaluations
max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
action_dict = dict()
# And some variables to keep track of the progress
scores_window = deque(maxlen=100) # todo smooth when rendering instead
completion_window = deque(maxlen=100)
scores = []
completion = []
action_count = [0] * action_size
agent_obs = [None] * env.get_num_agents()
agent_prev_obs = [None] * env.get_num_agents()
agent_prev_action = [2] * env.get_num_agents()
update_values = False
# Training parameters
training_parameters = {
'buffer_size': int(1e5),
'batch_size': 32,
'update_every': 8,
'learning_rate': 0.5e-4,
'tau': 1e-3,
'gamma': 0.99,
'buffer_min_size': 0,
'hidden_size': 256,
'use_gpu': False
}
# Double Dueling DQN policy
policy = DDDQNPolicy(state_size, action_size, Namespace(**training_parameters))
for episode_idx in range(n_episodes):
score = 0
# Reset environment
obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
# Build agent specific observations
for agent in env.get_agent_handles():
if obs[agent]:
agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth,
observation_radius=observation_radius)
agent_prev_obs[agent] = agent_obs[agent].copy()
# Run episode
for step in range(max_steps - 1):
for agent in env.get_agent_handles():
if info['action_required'][agent]:
# If an action is required, we want to store the obs at that step as well as the action
update_values = True
action = policy.act(agent, agent_obs[agent], eps=eps_start)
action_count[action] += 1
else:
update_values = False
action = 0
action_dict.update({agent: action})
# Environment step
next_obs, all_rewards, done, info = env.step(action_dict)
# Update replay buffer and train agent
for agent in range(env.get_num_agents()):
# Only update the values when we are done or when an action was taken and thus relevant information is present
if update_values or done[agent]:
policy.step(agent,
agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent],
agent_obs[agent], done[agent])
agent_prev_obs[agent] = agent_obs[agent].copy()
agent_prev_action[agent] = action_dict[agent]
if next_obs[agent]:
agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth,
observation_radius=10)
score += all_rewards[agent]
if done['__all__']:
break
# Epsilon decay
eps_start = max(eps_end, eps_decay * eps_start)
# Collection information about training
tasks_finished = np.sum([int(done[idx]) for idx in env.get_agent_handles()])
completion_window.append(tasks_finished / max(1, env.get_num_agents()))
scores_window.append(score / (max_steps * env.get_num_agents()))
completion.append((np.mean(completion_window)))
scores.append(np.mean(scores_window))
action_probs = action_count / np.sum(action_count)
if episode_idx % 100 == 0:
end = "\n"
torch.save(policy.qnetwork_local, './checkpoints/single-' + str(episode_idx) + '.pth')
action_count = [1] * action_size
else:
end = " "
print(
'\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(),
x_dim, y_dim,
episode_idx,
np.mean(scores_window),
100 * np.mean(completion_window),
eps_start,
action_probs
), end=end)
# Plot overall training progress at the end
plt.plot(scores)
plt.show()
plt.plot(completion)
plt.show()
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500,
type=int)
args = parser.parse_args()
train_agent(args.n_episodes)
This diff is collapsed.
#!/bin/bash
python ./run.py
#!/bin/bash
# manually install submodules.
python ./run.py
from time import time
import numpy as np
from flatland.envs.rail_env import fast_isclose
def print_timing(label, start_time, end_time):
print("{:>10.4f}ms".format(1000 * (end_time - start_time)) + "\t" + label)
def check_isclose(nbr=100000):
s = time()
for x in range(nbr):
fast_isclose(x, 0.0, rtol=1e-03)
e = time()
print_timing("fast_isclose", start_time=s, end_time=e)
s = time()
for x in range(nbr):
np.isclose(x, 0.0, rtol=1e-03)
e = time()
print_timing("np.isclose", start_time=s, end_time=e)
if __name__ == "__main__":
check_isclose()