Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hebe0663/neurips2020-flatland-starter-kit
  • flatland/neurips2020-flatland-starter-kit
  • manavsinghal157/marl-flatland
3 results
Show changes
Showing
with 261 additions and 57 deletions
import random
from collections import namedtuple, deque, Iterable
import numpy as np
import torch
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "action_prob"])
class ReplayBuffer:
"""Fixed-size buffer to store experience tuples."""
def __init__(self, action_size, buffer_size, batch_size, device):
"""Initialize a ReplayBuffer object.
Params
======
action_size (int): dimension of each action
buffer_size (int): maximum size of buffer
batch_size (int): size of each training batch
"""
self.action_size = action_size
self.memory = deque(maxlen=buffer_size)
self.batch_size = batch_size
self.device = device
def add(self, state, action, reward, next_state, done, action_prob=0.0):
"""Add a new experience to memory."""
e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, action_prob)
self.memory.append(e)
def sample(self):
"""Randomly sample a batch of experiences from memory."""
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
.float().to(self.device)
actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
.long().to(self.device)
rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
.float().to(self.device)
next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
.float().to(self.device)
dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
.float().to(self.device)
action_probs = torch.from_numpy(self.__v_stack_impr([e.action_prob for e in experiences if e is not None])) \
.float().to(self.device)
return states, actions, rewards, next_states, dones, action_probs
def __len__(self):
"""Return the current size of internal memory."""
return len(self.memory)
def __v_stack_impr(self, states):
sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1
np_states = np.reshape(np.array(states), (len(states), sub_dim))
return np_states
from collections import deque
from collections import namedtuple
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from reinforcement_learning.dddqn_policy import DDDQNPolicy
from reinforcement_learning.ppo_agent import PPOPolicy
dddqn_param_nt = namedtuple('DDDQN_Param', ['hidden_size', 'buffer_size', 'batch_size', 'update_every', 'learning_rate',
'tau', 'gamma', 'buffer_min_size', 'use_gpu'])
dddqn_param = dddqn_param_nt(hidden_size=128,
buffer_size=1000,
batch_size=64,
update_every=10,
learning_rate=1.e-3,
tau=1.e-2,
gamma=0.95,
buffer_min_size=0,
use_gpu=False)
def cartpole(use_dddqn=False):
eps = 1.0
eps_decay = 0.99
min_eps = 0.01
training_mode = True
env = gym.make("CartPole-v1")
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
if not use_dddqn:
policy = PPOPolicy(observation_space, action_space, False)
else:
policy = DDDQNPolicy(observation_space, action_space, dddqn_param)
episode = 0
checkpoint_interval = 20
scores_window = deque(maxlen=100)
writer = SummaryWriter()
while True:
episode += 1
state = env.reset()
policy.reset(env)
handle = 0
tot_reward = 0
policy.start_episode(train=training_mode)
while True:
# env.render()
policy.start_step(train=training_mode)
action = policy.act(handle, state, eps)
state_next, reward, terminal, info = env.step(action)
policy.end_step(train=training_mode)
tot_reward += reward
# reward = reward if not terminal else -reward
reward = 0 if not terminal else -1
policy.step(handle, state, action, reward, state_next, terminal)
state = np.copy(state_next)
if terminal:
break
policy.end_episode(train=training_mode)
eps = max(min_eps, eps * eps_decay)
scores_window.append(tot_reward)
if episode % checkpoint_interval == 0:
print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
tot_reward,
np.mean(
scores_window),
eps,
len(
policy.memory)))
else:
print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
tot_reward,
np.mean(
scores_window),
eps,
len(
policy.memory)),
end=" ")
writer.add_scalar("CartPole/value", tot_reward, episode)
writer.add_scalar("CartPole/smoothed_value", np.mean(scores_window), episode)
writer.flush()
if __name__ == "__main__":
cartpole()
import sys
import numpy as np
from pathlib import Path
import numpy as np
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.schedule_generators import complex_schedule_generator
from flatland.utils.rendertools import RenderTool
from pathlib import Path
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
......@@ -73,7 +73,7 @@ for trials in range(1, n_episodes + 1):
if done[a]:
acting_agent += 1
if a == acting_agent:
action = policy.act(obs[a])
action = policy.act(a, obs[a])
else:
action = 4
action_dict.update({a: action})
......
import sys
import numpy as np
from pathlib import Path
import numpy as np
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.schedule_generators import complex_schedule_generator
from flatland.utils.rendertools import RenderTool
from pathlib import Path
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
......@@ -66,7 +66,7 @@ for trials in range(1, n_episodes + 1):
if done[a]:
acting_agent += 1
if a == acting_agent:
action = policy.act(obs[a])
action = policy.act(a, obs[a])
else:
action = 4
action_dict.update({a: action})
......
......@@ -123,7 +123,8 @@ def train_agent(n_episodes):
# Build agent specific observations
for agent in env.get_agent_handles():
if obs[agent]:
agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth,
observation_radius=observation_radius)
agent_prev_obs[agent] = agent_obs[agent].copy()
# Run episode
......@@ -132,7 +133,7 @@ def train_agent(n_episodes):
if info['action_required'][agent]:
# If an action is required, we want to store the obs at that step as well as the action
update_values = True
action = policy.act(agent_obs[agent], eps=eps_start)
action = policy.act(agent, agent_obs[agent], eps=eps_start)
action_count[action] += 1
else:
update_values = False
......@@ -154,7 +155,8 @@ def train_agent(n_episodes):
agent_prev_action[agent] = action_dict[agent]
if next_obs[agent]:
agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=10)
agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth,
observation_radius=10)
score += all_rewards[agent]
......@@ -179,15 +181,16 @@ def train_agent(n_episodes):
else:
end = " "
print('\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(),
x_dim, y_dim,
episode_idx,
np.mean(scores_window),
100 * np.mean(completion_window),
eps_start,
action_probs
), end=end)
print(
'\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(),
x_dim, y_dim,
episode_idx,
np.mean(scores_window),
100 * np.mean(completion_window),
eps_start,
action_probs
), end=end)
# Plot overall training progress at the end
plt.plot(scores)
......@@ -199,7 +202,8 @@ def train_agent(n_episodes):
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500, type=int)
parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500,
type=int)
args = parser.parse_args()
train_agent(args.n_episodes)
'''
I did experiments in an early submission. Please note that the epsilon can have an
effects on the evaluation outcome :
DDDQNPolicy experiments - EPSILON impact analysis
----------------------------------------------------------------------------------------
checkpoint = "./checkpoints/201124171810-7800.pth" # Training on AGENTS=10 with Depth=2
......@@ -25,12 +27,17 @@ from pathlib import Path
import numpy as np
from flatland.core.env_observation_builder import DummyObservationBuilder
from flatland.envs.agent_utils import RailAgentStatus
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.evaluators.client import FlatlandRemoteClient
from flatland.evaluators.client import TimeoutException
from reinforcement_learning.ppo_agent import PPOAgent
from reinforcement_learning.dddqn_policy import DDDQNPolicy
from reinforcement_learning.deadlockavoidance_with_decision_agent import DeadLockAvoidanceWithDecisionAgent
from reinforcement_learning.multi_decision_agent import MultiDecisionAgent
from reinforcement_learning.ppo_agent import PPOPolicy
from utils.agent_action_config import get_action_size, map_actions, set_action_size_reduced, set_action_size_full
from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
from utils.deadlock_check import check_if_all_blocked
from utils.fast_tree_obs import FastTreeObs
......@@ -39,33 +46,71 @@ from utils.observation_utils import normalize_observation
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
from reinforcement_learning.dddqn_policy import DDDQNPolicy
####################################################
# EVALUATION PARAMETERS
set_action_size_full()
# Print per-step logs
VERBOSE = True
USE_FAST_TREEOBS = True
USE_PPO_AGENT = False
# Checkpoint to use (remember to push it!)
checkpoint = "./checkpoints/201124171810-7800.pth" # DDDQN: 18.249244799876152 DEPTH=2 AGENTS=10
# checkpoint = "./checkpoints/201126150143-5200.pth" # DDDQN: 18.249244799876152 DEPTH=2 AGENTS=10
# checkpoint = "./checkpoints/201126160144-2000.pth" # DDDQN: 18.249244799876152 DEPTH=2 AGENTS=10
checkpoint = "./checkpoints/201207144650-20000.pth" # PPO: 14.45790721540786
checkpoint = "./checkpoints/201211063511-6300.pth" # DDDQN: 16.948349308440857
checkpoint = "./checkpoints/201211095604-12000.pth" # DDDQN: 17.3862941316504
checkpoint = "./checkpoints/201211164554-8900.pth" # DDDQN: 17.44397192482364
EPSILON = 0.01
if False:
# -------------------------------------------------------------------------------------------------------
# RL solution
# -------------------------------------------------------------------------------------------------------
# 116591 adrian_egli
# graded 71.305 0.633 RL Successfully Graded ! More details about this submission can be found at:
# http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/51
# Fri, 22 Jan 2021 23:37:56
set_action_size_reduced()
load_policy = "DDDQN"
checkpoint = "./checkpoints/210122120236-3000.pth" # 17.011131341978228
EPSILON = 0.0
if False:
# -------------------------------------------------------------------------------------------------------
# RL solution
# -------------------------------------------------------------------------------------------------------
# 116658 adrian_egli
# graded 73.821 0.655 RL Successfully Graded ! More details about this submission can be found at:
# http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/52
# Sat, 23 Jan 2021 07:41:35
set_action_size_reduced()
load_policy = "PPO"
checkpoint = "./checkpoints/210122235754-5000.pth" # 16.00113400887389
EPSILON = 0.0
if True:
# -------------------------------------------------------------------------------------------------------
# RL solution
# -------------------------------------------------------------------------------------------------------
# 116659 adrian_egli
# graded 80.579 0.715 RL Successfully Graded ! More details about this submission can be found at:
# http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/53
# Sat, 23 Jan 2021 07:45:49
set_action_size_reduced()
load_policy = "DDDQN"
checkpoint = "./checkpoints/210122165109-5000.pth" # 17.993750197899438
EPSILON = 0.0
if False:
# -------------------------------------------------------------------------------------------------------
# !! This is not a RL solution !!!!
# -------------------------------------------------------------------------------------------------------
# 116727 adrian_egli
# graded 106.786 0.768 RL Successfully Graded ! More details about this submission can be found at:
# http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/54
# Sat, 23 Jan 2021 14:31:50
set_action_size_reduced()
load_policy = "DeadLockAvoidance"
checkpoint = None
EPSILON = 0.0
# Use last action cache
USE_ACTION_CACHE = False
USE_DEAD_LOCK_AVOIDANCE_AGENT = False # 21.54485505223213
# Observation parameters (must match training parameters!)
observation_tree_depth = 1
observation_tree_depth = 2
observation_radius = 10
observation_max_path_depth = 30
......@@ -101,15 +146,6 @@ else:
n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)])
state_size = n_features_per_node * n_nodes
action_size = 5
# Creates the policy. No GPU on evaluation server.
if not USE_PPO_AGENT:
policy = DDDQNPolicy(state_size, action_size, Namespace(**{'use_gpu': False}), evaluation_mode=True)
else:
policy = PPOAgent(state_size, action_size)
policy.load(checkpoint)
#####################################################################
# Main evaluation loop
#####################################################################
......@@ -143,6 +179,27 @@ while True:
tree_observation.set_env(local_env)
tree_observation.reset()
# Creates the policy. No GPU on evaluation server.
if load_policy == "DDDQN":
policy = DDDQNPolicy(state_size, get_action_size(), Namespace(**{'use_gpu': False}), evaluation_mode=True)
elif load_policy == "PPO":
policy = PPOPolicy(state_size, get_action_size())
elif load_policy == "DeadLockAvoidance":
policy = DeadLockAvoidanceAgent(local_env, get_action_size(), enable_eps=False)
elif load_policy == "DeadLockAvoidanceWithDecision":
# inter_policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=False, in_parameters=train_params)
inter_policy = DDDQNPolicy(state_size, get_action_size(), Namespace(**{'use_gpu': False}), evaluation_mode=True)
policy = DeadLockAvoidanceWithDecisionAgent(local_env, state_size, get_action_size(), inter_policy)
elif load_policy == "MultiDecision":
policy = MultiDecisionAgent(state_size, get_action_size(), Namespace(**{'use_gpu': False}))
else:
policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=False,
in_parameters=Namespace(**{'use_gpu': False}))
policy.load(checkpoint)
policy.reset(local_env)
observation = tree_observation.get_many(list(range(nb_agents)))
print("Evaluation {}: {} agents in {}x{}".format(evaluation_number, nb_agents, local_env.width, local_env.height))
......@@ -162,9 +219,6 @@ while True:
agent_last_action = {}
nb_hit = 0
if USE_DEAD_LOCK_AVOIDANCE_AGENT:
policy = DeadLockAvoidanceAgent(local_env, action_size)
policy.start_episode(train=False)
while True:
try:
......@@ -179,14 +233,7 @@ while True:
time_start = time.time()
action_dict = {}
policy.start_step(train=False)
if USE_DEAD_LOCK_AVOIDANCE_AGENT:
observation = np.zeros((local_env.get_num_agents(), 2))
for agent_handle in range(nb_agents):
if USE_DEAD_LOCK_AVOIDANCE_AGENT:
observation[agent_handle][0] = agent_handle
observation[agent_handle][1] = steps
if info['action_required'][agent_handle]:
if agent_handle in agent_last_obs and np.all(
agent_last_obs[agent_handle] == observation[agent_handle]):
......@@ -198,7 +245,7 @@ while True:
observation_tree_depth,
observation_radius=observation_radius)
action = policy.act(normalized_observation, eps=EPSILON)
action = policy.act(agent_handle, normalized_observation, eps=EPSILON)
action_dict[agent_handle] = action
......@@ -211,7 +258,7 @@ while True:
time_taken_by_controller.append(agent_time)
time_start = time.time()
_, all_rewards, done, info = remote_client.env_step(action_dict)
_, all_rewards, done, info = remote_client.env_step(map_actions(action_dict))
step_time = time.time() - time_start
time_taken_per_step.append(step_time)
......@@ -228,7 +275,11 @@ while True:
step_time = time.time() - time_start
time_taken_per_step.append(step_time)
nb_agents_done = sum(done[idx] for idx in local_env.get_agent_handles())
nb_agents_done = 0
for i_agent, agent in enumerate(local_env.agents):
# manage the boolean flag to check if all agents are indeed done (or done_removed)
if (agent.status in [RailAgentStatus.DONE, RailAgentStatus.DONE_REMOVED]):
nb_agents_done += 1
if VERBOSE or done['__all__']:
print(
......
runs_bench/Screenshots/full.png

139 KiB

runs_bench/Screenshots/reduced.png

178 KiB