Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hebe0663/neurips2020-flatland-starter-kit
  • flatland/neurips2020-flatland-starter-kit
  • manavsinghal157/marl-flatland
3 results
Show changes
Showing
with 663 additions and 268 deletions
from flatland.envs.rail_env import RailEnv
class DummyMemory:
def __init__(self):
self.memory = []
def __len__(self):
return 0
class Policy:
def step(self, handle, state, action, reward, next_state, done):
raise NotImplementedError
def act(self, state, eps=0.):
def act(self, handle, state, eps=0.):
raise NotImplementedError
def save(self, filename):
......@@ -11,10 +22,16 @@ class Policy:
def load(self, filename):
raise NotImplementedError
def start_step(self):
def start_step(self, train):
pass
def end_step(self, train):
pass
def end_step(self):
def start_episode(self, train):
pass
def end_episode(self, train):
pass
def load_replay_buffer(self, filename):
......@@ -23,8 +40,23 @@ class Policy:
def test(self):
pass
def reset(self):
def reset(self, env: RailEnv):
pass
def clone(self):
return self
\ No newline at end of file
return self
class HeuristicPolicy(Policy):
def __init__(self):
super(HeuristicPolicy).__init__()
class LearningPolicy(Policy):
def __init__(self):
super(LearningPolicy).__init__()
class HybridPolicy(Policy):
def __init__(self):
super(HybridPolicy).__init__()
import torch.nn as nn
import torch.nn.functional as F
class PolicyNetwork(nn.Module):
def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128, hidsize3=32):
super().__init__()
self.fc1 = nn.Linear(state_size, hidsize1)
self.fc2 = nn.Linear(hidsize1, hidsize2)
# self.fc3 = nn.Linear(hidsize2, hidsize3)
self.output = nn.Linear(hidsize2, action_size)
self.softmax = nn.Softmax(dim=1)
self.bn0 = nn.BatchNorm1d(state_size, affine=False)
def forward(self, inputs):
x = self.bn0(inputs.float())
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
# x = F.relu(self.fc3(x))
return self.softmax(self.output(x))
import os
import numpy as np
import torch
from torch.distributions.categorical import Categorical
from reinforcement_learning.policy import Policy
from reinforcement_learning.ppo.model import PolicyNetwork
from reinforcement_learning.ppo.replay_memory import Episode, ReplayBuffer
BUFFER_SIZE = 128_000
BATCH_SIZE = 8192
GAMMA = 0.95
LR = 0.5e-4
CLIP_FACTOR = .005
UPDATE_EVERY = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
class PPOAgent(Policy):
def __init__(self, state_size, action_size, num_agents):
self.action_size = action_size
self.state_size = state_size
self.num_agents = num_agents
self.policy = PolicyNetwork(state_size, action_size).to(device)
self.old_policy = PolicyNetwork(state_size, action_size).to(device)
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=LR)
self.episodes = [Episode() for _ in range(num_agents)]
self.memory = ReplayBuffer(BUFFER_SIZE)
self.t_step = 0
self.loss = 0
def reset(self):
self.finished = [False] * len(self.episodes)
self.tot_reward = [0] * self.num_agents
# Decide on an action to take in the environment
def act(self, state, eps=None):
self.policy.eval()
with torch.no_grad():
output = self.policy(torch.from_numpy(state).float().unsqueeze(0).to(device))
ret = Categorical(output).sample().item()
return ret
# Record the results of the agent's action and update the model
def step(self, handle, state, action, reward, next_state, done):
if not self.finished[handle]:
# Push experience into Episode memory
self.tot_reward[handle] += reward
if done == 1:
reward = 1 # self.tot_reward[handle]
else:
reward = 0
self.episodes[handle].push(state, action, reward, next_state, done)
# When we finish the episode, discount rewards and push the experience into replay memory
if done:
self.episodes[handle].discount_rewards(GAMMA)
self.memory.push_episode(self.episodes[handle])
self.episodes[handle].reset()
self.finished[handle] = True
# Perform a gradient update every UPDATE_EVERY time steps
self.t_step = (self.t_step + 1) % UPDATE_EVERY
if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 4:
self._learn(*self.memory.sample(BATCH_SIZE, device))
def _clip_gradient(self, model, clip):
for p in model.parameters():
p.grad.data.clamp_(-clip, clip)
return
"""Computes a gradient clipping coefficient based on gradient norm."""
totalnorm = 0
for p in model.parameters():
if p.grad is not None:
modulenorm = p.grad.data.norm()
totalnorm += modulenorm ** 2
totalnorm = np.sqrt(totalnorm)
coeff = min(1, clip / (totalnorm + 1e-6))
for p in model.parameters():
if p.grad is not None:
p.grad.mul_(coeff)
def _learn(self, states, actions, rewards, next_state, done):
self.policy.train()
responsible_outputs = torch.gather(self.policy(states), 1, actions)
old_responsible_outputs = torch.gather(self.old_policy(states), 1, actions).detach()
# rewards = rewards - rewards.mean()
ratio = responsible_outputs / (old_responsible_outputs + 1e-5)
clamped_ratio = torch.clamp(ratio, 1. - CLIP_FACTOR, 1. + CLIP_FACTOR)
loss = -torch.min(ratio * rewards, clamped_ratio * rewards).mean()
self.loss = loss
# Compute loss and perform a gradient step
self.old_policy.load_state_dict(self.policy.state_dict())
self.optimizer.zero_grad()
loss.backward()
# self._clip_gradient(self.policy, 1.0)
self.optimizer.step()
# Checkpointing methods
def save(self, filename):
# print("Saving model from checkpoint:", filename)
torch.save(self.policy.state_dict(), filename + ".policy")
torch.save(self.optimizer.state_dict(), filename + ".optimizer")
def load(self, filename):
print("load policy from file", filename)
if os.path.exists(filename + ".policy"):
print(' >> ', filename + ".policy")
try:
self.policy.load_state_dict(torch.load(filename + ".policy", map_location=device))
except:
print(" >> failed!")
pass
if os.path.exists(filename + ".optimizer"):
print(' >> ', filename + ".optimizer")
try:
self.optimizer.load_state_dict(torch.load(filename + ".optimizer", map_location=device))
except:
print(" >> failed!")
pass
import torch
import random
import numpy as np
from collections import namedtuple, deque, Iterable
Transition = namedtuple("Experience", ("state", "action", "reward", "next_state", "done"))
class Episode:
memory = []
def reset(self):
self.memory = []
def push(self, *args):
self.memory.append(tuple(args))
def discount_rewards(self, gamma):
running_add = 0.
for i, (state, action, reward, *rest) in list(enumerate(self.memory))[::-1]:
running_add = running_add * gamma + reward
self.memory[i] = (state, action, running_add, *rest)
class ReplayBuffer:
def __init__(self, buffer_size):
self.memory = deque(maxlen=buffer_size)
def push(self, state, action, reward, next_state, done):
self.memory.append(Transition(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done))
def push_episode(self, episode):
for step in episode.memory:
self.push(*step)
def sample(self, batch_size, device):
experiences = random.sample(self.memory, k=batch_size)
states = torch.from_numpy(self.stack([e.state for e in experiences])).float().to(device)
actions = torch.from_numpy(self.stack([e.action for e in experiences])).long().to(device)
rewards = torch.from_numpy(self.stack([e.reward for e in experiences])).float().to(device)
next_states = torch.from_numpy(self.stack([e.next_state for e in experiences])).float().to(device)
dones = torch.from_numpy(self.stack([e.done for e in experiences]).astype(np.uint8)).float().to(device)
return states, actions, rewards, next_states, dones
def stack(self, states):
sub_dims = states[0].shape[1:] if isinstance(states[0], Iterable) else [1]
return np.reshape(np.array(states), (len(states), *sub_dims))
def __len__(self):
return len(self.memory)
import copy
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
# Hyperparameters
from reinforcement_learning.policy import LearningPolicy
from reinforcement_learning.replay_buffer import ReplayBuffer
# https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
class EpisodeBuffers:
def __init__(self):
self.reset()
def __len__(self):
"""Return the current size of internal memory."""
return len(self.memory)
def reset(self):
self.memory = {}
def get_transitions(self, handle):
return self.memory.get(handle, [])
def push_transition(self, handle, transition):
transitions = self.get_transitions(handle)
transitions.append(transition)
self.memory.update({handle: transitions})
class ActorCriticModel(nn.Module):
def __init__(self, state_size, action_size, device, hidsize1=512, hidsize2=256):
super(ActorCriticModel, self).__init__()
self.device = device
self.actor = nn.Sequential(
nn.Linear(state_size, hidsize1),
nn.Tanh(),
nn.Linear(hidsize1, hidsize2),
nn.Tanh(),
nn.Linear(hidsize2, action_size),
nn.Softmax(dim=-1)
).to(self.device)
self.critic = nn.Sequential(
nn.Linear(state_size, hidsize1),
nn.Tanh(),
nn.Linear(hidsize1, hidsize2),
nn.Tanh(),
nn.Linear(hidsize2, 1)
).to(self.device)
def forward(self, x):
raise NotImplementedError
def get_actor_dist(self, state):
action_probs = self.actor(state)
dist = Categorical(action_probs)
return dist
def evaluate(self, states, actions):
action_probs = self.actor(states)
dist = Categorical(action_probs)
action_logprobs = dist.log_prob(actions)
dist_entropy = dist.entropy()
state_value = self.critic(states)
return action_logprobs, torch.squeeze(state_value), dist_entropy
def save(self, filename):
# print("Saving model from checkpoint:", filename)
torch.save(self.actor.state_dict(), filename + ".actor")
torch.save(self.critic.state_dict(), filename + ".value")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
return obj
def load(self, filename):
print("load model from file", filename)
self.actor = self._load(self.actor, filename + ".actor")
self.critic = self._load(self.critic, filename + ".value")
class PPOPolicy(LearningPolicy):
def __init__(self, state_size, action_size, use_replay_buffer=False, in_parameters=None):
print(">> PPOPolicy")
super(PPOPolicy, self).__init__()
# parameters
self.ppo_parameters = in_parameters
if self.ppo_parameters is not None:
self.hidsize = self.ppo_parameters.hidden_size
self.buffer_size = self.ppo_parameters.buffer_size
self.batch_size = self.ppo_parameters.batch_size
self.learning_rate = self.ppo_parameters.learning_rate
self.gamma = self.ppo_parameters.gamma
# Device
if self.ppo_parameters.use_gpu and torch.cuda.is_available():
self.device = torch.device("cuda:0")
# print("🐇 Using GPU")
else:
self.device = torch.device("cpu")
# print("🐢 Using CPU")
else:
self.hidsize = 128
self.learning_rate = 1.0e-3
self.gamma = 0.95
self.buffer_size = 32_000
self.batch_size = 1024
self.device = torch.device("cpu")
self.surrogate_eps_clip = 0.1
self.K_epoch = 10
self.weight_loss = 0.5
self.weight_entropy = 0.01
self.buffer_min_size = 0
self.use_replay_buffer = use_replay_buffer
self.current_episode_memory = EpisodeBuffers()
self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
self.loss = 0
self.actor_critic_model = ActorCriticModel(state_size, action_size,self.device,
hidsize1=self.hidsize,
hidsize2=self.hidsize)
self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate)
self.loss_function = nn.MSELoss() # nn.SmoothL1Loss()
def reset(self, env):
pass
def act(self, handle, state, eps=None):
# sample a action to take
torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
dist = self.actor_critic_model.get_actor_dist(torch_state)
action = dist.sample()
return action.item()
def step(self, handle, state, action, reward, next_state, done):
# record transitions ([state] -> [action] -> [reward, next_state, done])
torch_action = torch.tensor(action, dtype=torch.float).to(self.device)
torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
# evaluate actor
dist = self.actor_critic_model.get_actor_dist(torch_state)
action_logprobs = dist.log_prob(torch_action)
transition = (state, action, reward, next_state, action_logprobs.item(), done)
self.current_episode_memory.push_transition(handle, transition)
def _push_transitions_to_replay_buffer(self,
state_list,
action_list,
reward_list,
state_next_list,
done_list,
prob_a_list):
for idx in range(len(reward_list)):
state_i = state_list[idx]
action_i = action_list[idx]
reward_i = reward_list[idx]
state_next_i = state_next_list[idx]
done_i = done_list[idx]
prob_action_i = prob_a_list[idx]
self.memory.add(state_i, action_i, reward_i, state_next_i, done_i, prob_action_i)
def _convert_transitions_to_torch_tensors(self, transitions_array):
# build empty lists(arrays)
state_list, action_list, reward_list, state_next_list, prob_a_list, done_list = [], [], [], [], [], []
# set discounted_reward to zero
discounted_reward = 0
for transition in transitions_array[::-1]:
state_i, action_i, reward_i, state_next_i, prob_action_i, done_i = transition
state_list.insert(0, state_i)
action_list.insert(0, action_i)
if done_i:
discounted_reward = 0
done_list.insert(0, 1)
else:
done_list.insert(0, 0)
discounted_reward = reward_i + self.gamma * discounted_reward
reward_list.insert(0, discounted_reward)
state_next_list.insert(0, state_next_i)
prob_a_list.insert(0, prob_action_i)
if self.use_replay_buffer:
self._push_transitions_to_replay_buffer(state_list, action_list,
reward_list, state_next_list,
done_list, prob_a_list)
# convert data to torch tensors
states, actions, rewards, states_next, dones, prob_actions = \
torch.tensor(state_list, dtype=torch.float).to(self.device), \
torch.tensor(action_list).to(self.device), \
torch.tensor(reward_list, dtype=torch.float).to(self.device), \
torch.tensor(state_next_list, dtype=torch.float).to(self.device), \
torch.tensor(done_list, dtype=torch.float).to(self.device), \
torch.tensor(prob_a_list).to(self.device)
return states, actions, rewards, states_next, dones, prob_actions
def _get_transitions_from_replay_buffer(self, states, actions, rewards, states_next, dones, probs_action):
if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
states, actions, rewards, states_next, dones, probs_action = self.memory.sample()
actions = torch.squeeze(actions)
rewards = torch.squeeze(rewards)
states_next = torch.squeeze(states_next)
dones = torch.squeeze(dones)
probs_action = torch.squeeze(probs_action)
return states, actions, rewards, states_next, dones, probs_action
def train_net(self):
# All agents have to propagate their experiences made during past episode
for handle in range(len(self.current_episode_memory)):
# Extract agent's episode history (list of all transitions)
agent_episode_history = self.current_episode_memory.get_transitions(handle)
if len(agent_episode_history) > 0:
# Convert the replay buffer to torch tensors (arrays)
states, actions, rewards, states_next, dones, probs_action = \
self._convert_transitions_to_torch_tensors(agent_episode_history)
# Optimize policy for K epochs:
for k_loop in range(int(self.K_epoch)):
if self.use_replay_buffer:
states, actions, rewards, states_next, dones, probs_action = \
self._get_transitions_from_replay_buffer(
states, actions, rewards, states_next, dones, probs_action
)
# Evaluating actions (actor) and values (critic)
logprobs, state_values, dist_entropy = self.actor_critic_model.evaluate(states, actions)
# Finding the ratios (pi_thetas / pi_thetas_replayed):
ratios = torch.exp(logprobs - probs_action.detach())
# Finding Surrogate Loos
advantages = rewards - state_values.detach()
surr1 = ratios * advantages
surr2 = torch.clamp(ratios, 1. - self.surrogate_eps_clip, 1. + self.surrogate_eps_clip) * advantages
# The loss function is used to estimate the gardient and use the entropy function based
# heuristic to penalize the gradient function when the policy becomes deterministic this would let
# the gradient becomes very flat and so the gradient is no longer useful.
loss = \
-torch.min(surr1, surr2) \
+ self.weight_loss * self.loss_function(state_values, rewards) \
- self.weight_entropy * dist_entropy
# Make a gradient step
self.optimizer.zero_grad()
loss.mean().backward()
self.optimizer.step()
# Transfer the current loss to the agents loss (information) for debug purpose only
self.loss = loss.mean().detach().cpu().numpy()
# Reset all collect transition data
self.current_episode_memory.reset()
def end_episode(self, train):
if train:
self.train_net()
# Checkpointing methods
def save(self, filename):
# print("Saving model from checkpoint:", filename)
self.actor_critic_model.save(filename)
torch.save(self.optimizer.state_dict(), filename + ".optimizer")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
else:
print(" >> file not found!")
return obj
def load(self, filename):
print("load policy from file", filename)
self.actor_critic_model.load(filename)
print("load optimizer from file", filename)
self.optimizer = self._load(self.optimizer, filename + ".optimizer")
def clone(self):
policy = PPOPolicy(self.state_size, self.action_size)
policy.actor_critic_model = copy.deepcopy(self.actor_critic_model)
policy.optimizer = copy.deepcopy(self.optimizer)
return self
import random
from collections import namedtuple, deque, Iterable
import numpy as np
import torch
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "action_prob"])
class ReplayBuffer:
"""Fixed-size buffer to store experience tuples."""
def __init__(self, action_size, buffer_size, batch_size, device):
"""Initialize a ReplayBuffer object.
Params
======
action_size (int): dimension of each action
buffer_size (int): maximum size of buffer
batch_size (int): size of each training batch
"""
self.action_size = action_size
self.memory = deque(maxlen=buffer_size)
self.batch_size = batch_size
self.device = device
def add(self, state, action, reward, next_state, done, action_prob=0.0):
"""Add a new experience to memory."""
e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, action_prob)
self.memory.append(e)
def sample(self):
"""Randomly sample a batch of experiences from memory."""
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
.float().to(self.device)
actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
.long().to(self.device)
rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
.float().to(self.device)
next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
.float().to(self.device)
dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
.float().to(self.device)
action_probs = torch.from_numpy(self.__v_stack_impr([e.action_prob for e in experiences if e is not None])) \
.float().to(self.device)
return states, actions, rewards, next_states, dones, action_probs
def __len__(self):
"""Return the current size of internal memory."""
return len(self.memory)
def __v_stack_impr(self, states):
sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1
np_states = np.reshape(np.array(states), (len(states), sub_dim))
return np_states
from collections import deque
from collections import namedtuple
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from reinforcement_learning.dddqn_policy import DDDQNPolicy
from reinforcement_learning.ppo_agent import PPOPolicy
dddqn_param_nt = namedtuple('DDDQN_Param', ['hidden_size', 'buffer_size', 'batch_size', 'update_every', 'learning_rate',
'tau', 'gamma', 'buffer_min_size', 'use_gpu'])
dddqn_param = dddqn_param_nt(hidden_size=128,
buffer_size=1000,
batch_size=64,
update_every=10,
learning_rate=1.e-3,
tau=1.e-2,
gamma=0.95,
buffer_min_size=0,
use_gpu=False)
def cartpole(use_dddqn=False):
eps = 1.0
eps_decay = 0.99
min_eps = 0.01
training_mode = True
env = gym.make("CartPole-v1")
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
if not use_dddqn:
policy = PPOPolicy(observation_space, action_space, False)
else:
policy = DDDQNPolicy(observation_space, action_space, dddqn_param)
episode = 0
checkpoint_interval = 20
scores_window = deque(maxlen=100)
writer = SummaryWriter()
while True:
episode += 1
state = env.reset()
policy.reset(env)
handle = 0
tot_reward = 0
policy.start_episode(train=training_mode)
while True:
# env.render()
policy.start_step(train=training_mode)
action = policy.act(handle, state, eps)
state_next, reward, terminal, info = env.step(action)
policy.end_step(train=training_mode)
tot_reward += reward
# reward = reward if not terminal else -reward
reward = 0 if not terminal else -1
policy.step(handle, state, action, reward, state_next, terminal)
state = np.copy(state_next)
if terminal:
break
policy.end_episode(train=training_mode)
eps = max(min_eps, eps * eps_decay)
scores_window.append(tot_reward)
if episode % checkpoint_interval == 0:
print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
tot_reward,
np.mean(
scores_window),
eps,
len(
policy.memory)))
else:
print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
tot_reward,
np.mean(
scores_window),
eps,
len(
policy.memory)),
end=" ")
writer.add_scalar("CartPole/value", tot_reward, episode)
writer.add_scalar("CartPole/smoothed_value", np.mean(scores_window), episode)
writer.flush()
if __name__ == "__main__":
cartpole()
import sys
import numpy as np
from pathlib import Path
import numpy as np
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.schedule_generators import complex_schedule_generator
from flatland.utils.rendertools import RenderTool
from pathlib import Path
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
......@@ -73,7 +73,7 @@ for trials in range(1, n_episodes + 1):
if done[a]:
acting_agent += 1
if a == acting_agent:
action = policy.act(obs[a])
action = policy.act(a, obs[a])
else:
action = 4
action_dict.update({a: action})
......
import sys
import numpy as np
from pathlib import Path
import numpy as np
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.schedule_generators import complex_schedule_generator
from flatland.utils.rendertools import RenderTool
from pathlib import Path
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
......@@ -66,7 +66,7 @@ for trials in range(1, n_episodes + 1):
if done[a]:
acting_agent += 1
if a == acting_agent:
action = policy.act(obs[a])
action = policy.act(a, obs[a])
else:
action = 4
action_dict.update({a: action})
......
......@@ -123,7 +123,8 @@ def train_agent(n_episodes):
# Build agent specific observations
for agent in env.get_agent_handles():
if obs[agent]:
agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth,
observation_radius=observation_radius)
agent_prev_obs[agent] = agent_obs[agent].copy()
# Run episode
......@@ -132,7 +133,7 @@ def train_agent(n_episodes):
if info['action_required'][agent]:
# If an action is required, we want to store the obs at that step as well as the action
update_values = True
action = policy.act(agent_obs[agent], eps=eps_start)
action = policy.act(agent, agent_obs[agent], eps=eps_start)
action_count[action] += 1
else:
update_values = False
......@@ -154,7 +155,8 @@ def train_agent(n_episodes):
agent_prev_action[agent] = action_dict[agent]
if next_obs[agent]:
agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=10)
agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth,
observation_radius=10)
score += all_rewards[agent]
......@@ -179,15 +181,16 @@ def train_agent(n_episodes):
else:
end = " "
print('\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(),
x_dim, y_dim,
episode_idx,
np.mean(scores_window),
100 * np.mean(completion_window),
eps_start,
action_probs
), end=end)
print(
'\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(),
x_dim, y_dim,
episode_idx,
np.mean(scores_window),
100 * np.mean(completion_window),
eps_start,
action_probs
), end=end)
# Plot overall training progress at the end
plt.plot(scores)
......@@ -199,7 +202,8 @@ def train_agent(n_episodes):
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500, type=int)
parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500,
type=int)
args = parser.parse_args()
train_agent(args.n_episodes)
'''
I did experiments in an early submission. Please note that the epsilon can have an
effects on the evaluation outcome :
DDDQNPolicy experiments - EPSILON impact analysis
----------------------------------------------------------------------------------------
checkpoint = "./checkpoints/201124171810-7800.pth" # Training on AGENTS=10 with Depth=2
EPSILON = 0.000 # Sum Normalized Reward : 0.000000000000000 (primary score)
EPSILON = 0.002 # Sum Normalized Reward : 18.445875081269286 (primary score)
EPSILON = 0.005 # Sum Normalized Reward : 18.371733625865854 (primary score)
EPSILON = 0.010 # Sum Normalized Reward : 18.249244799876152 (primary score)
EPSILON = 0.020 # Sum Normalized Reward : 17.526987022691376 (primary score)
EPSILON = 0.030 # Sum Normalized Reward : 16.796885571003942 (primary score)
EPSILON = 0.040 # Sum Normalized Reward : 17.280787151431426 (primary score)
EPSILON = 0.050 # Sum Normalized Reward : 16.256945636647025 (primary score)
EPSILON = 0.100 # Sum Normalized Reward : 14.828347241759966 (primary score)
EPSILON = 0.200 # Sum Normalized Reward : 11.192330074898457 (primary score)
EPSILON = 0.300 # Sum Normalized Reward : 14.523067754608782 (primary score)
EPSILON = 0.400 # Sum Normalized Reward : 12.901508220410834 (primary score)
EPSILON = 0.500 # Sum Normalized Reward : 3.754660231871272 (primary score)
EPSILON = 1.000 # Sum Normalized Reward : 1.397180159192391 (primary score)
'''
import sys
import time
from argparse import Namespace
......@@ -5,34 +27,87 @@ from pathlib import Path
import numpy as np
from flatland.core.env_observation_builder import DummyObservationBuilder
from flatland.envs.agent_utils import RailAgentStatus
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnvActions
from flatland.evaluators.client import FlatlandRemoteClient
from flatland.evaluators.client import TimeoutException
from reinforcement_learning.dddqn_policy import DDDQNPolicy
from reinforcement_learning.deadlockavoidance_with_decision_agent import DeadLockAvoidanceWithDecisionAgent
from reinforcement_learning.multi_decision_agent import MultiDecisionAgent
from reinforcement_learning.ppo_agent import PPOPolicy
from utils.agent_action_config import get_action_size, map_actions, set_action_size_reduced, set_action_size_full
from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
from utils.deadlock_check import check_if_all_blocked
from utils.fast_tree_obs import FastTreeObs
from utils.observation_utils import normalize_observation
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
from reinforcement_learning.dddqn_policy import DDDQNPolicy
####################################################
# EVALUATION PARAMETERS
set_action_size_full()
# Print per-step logs
VERBOSE = True
# Checkpoint to use (remember to push it!)
checkpoint = "./checkpoints/201112143850-5400.pth" # 21.220418678677177 DEPTH=2 AGENTS=10
# checkpoint = "./checkpoints/201113211844-6100.pth" # 19.690047767961005 DEPTH=2 AGENTS=20
USE_FAST_TREEOBS = True
if False:
# -------------------------------------------------------------------------------------------------------
# RL solution
# -------------------------------------------------------------------------------------------------------
# 116591 adrian_egli
# graded 71.305 0.633 RL Successfully Graded ! More details about this submission can be found at:
# http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/51
# Fri, 22 Jan 2021 23:37:56
set_action_size_reduced()
load_policy = "DDDQN"
checkpoint = "./checkpoints/210122120236-3000.pth" # 17.011131341978228
EPSILON = 0.0
if False:
# -------------------------------------------------------------------------------------------------------
# RL solution
# -------------------------------------------------------------------------------------------------------
# 116658 adrian_egli
# graded 73.821 0.655 RL Successfully Graded ! More details about this submission can be found at:
# http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/52
# Sat, 23 Jan 2021 07:41:35
set_action_size_reduced()
load_policy = "PPO"
checkpoint = "./checkpoints/210122235754-5000.pth" # 16.00113400887389
EPSILON = 0.0
if True:
# -------------------------------------------------------------------------------------------------------
# RL solution
# -------------------------------------------------------------------------------------------------------
# 116659 adrian_egli
# graded 80.579 0.715 RL Successfully Graded ! More details about this submission can be found at:
# http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/53
# Sat, 23 Jan 2021 07:45:49
set_action_size_reduced()
load_policy = "DDDQN"
checkpoint = "./checkpoints/210122165109-5000.pth" # 17.993750197899438
EPSILON = 0.0
if False:
# -------------------------------------------------------------------------------------------------------
# !! This is not a RL solution !!!!
# -------------------------------------------------------------------------------------------------------
# 116727 adrian_egli
# graded 106.786 0.768 RL Successfully Graded ! More details about this submission can be found at:
# http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/54
# Sat, 23 Jan 2021 14:31:50
set_action_size_reduced()
load_policy = "DeadLockAvoidance"
checkpoint = None
EPSILON = 0.0
# Use last action cache
USE_ACTION_CACHE = False
USE_DEAD_LOCK_AVOIDANCE_AGENT = False
# Observation parameters (must match training parameters!)
observation_tree_depth = 2
......@@ -45,16 +120,31 @@ remote_client = FlatlandRemoteClient()
# Observation builder
predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
tree_observation = FastTreeObs(max_depth=observation_tree_depth)
if USE_FAST_TREEOBS:
def check_is_observation_valid(observation):
return True
def get_normalized_observation(observation, tree_depth: int, observation_radius=0):
return observation
# Calculates state and action sizes
state_size = tree_observation.observation_dim
action_size = 5
tree_observation = FastTreeObs(max_depth=observation_tree_depth)
state_size = tree_observation.observation_dim
else:
def check_is_observation_valid(observation):
return observation
# Creates the policy. No GPU on evaluation server.
policy = DDDQNPolicy(state_size, action_size, Namespace(**{'use_gpu': False}), evaluation_mode=True)
# policy = PPOAgent(state_size, action_size, 10)
policy.load(checkpoint)
def get_normalized_observation(observation, tree_depth: int, observation_radius=0):
return normalize_observation(observation, tree_depth, observation_radius)
tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)
# Calculate the state size given the depth of the tree observation and the number of features
n_features_per_node = tree_observation.observation_dim
n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)])
state_size = n_features_per_node * n_nodes
#####################################################################
# Main evaluation loop
......@@ -89,6 +179,27 @@ while True:
tree_observation.set_env(local_env)
tree_observation.reset()
# Creates the policy. No GPU on evaluation server.
if load_policy == "DDDQN":
policy = DDDQNPolicy(state_size, get_action_size(), Namespace(**{'use_gpu': False}), evaluation_mode=True)
elif load_policy == "PPO":
policy = PPOPolicy(state_size, get_action_size())
elif load_policy == "DeadLockAvoidance":
policy = DeadLockAvoidanceAgent(local_env, get_action_size(), enable_eps=False)
elif load_policy == "DeadLockAvoidanceWithDecision":
# inter_policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=False, in_parameters=train_params)
inter_policy = DDDQNPolicy(state_size, get_action_size(), Namespace(**{'use_gpu': False}), evaluation_mode=True)
policy = DeadLockAvoidanceWithDecisionAgent(local_env, state_size, get_action_size(), inter_policy)
elif load_policy == "MultiDecision":
policy = MultiDecisionAgent(state_size, get_action_size(), Namespace(**{'use_gpu': False}))
else:
policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=False,
in_parameters=Namespace(**{'use_gpu': False}))
policy.load(checkpoint)
policy.reset(local_env)
observation = tree_observation.get_many(list(range(nb_agents)))
print("Evaluation {}: {} agents in {}x{}".format(evaluation_number, nb_agents, local_env.width, local_env.height))
......@@ -108,9 +219,7 @@ while True:
agent_last_action = {}
nb_hit = 0
if USE_DEAD_LOCK_AVOIDANCE_AGENT:
policy = DeadLockAvoidanceAgent(local_env)
policy.start_episode(train=False)
while True:
try:
#####################################################################
......@@ -123,35 +232,33 @@ while True:
if not check_if_all_blocked(env=local_env):
time_start = time.time()
action_dict = {}
policy.start_step()
if USE_DEAD_LOCK_AVOIDANCE_AGENT:
observation = np.zeros((local_env.get_num_agents(), 2))
for agent in range(nb_agents):
if USE_DEAD_LOCK_AVOIDANCE_AGENT:
observation[agent][0] = agent
observation[agent][1] = steps
if info['action_required'][agent]:
if agent in agent_last_obs and np.all(agent_last_obs[agent] == observation[agent]):
policy.start_step(train=False)
for agent_handle in range(nb_agents):
if info['action_required'][agent_handle]:
if agent_handle in agent_last_obs and np.all(
agent_last_obs[agent_handle] == observation[agent_handle]):
# cache hit
action = agent_last_action[agent]
action = agent_last_action[agent_handle]
nb_hit += 1
else:
action = policy.act(observation[agent], eps=0.01)
normalized_observation = get_normalized_observation(observation[agent_handle],
observation_tree_depth,
observation_radius=observation_radius)
action_dict[agent] = action
action = policy.act(agent_handle, normalized_observation, eps=EPSILON)
if USE_ACTION_CACHE:
agent_last_obs[agent] = observation[agent]
agent_last_action[agent] = action
action_dict[agent_handle] = action
policy.end_step()
if USE_ACTION_CACHE:
agent_last_obs[agent_handle] = observation[agent_handle]
agent_last_action[agent_handle] = action
policy.end_step(train=False)
agent_time = time.time() - time_start
time_taken_by_controller.append(agent_time)
time_start = time.time()
_, all_rewards, done, info = remote_client.env_step(action_dict)
_, all_rewards, done, info = remote_client.env_step(map_actions(action_dict))
step_time = time.time() - time_start
time_taken_per_step.append(step_time)
......@@ -168,7 +275,11 @@ while True:
step_time = time.time() - time_start
time_taken_per_step.append(step_time)
nb_agents_done = sum(done[idx] for idx in local_env.get_agent_handles())
nb_agents_done = 0
for i_agent, agent in enumerate(local_env.agents):
# manage the boolean flag to check if all agents are indeed done (or done_removed)
if (agent.status in [RailAgentStatus.DONE, RailAgentStatus.DONE_REMOVED]):
nb_agents_done += 1
if VERBOSE or done['__all__']:
print(
......@@ -197,6 +308,8 @@ while True:
print("Timeout! Will skip this episode and go to the next.", err)
break
policy.end_episode(train=False)
np_time_taken_by_controller = np.array(time_taken_by_controller)
np_time_taken_per_step = np.array(time_taken_per_step)
print("Mean/Std of Time taken by Controller : ", np_time_taken_by_controller.mean(),
......