Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hebe0663/neurips2020-flatland-starter-kit
  • flatland/neurips2020-flatland-starter-kit
  • manavsinghal157/marl-flatland
3 results
Show changes
Showing
with 967 additions and 471 deletions
......@@ -15,7 +15,7 @@ class OrderedPolicy(Policy):
def __init__(self):
self.action_size = 5
def act(self, state, eps=0.):
def act(self, handle, state, eps=0.):
_, distance, _ = split_tree_into_feature_groups(state, 1)
distance = distance[1:]
min_dist = min_gt(distance, 0)
......
class Policy:
def step(self, handle, state, action, reward, next_state, done):
raise NotImplementedError
def act(self, state, eps=0.):
raise NotImplementedError
def save(self, filename):
raise NotImplementedError
def load(self, filename):
raise NotImplementedError
def start_step(self):
pass
def end_step(self):
pass
def load_replay_buffer(self, filename):
pass
def test(self):
pass
def reset(self):
pass
\ No newline at end of file
from flatland.envs.rail_env import RailEnv
class DummyMemory:
def __init__(self):
self.memory = []
def __len__(self):
return 0
class Policy:
def step(self, handle, state, action, reward, next_state, done):
raise NotImplementedError
def act(self, handle, state, eps=0.):
raise NotImplementedError
def save(self, filename):
raise NotImplementedError
def load(self, filename):
raise NotImplementedError
def start_step(self, train):
pass
def end_step(self, train):
pass
def start_episode(self, train):
pass
def end_episode(self, train):
pass
def load_replay_buffer(self, filename):
pass
def test(self):
pass
def reset(self, env: RailEnv):
pass
def clone(self):
return self
class HeuristicPolicy(Policy):
def __init__(self):
super(HeuristicPolicy).__init__()
class LearningPolicy(Policy):
def __init__(self):
super(LearningPolicy).__init__()
class HybridPolicy(Policy):
def __init__(self):
super(HybridPolicy).__init__()
import torch.nn as nn
import torch.nn.functional as F
class PolicyNetwork(nn.Module):
def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128, hidsize3=32):
super().__init__()
self.fc1 = nn.Linear(state_size, hidsize1)
self.fc2 = nn.Linear(hidsize1, hidsize2)
# self.fc3 = nn.Linear(hidsize2, hidsize3)
self.output = nn.Linear(hidsize2, action_size)
self.softmax = nn.Softmax(dim=1)
self.bn0 = nn.BatchNorm1d(state_size, affine=False)
def forward(self, inputs):
x = self.bn0(inputs.float())
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
# x = F.relu(self.fc3(x))
return self.softmax(self.output(x))
import os
import numpy as np
import torch
from torch.distributions.categorical import Categorical
from reinforcement_learning.policy import Policy
from reinforcement_learning.ppo.model import PolicyNetwork
from reinforcement_learning.ppo.replay_memory import Episode, ReplayBuffer
BUFFER_SIZE = 128_000
BATCH_SIZE = 8192
GAMMA = 0.95
LR = 0.5e-4
CLIP_FACTOR = .005
UPDATE_EVERY = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
class PPOAgent(Policy):
def __init__(self, state_size, action_size, num_agents):
self.action_size = action_size
self.state_size = state_size
self.num_agents = num_agents
self.policy = PolicyNetwork(state_size, action_size).to(device)
self.old_policy = PolicyNetwork(state_size, action_size).to(device)
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=LR)
self.episodes = [Episode() for _ in range(num_agents)]
self.memory = ReplayBuffer(BUFFER_SIZE)
self.t_step = 0
self.loss = 0
def reset(self):
self.finished = [False] * len(self.episodes)
self.tot_reward = [0] * self.num_agents
# Decide on an action to take in the environment
def act(self, state, eps=None):
if eps is not None:
# Epsilon-greedy action selection
if np.random.random() < eps:
return np.random.choice(np.arange(self.action_size))
self.policy.eval()
with torch.no_grad():
output = self.policy(torch.from_numpy(state).float().unsqueeze(0).to(device))
ret = Categorical(output).sample().item()
return ret
# Record the results of the agent's action and update the model
def step(self, handle, state, action, reward, next_state, done):
if not self.finished[handle]:
# Push experience into Episode memory
self.tot_reward[handle] += reward
if done == 1:
reward = 1 # self.tot_reward[handle]
else:
reward = 0
self.episodes[handle].push(state, action, reward, next_state, done)
# When we finish the episode, discount rewards and push the experience into replay memory
if done:
self.episodes[handle].discount_rewards(GAMMA)
self.memory.push_episode(self.episodes[handle])
self.episodes[handle].reset()
self.finished[handle] = True
# Perform a gradient update every UPDATE_EVERY time steps
self.t_step = (self.t_step + 1) % UPDATE_EVERY
if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 4:
self._learn(*self.memory.sample(BATCH_SIZE, device))
def _clip_gradient(self, model, clip):
for p in model.parameters():
p.grad.data.clamp_(-clip, clip)
return
"""Computes a gradient clipping coefficient based on gradient norm."""
totalnorm = 0
for p in model.parameters():
if p.grad is not None:
modulenorm = p.grad.data.norm()
totalnorm += modulenorm ** 2
totalnorm = np.sqrt(totalnorm)
coeff = min(1, clip / (totalnorm + 1e-6))
for p in model.parameters():
if p.grad is not None:
p.grad.mul_(coeff)
def _learn(self, states, actions, rewards, next_state, done):
self.policy.train()
responsible_outputs = torch.gather(self.policy(states), 1, actions)
old_responsible_outputs = torch.gather(self.old_policy(states), 1, actions).detach()
# rewards = rewards - rewards.mean()
ratio = responsible_outputs / (old_responsible_outputs + 1e-5)
clamped_ratio = torch.clamp(ratio, 1. - CLIP_FACTOR, 1. + CLIP_FACTOR)
loss = -torch.min(ratio * rewards, clamped_ratio * rewards).mean()
self.loss = loss
# Compute loss and perform a gradient step
self.old_policy.load_state_dict(self.policy.state_dict())
self.optimizer.zero_grad()
loss.backward()
# self._clip_gradient(self.policy, 1.0)
self.optimizer.step()
# Checkpointing methods
def save(self, filename):
# print("Saving model from checkpoint:", filename)
torch.save(self.policy.state_dict(), filename + ".policy")
torch.save(self.optimizer.state_dict(), filename + ".optimizer")
def load(self, filename):
print("load policy from file", filename)
if os.path.exists(filename + ".policy"):
print(' >> ', filename + ".policy")
try:
self.policy.load_state_dict(torch.load(filename + ".policy", map_location=device))
except:
print(" >> failed!")
pass
if os.path.exists(filename + ".optimizer"):
print(' >> ', filename + ".optimizer")
try:
self.optimizer.load_state_dict(torch.load(filename + ".optimizer", map_location=device))
except:
print(" >> failed!")
pass
import torch
import random
import numpy as np
from collections import namedtuple, deque, Iterable
Transition = namedtuple("Experience", ("state", "action", "reward", "next_state", "done"))
class Episode:
memory = []
def reset(self):
self.memory = []
def push(self, *args):
self.memory.append(tuple(args))
def discount_rewards(self, gamma):
running_add = 0.
for i, (state, action, reward, *rest) in list(enumerate(self.memory))[::-1]:
running_add = running_add * gamma + reward
self.memory[i] = (state, action, running_add, *rest)
class ReplayBuffer:
def __init__(self, buffer_size):
self.memory = deque(maxlen=buffer_size)
def push(self, state, action, reward, next_state, done):
self.memory.append(Transition(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done))
def push_episode(self, episode):
for step in episode.memory:
self.push(*step)
def sample(self, batch_size, device):
experiences = random.sample(self.memory, k=batch_size)
states = torch.from_numpy(self.stack([e.state for e in experiences])).float().to(device)
actions = torch.from_numpy(self.stack([e.action for e in experiences])).long().to(device)
rewards = torch.from_numpy(self.stack([e.reward for e in experiences])).float().to(device)
next_states = torch.from_numpy(self.stack([e.next_state for e in experiences])).float().to(device)
dones = torch.from_numpy(self.stack([e.done for e in experiences]).astype(np.uint8)).float().to(device)
return states, actions, rewards, next_states, dones
def stack(self, states):
sub_dims = states[0].shape[1:] if isinstance(states[0], Iterable) else [1]
return np.reshape(np.array(states), (len(states), *sub_dims))
def __len__(self):
return len(self.memory)
import copy
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
# Hyperparameters
from reinforcement_learning.policy import LearningPolicy
from reinforcement_learning.replay_buffer import ReplayBuffer
# https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
class EpisodeBuffers:
def __init__(self):
self.reset()
def __len__(self):
"""Return the current size of internal memory."""
return len(self.memory)
def reset(self):
self.memory = {}
def get_transitions(self, handle):
return self.memory.get(handle, [])
def push_transition(self, handle, transition):
transitions = self.get_transitions(handle)
transitions.append(transition)
self.memory.update({handle: transitions})
class ActorCriticModel(nn.Module):
def __init__(self, state_size, action_size, device, hidsize1=512, hidsize2=256):
super(ActorCriticModel, self).__init__()
self.device = device
self.actor = nn.Sequential(
nn.Linear(state_size, hidsize1),
nn.Tanh(),
nn.Linear(hidsize1, hidsize2),
nn.Tanh(),
nn.Linear(hidsize2, action_size),
nn.Softmax(dim=-1)
).to(self.device)
self.critic = nn.Sequential(
nn.Linear(state_size, hidsize1),
nn.Tanh(),
nn.Linear(hidsize1, hidsize2),
nn.Tanh(),
nn.Linear(hidsize2, 1)
).to(self.device)
def forward(self, x):
raise NotImplementedError
def get_actor_dist(self, state):
action_probs = self.actor(state)
dist = Categorical(action_probs)
return dist
def evaluate(self, states, actions):
action_probs = self.actor(states)
dist = Categorical(action_probs)
action_logprobs = dist.log_prob(actions)
dist_entropy = dist.entropy()
state_value = self.critic(states)
return action_logprobs, torch.squeeze(state_value), dist_entropy
def save(self, filename):
# print("Saving model from checkpoint:", filename)
torch.save(self.actor.state_dict(), filename + ".actor")
torch.save(self.critic.state_dict(), filename + ".value")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
return obj
def load(self, filename):
print("load model from file", filename)
self.actor = self._load(self.actor, filename + ".actor")
self.critic = self._load(self.critic, filename + ".value")
class PPOPolicy(LearningPolicy):
def __init__(self, state_size, action_size, use_replay_buffer=False, in_parameters=None):
print(">> PPOPolicy")
super(PPOPolicy, self).__init__()
# parameters
self.ppo_parameters = in_parameters
if self.ppo_parameters is not None:
self.hidsize = self.ppo_parameters.hidden_size
self.buffer_size = self.ppo_parameters.buffer_size
self.batch_size = self.ppo_parameters.batch_size
self.learning_rate = self.ppo_parameters.learning_rate
self.gamma = self.ppo_parameters.gamma
# Device
if self.ppo_parameters.use_gpu and torch.cuda.is_available():
self.device = torch.device("cuda:0")
# print("🐇 Using GPU")
else:
self.device = torch.device("cpu")
# print("🐢 Using CPU")
else:
self.hidsize = 128
self.learning_rate = 1.0e-3
self.gamma = 0.95
self.buffer_size = 32_000
self.batch_size = 1024
self.device = torch.device("cpu")
self.surrogate_eps_clip = 0.1
self.K_epoch = 10
self.weight_loss = 0.5
self.weight_entropy = 0.01
self.buffer_min_size = 0
self.use_replay_buffer = use_replay_buffer
self.current_episode_memory = EpisodeBuffers()
self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
self.loss = 0
self.actor_critic_model = ActorCriticModel(state_size, action_size,self.device,
hidsize1=self.hidsize,
hidsize2=self.hidsize)
self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate)
self.loss_function = nn.MSELoss() # nn.SmoothL1Loss()
def reset(self, env):
pass
def act(self, handle, state, eps=None):
# sample a action to take
torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
dist = self.actor_critic_model.get_actor_dist(torch_state)
action = dist.sample()
return action.item()
def step(self, handle, state, action, reward, next_state, done):
# record transitions ([state] -> [action] -> [reward, next_state, done])
torch_action = torch.tensor(action, dtype=torch.float).to(self.device)
torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
# evaluate actor
dist = self.actor_critic_model.get_actor_dist(torch_state)
action_logprobs = dist.log_prob(torch_action)
transition = (state, action, reward, next_state, action_logprobs.item(), done)
self.current_episode_memory.push_transition(handle, transition)
def _push_transitions_to_replay_buffer(self,
state_list,
action_list,
reward_list,
state_next_list,
done_list,
prob_a_list):
for idx in range(len(reward_list)):
state_i = state_list[idx]
action_i = action_list[idx]
reward_i = reward_list[idx]
state_next_i = state_next_list[idx]
done_i = done_list[idx]
prob_action_i = prob_a_list[idx]
self.memory.add(state_i, action_i, reward_i, state_next_i, done_i, prob_action_i)
def _convert_transitions_to_torch_tensors(self, transitions_array):
# build empty lists(arrays)
state_list, action_list, reward_list, state_next_list, prob_a_list, done_list = [], [], [], [], [], []
# set discounted_reward to zero
discounted_reward = 0
for transition in transitions_array[::-1]:
state_i, action_i, reward_i, state_next_i, prob_action_i, done_i = transition
state_list.insert(0, state_i)
action_list.insert(0, action_i)
if done_i:
discounted_reward = 0
done_list.insert(0, 1)
else:
done_list.insert(0, 0)
discounted_reward = reward_i + self.gamma * discounted_reward
reward_list.insert(0, discounted_reward)
state_next_list.insert(0, state_next_i)
prob_a_list.insert(0, prob_action_i)
if self.use_replay_buffer:
self._push_transitions_to_replay_buffer(state_list, action_list,
reward_list, state_next_list,
done_list, prob_a_list)
# convert data to torch tensors
states, actions, rewards, states_next, dones, prob_actions = \
torch.tensor(state_list, dtype=torch.float).to(self.device), \
torch.tensor(action_list).to(self.device), \
torch.tensor(reward_list, dtype=torch.float).to(self.device), \
torch.tensor(state_next_list, dtype=torch.float).to(self.device), \
torch.tensor(done_list, dtype=torch.float).to(self.device), \
torch.tensor(prob_a_list).to(self.device)
return states, actions, rewards, states_next, dones, prob_actions
def _get_transitions_from_replay_buffer(self, states, actions, rewards, states_next, dones, probs_action):
if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
states, actions, rewards, states_next, dones, probs_action = self.memory.sample()
actions = torch.squeeze(actions)
rewards = torch.squeeze(rewards)
states_next = torch.squeeze(states_next)
dones = torch.squeeze(dones)
probs_action = torch.squeeze(probs_action)
return states, actions, rewards, states_next, dones, probs_action
def train_net(self):
# All agents have to propagate their experiences made during past episode
for handle in range(len(self.current_episode_memory)):
# Extract agent's episode history (list of all transitions)
agent_episode_history = self.current_episode_memory.get_transitions(handle)
if len(agent_episode_history) > 0:
# Convert the replay buffer to torch tensors (arrays)
states, actions, rewards, states_next, dones, probs_action = \
self._convert_transitions_to_torch_tensors(agent_episode_history)
# Optimize policy for K epochs:
for k_loop in range(int(self.K_epoch)):
if self.use_replay_buffer:
states, actions, rewards, states_next, dones, probs_action = \
self._get_transitions_from_replay_buffer(
states, actions, rewards, states_next, dones, probs_action
)
# Evaluating actions (actor) and values (critic)
logprobs, state_values, dist_entropy = self.actor_critic_model.evaluate(states, actions)
# Finding the ratios (pi_thetas / pi_thetas_replayed):
ratios = torch.exp(logprobs - probs_action.detach())
# Finding Surrogate Loos
advantages = rewards - state_values.detach()
surr1 = ratios * advantages
surr2 = torch.clamp(ratios, 1. - self.surrogate_eps_clip, 1. + self.surrogate_eps_clip) * advantages
# The loss function is used to estimate the gardient and use the entropy function based
# heuristic to penalize the gradient function when the policy becomes deterministic this would let
# the gradient becomes very flat and so the gradient is no longer useful.
loss = \
-torch.min(surr1, surr2) \
+ self.weight_loss * self.loss_function(state_values, rewards) \
- self.weight_entropy * dist_entropy
# Make a gradient step
self.optimizer.zero_grad()
loss.mean().backward()
self.optimizer.step()
# Transfer the current loss to the agents loss (information) for debug purpose only
self.loss = loss.mean().detach().cpu().numpy()
# Reset all collect transition data
self.current_episode_memory.reset()
def end_episode(self, train):
if train:
self.train_net()
# Checkpointing methods
def save(self, filename):
# print("Saving model from checkpoint:", filename)
self.actor_critic_model.save(filename)
torch.save(self.optimizer.state_dict(), filename + ".optimizer")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
else:
print(" >> file not found!")
return obj
def load(self, filename):
print("load policy from file", filename)
self.actor_critic_model.load(filename)
print("load optimizer from file", filename)
self.optimizer = self._load(self.optimizer, filename + ".optimizer")
def clone(self):
policy = PPOPolicy(self.state_size, self.action_size)
policy.actor_critic_model = copy.deepcopy(self.actor_critic_model)
policy.optimizer = copy.deepcopy(self.optimizer)
return self
import random
from collections import namedtuple, deque, Iterable
import numpy as np
import torch
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "action_prob"])
class ReplayBuffer:
"""Fixed-size buffer to store experience tuples."""
def __init__(self, action_size, buffer_size, batch_size, device):
"""Initialize a ReplayBuffer object.
Params
======
action_size (int): dimension of each action
buffer_size (int): maximum size of buffer
batch_size (int): size of each training batch
"""
self.action_size = action_size
self.memory = deque(maxlen=buffer_size)
self.batch_size = batch_size
self.device = device
def add(self, state, action, reward, next_state, done, action_prob=0.0):
"""Add a new experience to memory."""
e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, action_prob)
self.memory.append(e)
def sample(self):
"""Randomly sample a batch of experiences from memory."""
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
.float().to(self.device)
actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
.long().to(self.device)
rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
.float().to(self.device)
next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
.float().to(self.device)
dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
.float().to(self.device)
action_probs = torch.from_numpy(self.__v_stack_impr([e.action_prob for e in experiences if e is not None])) \
.float().to(self.device)
return states, actions, rewards, next_states, dones, action_probs
def __len__(self):
"""Return the current size of internal memory."""
return len(self.memory)
def __v_stack_impr(self, states):
sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1
np_states = np.reshape(np.array(states), (len(states), sub_dim))
return np_states
from collections import deque
from collections import namedtuple
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from reinforcement_learning.dddqn_policy import DDDQNPolicy
from reinforcement_learning.ppo_agent import PPOPolicy
dddqn_param_nt = namedtuple('DDDQN_Param', ['hidden_size', 'buffer_size', 'batch_size', 'update_every', 'learning_rate',
'tau', 'gamma', 'buffer_min_size', 'use_gpu'])
dddqn_param = dddqn_param_nt(hidden_size=128,
buffer_size=1000,
batch_size=64,
update_every=10,
learning_rate=1.e-3,
tau=1.e-2,
gamma=0.95,
buffer_min_size=0,
use_gpu=False)
def cartpole(use_dddqn=False):
eps = 1.0
eps_decay = 0.99
min_eps = 0.01
training_mode = True
env = gym.make("CartPole-v1")
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
if not use_dddqn:
policy = PPOPolicy(observation_space, action_space, False)
else:
policy = DDDQNPolicy(observation_space, action_space, dddqn_param)
episode = 0
checkpoint_interval = 20
scores_window = deque(maxlen=100)
writer = SummaryWriter()
while True:
episode += 1
state = env.reset()
policy.reset(env)
handle = 0
tot_reward = 0
policy.start_episode(train=training_mode)
while True:
# env.render()
policy.start_step(train=training_mode)
action = policy.act(handle, state, eps)
state_next, reward, terminal, info = env.step(action)
policy.end_step(train=training_mode)
tot_reward += reward
# reward = reward if not terminal else -reward
reward = 0 if not terminal else -1
policy.step(handle, state, action, reward, state_next, terminal)
state = np.copy(state_next)
if terminal:
break
policy.end_episode(train=training_mode)
eps = max(min_eps, eps * eps_decay)
scores_window.append(tot_reward)
if episode % checkpoint_interval == 0:
print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
tot_reward,
np.mean(
scores_window),
eps,
len(
policy.memory)))
else:
print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
tot_reward,
np.mean(
scores_window),
eps,
len(
policy.memory)),
end=" ")
writer.add_scalar("CartPole/value", tot_reward, episode)
writer.add_scalar("CartPole/smoothed_value", np.mean(scores_window), episode)
writer.flush()
if __name__ == "__main__":
cartpole()
import sys
import numpy as np
from pathlib import Path
import numpy as np
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.schedule_generators import complex_schedule_generator
from flatland.utils.rendertools import RenderTool
from pathlib import Path
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
......@@ -73,7 +73,7 @@ for trials in range(1, n_episodes + 1):
if done[a]:
acting_agent += 1
if a == acting_agent:
action = policy.act(obs[a])
action = policy.act(a, obs[a])
else:
action = 4
action_dict.update({a: action})
......
import sys
from pathlib import Path
import numpy as np
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.schedule_generators import complex_schedule_generator
from flatland.utils.rendertools import RenderTool
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
from reinforcement_learning.ordered_policy import OrderedPolicy
np.random.seed(2)
x_dim = 20 # np.random.randint(8, 20)
y_dim = 20 # np.random.randint(8, 20)
n_agents = 10 # np.random.randint(3, 8)
n_goals = n_agents + np.random.randint(0, 3)
min_dist = int(0.75 * min(x_dim, y_dim))
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=complex_rail_generator(
nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
max_dist=99999,
seed=0
),
schedule_generator=complex_schedule_generator(),
obs_builder_object=TreeObsForRailEnv(max_depth=1, predictor=ShortestPathPredictorForRailEnv()),
number_of_agents=n_agents)
env.reset(True, True)
tree_depth = 1
observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())
env_renderer = RenderTool(env, gl="PGL", )
handle = env.get_agent_handles()
n_episodes = 1
max_steps = 100 * (env.height + env.width)
record_images = False
policy = OrderedPolicy()
action_dict = dict()
for trials in range(1, n_episodes + 1):
# Reset environment
obs, info = env.reset(True, True)
done = env.dones
env_renderer.reset()
frame_step = 0
# Run episode
for step in range(max_steps):
env_renderer.render_env(show=True, show_observations=False, show_predictions=True)
if record_images:
env_renderer.gl.save_image("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
frame_step += 1
# Action
acting_agent = 0
for a in range(env.get_num_agents()):
if done[a]:
acting_agent += 1
if a == acting_agent:
action = policy.act(a, obs[a])
else:
action = 4
action_dict.update({a: action})
# Environment step
obs, all_rewards, done, _ = env.step(action_dict)
if done['__all__']:
break
......@@ -123,7 +123,8 @@ def train_agent(n_episodes):
# Build agent specific observations
for agent in env.get_agent_handles():
if obs[agent]:
agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth,
observation_radius=observation_radius)
agent_prev_obs[agent] = agent_obs[agent].copy()
# Run episode
......@@ -132,7 +133,7 @@ def train_agent(n_episodes):
if info['action_required'][agent]:
# If an action is required, we want to store the obs at that step as well as the action
update_values = True
action = policy.act(agent_obs[agent], eps=eps_start)
action = policy.act(agent, agent_obs[agent], eps=eps_start)
action_count[action] += 1
else:
update_values = False
......@@ -154,7 +155,8 @@ def train_agent(n_episodes):
agent_prev_action[agent] = action_dict[agent]
if next_obs[agent]:
agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=10)
agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth,
observation_radius=10)
score += all_rewards[agent]
......@@ -179,15 +181,16 @@ def train_agent(n_episodes):
else:
end = " "
print('\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(),
x_dim, y_dim,
episode_idx,
np.mean(scores_window),
100 * np.mean(completion_window),
eps_start,
action_probs
), end=end)
print(
'\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(),
x_dim, y_dim,
episode_idx,
np.mean(scores_window),
100 * np.mean(completion_window),
eps_start,
action_probs
), end=end)
# Plot overall training progress at the end
plt.plot(scores)
......@@ -199,7 +202,8 @@ def train_agent(n_episodes):
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500, type=int)
parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500,
type=int)
args = parser.parse_args()
train_agent(args.n_episodes)
This diff is collapsed.
#!/bin/bash
# manually install submodules.
python ./run.py
from time import time
import numpy as np
from flatland.envs.rail_env import fast_isclose
def print_timing(label, start_time, end_time):
print("{:>10.4f}ms".format(1000 * (end_time - start_time)) + "\t" + label)
def check_isclose(nbr=100000):
s = time()
for x in range(nbr):
fast_isclose(x, 0.0, rtol=1e-03)
e = time()
print_timing("fast_isclose", start_time=s, end_time=e)
s = time()
for x in range(nbr):
np.isclose(x, 0.0, rtol=1e-03)
e = time()
print_timing("np.isclose", start_time=s, end_time=e)
if __name__ == "__main__":
check_isclose()