Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • jack_bruck/baselines
  • rivesunder/baselines
  • xzhaoma/baselines
  • giulia_cantini/baselines
  • sfwatergit/baselines
  • jiaodaxiaozi/baselines
  • flatland/baselines
7 results
Show changes
Showing
with 1067 additions and 266 deletions
...@@ -8,51 +8,41 @@ import torch ...@@ -8,51 +8,41 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
from torch_training.model import QNetwork, QNetwork2 from torch_training.model import QNetwork
BUFFER_SIZE = int(1e5) # replay buffer size BUFFER_SIZE = int(1e5) # replay buffer size
BATCH_SIZE = 512 # minibatch size BATCH_SIZE = 512 # minibatch size
GAMMA = 0.99 # discount factor 0.99 GAMMA = 0.99 # discount factor 0.99
TAU = 1e-3 # for soft update of target parameters TAU = 1e-3 # for soft update of target parameters
LR = 0.5e-4 # learning rate 5 LR = 0.5e-4 # learning rate 0.5e-4 works
UPDATE_EVERY = 10 # how often to update the network UPDATE_EVERY = 10 # how often to update the network
double_dqn = True # If using double dqn algorithm
input_channels = 5 # Number of Input channels
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(device) print(device)
class Agent: class Agent:
"""Interacts with and learns from the environment.""" """Interacts with and learns from the environment."""
def __init__(self, state_size, action_size, net_type, seed, double_dqn=True, input_channels=5): def __init__(self, state_size, action_size, double_dqn=True):
"""Initialize an Agent object. """Initialize an Agent object.
Params Params
====== ======
state_size (int): dimension of each state state_size (int): dimension of each state
action_size (int): dimension of each action action_size (int): dimension of each action
seed (int): random seed
""" """
self.state_size = state_size self.state_size = state_size
self.action_size = action_size self.action_size = action_size
self.seed = random.seed(seed)
self.version = net_type
self.double_dqn = double_dqn self.double_dqn = double_dqn
# Q-Network # Q-Network
if self.version == "Conv": self.qnetwork_local = QNetwork(state_size, action_size).to(device)
self.qnetwork_local = QNetwork2(state_size, action_size, seed, input_channels).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
else:
self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
# Replay memory # Replay memory
self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
# Initialize time step (for updating every UPDATE_EVERY steps) # Initialize time step (for updating every UPDATE_EVERY steps)
self.t_step = 0 self.t_step = 0
...@@ -152,7 +142,7 @@ class Agent: ...@@ -152,7 +142,7 @@ class Agent:
class ReplayBuffer: class ReplayBuffer:
"""Fixed-size buffer to store experience tuples.""" """Fixed-size buffer to store experience tuples."""
def __init__(self, action_size, buffer_size, batch_size, seed): def __init__(self, action_size, buffer_size, batch_size):
"""Initialize a ReplayBuffer object. """Initialize a ReplayBuffer object.
Params Params
...@@ -160,13 +150,11 @@ class ReplayBuffer: ...@@ -160,13 +150,11 @@ class ReplayBuffer:
action_size (int): dimension of each action action_size (int): dimension of each action
buffer_size (int): maximum size of buffer buffer_size (int): maximum size of buffer
batch_size (int): size of each training batch batch_size (int): size of each training batch
seed (int): random seed
""" """
self.action_size = action_size self.action_size = action_size
self.memory = deque(maxlen=buffer_size) self.memory = deque(maxlen=buffer_size)
self.batch_size = batch_size self.batch_size = batch_size
self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
self.seed = random.seed(seed)
def add(self, state, action, reward, next_state, done): def add(self, state, action, reward, next_state, done):
"""Add a new experience to memory.""" """Add a new experience to memory."""
...@@ -188,7 +176,7 @@ class ReplayBuffer: ...@@ -188,7 +176,7 @@ class ReplayBuffer:
dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \ dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
.float().to(device) .float().to(device)
return (states, actions, rewards, next_states, dones) return states, actions, rewards, next_states, dones
def __len__(self): def __len__(self):
"""Return the current size of internal memory.""" """Return the current size of internal memory."""
......
...@@ -3,7 +3,7 @@ import torch.nn.functional as F ...@@ -3,7 +3,7 @@ import torch.nn.functional as F
class QNetwork(nn.Module): class QNetwork(nn.Module):
def __init__(self, state_size, action_size, seed, hidsize1=128, hidsize2=128): def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128):
super(QNetwork, self).__init__() super(QNetwork, self).__init__()
self.fc1_val = nn.Linear(state_size, hidsize1) self.fc1_val = nn.Linear(state_size, hidsize1)
...@@ -24,38 +24,3 @@ class QNetwork(nn.Module): ...@@ -24,38 +24,3 @@ class QNetwork(nn.Module):
adv = F.relu(self.fc2_adv(adv)) adv = F.relu(self.fc2_adv(adv))
adv = self.fc3_adv(adv) adv = self.fc3_adv(adv)
return val + adv - adv.mean() return val + adv - adv.mean()
class QNetwork2(nn.Module):
def __init__(self, state_size, action_size, seed, input_channels, hidsize1=128, hidsize2=64):
super(QNetwork2, self).__init__()
self.conv1 = nn.Conv2d(input_channels, 16, kernel_size=3, stride=1)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=3)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 64, kernel_size=5, stride=3)
self.bn3 = nn.BatchNorm2d(64)
self.fc1_val = nn.Linear(6400, hidsize1)
self.fc2_val = nn.Linear(hidsize1, hidsize2)
self.fc3_val = nn.Linear(hidsize2, 1)
self.fc1_adv = nn.Linear(6400, hidsize1)
self.fc2_adv = nn.Linear(hidsize1, hidsize2)
self.fc3_adv = nn.Linear(hidsize2, action_size)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
# value function approximation
val = F.relu(self.fc1_val(x.view(x.size(0), -1)))
val = F.relu(self.fc2_val(val))
val = self.fc3_val(val)
# advantage calculation
adv = F.relu(self.fc1_adv(x.view(x.size(0), -1)))
adv = F.relu(self.fc2_adv(adv))
adv = self.fc3_adv(adv)
return val + adv - adv.mean()
import random
from collections import deque
import numpy as np
import torch
from flatland.envs.malfunction_generators import malfunction_from_params, MalfunctionParameters
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import sparse_rail_generator
from flatland.envs.schedule_generators import sparse_schedule_generator
from flatland.utils.rendertools import RenderTool
from importlib_resources import path
import torch_training.Nets
from torch_training.dueling_double_dqn import Agent
from utils.observation_utils import normalize_observation
random.seed(1)
np.random.seed(1)
"""
file_name = "./railway/complex_scene.pkl"
env = RailEnv(width=10,
height=20,
rail_generator=rail_from_file(file_name),
obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()))
x_dim = env.width
y_dim = env.height
"""
# Parameters for the Environment
x_dim = 25
y_dim = 25
n_agents = 10
# We are training an Agent using the Tree Observation with depth 2
observation_builder = TreeObsForRailEnv(max_depth=2)
# Use a the malfunction generator to break agents from time to time
stochastic_data = MalfunctionParameters(malfunction_rate=1./10000, # Rate of malfunction occurence
min_duration=15, # Minimal duration of malfunction
max_duration=50 # Max duration of malfunction
)
# Custom observation builder
TreeObservation = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))
# Different agent types (trains) with different speeds.
speed_ration_map = {1.: 0.25, # Fast passenger train
1. / 2.: 0.25, # Fast freight train
1. / 3.: 0.25, # Slow commuter train
1. / 4.: 0.25} # Slow freight train
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=sparse_rail_generator(max_num_cities=3,
# Number of cities in map (where train stations are)
seed=1, # Random seed
grid_mode=False,
max_rails_between_cities=2,
max_rails_in_city=2),
schedule_generator=sparse_schedule_generator(speed_ration_map),
number_of_agents=n_agents,
malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
obs_builder_object=TreeObservation)
env.reset(True, True)
observation_helper = TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv())
env_renderer = RenderTool(env, gl="PILSVG", )
num_features_per_node = env.obs_builder.observation_dim
tree_depth = 2
nr_nodes = 0
for i in range(tree_depth + 1):
nr_nodes += np.power(4, i)
state_size = num_features_per_node * nr_nodes
action_size = 5
# We set the number of episodes we would like to train on
if 'n_trials' not in locals():
n_trials = 60000
max_steps = int(4 * 2 * (20 + env.height + env.width))
eps = 1.
eps_end = 0.005
eps_decay = 0.9995
action_dict = dict()
final_action_dict = dict()
scores_window = deque(maxlen=100)
done_window = deque(maxlen=100)
scores = []
dones_list = []
action_prob = [0] * action_size
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
agent = Agent(state_size, action_size)
with path(torch_training.Nets, "navigator_checkpoint1200.pth") as file_in:
agent.qnetwork_local.load_state_dict(torch.load(file_in))
record_images = False
frame_step = 0
for trials in range(1, n_trials + 1):
# Reset environment
obs, info = env.reset(True, True)
env_renderer.reset()
# Build agent specific observations
for a in range(env.get_num_agents()):
agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
# Reset score and done
score = 0
env_done = 0
# Run episode
for step in range(max_steps):
# Action
for a in range(env.get_num_agents()):
if info['action_required'][a]:
action = agent.act(agent_obs[a], eps=0.)
else:
action = 0
action_prob[action] += 1
action_dict.update({a: action})
# Environment step
obs, all_rewards, done, _ = env.step(action_dict)
env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
# Build agent specific observations and normalize
for a in range(env.get_num_agents()):
if obs[a]:
agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
if done['__all__']:
break
import getopt
import random
import sys
from collections import deque
# make sure the root path is in system path
from pathlib import Path
from flatland.envs.malfunction_generators import malfunction_from_params, MalfunctionParameters
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch_training.dueling_double_dqn import Agent
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import sparse_rail_generator
from flatland.envs.schedule_generators import sparse_schedule_generator
from flatland.utils.rendertools import RenderTool
from utils.observation_utils import normalize_observation
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.agent_utils import RailAgentStatus
def main(argv):
try:
opts, args = getopt.getopt(argv, "n:", ["n_trials="])
except getopt.GetoptError:
print('training_navigation.py -n <n_trials>')
sys.exit(2)
for opt, arg in opts:
if opt in ('-n', '--n_trials'):
n_trials = int(arg)
random.seed(1)
np.random.seed(1)
# Parameters for the Environment
x_dim = 35
y_dim = 35
n_agents = 10
# Use a the malfunction generator to break agents from time to time
stochastic_data = MalfunctionParameters(malfunction_rate=1./10000, # Rate of malfunction occurence
min_duration=15, # Minimal duration of malfunction
max_duration=50 # Max duration of malfunction
)
# Custom observation builder
TreeObservation = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))
# Different agent types (trains) with different speeds.
speed_ration_map = {1.: 0.25, # Fast passenger train
1. / 2.: 0.25, # Fast freight train
1. / 3.: 0.25, # Slow commuter train
1. / 4.: 0.25} # Slow freight train
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=sparse_rail_generator(max_num_cities=3,
# Number of cities in map (where train stations are)
seed=1, # Random seed
grid_mode=False,
max_rails_between_cities=2,
max_rails_in_city=3),
schedule_generator=sparse_schedule_generator(speed_ration_map),
number_of_agents=n_agents,
malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
obs_builder_object=TreeObservation)
# Reset env
env.reset(True,True)
# After training we want to render the results so we also load a renderer
env_renderer = RenderTool(env, gl="PILSVG", )
# Given the depth of the tree observation and the number of features per node we get the following state_size
num_features_per_node = env.obs_builder.observation_dim
tree_depth = 2
nr_nodes = 0
for i in range(tree_depth + 1):
nr_nodes += np.power(4, i)
state_size = num_features_per_node * nr_nodes
# The action space of flatland is 5 discrete actions
action_size = 5
# We set the number of episodes we would like to train on
if 'n_trials' not in locals():
n_trials = 15000
# And the max number of steps we want to take per episode
max_steps = int(4 * 2 * (20 + env.height + env.width))
# Define training parameters
eps = 1.
eps_end = 0.005
eps_decay = 0.998
# And some variables to keep track of the progress
action_dict = dict()
final_action_dict = dict()
scores_window = deque(maxlen=100)
done_window = deque(maxlen=100)
scores = []
dones_list = []
action_prob = [0] * action_size
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
agent_obs_buffer = [None] * env.get_num_agents()
agent_action_buffer = [2] * env.get_num_agents()
cummulated_reward = np.zeros(env.get_num_agents())
update_values = [False] * env.get_num_agents()
# Now we load a Double dueling DQN agent
agent = Agent(state_size, action_size)
for trials in range(1, n_trials + 1):
# Reset environment
obs, info = env.reset(True, True)
env_renderer.reset()
# Build agent specific observations
for a in range(env.get_num_agents()):
if obs[a]:
agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
agent_obs_buffer[a] = agent_obs[a].copy()
# Reset score and done
score = 0
env_done = 0
# Run episode
while True:
# Action
for a in range(env.get_num_agents()):
if info['action_required'][a]:
# If an action is require, we want to store the obs a that step as well as the action
update_values[a] = True
action = agent.act(agent_obs[a], eps=eps)
action_prob[action] += 1
else:
update_values[a] = False
action = 0
action_dict.update({a: action})
# Environment step
next_obs, all_rewards, done, info = env.step(action_dict)
# Update replay buffer and train agent
for a in range(env.get_num_agents()):
# Only update the values when we are done or when an action was taken and thus relevant information is present
if update_values[a] or done[a]:
agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
agent_obs[a], done[a])
cummulated_reward[a] = 0.
agent_obs_buffer[a] = agent_obs[a].copy()
agent_action_buffer[a] = action_dict[a]
if next_obs[a]:
agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
score += all_rewards[a] / env.get_num_agents()
# Copy observation
if done['__all__']:
env_done = 1
break
# Epsilon decay
eps = max(eps_end, eps_decay * eps) # decrease epsilon
# Collection information about training
tasks_finished = 0
for current_agent in env.agents:
if current_agent.status == RailAgentStatus.DONE_REMOVED:
tasks_finished += 1
done_window.append(tasks_finished / max(1, env.get_num_agents()))
scores_window.append(score / max_steps) # save most recent score
scores.append(np.mean(scores_window))
dones_list.append((np.mean(done_window)))
print(
'\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(), x_dim, y_dim,
trials,
np.mean(scores_window),
100 * np.mean(done_window),
eps, action_prob / np.sum(action_prob)), end=" ")
if trials % 100 == 0:
print(
'\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(), x_dim, y_dim,
trials,
np.mean(scores_window),
100 * np.mean(done_window),
eps, action_prob / np.sum(action_prob)))
torch.save(agent.qnetwork_local.state_dict(),
'./Nets/navigator_checkpoint' + str(trials) + '.pth')
action_prob = [1] * action_size
# Plot overall training progress at the end
plt.plot(scores)
plt.show()
if __name__ == '__main__':
main(sys.argv[1:])
# Import packages for plotting and system
import getopt
import random
import sys
from collections import deque
import matplotlib.pyplot as plt
import numpy as np
import torch
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
# Import Flatland/ Observations and Predictors
from flatland.envs.schedule_generators import complex_schedule_generator
from importlib_resources import path
# Import Torch and utility functions to normalize observation
import torch_training.Nets
from torch_training.dueling_double_dqn import Agent
from utils.observation_utils import norm_obs_clip, split_tree_into_feature_groups
def main(argv):
try:
opts, args = getopt.getopt(argv, "n:", ["n_episodes="])
except getopt.GetoptError:
print('training_navigation.py -n <n_episodes>')
sys.exit(2)
for opt, arg in opts:
if opt in ('-n', '--n_episodes'):
n_episodes = int(arg)
## Initialize the random
random.seed(1)
np.random.seed(1)
# Initialize a random map with a random number of agents
x_dim = np.random.randint(8, 20)
y_dim = np.random.randint(8, 20)
n_agents = np.random.randint(3, 8)
n_goals = n_agents + np.random.randint(0, 3)
min_dist = int(0.75 * min(x_dim, y_dim))
tree_depth = 2
print("main2")
demo = False
# Get an observation builder and predictor
observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
max_dist=99999,
seed=0),
schedule_generator=complex_schedule_generator(),
obs_builder_object=observation_helper,
number_of_agents=n_agents)
env.reset(True, True)
handle = env.get_agent_handles()
features_per_node = env.obs_builder.observation_dim
nr_nodes = 0
for i in range(tree_depth + 1):
nr_nodes += np.power(4, i)
state_size = 2 * features_per_node * nr_nodes # We will use two time steps per observation --> 2x state_size
action_size = 5
# We set the number of episodes we would like to train on
if 'n_episodes' not in locals():
n_episodes = 60000
# Set max number of steps per episode as well as other training relevant parameter
max_steps = int(3 * (env.height + env.width))
eps = 1.
eps_end = 0.005
eps_decay = 0.9995
action_dict = dict()
final_action_dict = dict()
scores_window = deque(maxlen=100)
done_window = deque(maxlen=100)
time_obs = deque(maxlen=2)
scores = []
dones_list = []
action_prob = [0] * action_size
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
# Initialize the agent
agent = Agent(state_size, action_size)
# Here you can pre-load an agent
if False:
with path(torch_training.Nets, "avoid_checkpoint500.pth") as file_in:
agent.qnetwork_local.load_state_dict(torch.load(file_in))
# Do training over n_episodes
for episodes in range(1, n_episodes + 1):
"""
Training Curriculum: In order to get good generalization we change the number of agents
and the size of the levels every 50 episodes.
"""
if episodes % 50 == 0:
x_dim = np.random.randint(8, 20)
y_dim = np.random.randint(8, 20)
n_agents = np.random.randint(3, 8)
n_goals = n_agents + np.random.randint(0, 3)
min_dist = int(0.75 * min(x_dim, y_dim))
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
max_dist=99999,
seed=0),
schedule_generator=complex_schedule_generator(),
obs_builder_object=TreeObsForRailEnv(max_depth=3,
predictor=ShortestPathPredictorForRailEnv()),
number_of_agents=n_agents)
# Adjust the parameters according to the new env.
max_steps = int(3 * (env.height + env.width))
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
# Reset environment
obs, info = env.reset(True, True)
# Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
# different times during an episode
final_obs = agent_obs.copy()
final_obs_next = agent_next_obs.copy()
# Build agent specific observations
for a in range(env.get_num_agents()):
data, distance, agent_data = split_tree_into_feature_groups(obs[a], tree_depth)
data = norm_obs_clip(data)
distance = norm_obs_clip(distance)
agent_data = np.clip(agent_data, -1, 1)
obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
# Accumulate two time steps of observation (Here just twice the first state)
for i in range(2):
time_obs.append(obs)
# Build the agent specific double ti
for a in range(env.get_num_agents()):
agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
score = 0
env_done = 0
# Run episode
for step in range(max_steps):
# Action
for a in range(env.get_num_agents()):
if demo:
eps = 0
# action = agent.act(np.array(obs[a]), eps=eps)
action = agent.act(agent_obs[a], eps=eps)
action_prob[action] += 1
action_dict.update({a: action})
# Environment step
next_obs, all_rewards, done, _ = env.step(action_dict)
for a in range(env.get_num_agents()):
data, distance, agent_data = split_tree_into_feature_groups(next_obs[a], tree_depth)
data = norm_obs_clip(data)
distance = norm_obs_clip(distance)
agent_data = np.clip(agent_data, -1, 1)
next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
time_obs.append(next_obs)
# Update replay buffer and train agent
for a in range(env.get_num_agents()):
agent_next_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
if done[a]:
final_obs[a] = agent_obs[a].copy()
final_obs_next[a] = agent_next_obs[a].copy()
final_action_dict.update({a: action_dict[a]})
if not demo and not done[a]:
agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
score += all_rewards[a] / env.get_num_agents()
agent_obs = agent_next_obs.copy()
if done['__all__']:
env_done = 1
for a in range(env.get_num_agents()):
agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
break
# Epsilon decay
eps = max(eps_end, eps_decay * eps) # decrease epsilon
done_window.append(env_done)
scores_window.append(score / max_steps) # save most recent score
scores.append(np.mean(scores_window))
dones_list.append((np.mean(done_window)))
print(
'\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(), x_dim, y_dim,
episodes,
np.mean(scores_window),
100 * np.mean(done_window),
eps, action_prob / np.sum(action_prob)), end=" ")
if episodes % 100 == 0:
print(
'\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(),
episodes,
np.mean(scores_window),
100 * np.mean(done_window),
eps,
action_prob / np.sum(action_prob)))
torch.save(agent.qnetwork_local.state_dict(),
'./Nets/avoid_checkpoint' + str(episodes) + '.pth')
action_prob = [1] * action_size
plt.plot(scores)
plt.show()
if __name__ == '__main__':
main(sys.argv[1:])
No preview for this file type
File added
File added
File added
File added
File added
File added
File added
File added
File added
import random
from collections import deque
import numpy as np
import torch
from flatland.envs.malfunction_generators import malfunction_from_params, MalfunctionParameters
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import sparse_rail_generator
from flatland.envs.schedule_generators import sparse_schedule_generator
from flatland.utils.rendertools import RenderTool
from importlib_resources import path
import torch_training.Nets
from torch_training.dueling_double_dqn import Agent
from utils.observation_utils import normalize_observation
random.seed(1)
np.random.seed(1)
"""
file_name = "./railway/complex_scene.pkl"
env = RailEnv(width=10,
height=20,
rail_generator=rail_from_file(file_name),
obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()))
x_dim = env.width
y_dim = env.height
"""
# Parameters for the Environment
x_dim = 25
y_dim = 25
n_agents = 1
n_goals = 5
min_dist = 5
# We are training an Agent using the Tree Observation with depth 2
observation_builder = TreeObsForRailEnv(max_depth=2)
# Use a the malfunction generator to break agents from time to time
stochastic_data = MalfunctionParameters(malfunction_rate=1./10000, # Rate of malfunction occurence
min_duration=15, # Minimal duration of malfunction
max_duration=50 # Max duration of malfunction
)
# Custom observation builder
TreeObservation = TreeObsForRailEnv(max_depth=2)
# Different agent types (trains) with different speeds.
speed_ration_map = {1.: 1., # Fast passenger train
1. / 2.: 0.0, # Fast freight train
1. / 3.: 0.0, # Slow commuter train
1. / 4.: 0.0} # Slow freight train
env = RailEnv(width=x_dim,
height=y_dim,
rail_generator=sparse_rail_generator(max_num_cities=3,
# Number of cities in map (where train stations are)
seed=1, # Random seed
grid_mode=False,
max_rails_between_cities=2,
max_rails_in_city=4),
schedule_generator=sparse_schedule_generator(speed_ration_map),
number_of_agents=n_agents,
malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
obs_builder_object=TreeObservation)
env.reset(True,True)
env_renderer = RenderTool(env, gl="PILSVG", )
num_features_per_node = env.obs_builder.observation_dim
tree_depth = 2
nr_nodes = 0
for i in range(tree_depth + 1):
nr_nodes += np.power(4, i)
state_size = num_features_per_node * nr_nodes
action_size = 5
# We set the number of episodes we would like to train on
if 'n_trials' not in locals():
n_trials = 60000
max_steps = int(3 * (env.height + env.width))
eps = 1.
eps_end = 0.005
eps_decay = 0.9995
action_dict = dict()
final_action_dict = dict()
scores_window = deque(maxlen=100)
done_window = deque(maxlen=100)
scores = []
dones_list = []
action_prob = [0] * action_size
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
agent = Agent(state_size, action_size)
with path(torch_training.Nets, "navigator_checkpoint1000.pth") as file_in:
agent.qnetwork_local.load_state_dict(torch.load(file_in))
record_images = False
frame_step = 0
for trials in range(1, n_trials + 1):
# Reset environment
obs, info = env.reset(True, True)
env_renderer.reset()
# Build agent specific observations
for a in range(env.get_num_agents()):
agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
# Reset score and done
score = 0
env_done = 0
# Run episode
for step in range(max_steps):
# Action
for a in range(env.get_num_agents()):
if info['action_required'][a]:
action = agent.act(agent_obs[a], eps=0.)
else:
action = 0
action_prob[action] += 1
action_dict.update({a: action})
# Environment step
obs, all_rewards, done, _ = env.step(action_dict)
env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
# Build agent specific observations and normalize
for a in range(env.get_num_agents()):
if obs[a]:
agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
if done['__all__']:
break
import getopt
import random import random
import sys
from collections import deque from collections import deque
# make sure the root path is in system path
from pathlib import Path
from flatland.envs.malfunction_generators import malfunction_from_params, MalfunctionParameters
base_dir = Path(__file__).resolve().parent.parent
sys.path.append(str(base_dir))
import matplotlib.pyplot as plt
import numpy as np import numpy as np
import torch import torch
from dueling_double_dqn import Agent from torch_training.dueling_double_dqn import Agent
from flatland.envs.generators import complex_rail_generator
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import DummyPredictorForRailEnv
from flatland.envs.rail_env import RailEnv from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import sparse_rail_generator
from flatland.envs.schedule_generators import sparse_schedule_generator
from flatland.utils.rendertools import RenderTool from flatland.utils.rendertools import RenderTool
from utils.observation_utils import normalize_observation
from flatland.envs.observations import TreeObsForRailEnv
random.seed(1) def main(argv):
np.random.seed(1) try:
opts, args = getopt.getopt(argv, "n:", ["n_trials="])
# Example generate a rail given a manual specification, except getopt.GetoptError:
# a map of tuples (cell_type, rotation) print('training_navigation.py -n <n_trials>')
transition_probability = [15, # empty cell - Case 0 sys.exit(2)
5, # Case 1 - straight for opt, arg in opts:
5, # Case 2 - simple switch if opt in ('-n', '--n_trials'):
1, # Case 3 - diamond crossing n_trials = int(arg)
1, # Case 4 - single slip
1, # Case 5 - double slip random.seed(1)
1, # Case 6 - symmetrical np.random.seed(1)
0, # Case 7 - dead end
1, # Case 1b (8) - simple turn right # Parameters for the Environment
1, # Case 1c (9) - simple turn left x_dim = 35
1] # Case 2b (10) - simple switch mirrored y_dim = 35
n_agents = 1
# Example generate a random rail
"""
env = RailEnv(width=20, # Use a the malfunction generator to break agents from time to time
height=20, stochastic_data = MalfunctionParameters(malfunction_rate=1./10000, # Rate of malfunction occurence
rail_generator=random_rail_generator(cell_type_relative_proportion=transition_probability), min_duration=15, # Minimal duration of malfunction
number_of_agents=1) max_duration=50 # Max duration of malfunction
)
env = RailEnv(width=15,
height=15,
rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=10, min_dist=10, max_dist=99999, seed=0), # Custom observation builder
number_of_agents=1) TreeObservation = TreeObsForRailEnv(max_depth=2)
# Different agent types (trains) with different speeds.
speed_ration_map = {1.: 0., # Fast passenger train
env = RailEnv(width=10, 1. / 2.: 1.0, # Fast freight train
height=20) 1. / 3.: 0.0, # Slow commuter train
env.load("./railway/complex_scene.pkl") 1. / 4.: 0.0} # Slow freight train
"""
env = RailEnv(width=x_dim,
env = RailEnv(width=8, height=y_dim,
height=8, rail_generator=sparse_rail_generator(max_num_cities=3,
rail_generator=complex_rail_generator(nr_start_goal=5, nr_extra=1, min_dist=4, max_dist=99999, seed=0), # Number of cities in map (where train stations are)
obs_builder_object=TreeObsForRailEnv(max_depth=2, predictor=DummyPredictorForRailEnv()), seed=1, # Random seed
number_of_agents=3) grid_mode=False,
max_rails_between_cities=2,
env.reset(True, True) max_rails_in_city=3),
schedule_generator=sparse_schedule_generator(speed_ration_map),
env_renderer = RenderTool(env, gl="PILSVG") number_of_agents=n_agents,
handle = env.get_agent_handles() malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
# Malfunction data generator
state_size = 168 * 2 obs_builder_object=TreeObservation)
action_size = 5 # Reset env
n_trials = 15000 env.reset(True,True)
max_steps = int(1.5 * (env.height + env.width)) # After training we want to render the results so we also load a renderer
eps = 1. env_renderer = RenderTool(env, gl="PILSVG", )
eps_end = 0.005 # Given the depth of the tree observation and the number of features per node we get the following state_size
eps_decay = 0.9995 num_features_per_node = env.obs_builder.observation_dim
action_dict = dict() tree_depth = 2
final_action_dict = dict() nr_nodes = 0
scores_window = deque(maxlen=100) for i in range(tree_depth + 1):
done_window = deque(maxlen=100) nr_nodes += np.power(4, i)
time_obs = deque(maxlen=2) state_size = num_features_per_node * nr_nodes
scores = []
dones_list = [] # The action space of flatland is 5 discrete actions
action_prob = [0] * action_size action_size = 5
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents() # We set the number of episodes we would like to train on
agent = Agent(state_size, action_size, "FC", 0) if 'n_trials' not in locals():
# agent.qnetwork_local.load_state_dict(torch.load('./Nets/avoid_checkpoint15000.pth')) n_trials = 15000
demo = False # And the max number of steps we want to take per episode
max_steps = int(3 * (env.height + env.width))
def max_lt(seq, val):
""" # Define training parameters
Return greatest item in seq for which item < val applies. eps = 1.
None is returned if seq was empty or all items in seq were >= val. eps_end = 0.005
""" eps_decay = 0.998
max = 0
idx = len(seq) - 1 # And some variables to keep track of the progress
while idx >= 0: action_dict = dict()
if seq[idx] < val and seq[idx] >= 0 and seq[idx] > max: final_action_dict = dict()
max = seq[idx] scores_window = deque(maxlen=100)
idx -= 1 done_window = deque(maxlen=100)
return max scores = []
dones_list = []
action_prob = [0] * action_size
def min_lt(seq, val): agent_obs = [None] * env.get_num_agents()
""" agent_next_obs = [None] * env.get_num_agents()
Return smallest item in seq for which item > val applies. agent_obs_buffer = [None] * env.get_num_agents()
None is returned if seq was empty or all items in seq were >= val. agent_action_buffer = [2] * env.get_num_agents()
""" cummulated_reward = np.zeros(env.get_num_agents())
min = np.inf update_values = False
idx = len(seq) - 1 # Now we load a Double dueling DQN agent
while idx >= 0: agent = Agent(state_size, action_size)
if seq[idx] >= val and seq[idx] < min:
min = seq[idx] for trials in range(1, n_trials + 1):
idx -= 1
return min # Reset environment
obs, info = env.reset(True, True)
env_renderer.reset()
def norm_obs_clip(obs, clip_min=-1, clip_max=1): # Build agent specific observations
"""
This function returns the difference between min and max value of an observation
:param obs: Observation that should be normalized
:param clip_min: min value where observation will be clipped
:param clip_max: max value where observation will be clipped
:return: returnes normalized and clipped observatoin
"""
max_obs = max(1, max_lt(obs, 1000))
min_obs = min(max_obs, min_lt(obs, 0))
if max_obs == min_obs:
return np.clip(np.array(obs) / max_obs, clip_min, clip_max)
norm = np.abs(max_obs - min_obs)
if norm == 0:
norm = 1.
return np.clip((np.array(obs) - min_obs) / norm, clip_min, clip_max)
for trials in range(1, n_trials + 1):
# Reset environment
obs = env.reset(True, True)
if demo:
env_renderer.set_new_rail()
final_obs = obs.copy()
final_obs_next = obs.copy()
for a in range(env.get_num_agents()):
data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(obs[a]), num_features_per_node=8,
current_depth=0)
data = norm_obs_clip(data)
distance = norm_obs_clip(distance)
agent_data = np.clip(agent_data, -1, 1)
obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
for i in range(2):
time_obs.append(obs)
# env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
for a in range(env.get_num_agents()):
agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
score = 0
env_done = 0
# Run episode
for step in range(max_steps):
if demo:
env_renderer.renderEnv(show=True, show_observations=False)
# print(step)
# Action
for a in range(env.get_num_agents()):
if demo:
eps = 0
# action = agent.act(np.array(obs[a]), eps=eps)
action = agent.act(agent_obs[a], eps=eps)
action_prob[action] += 1
action_dict.update({a: action})
# Environment step
next_obs, all_rewards, done, _ = env.step(action_dict)
for a in range(env.get_num_agents()): for a in range(env.get_num_agents()):
data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(next_obs[a]), num_features_per_node=8, if obs[a]:
current_depth=0) agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
data = norm_obs_clip(data) agent_obs_buffer[a] = agent_obs[a].copy()
distance = norm_obs_clip(distance)
agent_data = np.clip(agent_data, -1, 1) # Reset score and done
next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data)) score = 0
time_obs.append(next_obs) env_done = 0
# Update replay buffer and train agent # Run episode
for a in range(env.get_num_agents()): for step in range(max_steps):
agent_next_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a])) # Action
if done[a]:
final_obs[a] = agent_obs[a].copy()
final_obs_next[a] = agent_next_obs[a].copy()
final_action_dict.update({a: action_dict[a]})
if not demo and not done[a]:
agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
score += all_rewards[a] / env.get_num_agents()
agent_obs = agent_next_obs.copy()
if done['__all__']:
env_done = 1
for a in range(env.get_num_agents()): for a in range(env.get_num_agents()):
agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a]) if info['action_required'][a]:
break # If an action is require, we want to store the obs a that step as well as the action
# Epsilon decay update_values = True
eps = max(eps_end, eps_decay * eps) # decrease epsilon action = agent.act(agent_obs[a], eps=eps)
action_prob[action] += 1
done_window.append(env_done) else:
scores_window.append(score / max_steps) # save most recent score update_values = False
scores.append(np.mean(scores_window)) action = 0
dones_list.append((np.mean(done_window))) action_dict.update({a: action})
print( # Environment step
'\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( next_obs, all_rewards, done, info = env.step(action_dict)
env.get_num_agents(), # Update replay buffer and train agent
trials, for a in range(env.get_num_agents()):
np.mean(scores_window), # Only update the values when we are done or when an action was taken and thus relevant information is present
100 * np.mean(done_window), if update_values or done[a]:
eps, action_prob / np.sum(action_prob)), end=" ") agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
agent_obs[a], done[a])
if trials % 100 == 0: cummulated_reward[a] = 0.
agent_obs_buffer[a] = agent_obs[a].copy()
agent_action_buffer[a] = action_dict[a]
if next_obs[a]:
agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
score += all_rewards[a] / env.get_num_agents()
# Copy observation
if done['__all__']:
env_done = 1
break
# Epsilon decay
eps = max(eps_end, eps_decay * eps) # decrease epsilon
# Collection information about training
tasks_finished = 0
for _idx in range(env.get_num_agents()):
if done[_idx] == 1:
tasks_finished += 1
done_window.append(tasks_finished / max(1, env.get_num_agents()))
scores_window.append(score / max_steps) # save most recent score
scores.append(np.mean(scores_window))
dones_list.append((np.mean(done_window)))
print( print(
'\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(), env.get_num_agents(), x_dim, y_dim,
trials, trials,
np.mean(scores_window), np.mean(scores_window),
100 * np.mean(done_window), 100 * np.mean(done_window),
eps, eps, action_prob / np.sum(action_prob)), end=" ")
action_prob / np.sum(action_prob)))
torch.save(agent.qnetwork_local.state_dict(), if trials % 100 == 0:
'./Nets/avoid_checkpoint' + str(trials) + '.pth') print(
action_prob = [1] * action_size '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
env.get_num_agents(), x_dim, y_dim,
trials,
np.mean(scores_window),
100 * np.mean(done_window),
eps, action_prob / np.sum(action_prob)))
torch.save(agent.qnetwork_local.state_dict(),
'./Nets/navigator_checkpoint' + str(trials) + '.pth')
action_prob = [1] * action_size
# Plot overall training progress at the end
plt.plot(scores)
plt.show()
if __name__ == '__main__':
main(sys.argv[1:])
...@@ -15,13 +15,14 @@ setenv = ...@@ -15,13 +15,14 @@ setenv =
PYTHONPATH = {toxinidir} PYTHONPATH = {toxinidir}
passenv = passenv =
DISPLAY DISPLAY
XAUTHORITY
; HTTP_PROXY+HTTPS_PROXY required behind corporate proxies ; HTTP_PROXY+HTTPS_PROXY required behind corporate proxies
HTTP_PROXY HTTP_PROXY
HTTPS_PROXY HTTPS_PROXY
deps = deps =
-r{toxinidir}/requirements_torch_training.txt -r{toxinidir}/requirements_torch_training.txt
commands = commands =
python torch_training/training_navigation.py python torch_training/multi_agent_training.py --n_trials=10
[flake8] [flake8]
max-line-length = 120 max-line-length = 120
...@@ -29,7 +30,12 @@ ignore = E121 E126 E123 E128 E133 E226 E241 E242 E704 W291 W293 W391 W503 W504 W ...@@ -29,7 +30,12 @@ ignore = E121 E126 E123 E128 E133 E226 E241 E242 E704 W291 W293 W391 W503 W504 W
[testenv:flake8] [testenv:flake8]
basepython = python basepython = python
passenv = DISPLAY passenv =
DISPLAY
XAUTHORITY
; HTTP_PROXY+HTTPS_PROXY required behind corporate proxies
HTTP_PROXY
HTTPS_PROXY
deps = deps =
-r{toxinidir}/requirements_torch_training.txt -r{toxinidir}/requirements_torch_training.txt
commands = commands =
......
import random
import time
from collections import deque
import numpy as np
from flatland.envs.observations import GlobalObsForRailEnv
from flatland.envs.rail_env import RailEnv
from flatland.envs.rail_generators import complex_rail_generator
from flatland.envs.schedule_generators import complex_schedule_generator
from line_profiler import LineProfiler
from utils.observation_utils import norm_obs_clip, split_tree_into_feature_groups
def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='*'):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '_' * (length - filledLength)
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end=" ")
# Print New Line on Complete
if iteration == total:
print('')
class RandomAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
def act(self, state, eps=0):
"""
:param state: input is the observation of the agent
:return: returns an action
"""
return np.random.choice(np.arange(self.action_size))
def step(self, memories):
"""
Step function to improve agent by adjusting policy given the observations
:param memories: SARS Tuple to be
:return:
"""
return
def save(self, filename):
# Store the current policy
return
def load(self, filename):
# Load a policy
return
def run_test(parameters, agent, test_nr=0, tree_depth=3):
# Parameter initialization
lp = LineProfiler()
features_per_node = 9
start_time_scoring = time.time()
action_dict = dict()
nr_trials_per_test = 5
print('Running Test {} with (x_dim,y_dim) = ({},{}) and {} Agents.'.format(test_nr, parameters[0], parameters[1],
parameters[2]))
# Reset all measurements
time_obs = deque(maxlen=2)
test_scores = []
test_dones = []
# Reset environment
random.seed(parameters[3])
np.random.seed(parameters[3])
nr_paths = max(2, parameters[2] + int(0.5 * parameters[2]))
min_dist = int(min([parameters[0], parameters[1]]) * 0.75)
env = RailEnv(width=parameters[0],
height=parameters[1],
rail_generator=complex_rail_generator(nr_start_goal=nr_paths, nr_extra=5, min_dist=min_dist,
max_dist=99999,
seed=parameters[3]),
schedule_generator=complex_schedule_generator(),
obs_builder_object=GlobalObsForRailEnv(),
number_of_agents=parameters[2])
max_steps = int(3 * (env.height + env.width))
lp_step = lp(env.step)
lp_reset = lp(env.reset)
agent_obs = [None] * env.get_num_agents()
printProgressBar(0, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20)
for trial in range(nr_trials_per_test):
# Reset the env
lp_reset(True, True)
obs, info = env.reset(True, True)
for a in range(env.get_num_agents()):
data, distance, agent_data = split_tree_into_feature_groups(obs[a], tree_depth)
data = norm_obs_clip(data)
distance = norm_obs_clip(distance)
agent_data = np.clip(agent_data, -1, 1)
obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
for i in range(2):
time_obs.append(obs)
for a in range(env.get_num_agents()):
agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
# Run episode
trial_score = 0
for step in range(max_steps):
for a in range(env.get_num_agents()):
action = agent.act(agent_obs[a], eps=0)
action_dict.update({a: action})
# Environment step
next_obs, all_rewards, done, _ = lp_step(action_dict)
for a in range(env.get_num_agents()):
data, distance, agent_data = split_tree_into_feature_groups(next_obs[a], tree_depth)
data = norm_obs_clip(data)
distance = norm_obs_clip(distance)
agent_data = np.clip(agent_data, -1, 1)
next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
time_obs.append(next_obs)
for a in range(env.get_num_agents()):
agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
trial_score += all_rewards[a] / env.get_num_agents()
if done['__all__']:
break
test_scores.append(trial_score / max_steps)
test_dones.append(done['__all__'])
printProgressBar(trial + 1, nr_trials_per_test, prefix='Progress:', suffix='Complete', length=20)
end_time_scoring = time.time()
tot_test_time = end_time_scoring - start_time_scoring
lp.print_stats()
return test_scores, test_dones, tot_test_time