Compare revisions

c977124a · c977124a · c977124a · c977124a · c977124a · c977124a
--- a/tests/testdata/test_array_5.csv
+++ b/tests/testdata/test_array_5.csv
--- a/tests/testdata/test_array_6.csv
+++ b/tests/testdata/test_array_6.csv
--- a/tests/testdata/test_array_7.csv
+++ b/tests/testdata/test_array_7.csv
--- a/tests/testdata/test_array_8.csv
+++ b/tests/testdata/test_array_8.csv
--- a/tests/testdata/test_array_9.csv
+++ b/tests/testdata/test_array_9.csv
--- a/torch_training/Getting_Started_Training.md
+++ b/torch_training/Getting_Started_Training.md
@@ -59,7 +59,8 @@ For training purposes the tree is flattend into a single array.

 ## Training
 ### Setting up the environment
-Let us now train a simle double dueling DQN agent to navigate to its target on flatland. We start by importing flatland
+Before you get started with the training make sure that you have [pytorch](https://pytorch.org/get-started/locally/) installed.
+Let us now train a simPle double dueling DQN agent to navigate to its target on flatland. We start by importing flatland

 ```
 from flatland.envs.generators import complex_rail_generator
@@ -105,12 +106,12 @@ We have no successfully set up the environment for training. To visualize it in
 env_renderer = RenderTool(env, gl="PILSVG", )
 ```

-###Setting up the agent
+### Setting up the agent

 To set up a appropriate agent we need the state and action space sizes. From the discussion above about the tree observation we end up with:

 [**Adrian**: I just wonder, why this is not done in seperate method in the the observation: get_state_size, then we don't have to write down much more. And the user don't need to 
-understand anything about the oberservation. I suggest moving this into the obersvation, base ObservationBuilder declare it as an abstract method. ... ] 
+understand anything about the observation. I suggest moving this into the observation, base ObservationBuilder declare it as an abstract method. ... ] 

 ```
 # Given the depth of the tree observation and the number of features per node we get the following state_size
@@ -149,7 +150,7 @@ We now use the normalized `agent_obs` for our training loop:
 for trials in range(1, n_trials + 1):

    # Reset environment
-    obs = env.reset(True, True)
+    obs, info = env.reset(True, True)
    if not Training:
        env_renderer.set_new_rail()

@@ -217,7 +218,7 @@ for trials in range(1, n_trials + 1):
    eps = max(eps_end, eps_decay * eps)  # decrease epsilon
 ```

-Running the `navigation_training.py` file trains a simple agent to navigate to any random target within the railway network. After running you should see a learning curve similiar to this one:
+Running the `training_navigation.py` file trains a simple agent to navigate to any random target within the railway network. After running you should see a learning curve similiar to this one:

 ![Learning_curve](https://i.imgur.com/yVGXpUy.png)


--- a/torch_training/Multi_Agent_Training_Intro.md
+++ b/torch_training/Multi_Agent_Training_Intro.md
@@ -174,7 +174,7 @@ We now use the normalized `agent_obs` for our training loop:
            agent_next_obs = [None] * env.get_num_agents()

        # Reset environment
-        obs = env.reset(True, True)
+        obs, info = env.reset(True, True)

        # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
        # different times during an episode
@@ -245,9 +245,8 @@ We now use the normalized `agent_obs` for our training loop:

 Running the `multi_agent_training.py` file trains a simple agent to navigate to any random target within the railway network. After running you should see a learning curve similiar to this one:

-*Learning curve provided soon*
+![Learning_Curve](https://i.imgur.com/Po4j4yK.png)

 and the agent behavior should look like this:

-*Gif provided soon*
-
+![Conflict_Avoidence](https://i.imgur.com/AvBHKaD.gif)
--- a/torch_training/Nets/avoid_checkpoint15000.pth
+++ b/torch_training/Nets/avoid_checkpoint15000.pth
--- a/torch_training/Nets/avoid_checkpoint30000.pth
+++ b/torch_training/Nets/avoid_checkpoint30000.pth
--- a/torch_training/Nets/avoid_checkpoint60000.pth
+++ b/torch_training/Nets/avoid_checkpoint60000.pth
--- a/torch_training/dueling_double_dqn.py
+++ b/torch_training/dueling_double_dqn.py
@@ -8,51 +8,41 @@ import torch
 import torch.nn.functional as F
 import torch.optim as optim

-from torch_training.model import QNetwork, QNetwork2
+from torch_training.model import QNetwork

 BUFFER_SIZE = int(1e5)  # replay buffer size
 BATCH_SIZE = 512  # minibatch size
 GAMMA = 0.99  # discount factor 0.99
 TAU = 1e-3  # for soft update of target parameters
-LR = 0.5e-4  # learning rate 5
+LR = 0.5e-4  # learning rate 0.5e-4 works
 UPDATE_EVERY = 10  # how often to update the network
-double_dqn = True  # If using double dqn algorithm
-input_channels = 5  # Number of Input channels

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-device = torch.device("cpu")
 print(device)


 class Agent:
    """Interacts with and learns from the environment."""

-    def __init__(self, state_size, action_size, net_type, seed, double_dqn=True, input_channels=5):
+    def __init__(self, state_size, action_size, double_dqn=True):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
-            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
-        self.seed = random.seed(seed)
-        self.version = net_type
        self.double_dqn = double_dqn
        # Q-Network
-        if self.version == "Conv":
-            self.qnetwork_local = QNetwork2(state_size, action_size, seed, input_channels).to(device)
-            self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
-        else:
-            self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
-            self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
+        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
+        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
-        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
+        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

@@ -152,7 +142,7 @@ class Agent:
 class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

-    def __init__(self, action_size, buffer_size, batch_size, seed):
+    def __init__(self, action_size, buffer_size, batch_size):
        """Initialize a ReplayBuffer object.

        Params
@@ -160,13 +150,11 @@ class ReplayBuffer:
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
-            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
-        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
@@ -188,7 +176,7 @@ class ReplayBuffer:
        dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
            .float().to(device)

-        return (states, actions, rewards, next_states, dones)
+        return states, actions, rewards, next_states, dones

    def __len__(self):
        """Return the current size of internal memory."""

--- a/torch_training/model.py
+++ b/torch_training/model.py
@@ -3,7 +3,7 @@ import torch.nn.functional as F


 class QNetwork(nn.Module):
-    def __init__(self, state_size, action_size, seed, hidsize1=128, hidsize2=128):
+    def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128):
        super(QNetwork, self).__init__()

        self.fc1_val = nn.Linear(state_size, hidsize1)
@@ -24,38 +24,3 @@ class QNetwork(nn.Module):
        adv = F.relu(self.fc2_adv(adv))
        adv = self.fc3_adv(adv)
        return val + adv - adv.mean()
-
-
-class QNetwork2(nn.Module):
-    def __init__(self, state_size, action_size, seed, input_channels, hidsize1=128, hidsize2=64):
-        super(QNetwork2, self).__init__()
-        self.conv1 = nn.Conv2d(input_channels, 16, kernel_size=3, stride=1)
-        self.bn1 = nn.BatchNorm2d(16)
-        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=3)
-        self.bn2 = nn.BatchNorm2d(32)
-        self.conv3 = nn.Conv2d(32, 64, kernel_size=5, stride=3)
-        self.bn3 = nn.BatchNorm2d(64)
-
-        self.fc1_val = nn.Linear(6400, hidsize1)
-        self.fc2_val = nn.Linear(hidsize1, hidsize2)
-        self.fc3_val = nn.Linear(hidsize2, 1)
-
-        self.fc1_adv = nn.Linear(6400, hidsize1)
-        self.fc2_adv = nn.Linear(hidsize1, hidsize2)
-        self.fc3_adv = nn.Linear(hidsize2, action_size)
-
-    def forward(self, x):
-        x = F.relu(self.conv1(x))
-        x = F.relu(self.conv2(x))
-        x = F.relu(self.conv3(x))
-
-        # value function approximation
-        val = F.relu(self.fc1_val(x.view(x.size(0), -1)))
-        val = F.relu(self.fc2_val(val))
-        val = self.fc3_val(val)
-
-        # advantage calculation
-        adv = F.relu(self.fc1_adv(x.view(x.size(0), -1)))
-        adv = F.relu(self.fc2_adv(adv))
-        adv = self.fc3_adv(adv)
-        return val + adv - adv.mean()
--- a/torch_training/multi_agent_inference.py
+++ b/torch_training/multi_agent_inference.py
@@ -3,16 +3,18 @@ from collections import deque

 import numpy as np
 import torch
-from flatland.envs.generators import complex_rail_generator
+from flatland.envs.malfunction_generators import malfunction_from_params, MalfunctionParameters
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
+from flatland.envs.rail_generators import sparse_rail_generator
+from flatland.envs.schedule_generators import sparse_schedule_generator
 from flatland.utils.rendertools import RenderTool
 from importlib_resources import path

 import torch_training.Nets
 from torch_training.dueling_double_dqn import Agent
-from utils.observation_utils import norm_obs_clip, split_tree
+from utils.observation_utils import normalize_observation

 random.seed(1)
 np.random.seed(1)
@@ -26,35 +28,60 @@ x_dim = env.width
 y_dim = env.height
 """

-x_dim = np.random.randint(8, 20)
-y_dim = np.random.randint(8, 20)
-n_agents = np.random.randint(3, 8)
-n_goals = n_agents + np.random.randint(0, 3)
-min_dist = int(0.75 * min(x_dim, y_dim))
+# Parameters for the Environment
+x_dim = 25
+y_dim = 25
+n_agents = 10
+
+# We are training an Agent using the Tree Observation with depth 2
+observation_builder = TreeObsForRailEnv(max_depth=2)
+
+# Use a the malfunction generator to break agents from time to time
+stochastic_data = MalfunctionParameters(malfunction_rate=1./10000,  # Rate of malfunction occurence
+                                        min_duration=15,  # Minimal duration of malfunction
+                                        max_duration=50  # Max duration of malfunction
+                                        )
+
+
+
+# Custom observation builder
+TreeObservation = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))
+
+# Different agent types (trains) with different speeds.
+speed_ration_map = {1.: 0.25,  # Fast passenger train
+                    1. / 2.: 0.25,  # Fast freight train
+                    1. / 3.: 0.25,  # Slow commuter train
+                    1. / 4.: 0.25}  # Slow freight train

 env = RailEnv(width=x_dim,
              height=y_dim,
-              rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
-                                                    max_dist=99999,
-                                                    seed=0),
-              obs_builder_object=TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv()),
-              number_of_agents=n_agents)
+              rail_generator=sparse_rail_generator(max_num_cities=3,
+                                                   # Number of cities in map (where train stations are)
+                                                   seed=1,  # Random seed
+                                                   grid_mode=False,
+                                                   max_rails_between_cities=2,
+                                                   max_rails_in_city=2),
+              schedule_generator=sparse_schedule_generator(speed_ration_map),
+              number_of_agents=n_agents,
+              malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
+              obs_builder_object=TreeObservation)
 env.reset(True, True)

-tree_depth = 3
-observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())
+observation_helper = TreeObsForRailEnv(max_depth=3, predictor=ShortestPathPredictorForRailEnv())
 env_renderer = RenderTool(env, gl="PILSVG", )
-handle = env.get_agent_handles()
 num_features_per_node = env.obs_builder.observation_dim
+
+tree_depth = 2
 nr_nodes = 0
 for i in range(tree_depth + 1):
    nr_nodes += np.power(4, i)
 state_size = num_features_per_node * nr_nodes
 action_size = 5

-n_trials = 100
-observation_radius = 10
-max_steps = int(3 * (env.height + env.width))
+# We set the number of episodes we would like to train on
+if 'n_trials' not in locals():
+    n_trials = 60000
+max_steps = int(4 * 2 * (20 + env.height + env.width))
 eps = 1.
 eps_end = 0.005
 eps_decay = 0.9995
@@ -62,14 +89,13 @@ action_dict = dict()
 final_action_dict = dict()
 scores_window = deque(maxlen=100)
 done_window = deque(maxlen=100)
-time_obs = deque(maxlen=2)
 scores = []
 dones_list = []
 action_prob = [0] * action_size
 agent_obs = [None] * env.get_num_agents()
 agent_next_obs = [None] * env.get_num_agents()
-agent = Agent(state_size, action_size, "FC", 0)
-with path(torch_training.Nets, "avoid_checkpoint49700.pth") as file_in:
+agent = Agent(state_size, action_size)
+with path(torch_training.Nets, "navigator_checkpoint1200.pth") as file_in:
    agent.qnetwork_local.load_state_dict(torch.load(file_in))

 record_images = False
@@ -78,43 +104,36 @@ frame_step = 0
 for trials in range(1, n_trials + 1):

    # Reset environment
-    obs = env.reset(True, True)
-
-    env_renderer.set_new_rail()
-
+    obs, info = env.reset(True, True)
+    env_renderer.reset()
+    # Build agent specific observations
    for a in range(env.get_num_agents()):
-        data, distance, agent_data = split_tree(tree=np.array(obs[a]), num_features_per_node=num_features_per_node,
-                                                current_depth=0)
-        data = norm_obs_clip(data, fixed_radius=observation_radius)
-        distance = norm_obs_clip(distance)
-        agent_data = np.clip(agent_data, -1, 1)
-        agent_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+        agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
+    # Reset score and done
+    score = 0
+    env_done = 0

    # Run episode
    for step in range(max_steps):
-        env_renderer.render_env(show=True, show_observations=False, show_predictions=True)
-
-        if record_images:
-            env_renderer.gl.saveImage("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
-            frame_step += 1

        # Action
        for a in range(env.get_num_agents()):
-            action = agent.act(agent_obs[a], eps=0)
-            action_dict.update({a: action})
+            if info['action_required'][a]:
+                action = agent.act(agent_obs[a], eps=0.)
+            else:
+                action = 0

+            action_prob[action] += 1
+            action_dict.update({a: action})
        # Environment step
-
-        next_obs, all_rewards, done, _ = env.step(action_dict)
+        obs, all_rewards, done, _ = env.step(action_dict)
+        env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
+        # Build agent specific observations and normalize
        for a in range(env.get_num_agents()):
-            data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                    num_features_per_node=num_features_per_node,
-                                                    current_depth=0)
-            data = norm_obs_clip(data, fixed_radius=observation_radius)
-            distance = norm_obs_clip(distance)
-            agent_data = np.clip(agent_data, -1, 1)
-            agent_next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
-
-        agent_obs = agent_next_obs.copy()
+            if obs[a]:
+                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
+
+
        if done['__all__']:
            break
+
--- a/torch_training/multi_agent_training.py
+++ b/torch_training/multi_agent_training.py
-# Import packages for plotting and system
 import getopt
 import random
 import sys
 from collections import deque
+# make sure the root path is in system path
+from pathlib import Path
+
+from flatland.envs.malfunction_generators import malfunction_from_params, MalfunctionParameters
+
+base_dir = Path(__file__).resolve().parent.parent
+sys.path.append(str(base_dir))

 import matplotlib.pyplot as plt
 import numpy as np
 import torch
-# Import Flatland/ Observations and Predictors
-from flatland.envs.generators import complex_rail_generator
-from flatland.envs.observations import TreeObsForRailEnv
-from flatland.envs.predictions import ShortestPathPredictorForRailEnv
-from flatland.envs.rail_env import RailEnv
-from importlib_resources import path
-
-# Import Torch and utility functions to normalize observation
-import torch_training.Nets
 from torch_training.dueling_double_dqn import Agent
-from utils.observation_utils import norm_obs_clip, split_tree

+from flatland.envs.rail_env import RailEnv
+from flatland.envs.rail_generators import sparse_rail_generator
+from flatland.envs.schedule_generators import sparse_schedule_generator
+from flatland.utils.rendertools import RenderTool
+from utils.observation_utils import normalize_observation
+from flatland.envs.observations import TreeObsForRailEnv
+from flatland.envs.predictions import ShortestPathPredictorForRailEnv
+from flatland.envs.agent_utils import RailAgentStatus

 def main(argv):
    try:
-        opts, args = getopt.getopt(argv, "n:", ["n_episodes="])
+        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
-        print('training_navigation.py -n <n_episodes>')
+        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
-        if opt in ('-n', '--n_episodes'):
-            n_episodes = int(arg)
+        if opt in ('-n', '--n_trials'):
+            n_trials = int(arg)

-    ## Initialize the random
    random.seed(1)
    np.random.seed(1)

-    # Initialize a random map with a random number of agents
-    x_dim = np.random.randint(8, 20)
-    y_dim = np.random.randint(8, 20)
-    n_agents = np.random.randint(3, 8)
-    n_goals = n_agents + np.random.randint(0, 3)
-    min_dist = int(0.75 * min(x_dim, y_dim))
-    tree_depth = 3
-    print("main2")
-
-    """
-     Get an observation builder and predictor:
-     The predictor will always predict the shortest path from the current location of the agent.
-     This is used to warn for potential conflicts --> Should be enhanced to get better performance!
-    """
-    predictor = ShortestPathPredictorForRailEnv()
-    observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=predictor)
+    # Parameters for the Environment
+    x_dim = 35
+    y_dim = 35
+    n_agents = 10
+
+
+    # Use a the malfunction generator to break agents from time to time
+    stochastic_data = MalfunctionParameters(malfunction_rate=1./10000,  # Rate of malfunction occurence
+                                            min_duration=15,  # Minimal duration of malfunction
+                                            max_duration=50  # Max duration of malfunction
+                                            )
+
+
+    # Custom observation builder
+    TreeObservation = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))
+
+    # Different agent types (trains) with different speeds.
+    speed_ration_map = {1.: 0.25,  # Fast passenger train
+                        1. / 2.: 0.25,  # Fast freight train
+                        1. / 3.: 0.25,  # Slow commuter train
+                        1. / 4.: 0.25}  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
-                  rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
-                                                        max_dist=99999,
-                                                        seed=0),
-                  obs_builder_object=observation_helper,
-                  number_of_agents=n_agents)
-    env.reset(True, True)
-
-    handle = env.get_agent_handles()
+                  rail_generator=sparse_rail_generator(max_num_cities=3,
+                                                       # Number of cities in map (where train stations are)
+                                                       seed=1,  # Random seed
+                                                       grid_mode=False,
+                                                       max_rails_between_cities=2,
+                                                       max_rails_in_city=3),
+                  schedule_generator=sparse_schedule_generator(speed_ration_map),
+                  number_of_agents=n_agents,
+                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
+                  obs_builder_object=TreeObservation)
+
+    # Reset env
+    env.reset(True,True)
+    # After training we want to render the results so we also load a renderer
+    env_renderer = RenderTool(env, gl="PILSVG", )
+    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim
+    tree_depth = 2
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes
+
+    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
-    if 'n_episodes' not in locals():
-        n_episodes = 60000
+    if 'n_trials' not in locals():
+        n_trials = 15000
+
+    # And the max number of steps we want to take per episode
+    max_steps = int(4 * 2 * (20 + env.height + env.width))

-    # Set max number of steps per episode as well as other training relevant parameter
-    max_steps = int(3 * (env.height + env.width))
+    # Define training parameters
    eps = 1.
    eps_end = 0.005
-    eps_decay = 0.9995
+    eps_decay = 0.998
+
+    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
@@ -86,106 +109,73 @@ def main(argv):
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
-    observation_radius = 10
-
-    # Initialize the agent
-    agent = Agent(state_size, action_size, "FC", 0)
-
-    # Here you can pre-load an agent
-    if False:
-        with path(torch_training.Nets, "avoid_checkpoint30000.pth") as file_in:
-            agent.qnetwork_local.load_state_dict(torch.load(file_in))
-
-    # Do training over n_episodes
-    for episodes in range(1, n_episodes + 1):
-        """
-        Training Curriculum: In order to get good generalization we change the number of agents
-        and the size of the levels every 50 episodes.
-        """
-        if episodes % 50 == 0:
-            x_dim = np.random.randint(8, 20)
-            y_dim = np.random.randint(8, 20)
-            n_agents = np.random.randint(3, 8)
-            n_goals = n_agents + np.random.randint(0, 3)
-            min_dist = int(0.75 * min(x_dim, y_dim))
-            env = RailEnv(width=x_dim,
-                          height=y_dim,
-                          rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
-                                                                max_dist=99999,
-                                                                seed=0),
-                          obs_builder_object=observation_helper,
-                          number_of_agents=n_agents)
-
-            # Adjust the parameters according to the new env.
-            max_steps = int(3 * (env.height + env.width))
-            agent_obs = [None] * env.get_num_agents()
-            agent_next_obs = [None] * env.get_num_agents()
+    agent_obs_buffer = [None] * env.get_num_agents()
+    agent_action_buffer = [2] * env.get_num_agents()
+    cummulated_reward = np.zeros(env.get_num_agents())
+    update_values = [False] * env.get_num_agents()
+    # Now we load a Double dueling DQN agent
+    agent = Agent(state_size, action_size)

-        # Reset environment
-        obs = env.reset(True, True)
-
-        # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
-        # different times during an episode
-        final_obs = agent_obs.copy()
-        final_obs_next = agent_next_obs.copy()
+    for trials in range(1, n_trials + 1):

+        # Reset environment
+        obs, info = env.reset(True, True)
+        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
-            data, distance, agent_data = split_tree(tree=np.array(obs[a]), num_features_per_node=num_features_per_node,
-                                                    current_depth=0)
-            data = norm_obs_clip(data, fixed_radius=observation_radius)
-            distance = norm_obs_clip(distance)
-            agent_data = np.clip(agent_data, -1, 1)
-            agent_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+            if obs[a]:
+                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
+                agent_obs_buffer[a] = agent_obs[a].copy()

+        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
-        for step in range(max_steps):
-
+        while True:
            # Action
            for a in range(env.get_num_agents()):
-                action = agent.act(agent_obs[a], eps=eps)
-                action_prob[action] += 1
+                if info['action_required'][a]:
+                    # If an action is require, we want to store the obs a that step as well as the action
+                    update_values[a] = True
+                    action = agent.act(agent_obs[a], eps=eps)
+                    action_prob[action] += 1
+                else:
+                    update_values[a] = False
+                    action = 0
                action_dict.update({a: action})

            # Environment step
-            next_obs, all_rewards, done, _ = env.step(action_dict)
-
-            # Build agent specific observations and normalize
-            for a in range(env.get_num_agents()):
-                data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                        num_features_per_node=num_features_per_node, current_depth=0)
-                data = norm_obs_clip(data, fixed_radius=observation_radius)
-                distance = norm_obs_clip(distance)
-                agent_data = np.clip(agent_data, -1, 1)
-                agent_next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
-
+            next_obs, all_rewards, done, info = env.step(action_dict)
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
-                if done[a]:
-                    final_obs[a] = agent_obs[a].copy()
-                    final_obs_next[a] = agent_next_obs[a].copy()
-                    final_action_dict.update({a: action_dict[a]})
-                if not done[a]:
-                    agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
+                # Only update the values when we are done or when an action was taken and thus relevant information is present
+                if update_values[a] or done[a]:
+                    agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
+                               agent_obs[a], done[a])
+                    cummulated_reward[a] = 0.
+
+                    agent_obs_buffer[a] = agent_obs[a].copy()
+                    agent_action_buffer[a] = action_dict[a]
+                if next_obs[a]:
+                    agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
+
                score += all_rewards[a] / env.get_num_agents()

            # Copy observation
-            agent_obs = agent_next_obs.copy()
-
            if done['__all__']:
                env_done = 1
-                for a in range(env.get_num_agents()):
-                    agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
                break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
-        done_window.append(env_done)
+        tasks_finished = 0
+        for current_agent in env.agents:
+            if current_agent.status == RailAgentStatus.DONE_REMOVED:
+                tasks_finished += 1
+        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))
@@ -193,23 +183,24 @@ def main(argv):
        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                env.get_num_agents(), x_dim, y_dim,
-                episodes,
+                trials,
                np.mean(scores_window),
                100 * np.mean(done_window),
                eps, action_prob / np.sum(action_prob)), end=" ")

-        if episodes % 100 == 0:
+        if trials % 100 == 0:
            print(
-                '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-                    env.get_num_agents(),
-                    episodes,
+                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                    env.get_num_agents(), x_dim, y_dim,
+                    trials,
                    np.mean(scores_window),
                    100 * np.mean(done_window),
-                    eps,
-                    action_prob / np.sum(action_prob)))
+                    eps, action_prob / np.sum(action_prob)))
            torch.save(agent.qnetwork_local.state_dict(),
-                       './Nets/avoid_checkpoint' + str(episodes) + '.pth')
+                       './Nets/navigator_checkpoint' + str(trials) + '.pth')
            action_prob = [1] * action_size
+
+    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()


--- a/torch_training/multi_agent_two_time_step_training.py
+++ b/torch_training/multi_agent_two_time_step_training.py
@@ -7,17 +7,18 @@ from collections import deque
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
-# Import Flatland/ Observations and Predictors
-from flatland.envs.generators import complex_rail_generator
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
+from flatland.envs.rail_generators import complex_rail_generator
+# Import Flatland/ Observations and Predictors
+from flatland.envs.schedule_generators import complex_schedule_generator
 from importlib_resources import path

 # Import Torch and utility functions to normalize observation
 import torch_training.Nets
 from torch_training.dueling_double_dqn import Agent
-from utils.observation_utils import norm_obs_clip, split_tree
+from utils.observation_utils import norm_obs_clip, split_tree_into_feature_groups


 def main(argv):
@@ -40,25 +41,25 @@ def main(argv):
    n_agents = np.random.randint(3, 8)
    n_goals = n_agents + np.random.randint(0, 3)
    min_dist = int(0.75 * min(x_dim, y_dim))
-    tree_depth = 3
+    tree_depth = 2
    print("main2")
+    demo = False

    # Get an observation builder and predictor
-    predictor = ShortestPathPredictorForRailEnv()
-    observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=predictor())
+    observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
                                                        max_dist=99999,
                                                        seed=0),
+                  schedule_generator=complex_schedule_generator(),
                  obs_builder_object=observation_helper,
                  number_of_agents=n_agents)
    env.reset(True, True)

    handle = env.get_agent_handles()
    features_per_node = env.obs_builder.observation_dim
-    tree_depth = 2
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
@@ -85,11 +86,11 @@ def main(argv):
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    # Initialize the agent
-    agent = Agent(state_size, action_size, "FC", 0)
+    agent = Agent(state_size, action_size)

    # Here you can pre-load an agent
    if False:
-        with path(torch_training.Nets, "avoid_checkpoint30000.pth") as file_in:
+        with path(torch_training.Nets, "avoid_checkpoint500.pth") as file_in:
            agent.qnetwork_local.load_state_dict(torch.load(file_in))

    # Do training over n_episodes
@@ -109,6 +110,7 @@ def main(argv):
                          rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
                                                                max_dist=99999,
                                                                seed=0),
+                          schedule_generator=complex_schedule_generator(),
                          obs_builder_object=TreeObsForRailEnv(max_depth=3,
                                                               predictor=ShortestPathPredictorForRailEnv()),
                          number_of_agents=n_agents)
@@ -119,7 +121,7 @@ def main(argv):
            agent_next_obs = [None] * env.get_num_agents()

        # Reset environment
-        obs = env.reset(True, True)
+        obs, info = env.reset(True, True)

        # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
        # different times during an episode
@@ -128,8 +130,7 @@ def main(argv):

        # Build agent specific observations
        for a in range(env.get_num_agents()):
-            data, distance, agent_data = split_tree(tree=np.array(obs[a]),
-                                                    current_depth=0)
+            data, distance, agent_data = split_tree_into_feature_groups(obs[a], tree_depth)
            data = norm_obs_clip(data)
            distance = norm_obs_clip(distance)
            agent_data = np.clip(agent_data, -1, 1)
@@ -160,8 +161,7 @@ def main(argv):

            next_obs, all_rewards, done, _ = env.step(action_dict)
            for a in range(env.get_num_agents()):
-                data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                        current_depth=0)
+                data, distance, agent_data = split_tree_into_feature_groups(next_obs[a], tree_depth)
                data = norm_obs_clip(data)
                distance = norm_obs_clip(distance)
                agent_data = np.clip(agent_data, -1, 1)

--- a/torch_training/railway/complex_scene.pkl
+++ b/torch_training/railway/complex_scene.pkl
--- a/torch_training/railway/hard_crossing.pkl
+++ b/torch_training/railway/hard_crossing.pkl
--- a/torch_training/railway/navigate_and_avoid.pkl
+++ b/torch_training/railway/navigate_and_avoid.pkl
--- a/torch_training/railway/simple_avoid.pkl
+++ b/torch_training/railway/simple_avoid.pkl
--- a/torch_training/railway/split_switch.pkl
+++ b/torch_training/railway/split_switch.pkl
No results found