From 5befd0e4066f0f6fe292eb24410022a714a101ff Mon Sep 17 00:00:00 2001
From: MLErik <baerenjesus@gmail.com>
Date: Mon, 7 Oct 2019 16:14:22 -0400
Subject: [PATCH] updated multi-agent training

---
 torch_training/multi_agent_training.py | 200 +++++++++----------------
 1 file changed, 73 insertions(+), 127 deletions(-)

diff --git a/torch_training/multi_agent_training.py b/torch_training/multi_agent_training.py
index ec8ac96..ed20ea6 100644
--- a/torch_training/multi_agent_training.py
+++ b/torch_training/multi_agent_training.py
@@ -1,4 +1,3 @@
-# Import packages for plotting and system
 import getopt
 import random
 import sys
@@ -12,58 +11,44 @@ sys.path.append(str(base_dir))
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
-from importlib_resources import path
+from torch_training.dueling_double_dqn import Agent
 
-# Import Torch and utility functions to normalize observation
-import torch_training.Nets
 from flatland.envs.observations import TreeObsForRailEnv
-from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.envs.rail_generators import sparse_rail_generator
-# Import Flatland/ Observations and Predictors
 from flatland.envs.schedule_generators import sparse_schedule_generator
-from torch_training.dueling_double_dqn import Agent
+from flatland.utils.rendertools import RenderTool
 from utils.observation_utils import normalize_observation
 
 
 def main(argv):
     try:
-        opts, args = getopt.getopt(argv, "n:", ["n_episodes="])
+        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
     except getopt.GetoptError:
-        print('training_navigation.py -n <n_episodes>')
+        print('training_navigation.py -n <n_trials>')
         sys.exit(2)
     for opt, arg in opts:
-        if opt in ('-n', '--n_episodes'):
-            n_episodes = int(arg)
+        if opt in ('-n', '--n_trials'):
+            n_trials = int(arg)
 
-    ## Initialize the random
     random.seed(1)
     np.random.seed(1)
 
-    # Initialize a random map with a random number of agents
-
-    """
-     Get an observation builder and predictor:
-     The predictor will always predict the shortest path from the current location of the agent.
-     This is used to warn for potential conflicts --> Should be enhanced to get better performance!
-    """
-
     # Parameters for the Environment
-    x_dim = 20
-    y_dim = 20
-    n_agents = 3
-    tree_depth = 2
+    x_dim = 40
+    y_dim = 40
+    n_agents = 4
+
 
     # Use a the malfunction generator to break agents from time to time
-    stochastic_data = {'prop_malfunction': 0.1,  # Percentage of defective agents
-                       'malfunction_rate': 30,  # Rate of malfunction occurence
+    stochastic_data = {'prop_malfunction': 0.05,  # Percentage of defective agents
+                       'malfunction_rate': 50,  # Rate of malfunction occurence
                        'min_duration': 3,  # Minimal duration of malfunction
                        'max_duration': 20  # Max duration of malfunction
                        }
 
     # Custom observation builder
-    predictor = ShortestPathPredictorForRailEnv()
-    observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=predictor)
+    TreeObservation = TreeObsForRailEnv(max_depth=2)
 
     # Different agent types (trains) with different speeds.
     speed_ration_map = {1.: 0.25,  # Fast passenger train
@@ -73,42 +58,43 @@ def main(argv):
 
     env = RailEnv(width=x_dim,
                   height=y_dim,
-                  rail_generator=sparse_rail_generator(num_cities=5,
+                  rail_generator=sparse_rail_generator(max_num_cities=3,
                                                        # Number of cities in map (where train stations are)
-                                                       num_intersections=4,
-                                                       # Number of intersections (no start / target)
-                                                       num_trainstations=10,  # Number of possible start/targets on map
-                                                       min_node_dist=3,  # Minimal distance of nodes
-                                                       node_radius=2,  # Proximity of stations to city center
-                                                       num_neighb=3,
-                                                       # Number of connections to other cities/intersections
-                                                       seed=15,  # Random seed
-                                                       grid_mode=True,
-                                                       enhance_intersection=False
-                                                       ),
+                                                       seed=1,  # Random seed
+                                                       grid_mode=False,
+                                                       max_rails_between_cities=2,
+                                                       max_rails_in_city=3),
                   schedule_generator=sparse_schedule_generator(speed_ration_map),
                   number_of_agents=n_agents,
                   stochastic_data=stochastic_data,  # Malfunction data generator
-                  obs_builder_object=observation_helper)
-    env.reset(True, True)
+                  obs_builder_object=TreeObservation)
 
-    handle = env.get_agent_handles()
+    # After training we want to render the results so we also load a renderer
+    env_renderer = RenderTool(env, gl="PILSVG", )
+    # Given the depth of the tree observation and the number of features per node we get the following state_size
     num_features_per_node = env.obs_builder.observation_dim
+    tree_depth = 2
     nr_nodes = 0
     for i in range(tree_depth + 1):
         nr_nodes += np.power(4, i)
     state_size = num_features_per_node * nr_nodes
+
+    # The action space of flatland is 5 discrete actions
     action_size = 5
 
     # We set the number of episodes we would like to train on
-    if 'n_episodes' not in locals():
-        n_episodes = 60000
+    if 'n_trials' not in locals():
+        n_trials = 15000
+
+    # And the max number of steps we want to take per episode
+    max_steps = int(3 * (env.height + env.width))
 
-    # Set max number of steps per episode as well as other training relevant parameter
-    max_steps = int((env.height + env.width))
+    # Define training parameters
     eps = 1.
     eps_end = 0.005
-    eps_decay = 0.9995
+    eps_decay = 0.998
+
+    # And some variables to keep track of the progress
     action_dict = dict()
     final_action_dict = dict()
     scores_window = deque(maxlen=100)
@@ -118,101 +104,60 @@ def main(argv):
     action_prob = [0] * action_size
     agent_obs = [None] * env.get_num_agents()
     agent_next_obs = [None] * env.get_num_agents()
-    observation_radius = 10
-
-    # Initialize the agent
+    agent_obs_buffer = [None] * env.get_num_agents()
+    agent_action_buffer = [2] * env.get_num_agents()
+    cummulated_reward = np.zeros(env.get_num_agents())
+    update_values = False
+    # Now we load a Double dueling DQN agent
     agent = Agent(state_size, action_size)
 
-    # Here you can pre-load an agent
-    if False:
-        with path(torch_training.Nets, "avoid_checkpoint500.pth") as file_in:
-            agent.qnetwork_local.load_state_dict(torch.load(file_in))
-
-    # Do training over n_episodes
-    for episodes in range(1, n_episodes + 1):
-        """
-        Training Curriculum: In order to get good generalization we change the number of agents
-        and the size of the levels every 50 episodes.
-        """
-        if episodes % 50 == 1:
-            env = RailEnv(width=x_dim,
-                          height=y_dim,
-                          rail_generator=sparse_rail_generator(num_cities=5,
-                                                               # Number of cities in map (where train stations are)
-                                                               num_intersections=4,
-                                                               # Number of intersections (no start / target)
-                                                               num_trainstations=10,
-                                                               # Number of possible start/targets on map
-                                                               min_node_dist=3,  # Minimal distance of nodes
-                                                               node_radius=2,  # Proximity of stations to city center
-                                                               num_neighb=3,
-                                                               # Number of connections to other cities/intersections
-                                                               seed=15,  # Random seed
-                                                               grid_mode=True,
-                                                               enhance_intersection=False
-                                                               ),
-                          schedule_generator=sparse_schedule_generator(speed_ration_map),
-                          number_of_agents=n_agents,
-                          stochastic_data=stochastic_data,  # Malfunction data generator
-                          obs_builder_object=observation_helper)
-
-            # Adjust the parameters according to the new env.
-            max_steps = int((env.height + env.width))
-            agent_obs = [None] * env.get_num_agents()
-            agent_next_obs = [None] * env.get_num_agents()
+    for trials in range(1, n_trials + 1):
 
         # Reset environment
         obs, info = env.reset(True, True)
-
-        # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
-        # different times during an episode
-        final_obs = agent_obs.copy()
-        final_obs_next = agent_next_obs.copy()
-        register_action_state = np.zeros(env.get_num_agents(), dtype=bool)
-
+        env_renderer.reset()
         # Build agent specific observations
         for a in range(env.get_num_agents()):
-            agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
+            agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
+            agent_obs_buffer[a] = agent_obs[a].copy()
+
+        # Reset score and done
         score = 0
         env_done = 0
 
         # Run episode
         for step in range(max_steps):
-
             # Action
             for a in range(env.get_num_agents()):
-                if env.agents[a].speed_data['position_fraction'] == 0.:
-                    register_action_state[a] = True
+                if info['action_required'][a]:
+                    # If an action is require, we want to store the obs a that step as well as the action
+                    update_values = True
+                    action = agent.act(agent_obs[a], eps=eps)
+                    action_prob[action] += 1
                 else:
-                    register_action_state[a] = False
-                action = agent.act(agent_obs[a], eps=eps)
-                action_prob[action] += 1
+                    update_values = False
+                    action = 0
                 action_dict.update({a: action})
 
             # Environment step
-            next_obs, all_rewards, done, _ = env.step(action_dict)
-
-            # Build agent specific observations and normalize
-            for a in range(env.get_num_agents()):
-                agent_next_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
-
+            next_obs, all_rewards, done, info = env.step(action_dict)
             # Update replay buffer and train agent
             for a in range(env.get_num_agents()):
-                if done[a]:
-                    final_obs[a] = agent_obs[a].copy()
-                    final_obs_next[a] = agent_next_obs[a].copy()
-                    final_action_dict.update({a: action_dict[a]})
-                if not done[a] and register_action_state[a]:
-                    agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
+                # Only update the values when we are done or when an action was taken and thus relevant information is present
+                if update_values or done[a]:
+                    agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
+                               agent_obs[a], done[a])
+                    cummulated_reward[a] = 0.
+
+                    agent_obs_buffer[a] = agent_obs[a].copy()
+                    agent_action_buffer[a] = action_dict[a]
+                agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)
+
                 score += all_rewards[a] / env.get_num_agents()
 
             # Copy observation
-            agent_obs = agent_next_obs.copy()
-
             if done['__all__']:
                 env_done = 1
-                for a in range(env.get_num_agents()):
-                    agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
                 break
 
         # Epsilon decay
@@ -223,7 +168,7 @@ def main(argv):
         for _idx in range(env.get_num_agents()):
             if done[_idx] == 1:
                 tasks_finished += 1
-        done_window.append(tasks_finished / env.get_num_agents())
+        done_window.append(tasks_finished / max(1, env.get_num_agents()))
         scores_window.append(score / max_steps)  # save most recent score
         scores.append(np.mean(scores_window))
         dones_list.append((np.mean(done_window)))
@@ -231,23 +176,24 @@ def main(argv):
         print(
             '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                 env.get_num_agents(), x_dim, y_dim,
-                episodes,
+                trials,
                 np.mean(scores_window),
                 100 * np.mean(done_window),
                 eps, action_prob / np.sum(action_prob)), end=" ")
 
-        if episodes % 100 == 0:
+        if trials % 100 == 0:
             print(
-                '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-                    env.get_num_agents(),
-                    episodes,
+                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                    env.get_num_agents(), x_dim, y_dim,
+                    trials,
                     np.mean(scores_window),
                     100 * np.mean(done_window),
-                    eps,
-                    action_prob / np.sum(action_prob)))
+                    eps, action_prob / np.sum(action_prob)))
             torch.save(agent.qnetwork_local.state_dict(),
-                       './Nets/avoid_checkpoint' + str(episodes) + '.pth')
+                       './Nets/avoider_checkpoint' + str(trials) + '.pth')
             action_prob = [1] * action_size
+
+    # Plot overall training progress at the end
     plt.plot(scores)
     plt.show()
 
-- 
GitLab