Compare revisions

0006c040 · 0006c040 · 0006c040 · 0006c040 · 0006c040 · 0006c040
--- a/reinforcement_learning/replay_buffer.py
+++ b/reinforcement_learning/replay_buffer.py
+import random
+from collections import namedtuple, deque, Iterable
+
+import numpy as np
+import torch
+
+Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "action_prob"])
+
+
+class ReplayBuffer:
+    """Fixed-size buffer to store experience tuples."""
+
+    def __init__(self, action_size, buffer_size, batch_size, device):
+        """Initialize a ReplayBuffer object.
+
+        Params
+        ======
+            action_size (int): dimension of each action
+            buffer_size (int): maximum size of buffer
+            batch_size (int): size of each training batch
+        """
+        self.action_size = action_size
+        self.memory = deque(maxlen=buffer_size)
+        self.batch_size = batch_size
+        self.device = device
+
+    def add(self, state, action, reward, next_state, done, action_prob=0.0):
+        """Add a new experience to memory."""
+        e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, action_prob)
+        self.memory.append(e)
+
+    def sample(self):
+        """Randomly sample a batch of experiences from memory."""
+        experiences = random.sample(self.memory, k=self.batch_size)
+        states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
+            .float().to(self.device)
+        actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
+            .long().to(self.device)
+        rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
+            .float().to(self.device)
+        next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
+            .float().to(self.device)
+        dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
+            .float().to(self.device)
+        action_probs = torch.from_numpy(self.__v_stack_impr([e.action_prob for e in experiences if e is not None])) \
+            .float().to(self.device)
+
+        return states, actions, rewards, next_states, dones, action_probs
+
+    def __len__(self):
+        """Return the current size of internal memory."""
+        return len(self.memory)
+
+    def __v_stack_impr(self, states):
+        sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1
+        np_states = np.reshape(np.array(states), (len(states), sub_dim))
+        return np_states
--- a/reinforcement_learning/rl_agent_test.py
+++ b/reinforcement_learning/rl_agent_test.py
+from collections import deque
+from collections import namedtuple
+
+import gym
+import numpy as np
+from torch.utils.tensorboard import SummaryWriter
+
+from reinforcement_learning.dddqn_policy import DDDQNPolicy
+from reinforcement_learning.ppo_agent import PPOPolicy
+
+dddqn_param_nt = namedtuple('DDDQN_Param', ['hidden_size', 'buffer_size', 'batch_size', 'update_every', 'learning_rate',
+                                            'tau', 'gamma', 'buffer_min_size', 'use_gpu'])
+dddqn_param = dddqn_param_nt(hidden_size=128,
+                             buffer_size=1000,
+                             batch_size=64,
+                             update_every=10,
+                             learning_rate=1.e-3,
+                             tau=1.e-2,
+                             gamma=0.95,
+                             buffer_min_size=0,
+                             use_gpu=False)
+
+
+def cartpole(use_dddqn=False):
+    eps = 1.0
+    eps_decay = 0.99
+    min_eps = 0.01
+    training_mode = True
+
+    env = gym.make("CartPole-v1")
+    observation_space = env.observation_space.shape[0]
+    action_space = env.action_space.n
+    if not use_dddqn:
+        policy = PPOPolicy(observation_space, action_space, False)
+    else:
+        policy = DDDQNPolicy(observation_space, action_space, dddqn_param)
+    episode = 0
+    checkpoint_interval = 20
+    scores_window = deque(maxlen=100)
+
+    writer = SummaryWriter()
+
+    while True:
+        episode += 1
+        state = env.reset()
+        policy.reset(env)
+        handle = 0
+        tot_reward = 0
+
+        policy.start_episode(train=training_mode)
+        while True:
+            # env.render()
+            policy.start_step(train=training_mode)
+            action = policy.act(handle, state, eps)
+            state_next, reward, terminal, info = env.step(action)
+            policy.end_step(train=training_mode)
+            tot_reward += reward
+            # reward = reward if not terminal else -reward
+            reward = 0 if not terminal else -1
+            policy.step(handle, state, action, reward, state_next, terminal)
+            state = np.copy(state_next)
+            if terminal:
+                break
+
+        policy.end_episode(train=training_mode)
+        eps = max(min_eps, eps * eps_decay)
+        scores_window.append(tot_reward)
+        if episode % checkpoint_interval == 0:
+            print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
+                                                                                                              tot_reward,
+                                                                                                              np.mean(
+                                                                                                                  scores_window),
+                                                                                                              eps,
+                                                                                                              len(
+                                                                                                                  policy.memory)))
+        else:
+            print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
+                                                                                                              tot_reward,
+                                                                                                              np.mean(
+                                                                                                                  scores_window),
+                                                                                                              eps,
+                                                                                                              len(
+                                                                                                                  policy.memory)),
+                  end=" ")
+
+        writer.add_scalar("CartPole/value", tot_reward, episode)
+        writer.add_scalar("CartPole/smoothed_value", np.mean(scores_window), episode)
+        writer.flush()
+
+
+if __name__ == "__main__":
+    cartpole()
--- a/reinforcement_learning/sequential_agent.py
+++ b/reinforcement_learning/sequential_agent.py
 import sys
-import numpy as np
+from pathlib import Path

+import numpy as np
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.envs.rail_generators import complex_rail_generator
 from flatland.envs.schedule_generators import complex_schedule_generator
 from flatland.utils.rendertools import RenderTool
-from pathlib import Path

 base_dir = Path(__file__).resolve().parent.parent
 sys.path.append(str(base_dir))
@@ -73,7 +73,7 @@ for trials in range(1, n_episodes + 1):
            if done[a]:
                acting_agent += 1
            if a == acting_agent:
-                action = policy.act(obs[a])
+                action = policy.act(a, obs[a])
            else:
                action = 4
            action_dict.update({a: action})

--- a/reinforcement_learning/sequential_agent_training.py
+++ b/reinforcement_learning/sequential_agent_training.py
 import sys
-import numpy as np
+from pathlib import Path

+import numpy as np
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.envs.rail_generators import complex_rail_generator
 from flatland.envs.schedule_generators import complex_schedule_generator
 from flatland.utils.rendertools import RenderTool
-from pathlib import Path

 base_dir = Path(__file__).resolve().parent.parent
 sys.path.append(str(base_dir))
@@ -66,7 +66,7 @@ for trials in range(1, n_episodes + 1):
            if done[a]:
                acting_agent += 1
            if a == acting_agent:
-                action = policy.act(obs[a])
+                action = policy.act(a, obs[a])
            else:
                action = 4
            action_dict.update({a: action})

--- a/reinforcement_learning/single_agent_training.py
+++ b/reinforcement_learning/single_agent_training.py
@@ -123,7 +123,8 @@ def train_agent(n_episodes):
        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
-                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
+                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth,
+                                                         observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
@@ -132,7 +133,7 @@ def train_agent(n_episodes):
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
-                    action = policy.act(agent_obs[agent], eps=eps_start)
+                    action = policy.act(agent, agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                else:
                    update_values = False
@@ -154,7 +155,8 @@ def train_agent(n_episodes):
                    agent_prev_action[agent] = action_dict[agent]

                if next_obs[agent]:
-                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=10)
+                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth,
+                                                             observation_radius=10)

                score += all_rewards[agent]

@@ -179,15 +181,16 @@ def train_agent(n_episodes):
        else:
            end = " "

-        print('\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-            env.get_num_agents(),
-            x_dim, y_dim,
-            episode_idx,
-            np.mean(scores_window),
-            100 * np.mean(completion_window),
-            eps_start,
-            action_probs
-        ), end=end)
+        print(
+            '\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                env.get_num_agents(),
+                x_dim, y_dim,
+                episode_idx,
+                np.mean(scores_window),
+                100 * np.mean(completion_window),
+                eps_start,
+                action_probs
+            ), end=end)

    # Plot overall training progress at the end
    plt.plot(scores)
@@ -199,7 +202,8 @@ def train_agent(n_episodes):

 if __name__ == "__main__":
    parser = ArgumentParser()
-    parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500, type=int)
+    parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500,
+                        type=int)
    args = parser.parse_args()

    train_agent(args.n_episodes)
--- a/run.py
+++ b/run.py
 '''
+I did experiments in an early submission. Please note that the epsilon can have an
+effects on the evaluation outcome :
 DDDQNPolicy experiments - EPSILON impact analysis
 ----------------------------------------------------------------------------------------
 checkpoint = "./checkpoints/201124171810-7800.pth"  # Training on AGENTS=10 with Depth=2
@@ -25,12 +27,17 @@ from pathlib import Path

 import numpy as np
 from flatland.core.env_observation_builder import DummyObservationBuilder
+from flatland.envs.agent_utils import RailAgentStatus
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.evaluators.client import FlatlandRemoteClient
 from flatland.evaluators.client import TimeoutException

-from reinforcement_learning.ppo_agent import PPOAgent
+from reinforcement_learning.dddqn_policy import DDDQNPolicy
+from reinforcement_learning.deadlockavoidance_with_decision_agent import DeadLockAvoidanceWithDecisionAgent
+from reinforcement_learning.multi_decision_agent import MultiDecisionAgent
+from reinforcement_learning.ppo_agent import PPOPolicy
+from utils.agent_action_config import get_action_size, map_actions, set_action_size_reduced, set_action_size_full
 from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
 from utils.deadlock_check import check_if_all_blocked
 from utils.fast_tree_obs import FastTreeObs
@@ -39,33 +46,71 @@ from utils.observation_utils import normalize_observation
 base_dir = Path(__file__).resolve().parent.parent
 sys.path.append(str(base_dir))

-from reinforcement_learning.dddqn_policy import DDDQNPolicy
-
 ####################################################
 # EVALUATION PARAMETERS
+set_action_size_full()

 # Print per-step logs
 VERBOSE = True
 USE_FAST_TREEOBS = True
-USE_PPO_AGENT = False

-# Checkpoint to use (remember to push it!)
-checkpoint = "./checkpoints/201124171810-7800.pth" # DDDQN: 18.249244799876152 DEPTH=2 AGENTS=10
-# checkpoint = "./checkpoints/201126150143-5200.pth" # DDDQN: 18.249244799876152 DEPTH=2 AGENTS=10
-# checkpoint = "./checkpoints/201126160144-2000.pth" # DDDQN: 18.249244799876152 DEPTH=2 AGENTS=10
-checkpoint = "./checkpoints/201207144650-20000.pth" # PPO: 14.45790721540786
-checkpoint = "./checkpoints/201211063511-6300.pth" # DDDQN: 16.948349308440857
-checkpoint = "./checkpoints/201211095604-12000.pth" # DDDQN: 17.3862941316504
-checkpoint = "./checkpoints/201211164554-8900.pth" # DDDQN: 17.44397192482364
-
-EPSILON = 0.01
+if False:
+    # -------------------------------------------------------------------------------------------------------
+    # RL solution
+    # -------------------------------------------------------------------------------------------------------
+    # 116591 adrian_egli
+    # graded	71.305	0.633	RL	Successfully Graded ! More details about this submission can be found at:
+    # http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/51
+    # Fri, 22 Jan 2021 23:37:56
+    set_action_size_reduced()
+    load_policy = "DDDQN"
+    checkpoint = "./checkpoints/210122120236-3000.pth"  # 17.011131341978228
+    EPSILON = 0.0
+
+if False:
+    # -------------------------------------------------------------------------------------------------------
+    # RL solution
+    # -------------------------------------------------------------------------------------------------------
+    # 116658 adrian_egli
+    # graded	73.821	0.655	RL	Successfully Graded ! More details about this submission can be found at:
+    # http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/52
+    # Sat, 23 Jan 2021 07:41:35
+    set_action_size_reduced()
+    load_policy = "PPO"
+    checkpoint = "./checkpoints/210122235754-5000.pth"  # 16.00113400887389
+    EPSILON = 0.0
+
+if True:
+    # -------------------------------------------------------------------------------------------------------
+    # RL solution
+    # -------------------------------------------------------------------------------------------------------
+    # 116659 adrian_egli
+    # graded	80.579	0.715	RL	Successfully Graded ! More details about this submission can be found at:
+    # http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/53
+    # Sat, 23 Jan 2021 07:45:49
+    set_action_size_reduced()
+    load_policy = "DDDQN"
+    checkpoint = "./checkpoints/210122165109-5000.pth"  # 17.993750197899438
+    EPSILON = 0.0
+
+if False:
+    # -------------------------------------------------------------------------------------------------------
+    # !! This is not a RL solution !!!!
+    # -------------------------------------------------------------------------------------------------------
+    # 116727 adrian_egli
+    # graded	106.786	0.768	RL	Successfully Graded ! More details about this submission can be found at:
+    # http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/54
+    # Sat, 23 Jan 2021 14:31:50
+    set_action_size_reduced()
+    load_policy = "DeadLockAvoidance"
+    checkpoint = None
+    EPSILON = 0.0

 # Use last action cache
 USE_ACTION_CACHE = False
-USE_DEAD_LOCK_AVOIDANCE_AGENT = False  # 21.54485505223213

 # Observation parameters (must match training parameters!)
-observation_tree_depth = 1
+observation_tree_depth = 2
 observation_radius = 10
 observation_max_path_depth = 30

@@ -101,15 +146,6 @@ else:
    n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)])
    state_size = n_features_per_node * n_nodes

-action_size = 5
-
-# Creates the policy. No GPU on evaluation server.
-if not USE_PPO_AGENT:
-    policy = DDDQNPolicy(state_size, action_size, Namespace(**{'use_gpu': False}), evaluation_mode=True)
-else:
-    policy = PPOAgent(state_size, action_size)
-policy.load(checkpoint)
-
 #####################################################################
 # Main evaluation loop
 #####################################################################
@@ -143,6 +179,27 @@ while True:

    tree_observation.set_env(local_env)
    tree_observation.reset()
+
+    # Creates the policy. No GPU on evaluation server.
+    if load_policy == "DDDQN":
+        policy = DDDQNPolicy(state_size, get_action_size(), Namespace(**{'use_gpu': False}), evaluation_mode=True)
+    elif load_policy == "PPO":
+        policy = PPOPolicy(state_size, get_action_size())
+    elif load_policy == "DeadLockAvoidance":
+        policy = DeadLockAvoidanceAgent(local_env, get_action_size(), enable_eps=False)
+    elif load_policy == "DeadLockAvoidanceWithDecision":
+        # inter_policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=False, in_parameters=train_params)
+        inter_policy = DDDQNPolicy(state_size, get_action_size(), Namespace(**{'use_gpu': False}), evaluation_mode=True)
+        policy = DeadLockAvoidanceWithDecisionAgent(local_env, state_size, get_action_size(), inter_policy)
+    elif load_policy == "MultiDecision":
+        policy = MultiDecisionAgent(state_size, get_action_size(), Namespace(**{'use_gpu': False}))
+    else:
+        policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=False,
+                           in_parameters=Namespace(**{'use_gpu': False}))
+
+    policy.load(checkpoint)
+
+    policy.reset(local_env)
    observation = tree_observation.get_many(list(range(nb_agents)))

    print("Evaluation {}: {} agents in {}x{}".format(evaluation_number, nb_agents, local_env.width, local_env.height))
@@ -162,9 +219,6 @@ while True:
    agent_last_action = {}
    nb_hit = 0

-    if USE_DEAD_LOCK_AVOIDANCE_AGENT:
-        policy = DeadLockAvoidanceAgent(local_env, action_size)
-
    policy.start_episode(train=False)
    while True:
        try:
@@ -179,14 +233,7 @@ while True:
                time_start = time.time()
                action_dict = {}
                policy.start_step(train=False)
-                if USE_DEAD_LOCK_AVOIDANCE_AGENT:
-                    observation = np.zeros((local_env.get_num_agents(), 2))
                for agent_handle in range(nb_agents):
-
-                    if USE_DEAD_LOCK_AVOIDANCE_AGENT:
-                        observation[agent_handle][0] = agent_handle
-                        observation[agent_handle][1] = steps
-
                    if info['action_required'][agent_handle]:
                        if agent_handle in agent_last_obs and np.all(
                                agent_last_obs[agent_handle] == observation[agent_handle]):
@@ -198,7 +245,7 @@ while True:
                                                                                observation_tree_depth,
                                                                                observation_radius=observation_radius)

-                            action = policy.act(normalized_observation, eps=EPSILON)
+                            action = policy.act(agent_handle, normalized_observation, eps=EPSILON)

                    action_dict[agent_handle] = action

@@ -211,7 +258,7 @@ while True:
                time_taken_by_controller.append(agent_time)

                time_start = time.time()
-                _, all_rewards, done, info = remote_client.env_step(action_dict)
+                _, all_rewards, done, info = remote_client.env_step(map_actions(action_dict))
                step_time = time.time() - time_start
                time_taken_per_step.append(step_time)

@@ -228,7 +275,11 @@ while True:
                step_time = time.time() - time_start
                time_taken_per_step.append(step_time)

-            nb_agents_done = sum(done[idx] for idx in local_env.get_agent_handles())
+            nb_agents_done = 0
+            for i_agent, agent in enumerate(local_env.agents):
+                # manage the boolean flag to check if all agents are indeed done (or done_removed)
+                if (agent.status in [RailAgentStatus.DONE, RailAgentStatus.DONE_REMOVED]):
+                    nb_agents_done += 1

            if VERBOSE or done['__all__']:
                print(

--- a/runs_bench/Jan14_10-56-32_K57261_PPO_reduced/events.out.tfevents.1610618195.K57261.15412.0
+++ b/runs_bench/Jan14_10-56-32_K57261_PPO_reduced/events.out.tfevents.1610618195.K57261.15412.0
--- a/runs_bench/Jan18_09-32-17_K57261_DDDQN_reduced/events.out.tfevents.1610958740.K57261.6608.0
+++ b/runs_bench/Jan18_09-32-17_K57261_DDDQN_reduced/events.out.tfevents.1610958740.K57261.6608.0
--- a/runs_bench/Jan18_09-34-10_K57261_DeadLockAvoidance_EPS_reduced/events.out.tfevents.1610958853.K57261.10660.0
+++ b/runs_bench/Jan18_09-34-10_K57261_DeadLockAvoidance_EPS_reduced/events.out.tfevents.1610958853.K57261.10660.0
--- a/runs_bench/Jan18_11-47-54_K57261_DeadLockAvoidance_reduced/events.out.tfevents.1610966876.K57261.4332.0
+++ b/runs_bench/Jan18_11-47-54_K57261_DeadLockAvoidance_reduced/events.out.tfevents.1610966876.K57261.4332.0
--- a/runs_bench/Jan18_11-56-16_K57261_DeadLockAvoidanceWithDecision_reduced/events.out.tfevents.1610967379.K57261.14680.0
+++ b/runs_bench/Jan18_11-56-16_K57261_DeadLockAvoidanceWithDecision_reduced/events.out.tfevents.1610967379.K57261.14680.0
--- a/runs_bench/Jan18_13-46-59_K57261_MultiDecisionAgent_reduced/events.out.tfevents.1610974021.K57261.12972.0
+++ b/runs_bench/Jan18_13-46-59_K57261_MultiDecisionAgent_reduced/events.out.tfevents.1610974021.K57261.12972.0
--- a/runs_bench/Jan18_14-53-57_K57261_PPO_full/events.out.tfevents.1610978039.K57261.484.0
+++ b/runs_bench/Jan18_14-53-57_K57261_PPO_full/events.out.tfevents.1610978039.K57261.484.0
--- a/runs_bench/Jan18_14-57-56_K57261_DDDQN_full/events.out.tfevents.1610978281.K57261.19984.0
+++ b/runs_bench/Jan18_14-57-56_K57261_DDDQN_full/events.out.tfevents.1610978281.K57261.19984.0
--- a/runs_bench/Jan18_16-05-23_K57261_DeadLockAvoidance_EPS_full/events.out.tfevents.1610982327.K57261.6264.0
+++ b/runs_bench/Jan18_16-05-23_K57261_DeadLockAvoidance_EPS_full/events.out.tfevents.1610982327.K57261.6264.0
--- a/runs_bench/Jan18_16-14-19_K57261_DeadLockAvoidance_full/events.out.tfevents.1610982862.K57261.14612.0
+++ b/runs_bench/Jan18_16-14-19_K57261_DeadLockAvoidance_full/events.out.tfevents.1610982862.K57261.14612.0
--- a/runs_bench/Jan18_16-43-41_K57261_DeadLockAvoidanceWithDecision_full/events.out.tfevents.1610984623.K57261.17628.0
+++ b/runs_bench/Jan18_16-43-41_K57261_DeadLockAvoidanceWithDecision_full/events.out.tfevents.1610984623.K57261.17628.0
--- a/runs_bench/Jan18_16-45-04_K57261_MultiDecision_full/events.out.tfevents.1610984709.K57261.1796.0
+++ b/runs_bench/Jan18_16-45-04_K57261_MultiDecision_full/events.out.tfevents.1610984709.K57261.1796.0
--- a/runs_bench/Screenshots/full.png
+++ b/runs_bench/Screenshots/full.png
--- a/runs_bench/Screenshots/reduced.png
+++ b/runs_bench/Screenshots/reduced.png
No results found