Compare revisions

0006c040 · 0006c040 · cabf5514 · cabf5514 · cabf5514 · 0006c040
--- a/reinforcement_learning/ordered_policy.py
+++ b/reinforcement_learning/ordered_policy.py
@@ -15,7 +15,7 @@ class OrderedPolicy(Policy):
    def __init__(self):
        self.action_size = 5

-    def act(self, state, eps=0.):
+    def act(self, handle, state, eps=0.):
        _, distance, _ = split_tree_into_feature_groups(state, 1)
        distance = distance[1:]
        min_dist = min_gt(distance, 0)

--- a/reinforcement_learning/policy.py
+++ b/reinforcement_learning/policy.py
-class Policy:
-    def step(self, handle, state, action, reward, next_state, done):
-        raise NotImplementedError
-
-    def act(self, state, eps=0.):
-        raise NotImplementedError
-
-    def save(self, filename):
-        raise NotImplementedError
-
-    def load(self, filename):
-        raise NotImplementedError
-
-    def start_step(self):
-        pass
-
-    def end_step(self):
-        pass
-
-    def load_replay_buffer(self, filename):
-        pass
-
-    def test(self):
-        pass
-
-    def reset(self):
-        pass
\ No newline at end of file
+from flatland.envs.rail_env import RailEnv
+
+
+class DummyMemory:
+    def __init__(self):
+        self.memory = []
+
+    def __len__(self):
+        return 0
+
+
+class Policy:
+    def step(self, handle, state, action, reward, next_state, done):
+        raise NotImplementedError
+
+    def act(self, handle, state, eps=0.):
+        raise NotImplementedError
+
+    def save(self, filename):
+        raise NotImplementedError
+
+    def load(self, filename):
+        raise NotImplementedError
+
+    def start_step(self, train):
+        pass
+
+    def end_step(self, train):
+        pass
+
+    def start_episode(self, train):
+        pass
+
+    def end_episode(self, train):
+        pass
+
+    def load_replay_buffer(self, filename):
+        pass
+
+    def test(self):
+        pass
+
+    def reset(self, env: RailEnv):
+        pass
+
+    def clone(self):
+        return self
+
+
+class HeuristicPolicy(Policy):
+    def __init__(self):
+        super(HeuristicPolicy).__init__()
+
+
+class LearningPolicy(Policy):
+    def __init__(self):
+        super(LearningPolicy).__init__()
+
+
+class HybridPolicy(Policy):
+    def __init__(self):
+        super(HybridPolicy).__init__()
--- a/reinforcement_learning/ppo/model.py
+++ b/reinforcement_learning/ppo/model.py
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class PolicyNetwork(nn.Module):
-    def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128, hidsize3=32):
-        super().__init__()
-        self.fc1 = nn.Linear(state_size, hidsize1)
-        self.fc2 = nn.Linear(hidsize1, hidsize2)
-        # self.fc3 = nn.Linear(hidsize2, hidsize3)
-        self.output = nn.Linear(hidsize2, action_size)
-        self.softmax = nn.Softmax(dim=1)
-        self.bn0 = nn.BatchNorm1d(state_size, affine=False)
-
-    def forward(self, inputs):
-        x = self.bn0(inputs.float())
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        # x = F.relu(self.fc3(x))
-        return self.softmax(self.output(x))
--- a/reinforcement_learning/ppo/ppo_agent.py
+++ b/reinforcement_learning/ppo/ppo_agent.py
-import os
-
-import numpy as np
-import torch
-from torch.distributions.categorical import Categorical
-
-from reinforcement_learning.policy import Policy
-from reinforcement_learning.ppo.model import PolicyNetwork
-from reinforcement_learning.ppo.replay_memory import Episode, ReplayBuffer
-
-BUFFER_SIZE = 128_000
-BATCH_SIZE = 8192
-GAMMA = 0.95
-LR = 0.5e-4
-CLIP_FACTOR = .005
-UPDATE_EVERY = 30
-
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print("device:", device)
-
-
-class PPOAgent(Policy):
-    def __init__(self, state_size, action_size, num_agents):
-        self.action_size = action_size
-        self.state_size = state_size
-        self.num_agents = num_agents
-        self.policy = PolicyNetwork(state_size, action_size).to(device)
-        self.old_policy = PolicyNetwork(state_size, action_size).to(device)
-        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=LR)
-        self.episodes = [Episode() for _ in range(num_agents)]
-        self.memory = ReplayBuffer(BUFFER_SIZE)
-        self.t_step = 0
-        self.loss = 0
-
-    def reset(self):
-        self.finished = [False] * len(self.episodes)
-        self.tot_reward = [0] * self.num_agents
-
-    # Decide on an action to take in the environment
-
-    def act(self, state, eps=None):
-        if eps is not None:
-            # Epsilon-greedy action selection
-            if np.random.random() < eps:
-                return np.random.choice(np.arange(self.action_size))
-
-        self.policy.eval()
-        with torch.no_grad():
-            output = self.policy(torch.from_numpy(state).float().unsqueeze(0).to(device))
-            ret = Categorical(output).sample().item()
-            return ret
-
-    # Record the results of the agent's action and update the model
-    def step(self, handle, state, action, reward, next_state, done):
-        if not self.finished[handle]:
-            # Push experience into Episode memory
-            self.tot_reward[handle] += reward
-            if done == 1:
-                reward = 1  # self.tot_reward[handle]
-            else:
-                reward = 0
-
-            self.episodes[handle].push(state, action, reward, next_state, done)
-
-            # When we finish the episode, discount rewards and push the experience into replay memory
-            if done:
-                self.episodes[handle].discount_rewards(GAMMA)
-                self.memory.push_episode(self.episodes[handle])
-                self.episodes[handle].reset()
-                self.finished[handle] = True
-
-        # Perform a gradient update every UPDATE_EVERY time steps
-        self.t_step = (self.t_step + 1) % UPDATE_EVERY
-        if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 4:
-            self._learn(*self.memory.sample(BATCH_SIZE, device))
-
-    def _clip_gradient(self, model, clip):
-
-        for p in model.parameters():
-            p.grad.data.clamp_(-clip, clip)
-        return
-
-        """Computes a gradient clipping coefficient based on gradient norm."""
-        totalnorm = 0
-        for p in model.parameters():
-            if p.grad is not None:
-                modulenorm = p.grad.data.norm()
-                totalnorm += modulenorm ** 2
-        totalnorm = np.sqrt(totalnorm)
-        coeff = min(1, clip / (totalnorm + 1e-6))
-
-        for p in model.parameters():
-            if p.grad is not None:
-                p.grad.mul_(coeff)
-
-    def _learn(self, states, actions, rewards, next_state, done):
-        self.policy.train()
-
-        responsible_outputs = torch.gather(self.policy(states), 1, actions)
-        old_responsible_outputs = torch.gather(self.old_policy(states), 1, actions).detach()
-
-        # rewards = rewards - rewards.mean()
-        ratio = responsible_outputs / (old_responsible_outputs + 1e-5)
-        clamped_ratio = torch.clamp(ratio, 1. - CLIP_FACTOR, 1. + CLIP_FACTOR)
-        loss = -torch.min(ratio * rewards, clamped_ratio * rewards).mean()
-        self.loss = loss
-
-        # Compute loss and perform a gradient step
-        self.old_policy.load_state_dict(self.policy.state_dict())
-        self.optimizer.zero_grad()
-        loss.backward()
-        # self._clip_gradient(self.policy, 1.0)
-        self.optimizer.step()
-
-    # Checkpointing methods
-    def save(self, filename):
-        # print("Saving model from checkpoint:", filename)
-        torch.save(self.policy.state_dict(), filename + ".policy")
-        torch.save(self.optimizer.state_dict(), filename + ".optimizer")
-
-    def load(self, filename):
-        print("load policy from file", filename)
-        if os.path.exists(filename + ".policy"):
-            print(' >> ', filename + ".policy")
-            try:
-                self.policy.load_state_dict(torch.load(filename + ".policy", map_location=device))
-            except:
-                print(" >> failed!")
-                pass
-        if os.path.exists(filename + ".optimizer"):
-            print(' >> ', filename + ".optimizer")
-            try:
-                self.optimizer.load_state_dict(torch.load(filename + ".optimizer", map_location=device))
-            except:
-                print(" >> failed!")
-                pass
--- a/reinforcement_learning/ppo/replay_memory.py
+++ b/reinforcement_learning/ppo/replay_memory.py
-import torch
-import random
-import numpy as np
-from collections import namedtuple, deque, Iterable
-
-
-Transition = namedtuple("Experience", ("state", "action", "reward", "next_state", "done"))
-
-
-class Episode:
-    memory = []
-
-    def reset(self):
-        self.memory = []
-
-    def push(self, *args):
-        self.memory.append(tuple(args))
-
-    def discount_rewards(self, gamma):
-        running_add = 0.
-        for i, (state, action, reward, *rest) in list(enumerate(self.memory))[::-1]:
-            running_add = running_add * gamma + reward
-            self.memory[i] = (state, action, running_add, *rest)
-
-
-class ReplayBuffer:
-    def __init__(self, buffer_size):
-        self.memory = deque(maxlen=buffer_size)
-
-    def push(self, state, action, reward, next_state, done):
-        self.memory.append(Transition(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done))
-
-    def push_episode(self, episode):
-        for step in episode.memory:
-            self.push(*step)
-
-    def sample(self, batch_size, device):
-        experiences = random.sample(self.memory, k=batch_size)
-
-        states      = torch.from_numpy(self.stack([e.state      for e in experiences])).float().to(device)
-        actions     = torch.from_numpy(self.stack([e.action     for e in experiences])).long().to(device)
-        rewards     = torch.from_numpy(self.stack([e.reward     for e in experiences])).float().to(device)
-        next_states = torch.from_numpy(self.stack([e.next_state for e in experiences])).float().to(device)
-        dones       = torch.from_numpy(self.stack([e.done       for e in experiences]).astype(np.uint8)).float().to(device)
-
-        return states, actions, rewards, next_states, dones
-
-    def stack(self, states):
-        sub_dims = states[0].shape[1:] if isinstance(states[0], Iterable) else [1]
-        return np.reshape(np.array(states), (len(states), *sub_dims))
-
-    def __len__(self):
-        return len(self.memory)
--- a/reinforcement_learning/ppo_agent.py
+++ b/reinforcement_learning/ppo_agent.py
+import copy
+import os
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.distributions import Categorical
+
+# Hyperparameters
+from reinforcement_learning.policy import LearningPolicy
+from reinforcement_learning.replay_buffer import ReplayBuffer
+
+# https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
+
+class EpisodeBuffers:
+    def __init__(self):
+        self.reset()
+
+    def __len__(self):
+        """Return the current size of internal memory."""
+        return len(self.memory)
+
+    def reset(self):
+        self.memory = {}
+
+    def get_transitions(self, handle):
+        return self.memory.get(handle, [])
+
+    def push_transition(self, handle, transition):
+        transitions = self.get_transitions(handle)
+        transitions.append(transition)
+        self.memory.update({handle: transitions})
+
+
+class ActorCriticModel(nn.Module):
+
+    def __init__(self, state_size, action_size, device, hidsize1=512, hidsize2=256):
+        super(ActorCriticModel, self).__init__()
+        self.device = device
+        self.actor = nn.Sequential(
+            nn.Linear(state_size, hidsize1),
+            nn.Tanh(),
+            nn.Linear(hidsize1, hidsize2),
+            nn.Tanh(),
+            nn.Linear(hidsize2, action_size),
+            nn.Softmax(dim=-1)
+        ).to(self.device)
+
+        self.critic = nn.Sequential(
+            nn.Linear(state_size, hidsize1),
+            nn.Tanh(),
+            nn.Linear(hidsize1, hidsize2),
+            nn.Tanh(),
+            nn.Linear(hidsize2, 1)
+        ).to(self.device)
+
+    def forward(self, x):
+        raise NotImplementedError
+
+    def get_actor_dist(self, state):
+        action_probs = self.actor(state)
+        dist = Categorical(action_probs)
+        return dist
+
+    def evaluate(self, states, actions):
+        action_probs = self.actor(states)
+        dist = Categorical(action_probs)
+        action_logprobs = dist.log_prob(actions)
+        dist_entropy = dist.entropy()
+        state_value = self.critic(states)
+        return action_logprobs, torch.squeeze(state_value), dist_entropy
+
+    def save(self, filename):
+        # print("Saving model from checkpoint:", filename)
+        torch.save(self.actor.state_dict(), filename + ".actor")
+        torch.save(self.critic.state_dict(), filename + ".value")
+
+    def _load(self, obj, filename):
+        if os.path.exists(filename):
+            print(' >> ', filename)
+            try:
+                obj.load_state_dict(torch.load(filename, map_location=self.device))
+            except:
+                print(" >> failed!")
+        return obj
+
+    def load(self, filename):
+        print("load model from file", filename)
+        self.actor = self._load(self.actor, filename + ".actor")
+        self.critic = self._load(self.critic, filename + ".value")
+
+
+class PPOPolicy(LearningPolicy):
+    def __init__(self, state_size, action_size, use_replay_buffer=False, in_parameters=None):
+        print(">> PPOPolicy")
+        super(PPOPolicy, self).__init__()
+        # parameters
+        self.ppo_parameters = in_parameters
+        if self.ppo_parameters is not None:
+            self.hidsize = self.ppo_parameters.hidden_size
+            self.buffer_size = self.ppo_parameters.buffer_size
+            self.batch_size = self.ppo_parameters.batch_size
+            self.learning_rate = self.ppo_parameters.learning_rate
+            self.gamma = self.ppo_parameters.gamma
+            # Device
+            if self.ppo_parameters.use_gpu and torch.cuda.is_available():
+                self.device = torch.device("cuda:0")
+                # print("🐇 Using GPU")
+            else:
+                self.device = torch.device("cpu")
+                # print("🐢 Using CPU")
+        else:
+            self.hidsize = 128
+            self.learning_rate = 1.0e-3
+            self.gamma = 0.95
+            self.buffer_size = 32_000
+            self.batch_size = 1024
+            self.device = torch.device("cpu")
+
+        self.surrogate_eps_clip = 0.1
+        self.K_epoch = 10
+        self.weight_loss = 0.5
+        self.weight_entropy = 0.01
+
+        self.buffer_min_size = 0
+        self.use_replay_buffer = use_replay_buffer
+
+        self.current_episode_memory = EpisodeBuffers()
+        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
+        self.loss = 0
+        self.actor_critic_model = ActorCriticModel(state_size, action_size,self.device,
+                                                   hidsize1=self.hidsize,
+                                                   hidsize2=self.hidsize)
+        self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate)
+        self.loss_function = nn.MSELoss()  # nn.SmoothL1Loss()
+
+    def reset(self, env):
+        pass
+
+    def act(self, handle, state, eps=None):
+        # sample a action to take
+        torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
+        dist = self.actor_critic_model.get_actor_dist(torch_state)
+        action = dist.sample()
+        return action.item()
+
+    def step(self, handle, state, action, reward, next_state, done):
+        # record transitions ([state] -> [action] -> [reward, next_state, done])
+        torch_action = torch.tensor(action, dtype=torch.float).to(self.device)
+        torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
+        # evaluate actor
+        dist = self.actor_critic_model.get_actor_dist(torch_state)
+        action_logprobs = dist.log_prob(torch_action)
+        transition = (state, action, reward, next_state, action_logprobs.item(), done)
+        self.current_episode_memory.push_transition(handle, transition)
+
+    def _push_transitions_to_replay_buffer(self,
+                                           state_list,
+                                           action_list,
+                                           reward_list,
+                                           state_next_list,
+                                           done_list,
+                                           prob_a_list):
+        for idx in range(len(reward_list)):
+            state_i = state_list[idx]
+            action_i = action_list[idx]
+            reward_i = reward_list[idx]
+            state_next_i = state_next_list[idx]
+            done_i = done_list[idx]
+            prob_action_i = prob_a_list[idx]
+            self.memory.add(state_i, action_i, reward_i, state_next_i, done_i, prob_action_i)
+
+    def _convert_transitions_to_torch_tensors(self, transitions_array):
+        # build empty lists(arrays)
+        state_list, action_list, reward_list, state_next_list, prob_a_list, done_list = [], [], [], [], [], []
+
+        # set discounted_reward to zero
+        discounted_reward = 0
+        for transition in transitions_array[::-1]:
+            state_i, action_i, reward_i, state_next_i, prob_action_i, done_i = transition
+
+            state_list.insert(0, state_i)
+            action_list.insert(0, action_i)
+            if done_i:
+                discounted_reward = 0
+                done_list.insert(0, 1)
+            else:
+                done_list.insert(0, 0)
+
+            discounted_reward = reward_i + self.gamma * discounted_reward
+            reward_list.insert(0, discounted_reward)
+            state_next_list.insert(0, state_next_i)
+            prob_a_list.insert(0, prob_action_i)
+
+        if self.use_replay_buffer:
+            self._push_transitions_to_replay_buffer(state_list, action_list,
+                                                    reward_list, state_next_list,
+                                                    done_list, prob_a_list)
+
+        # convert data to torch tensors
+        states, actions, rewards, states_next, dones, prob_actions = \
+            torch.tensor(state_list, dtype=torch.float).to(self.device), \
+            torch.tensor(action_list).to(self.device), \
+            torch.tensor(reward_list, dtype=torch.float).to(self.device), \
+            torch.tensor(state_next_list, dtype=torch.float).to(self.device), \
+            torch.tensor(done_list, dtype=torch.float).to(self.device), \
+            torch.tensor(prob_a_list).to(self.device)
+
+        return states, actions, rewards, states_next, dones, prob_actions
+
+    def _get_transitions_from_replay_buffer(self, states, actions, rewards, states_next, dones, probs_action):
+        if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
+            states, actions, rewards, states_next, dones, probs_action = self.memory.sample()
+            actions = torch.squeeze(actions)
+            rewards = torch.squeeze(rewards)
+            states_next = torch.squeeze(states_next)
+            dones = torch.squeeze(dones)
+            probs_action = torch.squeeze(probs_action)
+        return states, actions, rewards, states_next, dones, probs_action
+
+    def train_net(self):
+        # All agents have to propagate their experiences made during past episode
+        for handle in range(len(self.current_episode_memory)):
+            # Extract agent's episode history (list of all transitions)
+            agent_episode_history = self.current_episode_memory.get_transitions(handle)
+            if len(agent_episode_history) > 0:
+                # Convert the replay buffer to torch tensors (arrays)
+                states, actions, rewards, states_next, dones, probs_action = \
+                    self._convert_transitions_to_torch_tensors(agent_episode_history)
+
+                # Optimize policy for K epochs:
+                for k_loop in range(int(self.K_epoch)):
+
+                    if self.use_replay_buffer:
+                        states, actions, rewards, states_next, dones, probs_action = \
+                            self._get_transitions_from_replay_buffer(
+                                states, actions, rewards, states_next, dones, probs_action
+                            )
+
+                    # Evaluating actions (actor) and values (critic)
+                    logprobs, state_values, dist_entropy = self.actor_critic_model.evaluate(states, actions)
+
+                    # Finding the ratios (pi_thetas / pi_thetas_replayed):
+                    ratios = torch.exp(logprobs - probs_action.detach())
+
+                    # Finding Surrogate Loos
+                    advantages = rewards - state_values.detach()
+                    surr1 = ratios * advantages
+                    surr2 = torch.clamp(ratios, 1. - self.surrogate_eps_clip, 1. + self.surrogate_eps_clip) * advantages
+
+                    # The loss function is used to estimate the gardient and use the entropy function based
+                    # heuristic to penalize the gradient function when the policy becomes deterministic this would let
+                    # the gradient becomes very flat and so the gradient is no longer useful.
+                    loss = \
+                        -torch.min(surr1, surr2) \
+                        + self.weight_loss * self.loss_function(state_values, rewards) \
+                        - self.weight_entropy * dist_entropy
+
+                    # Make a gradient step
+                    self.optimizer.zero_grad()
+                    loss.mean().backward()
+                    self.optimizer.step()
+
+                    # Transfer the current loss to the agents loss (information) for debug purpose only
+                    self.loss = loss.mean().detach().cpu().numpy()
+
+        # Reset all collect transition data
+        self.current_episode_memory.reset()
+
+    def end_episode(self, train):
+        if train:
+            self.train_net()
+
+    # Checkpointing methods
+    def save(self, filename):
+        # print("Saving model from checkpoint:", filename)
+        self.actor_critic_model.save(filename)
+        torch.save(self.optimizer.state_dict(), filename + ".optimizer")
+
+    def _load(self, obj, filename):
+        if os.path.exists(filename):
+            print(' >> ', filename)
+            try:
+                obj.load_state_dict(torch.load(filename, map_location=self.device))
+            except:
+                print(" >> failed!")
+        else:
+            print(" >> file not found!")
+        return obj
+
+    def load(self, filename):
+        print("load policy from file", filename)
+        self.actor_critic_model.load(filename)
+        print("load optimizer from file", filename)
+        self.optimizer = self._load(self.optimizer, filename + ".optimizer")
+
+    def clone(self):
+        policy = PPOPolicy(self.state_size, self.action_size)
+        policy.actor_critic_model = copy.deepcopy(self.actor_critic_model)
+        policy.optimizer = copy.deepcopy(self.optimizer)
+        return self
--- a/reinforcement_learning/replay_buffer.py
+++ b/reinforcement_learning/replay_buffer.py
+import random
+from collections import namedtuple, deque, Iterable
+
+import numpy as np
+import torch
+
+Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "action_prob"])
+
+
+class ReplayBuffer:
+    """Fixed-size buffer to store experience tuples."""
+
+    def __init__(self, action_size, buffer_size, batch_size, device):
+        """Initialize a ReplayBuffer object.
+
+        Params
+        ======
+            action_size (int): dimension of each action
+            buffer_size (int): maximum size of buffer
+            batch_size (int): size of each training batch
+        """
+        self.action_size = action_size
+        self.memory = deque(maxlen=buffer_size)
+        self.batch_size = batch_size
+        self.device = device
+
+    def add(self, state, action, reward, next_state, done, action_prob=0.0):
+        """Add a new experience to memory."""
+        e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, action_prob)
+        self.memory.append(e)
+
+    def sample(self):
+        """Randomly sample a batch of experiences from memory."""
+        experiences = random.sample(self.memory, k=self.batch_size)
+        states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
+            .float().to(self.device)
+        actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
+            .long().to(self.device)
+        rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
+            .float().to(self.device)
+        next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
+            .float().to(self.device)
+        dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
+            .float().to(self.device)
+        action_probs = torch.from_numpy(self.__v_stack_impr([e.action_prob for e in experiences if e is not None])) \
+            .float().to(self.device)
+
+        return states, actions, rewards, next_states, dones, action_probs
+
+    def __len__(self):
+        """Return the current size of internal memory."""
+        return len(self.memory)
+
+    def __v_stack_impr(self, states):
+        sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1
+        np_states = np.reshape(np.array(states), (len(states), sub_dim))
+        return np_states
--- a/reinforcement_learning/rl_agent_test.py
+++ b/reinforcement_learning/rl_agent_test.py
+from collections import deque
+from collections import namedtuple
+
+import gym
+import numpy as np
+from torch.utils.tensorboard import SummaryWriter
+
+from reinforcement_learning.dddqn_policy import DDDQNPolicy
+from reinforcement_learning.ppo_agent import PPOPolicy
+
+dddqn_param_nt = namedtuple('DDDQN_Param', ['hidden_size', 'buffer_size', 'batch_size', 'update_every', 'learning_rate',
+                                            'tau', 'gamma', 'buffer_min_size', 'use_gpu'])
+dddqn_param = dddqn_param_nt(hidden_size=128,
+                             buffer_size=1000,
+                             batch_size=64,
+                             update_every=10,
+                             learning_rate=1.e-3,
+                             tau=1.e-2,
+                             gamma=0.95,
+                             buffer_min_size=0,
+                             use_gpu=False)
+
+
+def cartpole(use_dddqn=False):
+    eps = 1.0
+    eps_decay = 0.99
+    min_eps = 0.01
+    training_mode = True
+
+    env = gym.make("CartPole-v1")
+    observation_space = env.observation_space.shape[0]
+    action_space = env.action_space.n
+    if not use_dddqn:
+        policy = PPOPolicy(observation_space, action_space, False)
+    else:
+        policy = DDDQNPolicy(observation_space, action_space, dddqn_param)
+    episode = 0
+    checkpoint_interval = 20
+    scores_window = deque(maxlen=100)
+
+    writer = SummaryWriter()
+
+    while True:
+        episode += 1
+        state = env.reset()
+        policy.reset(env)
+        handle = 0
+        tot_reward = 0
+
+        policy.start_episode(train=training_mode)
+        while True:
+            # env.render()
+            policy.start_step(train=training_mode)
+            action = policy.act(handle, state, eps)
+            state_next, reward, terminal, info = env.step(action)
+            policy.end_step(train=training_mode)
+            tot_reward += reward
+            # reward = reward if not terminal else -reward
+            reward = 0 if not terminal else -1
+            policy.step(handle, state, action, reward, state_next, terminal)
+            state = np.copy(state_next)
+            if terminal:
+                break
+
+        policy.end_episode(train=training_mode)
+        eps = max(min_eps, eps * eps_decay)
+        scores_window.append(tot_reward)
+        if episode % checkpoint_interval == 0:
+            print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
+                                                                                                              tot_reward,
+                                                                                                              np.mean(
+                                                                                                                  scores_window),
+                                                                                                              eps,
+                                                                                                              len(
+                                                                                                                  policy.memory)))
+        else:
+            print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
+                                                                                                              tot_reward,
+                                                                                                              np.mean(
+                                                                                                                  scores_window),
+                                                                                                              eps,
+                                                                                                              len(
+                                                                                                                  policy.memory)),
+                  end=" ")
+
+        writer.add_scalar("CartPole/value", tot_reward, episode)
+        writer.add_scalar("CartPole/smoothed_value", np.mean(scores_window), episode)
+        writer.flush()
+
+
+if __name__ == "__main__":
+    cartpole()
--- a/reinforcement_learning/sequential_agent.py
+++ b/reinforcement_learning/sequential_agent.py
 import sys
-import numpy as np
+from pathlib import Path

+import numpy as np
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.envs.rail_generators import complex_rail_generator
 from flatland.envs.schedule_generators import complex_schedule_generator
 from flatland.utils.rendertools import RenderTool
-from pathlib import Path

 base_dir = Path(__file__).resolve().parent.parent
 sys.path.append(str(base_dir))
@@ -73,7 +73,7 @@ for trials in range(1, n_episodes + 1):
            if done[a]:
                acting_agent += 1
            if a == acting_agent:
-                action = policy.act(obs[a])
+                action = policy.act(a, obs[a])
            else:
                action = 4
            action_dict.update({a: action})

--- a/reinforcement_learning/sequential_agent_training.py
+++ b/reinforcement_learning/sequential_agent_training.py
+import sys
+from pathlib import Path
+
+import numpy as np
+from flatland.envs.observations import TreeObsForRailEnv
+from flatland.envs.predictions import ShortestPathPredictorForRailEnv
+from flatland.envs.rail_env import RailEnv
+from flatland.envs.rail_generators import complex_rail_generator
+from flatland.envs.schedule_generators import complex_schedule_generator
+from flatland.utils.rendertools import RenderTool
+
+base_dir = Path(__file__).resolve().parent.parent
+sys.path.append(str(base_dir))
+
+from reinforcement_learning.ordered_policy import OrderedPolicy
+
+np.random.seed(2)
+
+x_dim = 20  # np.random.randint(8, 20)
+y_dim = 20  # np.random.randint(8, 20)
+n_agents = 10  # np.random.randint(3, 8)
+n_goals = n_agents + np.random.randint(0, 3)
+min_dist = int(0.75 * min(x_dim, y_dim))
+
+env = RailEnv(width=x_dim,
+              height=y_dim,
+              rail_generator=complex_rail_generator(
+                  nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+                  max_dist=99999,
+                  seed=0
+              ),
+              schedule_generator=complex_schedule_generator(),
+              obs_builder_object=TreeObsForRailEnv(max_depth=1, predictor=ShortestPathPredictorForRailEnv()),
+              number_of_agents=n_agents)
+env.reset(True, True)
+
+tree_depth = 1
+observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())
+env_renderer = RenderTool(env, gl="PGL", )
+handle = env.get_agent_handles()
+n_episodes = 1
+max_steps = 100 * (env.height + env.width)
+record_images = False
+policy = OrderedPolicy()
+action_dict = dict()
+
+for trials in range(1, n_episodes + 1):
+
+    # Reset environment
+    obs, info = env.reset(True, True)
+    done = env.dones
+    env_renderer.reset()
+    frame_step = 0
+
+    # Run episode
+    for step in range(max_steps):
+        env_renderer.render_env(show=True, show_observations=False, show_predictions=True)
+
+        if record_images:
+            env_renderer.gl.save_image("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
+            frame_step += 1
+
+        # Action
+        acting_agent = 0
+        for a in range(env.get_num_agents()):
+            if done[a]:
+                acting_agent += 1
+            if a == acting_agent:
+                action = policy.act(a, obs[a])
+            else:
+                action = 4
+            action_dict.update({a: action})
+
+        # Environment step
+        obs, all_rewards, done, _ = env.step(action_dict)
+
+        if done['__all__']:
+            break
--- a/reinforcement_learning/single_agent_training.py
+++ b/reinforcement_learning/single_agent_training.py
@@ -123,7 +123,8 @@ def train_agent(n_episodes):
        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
-                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
+                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth,
+                                                         observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
@@ -132,7 +133,7 @@ def train_agent(n_episodes):
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
-                    action = policy.act(agent_obs[agent], eps=eps_start)
+                    action = policy.act(agent, agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                else:
                    update_values = False
@@ -154,7 +155,8 @@ def train_agent(n_episodes):
                    agent_prev_action[agent] = action_dict[agent]

                if next_obs[agent]:
-                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=10)
+                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth,
+                                                             observation_radius=10)

                score += all_rewards[agent]

@@ -179,15 +181,16 @@ def train_agent(n_episodes):
        else:
            end = " "

-        print('\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-            env.get_num_agents(),
-            x_dim, y_dim,
-            episode_idx,
-            np.mean(scores_window),
-            100 * np.mean(completion_window),
-            eps_start,
-            action_probs
-        ), end=end)
+        print(
+            '\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                env.get_num_agents(),
+                x_dim, y_dim,
+                episode_idx,
+                np.mean(scores_window),
+                100 * np.mean(completion_window),
+                eps_start,
+                action_probs
+            ), end=end)

    # Plot overall training progress at the end
    plt.plot(scores)
@@ -199,7 +202,8 @@ def train_agent(n_episodes):

 if __name__ == "__main__":
    parser = ArgumentParser()
-    parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500, type=int)
+    parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500,
+                        type=int)
    args = parser.parse_args()

    train_agent(args.n_episodes)
--- a/run.py
+++ b/run.py
--- a/run.sh
+++ b/run.sh
 #!/bin/bash

+# manually install submodules.
+
 python ./run.py
--- a/run_fast_methods.py
+++ b/run_fast_methods.py
+from time import time
+
+import numpy as np
+from flatland.envs.rail_env import fast_isclose
+
+
+def print_timing(label, start_time, end_time):
+    print("{:>10.4f}ms".format(1000 * (end_time - start_time)) + "\t" + label)
+
+
+def check_isclose(nbr=100000):
+    s = time()
+    for x in range(nbr):
+        fast_isclose(x, 0.0, rtol=1e-03)
+    e = time()
+    print_timing("fast_isclose", start_time=s, end_time=e)
+
+    s = time()
+    for x in range(nbr):
+        np.isclose(x, 0.0, rtol=1e-03)
+    e = time()
+    print_timing("np.isclose", start_time=s, end_time=e)
+
+
+if __name__ == "__main__":
+    check_isclose()
--- a/runs_bench/Jan14_10-56-32_K57261_PPO_reduced/events.out.tfevents.1610618195.K57261.15412.0
+++ b/runs_bench/Jan14_10-56-32_K57261_PPO_reduced/events.out.tfevents.1610618195.K57261.15412.0
--- a/runs_bench/Jan18_09-32-17_K57261_DDDQN_reduced/events.out.tfevents.1610958740.K57261.6608.0
+++ b/runs_bench/Jan18_09-32-17_K57261_DDDQN_reduced/events.out.tfevents.1610958740.K57261.6608.0
--- a/runs_bench/Jan18_09-34-10_K57261_DeadLockAvoidance_EPS_reduced/events.out.tfevents.1610958853.K57261.10660.0
+++ b/runs_bench/Jan18_09-34-10_K57261_DeadLockAvoidance_EPS_reduced/events.out.tfevents.1610958853.K57261.10660.0
--- a/runs_bench/Jan18_11-47-54_K57261_DeadLockAvoidance_reduced/events.out.tfevents.1610966876.K57261.4332.0
+++ b/runs_bench/Jan18_11-47-54_K57261_DeadLockAvoidance_reduced/events.out.tfevents.1610966876.K57261.4332.0
--- a/runs_bench/Jan18_11-56-16_K57261_DeadLockAvoidanceWithDecision_reduced/events.out.tfevents.1610967379.K57261.14680.0
+++ b/runs_bench/Jan18_11-56-16_K57261_DeadLockAvoidanceWithDecision_reduced/events.out.tfevents.1610967379.K57261.14680.0
--- a/runs_bench/Jan18_13-46-59_K57261_MultiDecisionAgent_reduced/events.out.tfevents.1610974021.K57261.12972.0
+++ b/runs_bench/Jan18_13-46-59_K57261_MultiDecisionAgent_reduced/events.out.tfevents.1610974021.K57261.12972.0
No results found