Compare revisions

0006c040 · 81147e7e · 81147e7e · 81147e7e · 0006c040 · 0006c040
--- a/reinforcement_learning/policy.py
+++ b/reinforcement_learning/policy.py
+from flatland.envs.rail_env import RailEnv
+
+
+class DummyMemory:
+    def __init__(self):
+        self.memory = []
+
+    def __len__(self):
+        return 0
+
+
 class Policy:
    def step(self, handle, state, action, reward, next_state, done):
        raise NotImplementedError

-    def act(self, state, eps=0.):
+    def act(self, handle, state, eps=0.):
        raise NotImplementedError

    def save(self, filename):
@@ -11,10 +22,16 @@ class Policy:
    def load(self, filename):
        raise NotImplementedError

-    def start_step(self):
+    def start_step(self, train):
+        pass
+
+    def end_step(self, train):
        pass

-    def end_step(self):
+    def start_episode(self, train):
+        pass
+
+    def end_episode(self, train):
        pass

    def load_replay_buffer(self, filename):
@@ -23,8 +40,23 @@ class Policy:
    def test(self):
        pass

-    def reset(self):
+    def reset(self, env: RailEnv):
        pass

    def clone(self):
-        return self
\ No newline at end of file
+        return self
+
+
+class HeuristicPolicy(Policy):
+    def __init__(self):
+        super(HeuristicPolicy).__init__()
+
+
+class LearningPolicy(Policy):
+    def __init__(self):
+        super(LearningPolicy).__init__()
+
+
+class HybridPolicy(Policy):
+    def __init__(self):
+        super(HybridPolicy).__init__()
--- a/reinforcement_learning/ppo/model.py
+++ b/reinforcement_learning/ppo/model.py
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class PolicyNetwork(nn.Module):
-    def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128, hidsize3=32):
-        super().__init__()
-        self.fc1 = nn.Linear(state_size, hidsize1)
-        self.fc2 = nn.Linear(hidsize1, hidsize2)
-        # self.fc3 = nn.Linear(hidsize2, hidsize3)
-        self.output = nn.Linear(hidsize2, action_size)
-        self.softmax = nn.Softmax(dim=1)
-        self.bn0 = nn.BatchNorm1d(state_size, affine=False)
-
-    def forward(self, inputs):
-        x = self.bn0(inputs.float())
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        # x = F.relu(self.fc3(x))
-        return self.softmax(self.output(x))
--- a/reinforcement_learning/ppo/ppo_agent.py
+++ b/reinforcement_learning/ppo/ppo_agent.py
-import os
-
-import numpy as np
-import torch
-from torch.distributions.categorical import Categorical
-
-from reinforcement_learning.policy import Policy
-from reinforcement_learning.ppo.model import PolicyNetwork
-from reinforcement_learning.ppo.replay_memory import Episode, ReplayBuffer
-
-BUFFER_SIZE = 128_000
-BATCH_SIZE = 8192
-GAMMA = 0.95
-LR = 0.5e-4
-CLIP_FACTOR = .005
-UPDATE_EVERY = 30
-
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print("device:", device)
-
-
-class PPOAgent(Policy):
-    def __init__(self, state_size, action_size, num_agents):
-        self.action_size = action_size
-        self.state_size = state_size
-        self.num_agents = num_agents
-        self.policy = PolicyNetwork(state_size, action_size).to(device)
-        self.old_policy = PolicyNetwork(state_size, action_size).to(device)
-        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=LR)
-        self.episodes = [Episode() for _ in range(num_agents)]
-        self.memory = ReplayBuffer(BUFFER_SIZE)
-        self.t_step = 0
-        self.loss = 0
-
-    def reset(self):
-        self.finished = [False] * len(self.episodes)
-        self.tot_reward = [0] * self.num_agents
-
-    # Decide on an action to take in the environment
-
-    def act(self, state, eps=None):
-        self.policy.eval()
-        with torch.no_grad():
-            output = self.policy(torch.from_numpy(state).float().unsqueeze(0).to(device))
-            ret = Categorical(output).sample().item()
-            return ret
-
-    # Record the results of the agent's action and update the model
-    def step(self, handle, state, action, reward, next_state, done):
-        if not self.finished[handle]:
-            # Push experience into Episode memory
-            self.tot_reward[handle] += reward
-            if done == 1:
-                reward = 1  # self.tot_reward[handle]
-            else:
-                reward = 0
-
-            self.episodes[handle].push(state, action, reward, next_state, done)
-
-            # When we finish the episode, discount rewards and push the experience into replay memory
-            if done:
-                self.episodes[handle].discount_rewards(GAMMA)
-                self.memory.push_episode(self.episodes[handle])
-                self.episodes[handle].reset()
-                self.finished[handle] = True
-
-        # Perform a gradient update every UPDATE_EVERY time steps
-        self.t_step = (self.t_step + 1) % UPDATE_EVERY
-        if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 4:
-            self._learn(*self.memory.sample(BATCH_SIZE, device))
-
-    def _clip_gradient(self, model, clip):
-
-        for p in model.parameters():
-            p.grad.data.clamp_(-clip, clip)
-        return
-
-        """Computes a gradient clipping coefficient based on gradient norm."""
-        totalnorm = 0
-        for p in model.parameters():
-            if p.grad is not None:
-                modulenorm = p.grad.data.norm()
-                totalnorm += modulenorm ** 2
-        totalnorm = np.sqrt(totalnorm)
-        coeff = min(1, clip / (totalnorm + 1e-6))
-
-        for p in model.parameters():
-            if p.grad is not None:
-                p.grad.mul_(coeff)
-
-    def _learn(self, states, actions, rewards, next_state, done):
-        self.policy.train()
-
-        responsible_outputs = torch.gather(self.policy(states), 1, actions)
-        old_responsible_outputs = torch.gather(self.old_policy(states), 1, actions).detach()
-
-        # rewards = rewards - rewards.mean()
-        ratio = responsible_outputs / (old_responsible_outputs + 1e-5)
-        clamped_ratio = torch.clamp(ratio, 1. - CLIP_FACTOR, 1. + CLIP_FACTOR)
-        loss = -torch.min(ratio * rewards, clamped_ratio * rewards).mean()
-        self.loss = loss
-
-        # Compute loss and perform a gradient step
-        self.old_policy.load_state_dict(self.policy.state_dict())
-        self.optimizer.zero_grad()
-        loss.backward()
-        # self._clip_gradient(self.policy, 1.0)
-        self.optimizer.step()
-
-    # Checkpointing methods
-    def save(self, filename):
-        # print("Saving model from checkpoint:", filename)
-        torch.save(self.policy.state_dict(), filename + ".policy")
-        torch.save(self.optimizer.state_dict(), filename + ".optimizer")
-
-    def load(self, filename):
-        print("load policy from file", filename)
-        if os.path.exists(filename + ".policy"):
-            print(' >> ', filename + ".policy")
-            try:
-                self.policy.load_state_dict(torch.load(filename + ".policy", map_location=device))
-            except:
-                print(" >> failed!")
-                pass
-        if os.path.exists(filename + ".optimizer"):
-            print(' >> ', filename + ".optimizer")
-            try:
-                self.optimizer.load_state_dict(torch.load(filename + ".optimizer", map_location=device))
-            except:
-                print(" >> failed!")
-                pass
--- a/reinforcement_learning/ppo/replay_memory.py
+++ b/reinforcement_learning/ppo/replay_memory.py
-import torch
-import random
-import numpy as np
-from collections import namedtuple, deque, Iterable
-
-
-Transition = namedtuple("Experience", ("state", "action", "reward", "next_state", "done"))
-
-
-class Episode:
-    memory = []
-
-    def reset(self):
-        self.memory = []
-
-    def push(self, *args):
-        self.memory.append(tuple(args))
-
-    def discount_rewards(self, gamma):
-        running_add = 0.
-        for i, (state, action, reward, *rest) in list(enumerate(self.memory))[::-1]:
-            running_add = running_add * gamma + reward
-            self.memory[i] = (state, action, running_add, *rest)
-
-
-class ReplayBuffer:
-    def __init__(self, buffer_size):
-        self.memory = deque(maxlen=buffer_size)
-
-    def push(self, state, action, reward, next_state, done):
-        self.memory.append(Transition(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done))
-
-    def push_episode(self, episode):
-        for step in episode.memory:
-            self.push(*step)
-
-    def sample(self, batch_size, device):
-        experiences = random.sample(self.memory, k=batch_size)
-
-        states      = torch.from_numpy(self.stack([e.state      for e in experiences])).float().to(device)
-        actions     = torch.from_numpy(self.stack([e.action     for e in experiences])).long().to(device)
-        rewards     = torch.from_numpy(self.stack([e.reward     for e in experiences])).float().to(device)
-        next_states = torch.from_numpy(self.stack([e.next_state for e in experiences])).float().to(device)
-        dones       = torch.from_numpy(self.stack([e.done       for e in experiences]).astype(np.uint8)).float().to(device)
-
-        return states, actions, rewards, next_states, dones
-
-    def stack(self, states):
-        sub_dims = states[0].shape[1:] if isinstance(states[0], Iterable) else [1]
-        return np.reshape(np.array(states), (len(states), *sub_dims))
-
-    def __len__(self):
-        return len(self.memory)
--- a/reinforcement_learning/ppo_agent.py
+++ b/reinforcement_learning/ppo_agent.py
+import copy
+import os
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.distributions import Categorical
+
+# Hyperparameters
+from reinforcement_learning.policy import LearningPolicy
+from reinforcement_learning.replay_buffer import ReplayBuffer
+
+# https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
+
+class EpisodeBuffers:
+    def __init__(self):
+        self.reset()
+
+    def __len__(self):
+        """Return the current size of internal memory."""
+        return len(self.memory)
+
+    def reset(self):
+        self.memory = {}
+
+    def get_transitions(self, handle):
+        return self.memory.get(handle, [])
+
+    def push_transition(self, handle, transition):
+        transitions = self.get_transitions(handle)
+        transitions.append(transition)
+        self.memory.update({handle: transitions})
+
+
+class ActorCriticModel(nn.Module):
+
+    def __init__(self, state_size, action_size, device, hidsize1=512, hidsize2=256):
+        super(ActorCriticModel, self).__init__()
+        self.device = device
+        self.actor = nn.Sequential(
+            nn.Linear(state_size, hidsize1),
+            nn.Tanh(),
+            nn.Linear(hidsize1, hidsize2),
+            nn.Tanh(),
+            nn.Linear(hidsize2, action_size),
+            nn.Softmax(dim=-1)
+        ).to(self.device)
+
+        self.critic = nn.Sequential(
+            nn.Linear(state_size, hidsize1),
+            nn.Tanh(),
+            nn.Linear(hidsize1, hidsize2),
+            nn.Tanh(),
+            nn.Linear(hidsize2, 1)
+        ).to(self.device)
+
+    def forward(self, x):
+        raise NotImplementedError
+
+    def get_actor_dist(self, state):
+        action_probs = self.actor(state)
+        dist = Categorical(action_probs)
+        return dist
+
+    def evaluate(self, states, actions):
+        action_probs = self.actor(states)
+        dist = Categorical(action_probs)
+        action_logprobs = dist.log_prob(actions)
+        dist_entropy = dist.entropy()
+        state_value = self.critic(states)
+        return action_logprobs, torch.squeeze(state_value), dist_entropy
+
+    def save(self, filename):
+        # print("Saving model from checkpoint:", filename)
+        torch.save(self.actor.state_dict(), filename + ".actor")
+        torch.save(self.critic.state_dict(), filename + ".value")
+
+    def _load(self, obj, filename):
+        if os.path.exists(filename):
+            print(' >> ', filename)
+            try:
+                obj.load_state_dict(torch.load(filename, map_location=self.device))
+            except:
+                print(" >> failed!")
+        return obj
+
+    def load(self, filename):
+        print("load model from file", filename)
+        self.actor = self._load(self.actor, filename + ".actor")
+        self.critic = self._load(self.critic, filename + ".value")
+
+
+class PPOPolicy(LearningPolicy):
+    def __init__(self, state_size, action_size, use_replay_buffer=False, in_parameters=None):
+        print(">> PPOPolicy")
+        super(PPOPolicy, self).__init__()
+        # parameters
+        self.ppo_parameters = in_parameters
+        if self.ppo_parameters is not None:
+            self.hidsize = self.ppo_parameters.hidden_size
+            self.buffer_size = self.ppo_parameters.buffer_size
+            self.batch_size = self.ppo_parameters.batch_size
+            self.learning_rate = self.ppo_parameters.learning_rate
+            self.gamma = self.ppo_parameters.gamma
+            # Device
+            if self.ppo_parameters.use_gpu and torch.cuda.is_available():
+                self.device = torch.device("cuda:0")
+                # print("🐇 Using GPU")
+            else:
+                self.device = torch.device("cpu")
+                # print("🐢 Using CPU")
+        else:
+            self.hidsize = 128
+            self.learning_rate = 1.0e-3
+            self.gamma = 0.95
+            self.buffer_size = 32_000
+            self.batch_size = 1024
+            self.device = torch.device("cpu")
+
+        self.surrogate_eps_clip = 0.1
+        self.K_epoch = 10
+        self.weight_loss = 0.5
+        self.weight_entropy = 0.01
+
+        self.buffer_min_size = 0
+        self.use_replay_buffer = use_replay_buffer
+
+        self.current_episode_memory = EpisodeBuffers()
+        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
+        self.loss = 0
+        self.actor_critic_model = ActorCriticModel(state_size, action_size,self.device,
+                                                   hidsize1=self.hidsize,
+                                                   hidsize2=self.hidsize)
+        self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate)
+        self.loss_function = nn.MSELoss()  # nn.SmoothL1Loss()
+
+    def reset(self, env):
+        pass
+
+    def act(self, handle, state, eps=None):
+        # sample a action to take
+        torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
+        dist = self.actor_critic_model.get_actor_dist(torch_state)
+        action = dist.sample()
+        return action.item()
+
+    def step(self, handle, state, action, reward, next_state, done):
+        # record transitions ([state] -> [action] -> [reward, next_state, done])
+        torch_action = torch.tensor(action, dtype=torch.float).to(self.device)
+        torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
+        # evaluate actor
+        dist = self.actor_critic_model.get_actor_dist(torch_state)
+        action_logprobs = dist.log_prob(torch_action)
+        transition = (state, action, reward, next_state, action_logprobs.item(), done)
+        self.current_episode_memory.push_transition(handle, transition)
+
+    def _push_transitions_to_replay_buffer(self,
+                                           state_list,
+                                           action_list,
+                                           reward_list,
+                                           state_next_list,
+                                           done_list,
+                                           prob_a_list):
+        for idx in range(len(reward_list)):
+            state_i = state_list[idx]
+            action_i = action_list[idx]
+            reward_i = reward_list[idx]
+            state_next_i = state_next_list[idx]
+            done_i = done_list[idx]
+            prob_action_i = prob_a_list[idx]
+            self.memory.add(state_i, action_i, reward_i, state_next_i, done_i, prob_action_i)
+
+    def _convert_transitions_to_torch_tensors(self, transitions_array):
+        # build empty lists(arrays)
+        state_list, action_list, reward_list, state_next_list, prob_a_list, done_list = [], [], [], [], [], []
+
+        # set discounted_reward to zero
+        discounted_reward = 0
+        for transition in transitions_array[::-1]:
+            state_i, action_i, reward_i, state_next_i, prob_action_i, done_i = transition
+
+            state_list.insert(0, state_i)
+            action_list.insert(0, action_i)
+            if done_i:
+                discounted_reward = 0
+                done_list.insert(0, 1)
+            else:
+                done_list.insert(0, 0)
+
+            discounted_reward = reward_i + self.gamma * discounted_reward
+            reward_list.insert(0, discounted_reward)
+            state_next_list.insert(0, state_next_i)
+            prob_a_list.insert(0, prob_action_i)
+
+        if self.use_replay_buffer:
+            self._push_transitions_to_replay_buffer(state_list, action_list,
+                                                    reward_list, state_next_list,
+                                                    done_list, prob_a_list)
+
+        # convert data to torch tensors
+        states, actions, rewards, states_next, dones, prob_actions = \
+            torch.tensor(state_list, dtype=torch.float).to(self.device), \
+            torch.tensor(action_list).to(self.device), \
+            torch.tensor(reward_list, dtype=torch.float).to(self.device), \
+            torch.tensor(state_next_list, dtype=torch.float).to(self.device), \
+            torch.tensor(done_list, dtype=torch.float).to(self.device), \
+            torch.tensor(prob_a_list).to(self.device)
+
+        return states, actions, rewards, states_next, dones, prob_actions
+
+    def _get_transitions_from_replay_buffer(self, states, actions, rewards, states_next, dones, probs_action):
+        if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
+            states, actions, rewards, states_next, dones, probs_action = self.memory.sample()
+            actions = torch.squeeze(actions)
+            rewards = torch.squeeze(rewards)
+            states_next = torch.squeeze(states_next)
+            dones = torch.squeeze(dones)
+            probs_action = torch.squeeze(probs_action)
+        return states, actions, rewards, states_next, dones, probs_action
+
+    def train_net(self):
+        # All agents have to propagate their experiences made during past episode
+        for handle in range(len(self.current_episode_memory)):
+            # Extract agent's episode history (list of all transitions)
+            agent_episode_history = self.current_episode_memory.get_transitions(handle)
+            if len(agent_episode_history) > 0:
+                # Convert the replay buffer to torch tensors (arrays)
+                states, actions, rewards, states_next, dones, probs_action = \
+                    self._convert_transitions_to_torch_tensors(agent_episode_history)
+
+                # Optimize policy for K epochs:
+                for k_loop in range(int(self.K_epoch)):
+
+                    if self.use_replay_buffer:
+                        states, actions, rewards, states_next, dones, probs_action = \
+                            self._get_transitions_from_replay_buffer(
+                                states, actions, rewards, states_next, dones, probs_action
+                            )
+
+                    # Evaluating actions (actor) and values (critic)
+                    logprobs, state_values, dist_entropy = self.actor_critic_model.evaluate(states, actions)
+
+                    # Finding the ratios (pi_thetas / pi_thetas_replayed):
+                    ratios = torch.exp(logprobs - probs_action.detach())
+
+                    # Finding Surrogate Loos
+                    advantages = rewards - state_values.detach()
+                    surr1 = ratios * advantages
+                    surr2 = torch.clamp(ratios, 1. - self.surrogate_eps_clip, 1. + self.surrogate_eps_clip) * advantages
+
+                    # The loss function is used to estimate the gardient and use the entropy function based
+                    # heuristic to penalize the gradient function when the policy becomes deterministic this would let
+                    # the gradient becomes very flat and so the gradient is no longer useful.
+                    loss = \
+                        -torch.min(surr1, surr2) \
+                        + self.weight_loss * self.loss_function(state_values, rewards) \
+                        - self.weight_entropy * dist_entropy
+
+                    # Make a gradient step
+                    self.optimizer.zero_grad()
+                    loss.mean().backward()
+                    self.optimizer.step()
+
+                    # Transfer the current loss to the agents loss (information) for debug purpose only
+                    self.loss = loss.mean().detach().cpu().numpy()
+
+        # Reset all collect transition data
+        self.current_episode_memory.reset()
+
+    def end_episode(self, train):
+        if train:
+            self.train_net()
+
+    # Checkpointing methods
+    def save(self, filename):
+        # print("Saving model from checkpoint:", filename)
+        self.actor_critic_model.save(filename)
+        torch.save(self.optimizer.state_dict(), filename + ".optimizer")
+
+    def _load(self, obj, filename):
+        if os.path.exists(filename):
+            print(' >> ', filename)
+            try:
+                obj.load_state_dict(torch.load(filename, map_location=self.device))
+            except:
+                print(" >> failed!")
+        else:
+            print(" >> file not found!")
+        return obj
+
+    def load(self, filename):
+        print("load policy from file", filename)
+        self.actor_critic_model.load(filename)
+        print("load optimizer from file", filename)
+        self.optimizer = self._load(self.optimizer, filename + ".optimizer")
+
+    def clone(self):
+        policy = PPOPolicy(self.state_size, self.action_size)
+        policy.actor_critic_model = copy.deepcopy(self.actor_critic_model)
+        policy.optimizer = copy.deepcopy(self.optimizer)
+        return self
--- a/reinforcement_learning/replay_buffer.py
+++ b/reinforcement_learning/replay_buffer.py
+import random
+from collections import namedtuple, deque, Iterable
+
+import numpy as np
+import torch
+
+Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done", "action_prob"])
+
+
+class ReplayBuffer:
+    """Fixed-size buffer to store experience tuples."""
+
+    def __init__(self, action_size, buffer_size, batch_size, device):
+        """Initialize a ReplayBuffer object.
+
+        Params
+        ======
+            action_size (int): dimension of each action
+            buffer_size (int): maximum size of buffer
+            batch_size (int): size of each training batch
+        """
+        self.action_size = action_size
+        self.memory = deque(maxlen=buffer_size)
+        self.batch_size = batch_size
+        self.device = device
+
+    def add(self, state, action, reward, next_state, done, action_prob=0.0):
+        """Add a new experience to memory."""
+        e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done, action_prob)
+        self.memory.append(e)
+
+    def sample(self):
+        """Randomly sample a batch of experiences from memory."""
+        experiences = random.sample(self.memory, k=self.batch_size)
+        states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
+            .float().to(self.device)
+        actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
+            .long().to(self.device)
+        rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
+            .float().to(self.device)
+        next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
+            .float().to(self.device)
+        dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
+            .float().to(self.device)
+        action_probs = torch.from_numpy(self.__v_stack_impr([e.action_prob for e in experiences if e is not None])) \
+            .float().to(self.device)
+
+        return states, actions, rewards, next_states, dones, action_probs
+
+    def __len__(self):
+        """Return the current size of internal memory."""
+        return len(self.memory)
+
+    def __v_stack_impr(self, states):
+        sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1
+        np_states = np.reshape(np.array(states), (len(states), sub_dim))
+        return np_states
--- a/reinforcement_learning/rl_agent_test.py
+++ b/reinforcement_learning/rl_agent_test.py
+from collections import deque
+from collections import namedtuple
+
+import gym
+import numpy as np
+from torch.utils.tensorboard import SummaryWriter
+
+from reinforcement_learning.dddqn_policy import DDDQNPolicy
+from reinforcement_learning.ppo_agent import PPOPolicy
+
+dddqn_param_nt = namedtuple('DDDQN_Param', ['hidden_size', 'buffer_size', 'batch_size', 'update_every', 'learning_rate',
+                                            'tau', 'gamma', 'buffer_min_size', 'use_gpu'])
+dddqn_param = dddqn_param_nt(hidden_size=128,
+                             buffer_size=1000,
+                             batch_size=64,
+                             update_every=10,
+                             learning_rate=1.e-3,
+                             tau=1.e-2,
+                             gamma=0.95,
+                             buffer_min_size=0,
+                             use_gpu=False)
+
+
+def cartpole(use_dddqn=False):
+    eps = 1.0
+    eps_decay = 0.99
+    min_eps = 0.01
+    training_mode = True
+
+    env = gym.make("CartPole-v1")
+    observation_space = env.observation_space.shape[0]
+    action_space = env.action_space.n
+    if not use_dddqn:
+        policy = PPOPolicy(observation_space, action_space, False)
+    else:
+        policy = DDDQNPolicy(observation_space, action_space, dddqn_param)
+    episode = 0
+    checkpoint_interval = 20
+    scores_window = deque(maxlen=100)
+
+    writer = SummaryWriter()
+
+    while True:
+        episode += 1
+        state = env.reset()
+        policy.reset(env)
+        handle = 0
+        tot_reward = 0
+
+        policy.start_episode(train=training_mode)
+        while True:
+            # env.render()
+            policy.start_step(train=training_mode)
+            action = policy.act(handle, state, eps)
+            state_next, reward, terminal, info = env.step(action)
+            policy.end_step(train=training_mode)
+            tot_reward += reward
+            # reward = reward if not terminal else -reward
+            reward = 0 if not terminal else -1
+            policy.step(handle, state, action, reward, state_next, terminal)
+            state = np.copy(state_next)
+            if terminal:
+                break
+
+        policy.end_episode(train=training_mode)
+        eps = max(min_eps, eps * eps_decay)
+        scores_window.append(tot_reward)
+        if episode % checkpoint_interval == 0:
+            print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
+                                                                                                              tot_reward,
+                                                                                                              np.mean(
+                                                                                                                  scores_window),
+                                                                                                              eps,
+                                                                                                              len(
+                                                                                                                  policy.memory)))
+        else:
+            print('\rEpisode: {:5}\treward: {:7.3f}\t avg: {:7.3f}\t eps: {:5.3f}\t replay buffer: {}'.format(episode,
+                                                                                                              tot_reward,
+                                                                                                              np.mean(
+                                                                                                                  scores_window),
+                                                                                                              eps,
+                                                                                                              len(
+                                                                                                                  policy.memory)),
+                  end=" ")
+
+        writer.add_scalar("CartPole/value", tot_reward, episode)
+        writer.add_scalar("CartPole/smoothed_value", np.mean(scores_window), episode)
+        writer.flush()
+
+
+if __name__ == "__main__":
+    cartpole()
--- a/reinforcement_learning/sequential_agent.py
+++ b/reinforcement_learning/sequential_agent.py
 import sys
-import numpy as np
+from pathlib import Path

+import numpy as np
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.envs.rail_generators import complex_rail_generator
 from flatland.envs.schedule_generators import complex_schedule_generator
 from flatland.utils.rendertools import RenderTool
-from pathlib import Path

 base_dir = Path(__file__).resolve().parent.parent
 sys.path.append(str(base_dir))
@@ -73,7 +73,7 @@ for trials in range(1, n_episodes + 1):
            if done[a]:
                acting_agent += 1
            if a == acting_agent:
-                action = policy.act(obs[a])
+                action = policy.act(a, obs[a])
            else:
                action = 4
            action_dict.update({a: action})

--- a/reinforcement_learning/sequential_agent_training.py
+++ b/reinforcement_learning/sequential_agent_training.py
 import sys
-import numpy as np
+from pathlib import Path

+import numpy as np
 from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
 from flatland.envs.rail_env import RailEnv
 from flatland.envs.rail_generators import complex_rail_generator
 from flatland.envs.schedule_generators import complex_schedule_generator
 from flatland.utils.rendertools import RenderTool
-from pathlib import Path

 base_dir = Path(__file__).resolve().parent.parent
 sys.path.append(str(base_dir))
@@ -66,7 +66,7 @@ for trials in range(1, n_episodes + 1):
            if done[a]:
                acting_agent += 1
            if a == acting_agent:
-                action = policy.act(obs[a])
+                action = policy.act(a, obs[a])
            else:
                action = 4
            action_dict.update({a: action})

--- a/reinforcement_learning/single_agent_training.py
+++ b/reinforcement_learning/single_agent_training.py
@@ -123,7 +123,8 @@ def train_agent(n_episodes):
        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
-                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
+                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth,
+                                                         observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
@@ -132,7 +133,7 @@ def train_agent(n_episodes):
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
-                    action = policy.act(agent_obs[agent], eps=eps_start)
+                    action = policy.act(agent, agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                else:
                    update_values = False
@@ -154,7 +155,8 @@ def train_agent(n_episodes):
                    agent_prev_action[agent] = action_dict[agent]

                if next_obs[agent]:
-                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=10)
+                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth,
+                                                             observation_radius=10)

                score += all_rewards[agent]

@@ -179,15 +181,16 @@ def train_agent(n_episodes):
        else:
            end = " "

-        print('\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
-            env.get_num_agents(),
-            x_dim, y_dim,
-            episode_idx,
-            np.mean(scores_window),
-            100 * np.mean(completion_window),
-            eps_start,
-            action_probs
-        ), end=end)
+        print(
+            '\rTraining {} agents on {}x{}\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
+                env.get_num_agents(),
+                x_dim, y_dim,
+                episode_idx,
+                np.mean(scores_window),
+                100 * np.mean(completion_window),
+                eps_start,
+                action_probs
+            ), end=end)

    # Plot overall training progress at the end
    plt.plot(scores)
@@ -199,7 +202,8 @@ def train_agent(n_episodes):

 if __name__ == "__main__":
    parser = ArgumentParser()
-    parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500, type=int)
+    parser.add_argument("-n", "--n_episodes", dest="n_episodes", help="number of episodes to run", default=500,
+                        type=int)
    args = parser.parse_args()

    train_agent(args.n_episodes)
--- a/run.py
+++ b/run.py
+'''
+I did experiments in an early submission. Please note that the epsilon can have an
+effects on the evaluation outcome :
+DDDQNPolicy experiments - EPSILON impact analysis
+----------------------------------------------------------------------------------------
+checkpoint = "./checkpoints/201124171810-7800.pth"  # Training on AGENTS=10 with Depth=2
+EPSILON = 0.000 # Sum Normalized Reward :  0.000000000000000 (primary score)
+EPSILON = 0.002 # Sum Normalized Reward : 18.445875081269286 (primary score)
+EPSILON = 0.005 # Sum Normalized Reward : 18.371733625865854 (primary score)
+EPSILON = 0.010 # Sum Normalized Reward : 18.249244799876152 (primary score)
+EPSILON = 0.020 # Sum Normalized Reward : 17.526987022691376 (primary score)
+EPSILON = 0.030 # Sum Normalized Reward : 16.796885571003942 (primary score)
+EPSILON = 0.040 # Sum Normalized Reward : 17.280787151431426 (primary score)
+EPSILON = 0.050 # Sum Normalized Reward : 16.256945636647025 (primary score)
+EPSILON = 0.100 # Sum Normalized Reward : 14.828347241759966 (primary score)
+EPSILON = 0.200 # Sum Normalized Reward : 11.192330074898457 (primary score)
+EPSILON = 0.300 # Sum Normalized Reward : 14.523067754608782 (primary score)
+EPSILON = 0.400 # Sum Normalized Reward : 12.901508220410834 (primary score)
+EPSILON = 0.500 # Sum Normalized Reward :  3.754660231871272 (primary score)
+EPSILON = 1.000 # Sum Normalized Reward :  1.397180159192391 (primary score)
+'''
+
 import sys
 import time
 from argparse import Namespace
@@ -5,34 +27,87 @@ from pathlib import Path

 import numpy as np
 from flatland.core.env_observation_builder import DummyObservationBuilder
+from flatland.envs.agent_utils import RailAgentStatus
+from flatland.envs.observations import TreeObsForRailEnv
 from flatland.envs.predictions import ShortestPathPredictorForRailEnv
-from flatland.envs.rail_env import RailEnvActions
 from flatland.evaluators.client import FlatlandRemoteClient
 from flatland.evaluators.client import TimeoutException

+from reinforcement_learning.dddqn_policy import DDDQNPolicy
+from reinforcement_learning.deadlockavoidance_with_decision_agent import DeadLockAvoidanceWithDecisionAgent
+from reinforcement_learning.multi_decision_agent import MultiDecisionAgent
+from reinforcement_learning.ppo_agent import PPOPolicy
+from utils.agent_action_config import get_action_size, map_actions, set_action_size_reduced, set_action_size_full
 from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
 from utils.deadlock_check import check_if_all_blocked
 from utils.fast_tree_obs import FastTreeObs
+from utils.observation_utils import normalize_observation

 base_dir = Path(__file__).resolve().parent.parent
 sys.path.append(str(base_dir))

-from reinforcement_learning.dddqn_policy import DDDQNPolicy
-
 ####################################################
 # EVALUATION PARAMETERS
+set_action_size_full()

 # Print per-step logs
 VERBOSE = True
-
-# Checkpoint to use (remember to push it!)
-checkpoint = "./checkpoints/201112143850-5400.pth" # 21.220418678677177 DEPTH=2 AGENTS=10
-# checkpoint = "./checkpoints/201113211844-6100.pth" # 19.690047767961005 DEPTH=2 AGENTS=20
-
+USE_FAST_TREEOBS = True
+
+if False:
+    # -------------------------------------------------------------------------------------------------------
+    # RL solution
+    # -------------------------------------------------------------------------------------------------------
+    # 116591 adrian_egli
+    # graded	71.305	0.633	RL	Successfully Graded ! More details about this submission can be found at:
+    # http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/51
+    # Fri, 22 Jan 2021 23:37:56
+    set_action_size_reduced()
+    load_policy = "DDDQN"
+    checkpoint = "./checkpoints/210122120236-3000.pth"  # 17.011131341978228
+    EPSILON = 0.0
+
+if False:
+    # -------------------------------------------------------------------------------------------------------
+    # RL solution
+    # -------------------------------------------------------------------------------------------------------
+    # 116658 adrian_egli
+    # graded	73.821	0.655	RL	Successfully Graded ! More details about this submission can be found at:
+    # http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/52
+    # Sat, 23 Jan 2021 07:41:35
+    set_action_size_reduced()
+    load_policy = "PPO"
+    checkpoint = "./checkpoints/210122235754-5000.pth"  # 16.00113400887389
+    EPSILON = 0.0
+
+if True:
+    # -------------------------------------------------------------------------------------------------------
+    # RL solution
+    # -------------------------------------------------------------------------------------------------------
+    # 116659 adrian_egli
+    # graded	80.579	0.715	RL	Successfully Graded ! More details about this submission can be found at:
+    # http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/53
+    # Sat, 23 Jan 2021 07:45:49
+    set_action_size_reduced()
+    load_policy = "DDDQN"
+    checkpoint = "./checkpoints/210122165109-5000.pth"  # 17.993750197899438
+    EPSILON = 0.0
+
+if False:
+    # -------------------------------------------------------------------------------------------------------
+    # !! This is not a RL solution !!!!
+    # -------------------------------------------------------------------------------------------------------
+    # 116727 adrian_egli
+    # graded	106.786	0.768	RL	Successfully Graded ! More details about this submission can be found at:
+    # http://gitlab.aicrowd.com/adrian_egli/neurips2020-flatland-starter-kit/issues/54
+    # Sat, 23 Jan 2021 14:31:50
+    set_action_size_reduced()
+    load_policy = "DeadLockAvoidance"
+    checkpoint = None
+    EPSILON = 0.0

 # Use last action cache
 USE_ACTION_CACHE = False
-USE_DEAD_LOCK_AVOIDANCE_AGENT = False

 # Observation parameters (must match training parameters!)
 observation_tree_depth = 2
@@ -45,16 +120,31 @@ remote_client = FlatlandRemoteClient()

 # Observation builder
 predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
-tree_observation = FastTreeObs(max_depth=observation_tree_depth)
+if USE_FAST_TREEOBS:
+    def check_is_observation_valid(observation):
+        return True
+
+
+    def get_normalized_observation(observation, tree_depth: int, observation_radius=0):
+        return observation
+

-# Calculates state and action sizes
-state_size = tree_observation.observation_dim
-action_size = 5
+    tree_observation = FastTreeObs(max_depth=observation_tree_depth)
+    state_size = tree_observation.observation_dim
+else:
+    def check_is_observation_valid(observation):
+        return observation

-# Creates the policy. No GPU on evaluation server.
-policy = DDDQNPolicy(state_size, action_size, Namespace(**{'use_gpu': False}), evaluation_mode=True)
-# policy = PPOAgent(state_size, action_size, 10)
-policy.load(checkpoint)
+
+    def get_normalized_observation(observation, tree_depth: int, observation_radius=0):
+        return normalize_observation(observation, tree_depth, observation_radius)
+
+
+    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)
+    # Calculate the state size given the depth of the tree observation and the number of features
+    n_features_per_node = tree_observation.observation_dim
+    n_nodes = sum([np.power(4, i) for i in range(observation_tree_depth + 1)])
+    state_size = n_features_per_node * n_nodes

 #####################################################################
 # Main evaluation loop
@@ -89,6 +179,27 @@ while True:

    tree_observation.set_env(local_env)
    tree_observation.reset()
+
+    # Creates the policy. No GPU on evaluation server.
+    if load_policy == "DDDQN":
+        policy = DDDQNPolicy(state_size, get_action_size(), Namespace(**{'use_gpu': False}), evaluation_mode=True)
+    elif load_policy == "PPO":
+        policy = PPOPolicy(state_size, get_action_size())
+    elif load_policy == "DeadLockAvoidance":
+        policy = DeadLockAvoidanceAgent(local_env, get_action_size(), enable_eps=False)
+    elif load_policy == "DeadLockAvoidanceWithDecision":
+        # inter_policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=False, in_parameters=train_params)
+        inter_policy = DDDQNPolicy(state_size, get_action_size(), Namespace(**{'use_gpu': False}), evaluation_mode=True)
+        policy = DeadLockAvoidanceWithDecisionAgent(local_env, state_size, get_action_size(), inter_policy)
+    elif load_policy == "MultiDecision":
+        policy = MultiDecisionAgent(state_size, get_action_size(), Namespace(**{'use_gpu': False}))
+    else:
+        policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=False,
+                           in_parameters=Namespace(**{'use_gpu': False}))
+
+    policy.load(checkpoint)
+
+    policy.reset(local_env)
    observation = tree_observation.get_many(list(range(nb_agents)))

    print("Evaluation {}: {} agents in {}x{}".format(evaluation_number, nb_agents, local_env.width, local_env.height))
@@ -108,9 +219,7 @@ while True:
    agent_last_action = {}
    nb_hit = 0

-    if USE_DEAD_LOCK_AVOIDANCE_AGENT:
-        policy = DeadLockAvoidanceAgent(local_env)
-
+    policy.start_episode(train=False)
    while True:
        try:
            #####################################################################
@@ -123,35 +232,33 @@ while True:
            if not check_if_all_blocked(env=local_env):
                time_start = time.time()
                action_dict = {}
-                policy.start_step()
-                if USE_DEAD_LOCK_AVOIDANCE_AGENT:
-                    observation = np.zeros((local_env.get_num_agents(), 2))
-                for agent in range(nb_agents):
-
-                    if USE_DEAD_LOCK_AVOIDANCE_AGENT:
-                        observation[agent][0] = agent
-                        observation[agent][1] = steps
-
-                    if info['action_required'][agent]:
-                        if agent in agent_last_obs and np.all(agent_last_obs[agent] == observation[agent]):
+                policy.start_step(train=False)
+                for agent_handle in range(nb_agents):
+                    if info['action_required'][agent_handle]:
+                        if agent_handle in agent_last_obs and np.all(
+                                agent_last_obs[agent_handle] == observation[agent_handle]):
                            # cache hit
-                            action = agent_last_action[agent]
+                            action = agent_last_action[agent_handle]
                            nb_hit += 1
                        else:
-                            action = policy.act(observation[agent], eps=0.01)
+                            normalized_observation = get_normalized_observation(observation[agent_handle],
+                                                                                observation_tree_depth,
+                                                                                observation_radius=observation_radius)

-                        action_dict[agent] = action
+                            action = policy.act(agent_handle, normalized_observation, eps=EPSILON)

-                        if USE_ACTION_CACHE:
-                            agent_last_obs[agent] = observation[agent]
-                            agent_last_action[agent] = action
+                    action_dict[agent_handle] = action

-                policy.end_step()
+                    if USE_ACTION_CACHE:
+                        agent_last_obs[agent_handle] = observation[agent_handle]
+                        agent_last_action[agent_handle] = action
+
+                policy.end_step(train=False)
                agent_time = time.time() - time_start
                time_taken_by_controller.append(agent_time)

                time_start = time.time()
-                _, all_rewards, done, info = remote_client.env_step(action_dict)
+                _, all_rewards, done, info = remote_client.env_step(map_actions(action_dict))
                step_time = time.time() - time_start
                time_taken_per_step.append(step_time)

@@ -168,7 +275,11 @@ while True:
                step_time = time.time() - time_start
                time_taken_per_step.append(step_time)

-            nb_agents_done = sum(done[idx] for idx in local_env.get_agent_handles())
+            nb_agents_done = 0
+            for i_agent, agent in enumerate(local_env.agents):
+                # manage the boolean flag to check if all agents are indeed done (or done_removed)
+                if (agent.status in [RailAgentStatus.DONE, RailAgentStatus.DONE_REMOVED]):
+                    nb_agents_done += 1

            if VERBOSE or done['__all__']:
                print(
@@ -197,6 +308,8 @@ while True:
            print("Timeout! Will skip this episode and go to the next.", err)
            break

+    policy.end_episode(train=False)
+
    np_time_taken_by_controller = np.array(time_taken_by_controller)
    np_time_taken_per_step = np.array(time_taken_per_step)
    print("Mean/Std of Time taken by Controller : ", np_time_taken_by_controller.mean(),

--- a/runs_bench/Jan14_10-56-32_K57261_PPO_reduced/events.out.tfevents.1610618195.K57261.15412.0
+++ b/runs_bench/Jan14_10-56-32_K57261_PPO_reduced/events.out.tfevents.1610618195.K57261.15412.0
--- a/runs_bench/Jan18_09-32-17_K57261_DDDQN_reduced/events.out.tfevents.1610958740.K57261.6608.0
+++ b/runs_bench/Jan18_09-32-17_K57261_DDDQN_reduced/events.out.tfevents.1610958740.K57261.6608.0
--- a/runs_bench/Jan18_09-34-10_K57261_DeadLockAvoidance_EPS_reduced/events.out.tfevents.1610958853.K57261.10660.0
+++ b/runs_bench/Jan18_09-34-10_K57261_DeadLockAvoidance_EPS_reduced/events.out.tfevents.1610958853.K57261.10660.0
--- a/runs_bench/Jan18_11-47-54_K57261_DeadLockAvoidance_reduced/events.out.tfevents.1610966876.K57261.4332.0
+++ b/runs_bench/Jan18_11-47-54_K57261_DeadLockAvoidance_reduced/events.out.tfevents.1610966876.K57261.4332.0
--- a/runs_bench/Jan18_11-56-16_K57261_DeadLockAvoidanceWithDecision_reduced/events.out.tfevents.1610967379.K57261.14680.0
+++ b/runs_bench/Jan18_11-56-16_K57261_DeadLockAvoidanceWithDecision_reduced/events.out.tfevents.1610967379.K57261.14680.0
--- a/runs_bench/Jan18_13-46-59_K57261_MultiDecisionAgent_reduced/events.out.tfevents.1610974021.K57261.12972.0
+++ b/runs_bench/Jan18_13-46-59_K57261_MultiDecisionAgent_reduced/events.out.tfevents.1610974021.K57261.12972.0
--- a/runs_bench/Jan18_14-53-57_K57261_PPO_full/events.out.tfevents.1610978039.K57261.484.0
+++ b/runs_bench/Jan18_14-53-57_K57261_PPO_full/events.out.tfevents.1610978039.K57261.484.0
--- a/runs_bench/Jan18_14-57-56_K57261_DDDQN_full/events.out.tfevents.1610978281.K57261.19984.0
+++ b/runs_bench/Jan18_14-57-56_K57261_DDDQN_full/events.out.tfevents.1610978281.K57261.19984.0
--- a/runs_bench/Jan18_16-05-23_K57261_DeadLockAvoidance_EPS_full/events.out.tfevents.1610982327.K57261.6264.0
+++ b/runs_bench/Jan18_16-05-23_K57261_DeadLockAvoidance_EPS_full/events.out.tfevents.1610982327.K57261.6264.0
No results found