.

de66e9ba · Adrian Egli · 98d00d0b · de66e9ba · de66e9ba · de66e9ba
Commit de66e9ba authored 4 years ago by Adrian Egli
--- a/src/ppo/agent.py
+++ b/src/ppo/agent.py
+import pickle
+
+import torch
+# from model import PolicyNetwork
+# from replay_memory import Episode, ReplayBuffer
+from torch.distributions.categorical import Categorical
+
+from src.ppo.model import PolicyNetwork
+from src.ppo.replay_memory import Episode, ReplayBuffer
+
+BUFFER_SIZE = 32_000
+BATCH_SIZE = 4096
+GAMMA = 0.98
+LR = 0.5e-4
+CLIP_FACTOR = .005
+UPDATE_EVERY = 30
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print("device:", device)
+
+
+class Agent:
+    def __init__(self, state_size, action_size, num_agents):
+        self.policy = PolicyNetwork(state_size, action_size).to(device)
+        self.old_policy = PolicyNetwork(state_size, action_size).to(device)
+        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=LR)
+
+        self.episodes = [Episode() for _ in range(num_agents)]
+        self.memory = ReplayBuffer(BUFFER_SIZE)
+        self.t_step = 0
+
+    def reset(self):
+        self.finished = [False] * len(self.episodes)
+
+    # Decide on an action to take in the environment
+
+    def act(self, state, eps=None):
+        self.policy.eval()
+        with torch.no_grad():
+            output = self.policy(torch.from_numpy(state).float().unsqueeze(0).to(device))
+            return Categorical(output).sample().item()
+
+    # Record the results of the agent's action and update the model
+
+    def step(self, handle, state, action, next_state, agent_done, episode_done, collision):
+        if not self.finished[handle]:
+            if agent_done:
+                reward = 1
+            elif collision:
+                reward = -.5
+            else:
+                reward = 0
+
+            # Push experience into Episode memory
+            self.episodes[handle].push(state, action, reward, next_state, agent_done or episode_done)
+
+            # When we finish the episode, discount rewards and push the experience into replay memory
+            if agent_done or episode_done:
+                self.episodes[handle].discount_rewards(GAMMA)
+                self.memory.push_episode(self.episodes[handle])
+                self.episodes[handle].reset()
+                self.finished[handle] = True
+
+        # Perform a gradient update every UPDATE_EVERY time steps
+        self.t_step = (self.t_step + 1) % UPDATE_EVERY
+        if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 4:
+            self.learn(*self.memory.sample(BATCH_SIZE, device))
+
+    def learn(self, states, actions, rewards, next_state, done):
+        self.policy.train()
+
+        responsible_outputs = torch.gather(self.policy(states), 1, actions)
+        old_responsible_outputs = torch.gather(self.old_policy(states), 1, actions).detach()
+
+        # rewards = rewards - rewards.mean()
+        ratio = responsible_outputs / (old_responsible_outputs + 1e-5)
+        clamped_ratio = torch.clamp(ratio, 1. - CLIP_FACTOR, 1. + CLIP_FACTOR)
+        loss = -torch.min(ratio * rewards, clamped_ratio * rewards).mean()
+
+        # Compute loss and perform a gradient step
+        self.old_policy.load_state_dict(self.policy.state_dict())
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+    # Checkpointing methods
+
+    def save(self, path, *data):
+        torch.save(self.policy.state_dict(), path / 'ppo/model_checkpoint.policy')
+        torch.save(self.optimizer.state_dict(), path / 'ppo/model_checkpoint.optimizer')
+        with open(path / 'ppo/model_checkpoint.meta', 'wb') as file:
+            pickle.dump(data, file)
+
+    def load(self, path, *defaults):
+        try:
+            print("Loading model from checkpoint...")
+            print(path + 'ppo/model_checkpoint.policy')
+            self.policy.load_state_dict(
+                torch.load(path + 'ppo/model_checkpoint.policy', map_location=torch.device('cpu')))
+            self.optimizer.load_state_dict(
+                torch.load(path + 'ppo/model_checkpoint.optimizer', map_location=torch.device('cpu')))
+            with open(path + 'ppo/model_checkpoint.meta', 'rb') as file:
+                return pickle.load(file)
+        except:
+            print("No checkpoint file was found")
+            return defaults
--- a/src/ppo/model.py
+++ b/src/ppo/model.py
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PolicyNetwork(nn.Module):
+    def __init__(self, state_size, action_size, hidsize1=128, hidsize2=128, hidsize3=32):
+        super().__init__()
+        self.fc1 = nn.Linear(state_size, hidsize1)
+        self.fc2 = nn.Linear(hidsize1, hidsize2)
+        # self.fc3 = nn.Linear(hidsize2, hidsize3)
+        self.output = nn.Linear(hidsize2, action_size)
+        self.softmax = nn.Softmax(dim=1)
+        self.bn0 = nn.BatchNorm1d(state_size, affine=False)
+
+    def forward(self, inputs):
+        x = self.bn0(inputs.float())
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        # x = F.relu(self.fc3(x))
+        return self.softmax(self.output(x))
--- a/src/ppo/replay_memory.py
+++ b/src/ppo/replay_memory.py
+import torch
+import random
+import numpy as np
+from collections import namedtuple, deque, Iterable
+
+
+Transition = namedtuple("Experience", ("state", "action", "reward", "next_state", "done"))
+
+
+class Episode:
+    memory = []
+
+    def reset(self):
+        self.memory = []
+
+    def push(self, *args):
+        self.memory.append(tuple(args))
+
+    def discount_rewards(self, gamma):
+        running_add = 0.
+        for i, (state, action, reward, *rest) in list(enumerate(self.memory))[::-1]:
+            running_add = running_add * gamma + reward
+            self.memory[i] = (state, action, running_add, *rest)
+
+
+class ReplayBuffer:
+    def __init__(self, buffer_size):
+        self.memory = deque(maxlen=buffer_size)
+
+    def push(self, state, action, reward, next_state, done):
+        self.memory.append(Transition(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done))
+
+    def push_episode(self, episode):
+        for step in episode.memory:
+            self.push(*step)
+
+    def sample(self, batch_size, device):
+        experiences = random.sample(self.memory, k=batch_size)
+
+        states      = torch.from_numpy(self.stack([e.state      for e in experiences])).float().to(device)
+        actions     = torch.from_numpy(self.stack([e.action     for e in experiences])).long().to(device)
+        rewards     = torch.from_numpy(self.stack([e.reward     for e in experiences])).float().to(device)
+        next_states = torch.from_numpy(self.stack([e.next_state for e in experiences])).float().to(device)
+        dones       = torch.from_numpy(self.stack([e.done       for e in experiences]).astype(np.uint8)).float().to(device)
+
+        return states, actions, rewards, next_states, dones
+
+    def stack(self, states):
+        sub_dims = states[0].shape[1:] if isinstance(states[0], Iterable) else [1]
+        return np.reshape(np.array(states), (len(states), *sub_dims))
+
+    def __len__(self):
+        return len(self.memory)