Commit 46c09e63 authored by adrian_egli2's avatar adrian_egli2
Browse files

Initial commit

parent 19e33cc0
{
"challenge_id": "neurips-2020-flatland-challenge",
"grader_id": "neurips-2020-flatland-challenge",
"challenge_id": "flatland-3",
"grader_id": "flatland-3",
"authors" : ["Adrian Egli"],
"description" : "Deadlock avoidance agent test",
"debug": false,
"tags": ["RL"]
"tags": ["Overall"]
}
name: flatland-rl
name: flatland3-rl
channels:
- pytorch
- conda-forge
......@@ -10,4 +10,5 @@ dependencies:
- python==3.6.8
- pip:
- tensorboard==2.3.0
- tensorboardx==2.1
\ No newline at end of file
- tensorboardx==2.1
- flatland-rl
\ No newline at end of file
import copy
import os
import torch
import torch.nn as nn
# Hyperparameters
from reinforcement_learning.policy import LearningPolicy
from reinforcement_learning.ppo_agent import EpisodeBuffers
# https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
# Actor module
class A2CShared(nn.Module):
def __init__(self, state_size, action_size, device, hidsize1=512, hidsize2=256):
super(A2CShared, self).__init__()
self.device = device
self.model = nn.Sequential(
nn.Linear(state_size, hidsize1),
nn.Tanh(),
nn.Linear(hidsize1, hidsize2),
nn.Tanh(),
nn.Linear(hidsize2, hidsize2),
nn.Tanh()
).to(self.device)
def forward(self, X):
return self.model(X)
def save(self, filename):
# print("Saving model from checkpoint:", filename)
torch.save(self.model.state_dict(), filename + ".a2c_shared")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
return obj
def load(self, filename):
print("load model from file", filename)
self.model = self._load(self.model, filename + ".a2c_shared")
class A2CActor(nn.Module):
def __init__(self, state_size, action_size, device, shared_model, hidsize1=512, hidsize2=256):
super(A2CActor, self).__init__()
self.device = device
self.shared_model = shared_model
self.model = nn.Sequential(
nn.Linear(hidsize2, hidsize2),
nn.Tanh(),
nn.Linear(hidsize2, action_size),
nn.Softmax(dim=-1)
).to(self.device)
def forward(self, X):
return self.model(self.shared_model(X))
def get_actor_dist(self, state):
probs = self.forward(state)
dist = torch.distributions.Categorical(probs=probs)
return dist, probs
def save(self, filename):
# print("Saving model from checkpoint:", filename)
torch.save(self.model.state_dict(), filename + ".a2c_actor")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
return obj
def load(self, filename):
print("load model from file", filename)
self.model = self._load(self.model, filename + ".a2c_actor")
# Critic module
class A2CCritic(nn.Module):
def __init__(self, state_size, device, shared_model, hidsize1=512, hidsize2=256):
super(A2CCritic, self).__init__()
self.device = device
self.shared_model = shared_model
self.model = nn.Sequential(
nn.Linear(hidsize2, hidsize2),
nn.Tanh(),
nn.Linear(hidsize2, 1)
).to(self.device)
def forward(self, X):
return self.model(self.shared_model(X))
def save(self, filename):
# print("Saving model from checkpoint:", filename)
torch.save(self.model.state_dict(), filename + ".a2c_critic")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
return obj
def load(self, filename):
print("load model from file", filename)
self.model = self._load(self.model, filename + ".a2c_critic")
class A2CPolicy(LearningPolicy):
def __init__(self, state_size, action_size, in_parameters=None, clip_grad_norm=0.1):
print(">> A2CPolicy")
super(A2CPolicy, self).__init__()
# parameters
self.state_size = state_size
self.action_size = action_size
self.a2c_parameters = in_parameters
if self.a2c_parameters is not None:
self.hidsize = self.a2c_parameters.hidden_size
self.learning_rate = self.a2c_parameters.learning_rate
self.gamma = self.a2c_parameters.gamma
# Device
if self.a2c_parameters.use_gpu and torch.cuda.is_available():
self.device = torch.device("cuda:0")
# print("🐇 Using GPU")
else:
self.device = torch.device("cpu")
# print("🐢 Using CPU")
else:
self.hidsize = 128
self.learning_rate = 0.5e-4
self.gamma = 0.99
self.device = torch.device("cpu")
self.current_episode_memory = EpisodeBuffers()
self.agent_done = {}
self.memory = [] # dummy parameter
self.loss_function = nn.MSELoss()
self.loss = 0
self.shared_model = A2CShared(state_size,
action_size,
self.device,
hidsize1=self.hidsize,
hidsize2=self.hidsize)
self.actor = A2CActor(state_size,
action_size,
self.device,
shared_model=self.shared_model,
hidsize1=self.hidsize,
hidsize2=self.hidsize)
self.critic = A2CCritic(state_size,
self.device,
shared_model=self.shared_model,
hidsize1=self.hidsize,
hidsize2=self.hidsize)
self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.learning_rate)
self.optimizer_critic = torch.optim.Adam(self.critic.parameters(), lr=self.learning_rate)
self.clip_grad_norm = clip_grad_norm
def shape_reward(self, handle, action, state, reward, done, deadlocked=None):
if done == 1:
return 1.0
return 0.0
def reset(self, env):
pass
def act(self, handle, state, eps=0.0):
torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
dist, _ = self.actor.get_actor_dist(torch_state)
action = dist.sample()
return action.item()
def step(self, handle, state, action, reward, next_state, done):
if self.agent_done.get(handle, False):
return # remove? if not Flatland?
# record transitions ([state] -> [action] -> [reward, next_state, done])
torch_action = torch.tensor(action, dtype=torch.float).to(self.device)
torch_state = torch.tensor(state, dtype=torch.float).to(self.device)
# evaluate actor
dist, _ = self.actor.get_actor_dist(torch_state)
action_logprobs = dist.log_prob(torch_action)
transition = (state, action, reward, next_state, action_logprobs.item(), done)
self.current_episode_memory.push_transition(handle, transition)
if done:
self.agent_done.update({handle: done})
def _convert_transitions_to_torch_tensors(self, transitions_array, all_done):
# build empty lists(arrays)
state_list, action_list, reward_list, state_next_list, prob_a_list, done_list = [], [], [], [], [], []
# set discounted_reward to zero
discounted_reward = 0
for transition in transitions_array[::-1]:
state_i, action_i, reward_i, state_next_i, prob_action_i, done_i = transition
state_list.insert(0, state_i)
action_list.insert(0, action_i)
done_list.insert(0, int(done_i))
mask_i = 1.0 - int(done_i)
discounted_reward = reward_i + self.gamma * mask_i * discounted_reward
reward_list.insert(0, discounted_reward)
state_next_list.insert(0, state_next_i)
prob_a_list.insert(0, prob_action_i)
# convert data to torch tensors
states, actions, rewards, states_next, dones, prob_actions = \
torch.tensor(state_list, dtype=torch.float).to(self.device), \
torch.tensor(action_list).to(self.device), \
torch.tensor(reward_list, dtype=torch.float).to(self.device), \
torch.tensor(state_next_list, dtype=torch.float).to(self.device), \
torch.tensor(done_list, dtype=torch.float).to(self.device), \
torch.tensor(prob_a_list).to(self.device)
return states, actions, rewards, states_next, dones, prob_actions
def train_net(self):
# All agents have to propagate their experiences made during past episode
all_done = True
for handle in range(len(self.current_episode_memory)):
all_done = self.agent_done.get(handle, False)
for handle in range(len(self.current_episode_memory)):
# Extract agent's episode history (list of all transitions)
agent_episode_history = self.current_episode_memory.get_transitions(handle)
if len(agent_episode_history) > 0:
# Convert the replay buffer to torch tensors (arrays)
states, actions, rewards, states_next, dones, probs_action = \
self._convert_transitions_to_torch_tensors(agent_episode_history, all_done)
critic_loss = self.loss_function(rewards, torch.squeeze(self.critic(states)))
self.optimizer_critic.zero_grad()
critic_loss.mean().backward()
torch.nn.utils.clip_grad_norm_(self.critic.model.parameters(), self.clip_grad_norm)
self.optimizer_critic.step()
advantages = rewards - torch.squeeze(self.critic(states)).detach()
dist, _ = self.actor.get_actor_dist(states)
action_logprobs = dist.log_prob(actions)
self.optimizer_actor.zero_grad()
actor_loss = -(advantages.detach() * action_logprobs) - 0.01 * dist.entropy()
actor_loss.mean().backward()
torch.nn.utils.clip_grad_norm_(self.actor.model.parameters(), self.clip_grad_norm)
self.optimizer_actor.step()
# Transfer the current loss to the agents loss (information) for debug purpose only
self.loss = actor_loss.mean().detach().cpu().numpy()
# Reset all collect transition data
self.current_episode_memory.reset()
self.agent_done = {}
def end_episode(self, train):
if train:
self.train_net()
# Checkpointing methods
def save(self, filename):
# print("Saving model from checkpoint:", filename)
self.shared_model.save(filename)
self.actor.save(filename)
self.critic.save(filename)
torch.save(self.optimizer_actor.state_dict(), filename + ".a2c_optimizer_actor")
torch.save(self.optimizer_critic.state_dict(), filename + ".a2c_optimizer_critic")
def _load(self, obj, filename):
if os.path.exists(filename):
print(' >> ', filename)
try:
obj.load_state_dict(torch.load(filename, map_location=self.device))
except:
print(" >> failed!")
else:
print(" >> file not found!")
return obj
def load(self, filename):
print("load policy from file", filename)
self.shared_model.load(filename)
self.actor.load(filename)
self.critic.load(filename)
print("load optimizer from file", filename)
self.optimizer_actor = self._load(self.optimizer_actor, filename + ".a2c_optimizer_actor")
self.optimizer_critic = self._load(self.optimizer_critic, filename + ".a2c_optimizer_critic")
def clone(self):
policy = A2CPolicy(self.state_size, self.action_size)
policy.shared_model = copy.deepcopy(self.shared_model)
policy.actor = copy.deepcopy(self.actor)
policy.critic = copy.deepcopy(self.critic)
policy.optimizer_actor = copy.deepcopy(self.optimizer_actor)
policy.optimizer_critic = copy.deepcopy(self.optimizer_critic)
return self
......@@ -2,7 +2,6 @@ import copy
import os
import pickle
import random
from collections import namedtuple, deque, Iterable
import numpy as np
import torch
......@@ -10,32 +9,40 @@ import torch.nn.functional as F
import torch.optim as optim
from reinforcement_learning.model import DuelingQNetwork
from reinforcement_learning.policy import Policy
from reinforcement_learning.policy import Policy, LearningPolicy
from reinforcement_learning.ppo_agent import EpisodeBuffers
from reinforcement_learning.replay_buffer import ReplayBuffer
class DDDQNPolicy(Policy):
class DDDQNPolicy(LearningPolicy):
"""Dueling Double DQN policy"""
def __init__(self, state_size, action_size, parameters, evaluation_mode=False):
def __init__(self, state_size, action_size, in_parameters, evaluation_mode=False,
enable_delayed_transition_push_at_episode_end=False,
skip_unfinished_agent=0.0):
print(">> DDDQNPolicy")
super(Policy, self).__init__()
self.ddqn_parameters = in_parameters
self.evaluation_mode = evaluation_mode
self.state_size = state_size
self.action_size = action_size
self.double_dqn = True
self.hidsize = 1
self.hidsize = 128
if not evaluation_mode:
self.hidsize = parameters.hidden_size
self.buffer_size = parameters.buffer_size
self.batch_size = parameters.batch_size
self.update_every = parameters.update_every
self.learning_rate = parameters.learning_rate
self.tau = parameters.tau
self.gamma = parameters.gamma
self.buffer_min_size = parameters.buffer_min_size
# Device
if parameters.use_gpu and torch.cuda.is_available():
self.hidsize = self.ddqn_parameters.hidden_size
self.buffer_size = self.ddqn_parameters.buffer_size
self.batch_size = self.ddqn_parameters.batch_size
self.update_every = self.ddqn_parameters.update_every
self.learning_rate = self.ddqn_parameters.learning_rate
self.tau = self.ddqn_parameters.tau
self.gamma = self.ddqn_parameters.gamma
self.buffer_min_size = self.ddqn_parameters.buffer_min_size
# Device
if self.ddqn_parameters.use_gpu and torch.cuda.is_available():
self.device = torch.device("cuda:0")
# print("🐇 Using GPU")
else:
......@@ -43,45 +50,93 @@ class DDDQNPolicy(Policy):
# print("🐢 Using CPU")
# Q-Network
self.qnetwork_local = DuelingQNetwork(state_size, action_size, hidsize1=self.hidsize, hidsize2=self.hidsize).to(self.device)
self.qnetwork_local = DuelingQNetwork(state_size,
action_size,
hidsize1=self.hidsize,
hidsize2=self.hidsize,
hidsize3=self.hidsize).to(self.device)
if not evaluation_mode:
self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.learning_rate)
self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
self.t_step = 0
self.loss = 0.0
else:
self.memory = ReplayBuffer(action_size, 1, 1, self.device)
self.loss = 0.0
def act(self, state, eps=0.):
self.enable_delayed_transition_push_at_episode_end = enable_delayed_transition_push_at_episode_end
self.skip_unfinished_agent = skip_unfinished_agent
self.current_episode_memory = EpisodeBuffers()
self.agent_done = {}
def act(self, handle, state, eps=0.):
state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
self.qnetwork_local.eval()
with torch.no_grad():
action_values = self.qnetwork_local(state)
self.qnetwork_local.train()
# Epsilon-greedy action selection
if random.random() > eps:
if random.random() >= eps:
return np.argmax(action_values.cpu().data.numpy())
else:
return random.choice(np.arange(self.action_size))
def step(self, state, action, reward, next_state, done):
def step(self, handle, state, action, reward, next_state, done):
if self.agent_done.get(handle, False):
return # remove? if not Flatland?
assert not self.evaluation_mode, "Policy has been initialized for evaluation only."
# Save experience in replay memory
self.memory.add(state, action, reward, next_state, done)
# Save transition (episode)
if self.enable_delayed_transition_push_at_episode_end:
transition = (state, action, reward, next_state, 0, done)
self.current_episode_memory.push_transition(handle, transition)
else:
# Save experience in replay memory
self.memory.add(state, action, reward, next_state, done)
# Learn every UPDATE_EVERY time steps.
self.t_step = (self.t_step + 1) % self.update_every
if self.t_step == 0:
# If enough samples are available in memory, get random subset and learn
if len(self.memory) > self.buffer_min_size and len(self.memory) > self.batch_size:
self._learn()
self._learn()
if done:
self.agent_done.update({handle: done})
def end_episode(self, train):
if train:
if self.enable_delayed_transition_push_at_episode_end:
# All agents have to propagate their experiences made during past episode
all_done = True
for handle in range(len(self.current_episode_memory)):
all_done = self.agent_done.get(handle, False)
for handle in range(len(self.current_episode_memory)):
if (not self.agent_done.get(handle, False)) and (np.random.random() < self.skip_unfinished_agent):
continue
# Extract agent's episode history (list of all transitions)
agent_episode_history = self.current_episode_memory.get_transitions(handle)
if len(agent_episode_history) > 0:
for transition in agent_episode_history:
state_i, action_i, reward_i, state_next_i, _, done_i = transition
# Save experience in replay memory
self.memory.add(state_i, action_i, reward_i, state_next_i, done_i)
# update / learn
self._learn()
# Reset all collect transition data
self.current_episode_memory.reset()
self.agent_done = {}
def _learn(self):
if len(self.memory) <= self.buffer_min_size or len(self.memory) <= self.batch_size:
return
experiences = self.memory.sample()
states, actions, rewards, next_states, dones = experiences
states, actions, rewards, next_states, dones, _, _ = experiences
# Get expected Q values from local model
q_expected = self.qnetwork_local(states).gather(1, actions)
......@@ -119,10 +174,20 @@ class DDDQNPolicy(Policy):
torch.save(self.qnetwork_target.state_dict(), filename + ".target")
def load(self, filename):
if os.path.exists(filename + ".local"):
self.qnetwork_local.load_state_dict(torch.load(filename + ".local"))
if os.path.exists(filename + ".target"):
self.qnetwork_target.load_state_dict(torch.load(filename + ".target"))
try:
if os.path.exists(filename + ".local") and os.path.exists(filename + ".target"):
self.qnetwork_local.load_state_dict(torch.load(filename + ".local", map_location=self.device))
print("qnetwork_local loaded ('{}')".format(filename + ".local"))
if not self.evaluation_mode:
self.qnetwork_target.load_state_dict(torch.load(filename + ".target", map_location=self.device))
print("qnetwork_target loaded ('{}' )".format(filename + ".target"))
else:
print(">> Checkpoint not found, using untrained policy! ('{}', '{}')".format(filename + ".local",
filename + ".target"))
except Exception as exc:
print(exc)
print("Couldn't load policy from, using untrained policy! ('{}', '{}')".format(filename + ".local",
filename + ".target"))
def save_replay_buffer(self, filename):
memory = self.memory.memory
......@@ -134,57 +199,11 @@ class DDDQNPolicy(Policy):
self.memory.memory = pickle.load(f)
def test(self):
self.act(np.array([[0] * self.state_size]))
self.act(0, np.array([[0] * self.state_size]))
self._learn()
Experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
class ReplayBuffer:
"""Fixed-size buffer to store experience tuples."""
def __init__(self, action_size, buffer_size, batch_size, device):
"""Initialize a ReplayBuffer object.
Params
======
action_size (int): dimension of each action
buffer_size (int): maximum size of buffer
batch_size (int): size of each training batch
"""
self.action_size = action_size
self.memory = deque(maxlen=buffer_size)
self.batch_size = batch_size
self.device = device
def add(self, state, action, reward, next_state, done):
"""Add a new experience to memory."""
e = Experience(np.expand_dims(state, 0), action, reward, np.expand_dims(next_state, 0), done)
self.memory.append(e)
def sample(self):
"""Randomly sample a batch of experiences from memory."""
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(self.__v_stack_impr([e.state for e in experiences if e is not None])) \
.float().to(self.device)
actions = torch.from_numpy(self.__v_stack_impr([e.action for e in experiences if e is not None])) \
.long().to(self.device)
rewards = torch.from_numpy(self.__v_stack_impr([e.reward for e in experiences if e is not None])) \
.float().to(self.device)
next_states = torch.from_numpy(self.__v_stack_impr([e.next_state for e in experiences if e is not None])) \
.float().to(self.device)
dones = torch.from_numpy(self.__v_stack_impr([e.done for e in experiences if e is not None]).astype(np.uint8)) \
.float().to(self.device)
return states, actions, rewards, next_states, dones
def __len__(self):
"""Return the current size of internal memory."""
return len(self.memory)
def __v_stack_impr(self, states):
sub_dim = len(states[0][0]) if isinstance(states[0], Iterable) else 1