Commit d1ac97bd authored by Dipam Chakraborty's avatar Dipam Chakraborty
Browse files

Merge branch 'eric/baselines' into 'master'


See merge request dipam/neurips-2021-nethack-challenge!1
parents eed51f36 1369276b
*.wav filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
submission filter=lfs diff=lfs merge=lfs -text
submission/* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
......@@ -11,20 +11,9 @@ class BatchedAgent:
self.num_envs = num_envs
self.num_actions = num_actions
def preprocess_observations(self, observations, rewards, dones, infos):
def batched_step(self, observations, rewards, dones, infos):
Add any preprocessing steps, for example rerodering/stacking for torch/tf in your model
Take list of outputs of each environments and return a list of actions
raise NotImplementedError
def preprocess_actions(self, actions):
Add any postprocessing steps, for example converting to lists
def batched_step(self):
Return a list of actions
import torch
import numpy as np
from agents.batched_agent import BatchedAgent
from nethack_baselines.torchbeast.models import load_model
MODEL_DIR = "./models/torchbeast/example_run"
class TorchBeastAgent(BatchedAgent):
A BatchedAgent using the TorchBeast Model
def __init__(self, num_envs, num_actions):
super().__init__(num_envs, num_actions)
self.model_dir = MODEL_DIR
self.device = "cuda:0"
self.model = load_model(MODEL_DIR, self.device)
self.core_state = [ for m in self.model.initial_state(batch_size=num_envs)
def batch_inputs(self, observations, dones):
Convert lists of observations, rewards, dones, infos to tensors for TorchBeast.
TorchBeast models:
* take tensors in the form: [T, B, ...]: B:= batch, T:= unroll (=1)
* take "done" as a BOOLEAN observation
states = list(observations[0].keys())
obs = {k: [] for k in states}
# Unpack List[Dicts] -> Dict[Lists]
for o in observations:
for k, t in o.items():
# Convert to Tensor, Add Unroll Dim (=1), Move to GPU
for k in states:
obs[k] = torch.Tensor(np.stack(obs[k])[None, ...]).to(self.device)
obs["done"] = torch.Tensor(np.array(dones)[None, ...]).bool().to(self.device)
return obs, dones
def batched_step(self, observations, rewards, dones, infos):
Perform a batched step on lists of environment outputs.
Torchbeast models:
* take the core (LSTM) state as input, and return as output
* return outputs as a dict of "action", "policy_logits", "baseline"
observations, dones = self.batch_inputs(observations, dones)
with torch.no_grad():
outputs, self.core_state = self.model(observations, self.core_state)
return outputs["action"].cpu().numpy()[0]
\ No newline at end of file
......@@ -4,5 +4,7 @@
"authors": [
"external_dataset_used": false
"external_dataset_used": false,
"gpu": true
......@@ -32,6 +32,7 @@ def evaluate():
agent = Agent(num_envs, num_actions)
run_batched_rollout(batched_env, agent)
if __name__ == '__main__':
# TorchBeast NetHackChallenge Benchmark
This is a baseline model for the NetHack Challenge based on [TorchBeast]( - FAIR's implementation of IMPALA for PyTorch.
It comes with all the code you need to train, run and submit a model that is based on the results published in the original NLE paper.
This implementation runs with 2 GPUS (one for acting and one for learning), and runs many simultaneous environments with dynamic batching.
## Installation
To get this running all you need to do is follow the TorchBeast installation instructions, on the repo page, and then install the requirements.txt
A Dockerfile is also provided with installation of Torchbeast.
## Running The Baseline
Once installed, in this directory run:
To change parameters, edit `config.yaml`, or to override parameters from the command-line run:
`python embedding_dim=16`
The training will save checkpoints to a new directory (`outputs`) and should the environments create any outputs, they will be saved to `nle_data` - (by default recordings of episodes are switched off to save space).
The default polybeast runs on 2 GPUs, one for the learner and one for the actors. However, with only one GPU you can run still run polybeast - just override the `actor_device` argument:
`python actor_device=cpu`
## Making a submission
Take the output directory of your trained model, add the `checkpoint.tar` and `config.yaml` to the git repo. Then change the `SUBMISSION` variable in `` in the base of this repository to point to that directory.
After that tag the submission, and push the branch and tag to AIcrowd's gitlab!
## Repo Structure
├── core/
├── models/ # <- Models HERE
├── util/
├── config.yaml # <- Flags HERE
├── # <- Training Env HERE
├── # <- Training Loop HERE
└── # <- main() HERE
The structure is simple, compartmentalising the environment setup, training loop and models in to different files. You can tweak any of these separately, and add parameters to the flags (which are passed around).
## About the Model
This model (`BaselineNet`) we provide is simple and all in `models/`.
* It encodes the dungeon into a fixed-size representation (`GlyphEncoder`)
* It encodes the topline message into a fixed-size representation (`MessageEncoder`)
* It encodes the bottom line statistics (eg armour class, health) into a fixed-size representation (`BLStatsEncoder`)
* It concatenates all these outputs into a fixed size, runs this through a fully connected layer, and into an LSTM.
* The outputs of the LSTM go through policy and baseline heads (since this is an actor-critic alorithm)
As you can see there is a lot of data to play with in this game, and plenty to try, both in modelling and in the learning algorithms used.
## Improvement Ideas
*Here are some ideas we haven't tried yet, but might be easy places to start. Happy tinkering!*
### Model Improvements (``)
* The model is currently not using the terminal observations (`tty_chars`, `tty_colors`, `tty_cursor`), so it has no idea about menus - could this we make use of this somehow?
* The bottom-line stats are very informative, but very simply encoded in `BLStatsEncoder` - is there a better way to do this?
* The `GlyphEncoder` builds a embedding for the glyphs, and then takes a crop of these centered around the player icon coordinates (`@`). Should the crop be reusing these the same embedding matrix?
* The current model constrains the vast action space to a smaller subset of actions. Is it too constrained? Or not constrained enough?
### Environment Improvements (``)
* Opening menus (such as when spellcasting) do not advance the in game timer. However, models can also get stuck in menus as you have to learn what buttons to press to close the menu. Can changing the penalty for not advancing the in-game timer improve the result?
* The NetHackChallenge assesses the score on random character assignments. Might it be easier to learn on just a few of these at the beginning of training?
### Algorithm/Optimisation Improvements (``)
* Can we add some intrinsic rewards to help our agents learn?
* Should we add penalties for disincentivise pathological behaviour we observe?
* Can we improve the model by using a different optimizer?
# Copyright (c) Facebook, Inc. and its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
- hydra/job_logging: colorlog
- hydra/hydra_logging: colorlog
# - hydra/launcher: submitit_slurm
# # To Be Used With hydra submitit_slurm if you have SLURM cluster
# # pip install hydra-core hydra_colorlog
# # can set these on the commandline too, e.g. `hydra.launcher.partition=dev`
# hydra:
# launcher:
# timeout_min: 4300
# cpus_per_task: 20
# gpus_per_node: 2
# tasks_per_node: 1
# mem_gb: 20
# nodes: 1
# partition: dev
# comment: null
# max_num_timeout: 5 # will requeue on timeout or preemption
name: null # can use this to have multiple runs with same params, eg name=1,2,3,4,5
## WANDB settings
wandb: false # Enable wandb logging.
project: nethack_challenge # The wandb project name.
entity: user1 # The wandb user to log to.
group: group1 # The wandb group for the run.
# POLYBEAST ENV settings
mock: false # Use mock environment instead of NetHack.
single_ttyrec: true # Record ttyrec only for actor 0.
num_seeds: 0 # If larger than 0, samples fixed number of environment seeds to be used.'
write_profiler_trace: false # Collect and write a profiler trace for chrome://tracing/.
fn_penalty_step: constant # Function to accumulate penalty.
penalty_time: 0.0 # Penalty per time step in the episode.
penalty_step: -0.01 # Penalty per step in the episode.
reward_lose: 0 # Reward for losing (dying before finding the staircase).
reward_win: 100 # Reward for winning (finding the staircase).
state_counter: none # Method for counting state visits. Default none.
character: 'mon-hum-neu-mal' # Specification of the NetHack character.
## typical characters we use
# 'mon-hum-neu-mal'
# 'val-dwa-law-fem'
# 'wiz-elf-cha-mal'
# 'tou-hum-neu-fem'
# '@' # random (used in Challenge assessment)
# RUN settings.
mode: train # Training or test mode.
env: challenge # Name of Gym environment to create.
# # env (task) names: challenge, staircase, pet,
# eat, gold, score, scout, oracle
# TRAINING settings.
num_actors: 256 # Number of actors.
total_steps: 1e9 # Total environment steps to train for. Will be cast to int.
batch_size: 32 # Learner batch size.
unroll_length: 80 # The unroll length (time dimension).
num_learner_threads: 1 # Number learner threads.
num_inference_threads: 1 # Number inference threads.
disable_cuda: false # Disable CUDA.
learner_device: cuda:1 # Set learner device.
actor_device: cuda:0 # Set actor device.
# OPTIMIZER settings. (RMS Prop)
learning_rate: 0.0002 # Learning rate.
grad_norm_clipping: 40 # Global gradient norm clip.
alpha: 0.99 # RMSProp smoothing constant.
momentum: 0 # RMSProp momentum.
epsilon: 0.000001 # RMSProp epsilon.
# LOSS settings.
entropy_cost: 0.001 # Entropy cost/multiplier.
baseline_cost: 0.5 # Baseline cost/multiplier.
discounting: 0.999 # Discounting factor.
normalize_reward: true # Normalizes reward by dividing by running stdev from mean.
# MODEL settings.
model: baseline # Name of model to build (see models/
use_lstm: true # Use LSTM in agent model.
hidden_dim: 256 # Size of hidden representations.
embedding_dim: 64 # Size of glyph embeddings.
layers: 5 # Number of ConvNet Layers for Glyph Model
crop_dim: 9 # Size of crop (c x c)
use_index_select: true # Whether to use index_select instead of embedding lookup (for speed reasons).
restrict_action_space: True # Use a restricted ACTION SPACE (only nethack.USEFUL_ACTIONS)
hidden_dim: 64 # Hidden dimension for message encoder.
embedding_dim: 32 # Embedding dimension for characters in message encoder.
# TEST settings.
load_dir: null # Path to load a model from for testing
# Copyright (c) Facebook, Inc. and its affiliates.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import csv
import datetime
import json
import logging
import os
import time
import weakref
def _save_metadata(path, metadata):
metadata["date_save"] ="%Y-%m-%d %H:%M:%S.%f")
with open(path, "w") as f:
json.dump(metadata, f, indent=4, sort_keys=True)
def gather_metadata():
metadata = dict("%Y-%m-%d %H:%M:%S.%f"),
# Git metadata.
import git
except ImportError:
"Couldn't import gitpython module; install it with `pip install gitpython`."
repo = git.Repo(search_parent_directories=True)
metadata["git"] = {
"commit": repo.commit().hexsha,
"is_dirty": repo.is_dirty(),
"path": repo.git_dir,
if not repo.head.is_detached:
metadata["git"]["branch"] =
except git.InvalidGitRepositoryError:
if "git" not in metadata:
logging.warning("Couldn't determine git data.")
# Slurm metadata.
if "SLURM_JOB_ID" in os.environ:
slurm_env_keys = [k for k in os.environ if k.startswith("SLURM")]
metadata["slurm"] = {}
for k in slurm_env_keys:
d_key = k.replace("SLURM_", "").replace("SLURMD_", "").lower()
metadata["slurm"][d_key] = os.environ[k]
return metadata
class FileWriter:
def __init__(self, xp_args=None, rootdir="~/palaas"):
if rootdir == "~/palaas":
# make unique id in case someone uses the default rootdir
xpid = "{proc}_{unixtime}".format(
proc=os.getpid(), unixtime=int(time.time())
rootdir = os.path.join(rootdir, xpid)
self.basepath = os.path.expandvars(os.path.expanduser(rootdir))
self._tick = 0
# metadata gathering
if xp_args is None:
xp_args = {}
self.metadata = gather_metadata()
# we need to copy the args, otherwise when we close the file writer
# (and rewrite the args) we might have non-serializable objects (or
# other nasty stuff).
self.metadata["args"] = copy.deepcopy(xp_args)
formatter = logging.Formatter("%(message)s")
self._logger = logging.getLogger("palaas/out")
# to stdout handler
shandle = logging.StreamHandler()
# to file handler
if not os.path.exists(self.basepath):"Creating log directory: %s", self.basepath)
os.makedirs(self.basepath, exist_ok=True)
else:"Found log directory: %s", self.basepath)
self.paths = dict(
)"Saving arguments to %s", self.paths["meta"])
if os.path.exists(self.paths["meta"]):
"Path to meta file already exists. " "Not overriding meta."
self.save_metadata()"Saving messages to %s", self.paths["msg"])
if os.path.exists(self.paths["msg"]):
"Path to message file already exists. " "New data will be appended."
fhandle = logging.FileHandler(self.paths["msg"])
self._logger.addHandler(fhandle)"Saving logs data to %s", self.paths["logs"])"Saving logs' fields to %s", self.paths["fields"])
self.fieldnames = ["_tick", "_time"]
if os.path.exists(self.paths["logs"]):
"Path to log file already exists. " "New data will be appended."
# Override default fieldnames.
with open(self.paths["fields"], "r") as csvfile:
reader = csv.reader(csvfile)
lines = list(reader)
if len(lines) > 0:
self.fieldnames = lines[-1]
# Override default tick: use the last tick from the logs file plus 1.
with open(self.paths["logs"], "r") as csvfile:
reader = csv.reader(csvfile)
lines = list(reader)
# Need at least two lines in order to read the last tick:
# the first is the csv header and the second is the first line
# of data.
if len(lines) > 1:
self._tick = int(lines[-1][0]) + 1
self._fieldfile = open(self.paths["fields"], "a")
self._fieldwriter = csv.writer(self._fieldfile)
self._logfile = open(self.paths["logs"], "a")
self._logwriter = csv.DictWriter(self._logfile, fieldnames=self.fieldnames)
# Auto-close (and save) on destruction.
weakref.finalize(self, _save_metadata, self.paths["meta"], self.metadata)
def log(self, to_log, tick=None, verbose=False):
if tick is not None:
raise NotImplementedError
to_log["_tick"] = self._tick
self._tick += 1
to_log["_time"] = time.time()
old_len = len(self.fieldnames)
for k in to_log:
if k not in self.fieldnames:
if old_len != len(self.fieldnames):
self._fieldfile.flush()"Updated log fields: %s", self.fieldnames)
if to_log["_tick"] == 0:
self._logfile.write("# %s\n" % ",".join(self.fieldnames))
if verbose:
"LOG | %s",
", ".join(["{}: {}".format(k, to_log[k]) for k in sorted(to_log)]),
def close(self, successful=True):
self.metadata["successful"] = successful
for f in [self._logfile, self._fieldfile]:
def save_metadata(self):
_save_metadata(self.paths["meta"], self.metadata)
# This file taken from
# cd66d00914d56c8ba2f0615d9cdeefcb169a8d70/
# and modified.
# Copyright 2018 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to compute V-trace off-policy actor critic targets.
For details and theory see:
"IMPALA: Scalable Distributed Deep-RL with
Importance Weighted Actor-Learner Architectures"
by Espeholt, Soyer, Munos et al.
See for the full paper.
import collections
import torch
import torch.nn.functional as F
VTraceFromLogitsReturns = collections.namedtuple(
VTraceReturns = collections.namedtuple("VTraceReturns", "vs pg_advantages")
def action_log_probs(policy_logits, actions):
return -F.nll_loss(
F.log_softmax(torch.flatten(policy_logits, 0, 1), dim=-1),
torch.flatten(actions, 0, 1),
def from_logits(
"""V-trace for softmax policies."""
target_action_log_probs = action_log_probs(target_policy_logits, actions)
behavior_action_log_probs = action_log_probs(behavior_policy_logits, actions)
log_rhos = target_action_log_probs - behavior_action_log_probs
vtrace_returns = from_importance_weights(