Commit d5739af8 authored by robert_kirk's avatar robert_kirk
Browse files

Add rllib agent and rollout script

make repository a package

This will enable the rllib stuff to run correctly, remove some weird
path hacking, and is generally a better development strategy

We batch rollout evaluations, like in the standard rollout script.

Currently needs a docker container with at least 2.5 Gb RAM, otherwise
it crashes
parent 626aaad9
......@@ -3,3 +3,5 @@
submission filter=lfs diff=lfs merge=lfs -text
submission/* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
outputs filter=lfs diff=lfs merge=lfs -text
outputs/* filter=lfs diff=lfs merge=lfs -text
......@@ -132,3 +132,4 @@ dmypy.json
nle_data/
test_batched_env.py
outputs
......@@ -47,3 +47,4 @@ COPY --chown=1001:1001 requirements.txt ${HOME_DIR}/requirements.txt
RUN pip install -r requirements.txt --no-cache-dir
COPY --chown=1001:1001 . ${HOME_DIR}
RUN pip install . --no-cache-dir
import numpy as np
from nethack_baselines.rllib.util import load_agent
from nethack_baselines.torchbeast.models import load_model
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
from agents.batched_agent import BatchedAgent
# By default, choose from impala, ppo, a2c, dqn.
ALGO_CLASS_NAME = "impala"
CHECKPOINT_LOCATION = ""
# e.g.
# CHECKPOINT_LOCATION = "outputs/2021-06-08/15-04-39/ray_results/IMPALA_2021-06-08_15-04-43/IMPALA_RLlibNLE-v0_79638_00000_0_2021-06-08_15-04-43/checkpoint_000001/checkpoint-1"
class RLlibAgent(BatchedAgent):
"""
A BatchedAgent using the TorchBeast Model
"""
def __init__(self, num_envs, num_actions):
super().__init__(num_envs, num_actions)
if CHECKPOINT_LOCATION == "":
raise ValueError(
"You need to specify a CHECKPOINT_LOCATION for your model, otherwise submission won't work"
)
agent = load_agent(CHECKPOINT_LOCATION, ALGO_CLASS_NAME)
self.policy = agent.get_policy(DEFAULT_POLICY_ID)
self.preprocessor = agent.workers.local_worker().preprocessors[DEFAULT_POLICY_ID]
self.state = [
np.stack(init_states)
for init_states in (zip(*[self.policy.get_initial_state() for _ in range(self.num_envs)]))
]
self.previous_actions = [0] * self.num_envs
def batch_inputs(self, observations):
return [self.preprocessor.transform(observation) for observation in observations]
def batched_step(self, observations, rewards, dones, infos):
"""
Perform a batched step on lists of environment outputs.
RLlib policies:
* Take the observation, previous action and reward, and LSTM state as input
* return outputs as a tuple of actions, state, and action information
"""
observations = self.batch_inputs(observations)
actions, state, _ = self.policy.compute_actions(
observations,
prev_action_batch=self.previous_actions,
prev_reward_batch=rewards,
state_batches=self.state,
)
for ep_done_idx_l in np.argwhere(dones):
ep_done_idx = ep_done_idx_l[0]
new_init_state = self.policy.get_initial_state()
for i, inner_state in enumerate(self.state):
inner_state[ep_done_idx] = new_init_state[i]
self.previous_actions[ep_done_idx] = 0
self.state = state
self.previous_actions = actions
return actions
......@@ -15,4 +15,4 @@ def addtimelimitwrapper_fn():
"""
env = create_env()
env = TimeLimit(env, max_episode_steps=10_000_000)
return env
\ No newline at end of file
return env
Placeholder
# RLlib NetHackChallenge Benchmark
This is a baseline model for the NetHack Challenge based on
[RLlib](https://github.com/ray-project/ray#rllib-quick-start).
It comes with all the code you need to train, run and submit a model, and you
can choose from a variety of algorithms implemented in RLlib.
We provide default configuration and hyperparameters for 4 algorithms:
* IMPALA
* DQN
* PPO
* A2C
You're not restricted to using these algorithms - others could be added with
minimal effort in `train.py` and `util/loading.py`.
This implementation runs many simultaneous environments with dynamic batching.
## Installation
To get this running, you'll want to create a virtual environment (probably with
conda)
```bash
conda create -n nle-competition python=3.8
conda activate nle-competition
```
Then you'll want to install the requirements at the root of this repository,
both from the `requirements.txt` and the `setup.py`:
```bash
pip install -r requirements.txt
pip install . -e
```
This will install the repository as a python package in editable mode, meaning
any changes you make to the code will be recognised.
## Running The Baseline
Once installed, from the root of the repository run:
```bash
python nethack_baselines/rllib/train.py
```
This will train the default algorithm (IMPALA) with default hyperparameters.
You can choose a different algorithm as follows:
```bash
python nethack_baselines/rllib/train.py algo=ppo
```
You can also control other hyperparameters on the command line:
```bash
python nethack_baselines/rllib/train.py algo=ppo num_sgd_iter=5 total_steps=1000000
```
An important configuration is the number of cpus and gpus that are available,
which can be set with `num_gpus` and `num_cpus` - the higher these numbers
(especially cpus) the faster training will be.
This configuration can also be changed the adjusting `config.yaml`
The output of training will be in a directory `outputs` at the root of the
repository, with each run having a date and time-based folder.
## Making a submission
Once the training is complete, model checkpoints will be available in
`outputs/<data>/<time>/ray_results/...`. At the end of training the script will
print out the file-path which needs to be used to specify the agent. This
file-path should be used in `agents/rllib_batched_agent.py` as
`CHECKPOINT_LOCATION`, and you should also set `ALGO_CLASS_NAME` to the
algorithm you used (impala, ppo, etc..). **If you don't change these values the
submission won't use your new model**.
Next, make sure to **add the model checkpoints to the git repository**, for example:
```bash
git add -f outputs/2021-06-08/15-04-39/ray_results/
```
Finally, commit all you changes (including the added checkpoint and changed `agents/rllib_batched_agent.py`),
tag the submission and push the branch and tag to AIcrowd's GitLab.
## Repo Structure
```
nle_baselines/rllib
├── models.py # <- Models HERE
├── util/
├── config.yaml # <- Flags HERE
├── train.py # <- Training Loop HERE
├── env.py # <- Training Envionment HERE
```
The structure is simple, compartmentalising the environment setup, training
loop and models in to different files. You can tweak any of these separately,
and add parameters to the flags (which are passed around).
## About the Model
This model (`BaseNet`) we provide is simple and all in `models.py`.
* It encodes the dungeon into a fixed-size representation (`GlyphEncoder`)
* It encodes the topline message into a fixed-size representation (`MessageEncoder`)
* It encodes the bottom line statistics (eg armour class, health) into a fixed-size representation (`BLStatsEncoder`)
* It concatenates all these outputs into a fixed size, runs this through a fully connected layer
* If using an LSTM (which is controlled by RLlib), then this output is passed through and LSTM,
and then fully connect layers for various policy ouputs (such as value function and action distribution)
As you can see there is a lot of data to play with in this game, and plenty to try, both in modelling and in the learning algorithms used.
## Improvement Ideas
*Here are some ideas we haven't tried yet, but might be easy places to start. Happy tinkering!*
### Model Improvements (`model.py`)
* The model is currently not using the terminal observations (`tty_chars`, `tty_colors`, `tty_cursor`), so it has no idea about menus - could this we make use of this somehow?
* The bottom-line stats are very informative, but very simply encoded in `BLStatsEncoder` - is there a better way to do this?
* The `GlyphEncoder` builds a embedding for the glyphs, and then takes a crop of these centered around the player icon coordinates (`@`). Should the crop be reusing these the same embedding matrix?
* The current model constrains the vast action space to a smaller subset of actions. Is it too constrained? Or not constrained enough?
### Environment Improvements (`envs.py`)
* Opening menus (such as when spellcasting) do not advance the in game timer. However, models can also get stuck
in menus as you have to learn what buttons to press to close the menu. Can changing the penalty for not advancing
the in-game timer improve the result?
* The NetHackChallenge assesses the score on random character assignments. Might it be easier to learn on just a few of these at the beginning of training?
### Algorithm/Optimisation Improvements (`train.py`)
* Which algorithm from RLlib works best? Which hyperparameters are the ones we expect to perform well?
## How to add an algorithm
If you wanted to use an algorithm from RLlib which we don't provide a default
configuration for, here's some pointers to what's necessary:
* Add the algorithm to `NAME_TO_TRAINER` in
`nethack_baselines/rllib/util/loading.py`, so that it can be loaded correctly.
* Add a configuration key to `config.yaml` with the algorithm's name (e.g.
`sac`), and under that key specify the configuration that's specific to that
algorithm (e.g. `initial_alpha: 0.5`)
Once that's done, you should be able to use the new algorithm by running
```bash
python nle_baselines/rllib/train.py algo=sac
```
from .train import train
from .models import RLLibNLENetwork
from .envs import RLLibNLEEnv
__all__ = ["RLLibNLEEnv", "RLLibNLENetwork", "train"]
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
defaults:
- hydra/job_logging: colorlog
- hydra/hydra_logging: colorlog
name: null # can use this to have multiple runs with same params, eg name=1,2,3,4,5
seed: 123
checkpoint_freq: 0
## WandB settings
wandb: false # enable wandb logging
project: mtp_rllib # specifies the project name to log to
entity: nethack # the team to log to
group: default # defines a group name for the experiment
tags: 'rllib' # what tags for this run, comma-separated
# Env settings
fn_penalty_step: constant
penalty_time: 0.0
penalty_step: -0.001
reward_lose: 0
reward_win: 1
character: mon-hum-neu-mal
save_tty: False
## typical characters we use
# mon-hum-neu-mal
# val-dwa-law-fem
# wiz-elf-cha-mal
# tou-hum-neu-fem
env: challenge
## NLE tasks:
# staircase, score, pet, oracle, gold, eat, scout,
# Training settings.
num_gpus: 1
num_cpus: 8
num_actors: 256 # should be at least batch_size
total_steps: 1e9 # 1e9 used in paper
train_batch_size: 32 # 32 is standard, can use 128 with small model variants
unroll_length: 80 # 80 is standard
# Model settings.
model: baseline # random, baseline, rnd, ride
use_lstm: true
hidden_dim: 256 # use at least 128, 256 is stronger
embedding_dim: 64 # use at least 32, 64 is stronger
glyph_type: all_cat # full, group_id, color_char, all, all_cat* (all_cat best, full fastest)
equalize_input_dim: false # project inputs to same dim (*false unless doing dynamics)
equalize_factor: 2 # multiplies hdim by this when equalize is enabled (2 > 1)
layers: 5 # number of cnn layers for crop/glyph model
crop_model: cnn
crop_dim: 9 # size of crop
use_index_select: true # use index select instead of normal embedding lookup
msg:
model: lt_cnn # character model? none, lt_cnn*, cnn, gru, lstm
hidden_dim: 64 # recommend 256
embedding_dim: 32 # recommend 64
# Experimental settings.
state_counter: none # none, coordinates
# Generic Loss settings.
lr: 0.0002
gamma: 0.999 # probably a bit better at 0.999, esp with intrinsic reward
reward_clipping: none # use none with normalize_reward, else use tim
normalize_reward: true # true is reliable across tasks, but false & tim-clip is best on score
# Optimizer settings.
decay: 0.99 # 0.99 vs 0.9 vs 0.5 seems to make no difference
momentum: 0 # keep at 0
epsilon: 0.000001 # do not use 0.01, 1e-6 seems same as 1e-8
grad_clip: 40
# Algorithm-specific settings. These can override settings above (i.e. learning rate)
algo: impala # must match one of the keys below
impala:
entropy_coeff: 0.001 # 0.001 is better than 0.0001
vf_loss_coeff: 0.5
dqn:
lr: 0.000001
double_q: True
dueling: True
noisy: False
prioritized_replay: True
n_step: 5
buffer_size: 100000
target_network_update_freq: 50000
prioritized_replay_beta_annealing_timesteps: 1000000
learning_starts: 50000
model:
use_lstm: False
exploration_config:
epsilon_timesteps: 1000000
ppo:
lr: 0.00005
rollout_fragment_length: 128
train_batch_size: 128
sgd_minibatch_size: 32
num_sgd_iter: 2
entropy_coeff: 0.0001
vf_loss_coeff: 0.5
model:
vf_share_layers: True
a2c:
lr: 0.00005
rollout_fragment_length: 128
train_batch_size: 128
microbatch_size: null
entropy_coeff: 0.0001
vf_loss_coeff: 0.5
sample_async: False
import threading
from collections import OrderedDict
from typing import Tuple, Union
import gym
import nle # noqa: F401
import numpy as np
from nle.env import tasks
ENVS = dict(
# NLE tasks
staircase=tasks.NetHackStaircase,
score=tasks.NetHackScore,
pet=tasks.NetHackStaircasePet,
oracle=tasks.NetHackOracle,
gold=tasks.NetHackGold,
eat=tasks.NetHackEat,
scout=tasks.NetHackScout,
challenge=tasks.NetHackChallenge
)
def create_env(flags, env_id=0, lock=threading.Lock()):
# commenting out these options for now because they use too much disk space
# archivefile = "nethack.%i.%%(pid)i.%%(time)s.zip" % env_id
# if flags.single_ttyrec and env_id != 0:
# archivefile = None
# logdir = os.path.join(flags.savedir, "archives")
with lock:
env_class = ENVS[flags.env]
kwargs = dict(
savedir=flags.savedir,
archivefile=None,
character=flags.character,
max_episode_steps=flags.max_num_steps,
observation_keys=(
"glyphs",
"chars",
"colors",
"specials",
"blstats",
"message",
"tty_chars",
"tty_colors",
"tty_cursor",
"inv_glyphs",
"inv_strs",
"inv_letters",
"inv_oclasses",
# "screen_descriptions",
),
penalty_step=flags.penalty_step,
penalty_time=flags.penalty_time,
penalty_mode=flags.fn_penalty_step,
)
if flags.env in ("staircase", "pet", "oracle"):
kwargs.update(reward_win=flags.reward_win, reward_lose=flags.reward_lose)
elif env_id == 0: # print warning once
# Removed because it's too noisy:
# print("Ignoring flags.reward_win and flags.reward_lose")
pass
if flags.state_counter != "none":
kwargs.update(state_counter=flags.state_counter)
env = env_class(**kwargs)
if flags.seedspath is not None and len(flags.seedspath) > 0:
raise NotImplementedError("seedspath > 0 not implemented yet.")
return env
class RLLibNLEEnv(gym.Env):
def __init__(self, env_config: dict) -> None:
self.gym_env = create_env(env_config["flags"])
# We sort the observation keys so we can create the OrderedDict output
# in a consistent order
self._observation_keys = sorted(self.gym_env.observation_space.spaces.keys())
@property
def action_space(self) -> gym.Space:
return self.gym_env.action_space
@property
def observation_space(self) -> gym.Space:
return self.gym_env.observation_space
def reset(self) -> dict:
return self._process_obs(self.gym_env.reset())
def _process_obs(self, obs: dict) -> dict:
return OrderedDict({key: obs[key] for key in self._observation_keys})
def step(
self, action: Union[int, np.int64]
) -> Tuple[dict, Union[np.number, int], Union[np.bool_, bool], dict]:
obs, reward, done, info = self.gym_env.step(action)
return self._process_obs(obs), reward, done, info
def render(self):
return self.gym_env.render()
def close(self):
return self.gym_env.close()
This diff is collapsed.
import os
from collections.abc import Iterable
from numbers import Number
import hydra
import nethack_baselines.rllib.models # noqa: F401
import numpy as np
import ray
import ray.tune.integration.wandb
from nethack_baselines.rllib.envs import RLLibNLEEnv
from nethack_baselines.rllib.util.loading import NAME_TO_TRAINER
from omegaconf import DictConfig, OmegaConf
from ray import tune
from ray.rllib.models.catalog import MODEL_DEFAULTS
from ray.tune.integration.wandb import (_VALID_ITERABLE_TYPES, _VALID_TYPES,
WandbLoggerCallback)
from ray.tune.registry import register_env
from ray.tune.utils import merge_dicts
def get_full_config(cfg: DictConfig) -> DictConfig:
env_flags = OmegaConf.to_container(cfg)
max_num_steps = 1e6
if cfg.env in ("staircase", "pet"):
max_num_steps = 1000
env_flags["max_num_steps"] = int(max_num_steps)
env_flags["seedspath"] = ""
return OmegaConf.create(env_flags)
@hydra.main(config_name="config")
def train(cfg: DictConfig) -> None:
ray.init(num_gpus=cfg.num_gpus, num_cpus=cfg.num_cpus + 1)
cfg = get_full_config(cfg)
register_env("RLlibNLE-v0", RLLibNLEEnv)
try:
algo, trainer = NAME_TO_TRAINER[cfg.algo]
except KeyError:
raise ValueError("The algorithm you specified isn't currently supported: %s", cfg.algo)
config = algo.DEFAULT_CONFIG.copy()
args_config = OmegaConf.to_container(cfg)
# Algo-specific config. Requires hydra config keys to match rllib exactly
algo_config = args_config.pop(cfg.algo)
# Remove unnecessary config keys
for algo in NAME_TO_TRAINER.keys():
if algo != cfg.algo:
args_config.pop(algo, None)
# Merge config from hydra (will have some rogue keys but that's ok)
config = merge_dicts(config, args_config)
# Update configuration with parsed arguments in specific ways
config = merge_dicts(
config,
{
"framework": "torch",
"num_gpus": cfg.num_gpus,
"seed": cfg.seed,
"env": "RLlibNLE-v0",
"env_config": {
"flags": cfg,
"name": cfg.env,
},
"train_batch_size": cfg.train_batch_size,
"model": merge_dicts(
MODEL_DEFAULTS,
{
"custom_model": "rllib_nle_model",
"custom_model_config": {"flags": cfg, "algo": cfg.algo},
"use_lstm": cfg.use_lstm,
"lstm_use_prev_reward": True,
"lstm_use_prev_action": True,
"lstm_cell_size": cfg.hidden_dim,
},
),
"num_workers": cfg.num_cpus,
"num_envs_per_worker": int(cfg.num_actors / cfg.num_cpus),
"evaluation_interval": 100,
"evaluation_num_episodes": 50,
"evaluation_config": {"explore": False},
"rollout_fragment_length": cfg.unroll_length,
},
)
# Merge algo-specific config at top level
config = merge_dicts(config, algo_config)
# Ensure we can use the config we've specified above
trainer_class = trainer.with_updates(default_config=config)
callbacks = []
if cfg.wandb:
callbacks.append(
WandbLoggerCallback(
project=cfg.project,
api_key_file="~/.wandb_api_key",
entity=cfg.entity,
group=cfg.group,
tags=cfg.tags.split(","),
)
)
os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Only log to wandb
# Hacky monkey-patching to allow for OmegaConf config
def _is_allowed_type(obj):
"""Return True if type is allowed for logging to wandb"""
if isinstance(obj, DictConfig):
return True
if isinstance(obj, np.ndarray) and obj.size == 1:
return isinstance(obj.item(), Number)
if isinstance(obj, Iterable) and len(obj) > 0:
return isinstance(obj[0], _VALID_ITERABLE_TYPES)