Commit ca30c538 authored by Dipam Chakraborty's avatar Dipam Chakraborty
Browse files

save replay buffer to checkpoint

parent 6d45b551
......@@ -70,7 +70,9 @@ class CustomTorchPolicy(TorchPolicy):
replay_size = self.config['retune_replay_size'],
num_retunes = self.config['num_retunes'])
self.exp_replay = np.zeros((self.retune_selector.replay_size, *observation_space.shape), dtype=np.uint8)
self.vtarg_replay = np.zeros((self.retune_selector.replay_size), dtype=np.float32)
self.save_success = 0
self.target_timesteps = 8_000_000
self.buffer_time = 20 # TODO: Could try to do a median or mean time step check instead
self.max_time = 7200
......@@ -98,7 +100,6 @@ class CustomTorchPolicy(TorchPolicy):
>>> ev.learn_on_batch(samples)
Reference: https://github.com/ray-project/ray/blob/master/rllib/policy/policy.py#L279-L316
"""
## Config data values
nbatch = self.nbatch
nbatch_train = self.mem_limited_batch_size
......@@ -184,7 +185,7 @@ class CustomTorchPolicy(TorchPolicy):
lrnow, cliprange, vfcliprange, max_grad_norm, ent_coef, vf_coef, *slices)
## Distill with aux head
should_retune = self.retune_selector.update(obs, returns)
should_retune = self.retune_selector.update(obs, returns, self.exp_replay, self.vtarg_replay)
if should_retune:
self.aux_train()
self.update_batch_time()
......@@ -244,21 +245,21 @@ class CustomTorchPolicy(TorchPolicy):
# Store current value function and policy logits
for start in range(0, replay_size, nbatch_train):
end = start + nbatch_train
replay_batch = self.retune_selector.exp_replay[start:end]
replay_batch = self.exp_replay[start:end]
_, replay_pi[start:end] = self.model.vf_pi(replay_batch,
ret_numpy=True, no_grad=True, to_torch=True)
optim_count = 0
# Tune vf and pi heads to older predictions with augmented observations
inds = np.arange(len(self.retune_selector.exp_replay))
inds = np.arange(len(self.exp_replay))
for ep in range(retune_epochs):
np.random.shuffle(inds)
for start in range(0, replay_size, aux_nbatch_train):
end = start + aux_nbatch_train
mbinds = inds[start:end]
optim_count += 1
slices = [self.retune_selector.exp_replay[mbinds],
self.to_tensor(self.retune_selector.vtarg_replay[mbinds]),
slices = [self.exp_replay[mbinds],
self.to_tensor(self.vtarg_replay[mbinds]),
self.to_tensor(replay_pi[mbinds])]
self.tune_policy(*slices)
......@@ -289,7 +290,7 @@ class CustomTorchPolicy(TorchPolicy):
self.aux_optimizer.zero_grad()
def best_reward_model_select(self, samples):
self.timesteps_total += self.nbatch
self.timesteps_total += len(samples['dones'])
## Best reward model selection
eprews = [info['episode']['r'] for info in samples['infos'] if 'episode' in info]
......@@ -341,8 +342,7 @@ class CustomTorchPolicy(TorchPolicy):
"best_weights": self.best_weights,
"reward_deque": self.reward_deque,
"batch_end_time": self.batch_end_time,
"num_retunes": self.retune_selector.num_retunes,
# "retune_selector": self.retune_selector,
"retune_selector": self.retune_selector,
"gamma": self.gamma,
"maxrewep_lenbuf": self.maxrewep_lenbuf,
"lr": self.lr,
......@@ -359,8 +359,7 @@ class CustomTorchPolicy(TorchPolicy):
self.best_weights = custom_state_vars["best_weights"]
self.reward_deque = custom_state_vars["reward_deque"]
self.batch_end_time = custom_state_vars["batch_end_time"]
self.retune_selector.set_num_retunes(custom_state_vars["num_retunes"])
# self.retune_selector = custom_state_vars["num_retunes"]
self.retune_selector = custom_state_vars["retune_selector"]
self.gamma = self.adaptive_discount_tuner.gamma = custom_state_vars["gamma"]
self.maxrewep_lenbuf = custom_state_vars["maxrewep_lenbuf"]
self.lr =custom_state_vars["lr"]
......@@ -377,24 +376,24 @@ class CustomTorchPolicy(TorchPolicy):
k: v.cpu().detach().numpy()
for k, v in self.model.state_dict().items()
}
weights["optimizer_state"] = {
k: v
for k, v in self.optimizer.state_dict().items()
}
weights["aux_optimizer_state"] = {
k: v
for k, v in self.aux_optimizer.state_dict().items()
}
weights["custom_state_vars"] = self.get_custom_state_vars()
# weights["optimizer_state"] = {
# k: v
# for k, v in self.optimizer.state_dict().items()
# }
# weights["aux_optimizer_state"] = {
# k: v
# for k, v in self.aux_optimizer.state_dict().items()
# }
# weights["custom_state_vars"] = self.get_custom_state_vars()
return weights
@override(TorchPolicy)
def set_weights(self, weights):
self.set_model_weights(weights["current_weights"])
self.set_optimizer_state(weights["optimizer_state"])
self.set_aux_optimizer_state(weights["aux_optimizer_state"])
self.set_custom_state_vars(weights["custom_state_vars"])
# self.set_optimizer_state(weights["optimizer_state"])
# self.set_aux_optimizer_state(weights["aux_optimizer_state"])
# self.set_custom_state_vars(weights["custom_state_vars"])
def set_aux_optimizer_state(self, aux_optimizer_state):
aux_optimizer_state = convert_to_torch_tensor(aux_optimizer_state, device=self.device)
......
import logging
import os
import time
from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG
from ray.rllib.optimizers import SyncSamplesOptimizer
from ray.rllib.utils import add_mixins
from ray.rllib.utils.annotations import override, DeveloperAPI
from zlib import compress, decompress
import numpy as np
from sys import getsizeof
logger = logging.getLogger(__name__)
@DeveloperAPI
def build_trainer(name,
default_policy,
default_config=None,
validate_config=None,
get_initial_state=None,
get_policy_class=None,
before_init=None,
make_workers=None,
make_policy_optimizer=None,
after_init=None,
before_train_step=None,
after_optimizer_step=None,
after_train_result=None,
collect_metrics_fn=None,
before_evaluate_fn=None,
mixins=None,
execution_plan=None):
"""Helper function for defining a custom trainer.
Functions will be run in this order to initialize the trainer:
1. Config setup: validate_config, get_initial_state, get_policy
2. Worker setup: before_init, make_workers, make_policy_optimizer
3. Post setup: after_init
Arguments:
name (str): name of the trainer (e.g., "PPO")
default_policy (cls): the default Policy class to use
default_config (dict): The default config dict of the algorithm,
otherwise uses the Trainer default config.
validate_config (func): optional callback that checks a given config
for correctness. It may mutate the config as needed.
get_initial_state (func): optional function that returns the initial
state dict given the trainer instance as an argument. The state
dict must be serializable so that it can be checkpointed, and will
be available as the `trainer.state` variable.
get_policy_class (func): optional callback that takes a config and
returns the policy class to override the default with
before_init (func): optional function to run at the start of trainer
init that takes the trainer instance as argument
make_workers (func): override the method that creates rollout workers.
This takes in (trainer, env_creator, policy, config) as args.
make_policy_optimizer (func): optional function that returns a
PolicyOptimizer instance given (WorkerSet, config)
after_init (func): optional function to run at the end of trainer init
that takes the trainer instance as argument
before_train_step (func): optional callback to run before each train()
call. It takes the trainer instance as an argument.
after_optimizer_step (func): optional callback to run after each
step() call to the policy optimizer. It takes the trainer instance
and the policy gradient fetches as arguments.
after_train_result (func): optional callback to run at the end of each
train() call. It takes the trainer instance and result dict as
arguments, and may mutate the result dict as needed.
collect_metrics_fn (func): override the method used to collect metrics.
It takes the trainer instance as argumnt.
before_evaluate_fn (func): callback to run before evaluation. This
takes the trainer instance as argument.
mixins (list): list of any class mixins for the returned trainer class.
These mixins will be applied in order and will have higher
precedence than the Trainer class
execution_plan (func): Experimental distributed execution
API. This overrides `make_policy_optimizer`.
Returns:
a Trainer instance that uses the specified args.
"""
original_kwargs = locals().copy()
base = add_mixins(Trainer, mixins)
class trainer_cls(base):
_name = name
_default_config = default_config or COMMON_CONFIG
_policy = default_policy
def __init__(self, config=None, env=None, logger_creator=None):
Trainer.__init__(self, config, env, logger_creator)
def _init(self, config, env_creator):
if validate_config:
validate_config(config)
if get_initial_state:
self.state = get_initial_state(self)
else:
self.state = {}
if get_policy_class is None:
self._policy = default_policy
else:
self._policy = get_policy_class(config)
if before_init:
before_init(self)
use_exec_api = (execution_plan
and (self.config["use_exec_api"]
or "RLLIB_EXEC_API" in os.environ))
# Creating all workers (excluding evaluation workers).
if make_workers and not use_exec_api:
self.workers = make_workers(self, env_creator, self._policy,
config)
else:
self.workers = self._make_workers(env_creator, self._policy,
config,
self.config["num_workers"])
self.train_exec_impl = None
self.optimizer = None
self.execution_plan = execution_plan
if use_exec_api:
logger.warning(
"The experimental distributed execution API is enabled "
"for this algorithm. Disable this by setting "
"'use_exec_api': False.")
self.train_exec_impl = execution_plan(self.workers, config)
elif make_policy_optimizer:
self.optimizer = make_policy_optimizer(self.workers, config)
else:
optimizer_config = dict(
config["optimizer"],
**{"train_batch_size": config["train_batch_size"]})
self.optimizer = SyncSamplesOptimizer(self.workers,
**optimizer_config)
if after_init:
after_init(self)
@override(Trainer)
def _train(self):
if self.train_exec_impl:
return self._train_exec_impl()
if before_train_step:
before_train_step(self)
prev_steps = self.optimizer.num_steps_sampled
start = time.time()
optimizer_steps_this_iter = 0
while True:
fetches = self.optimizer.step()
optimizer_steps_this_iter += 1
if after_optimizer_step:
after_optimizer_step(self, fetches)
if (time.time() - start >= self.config["min_iter_time_s"]
and self.optimizer.num_steps_sampled - prev_steps >=
self.config["timesteps_per_iteration"]):
break
if collect_metrics_fn:
res = collect_metrics_fn(self)
else:
res = self.collect_metrics()
res.update(
optimizer_steps_this_iter=optimizer_steps_this_iter,
timesteps_this_iter=self.optimizer.num_steps_sampled -
prev_steps,
info=res.get("info", {}))
if after_train_result:
after_train_result(self, res)
return res
def _train_exec_impl(self):
if before_train_step:
logger.debug("Ignoring before_train_step callback")
res = next(self.train_exec_impl)
if after_train_result:
logger.debug("Ignoring after_train_result callback")
return res
@override(Trainer)
def _before_evaluate(self):
if before_evaluate_fn:
before_evaluate_fn(self)
def __getstate__(self):
state = Trainer.__getstate__(self)
state["trainer_state"] = self.state.copy()
policy = Trainer.get_policy(self)
state["vtarg_replay"] = policy.vtarg_replay
state["custom_state_vars"] = policy.get_custom_state_vars()
state["optimizer_state"] = {k: v for k, v in policy.optimizer.state_dict().items()}
state["aux_optimizer_state"] = {k: v for k, v in policy.aux_optimizer.state_dict().items()}
if getsizeof(policy.exp_replay) < 3_500_000_000:
state["replay_buffer"] = policy.exp_replay
policy.save_success = 1
else:
replay_compressed = compress(policy.exp_replay, level=9)
if getsizeof(replay_compressed) < 3_500_000_000:
state["replay_buffer"] = replay_compressed
policy.save_success = 2
# print("Compression Success", getsizeof(replay_compressed))
else:
policy.save_success = -1
# print("Compression Failed", getsizeof(replay_compressed))
if self.train_exec_impl:
state["train_exec_impl"] = (
self.train_exec_impl.shared_metrics.get().save())
return state
def __setstate__(self, state):
Trainer.__setstate__(self, state)
policy = Trainer.get_policy(self)
self.state = state["trainer_state"].copy()
policy.set_optimizer_state(state["optimizer_state"])
policy.set_aux_optimizer_state(state["aux_optimizer_state"])
policy.set_custom_state_vars(state["custom_state_vars"])
replay_buffer = state.get("replay_buffer", None)
if replay_buffer is not None:
if isinstance(replay_buffer, np.ndarray):
policy.exp_replay = replay_buffer
else:
policy.exp_replay = decompress(replay_buffer)
policy.vtarg_replay = state["vtarg_replay"]
if self.train_exec_impl:
self.train_exec_impl.shared_metrics.get().restore(
state["train_exec_impl"])
def with_updates(**overrides):
"""Build a copy of this trainer with the specified overrides.
Arguments:
overrides (dict): use this to override any of the arguments
originally passed to build_trainer() for this policy.
"""
return build_trainer(**dict(original_kwargs, **overrides))
trainer_cls.with_updates = staticmethod(with_updates)
trainer_cls.__name__ = name
trainer_cls.__qualname__ = name
return trainer_cls
......@@ -2,7 +2,8 @@ import logging
from ray.rllib.agents import with_common_config
from .custom_torch_ppg import CustomTorchPolicy
from ray.rllib.agents.trainer_template import build_trainer
# from ray.rllib.agents.trainer_template import build_trainer
from .custom_trainer_template import build_trainer
logger = logging.getLogger(__name__)
......
......@@ -94,8 +94,7 @@ class RetuneSelector:
def __init__(self, nbatch, ob_space, ac_space, skips = 800_000, replay_size = 200_000, num_retunes = 5):
self.skips = skips + (-skips) % nbatch
self.replay_size = replay_size + (-replay_size) % nbatch
self.exp_replay = np.empty((self.replay_size, *ob_space.shape), dtype=np.uint8)
self.vtarg_replay = np.empty((self.replay_size), dtype=np.float32)
self.batch_size = nbatch
self.batches_in_replay = self.replay_size // nbatch
......@@ -107,7 +106,7 @@ class RetuneSelector:
self.replay_index = 0
self.buffer_full = False
def update(self, obs_batch, vtarg_batch):
def update(self, obs_batch, vtarg_batch, exp_replay, vtarg_replay):
if self.num_retunes == 0:
return False
......@@ -117,8 +116,8 @@ class RetuneSelector:
start = self.replay_index * self.batch_size
end = start + self.batch_size
self.exp_replay[start:end] = obs_batch
self.vtarg_replay[start:end] = vtarg_batch
exp_replay[start:end] = obs_batch
vtarg_replay[start:end] = vtarg_batch
self.replay_index = (self.replay_index + 1) % self.batches_in_replay
......
......@@ -141,6 +141,8 @@ class CustomCallbacks(DefaultCallbacks):
result['return_min'] = trainer_policy.config['env_config']['return_min']
result['return_blind'] = trainer_policy.config['env_config']['return_blind']
result['return_max'] = trainer_policy.config['env_config']['return_max']
result['buffer_save_success'] = trainer_policy.save_success
......
......@@ -46,8 +46,8 @@ procgen-ppo:
no_done_at_end: False
# Custom switches
retune_skips: 300000
retune_replay_size: 200000
retune_skips: 200000
retune_replay_size: 300000
num_retunes: 20
retune_epochs: 6
standardize_rewards: True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment