Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Dipam Chakraborty
neurips-2020-procgen-competition
Commits
1fb3e4c4
Commit
1fb3e4c4
authored
Oct 30, 2020
by
Dipam Chakraborty
Browse files
ppg l2 3e-5
parent
c4a8cd1a
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
algorithms/custom_ppg/custom_torch_ppg.py
View file @
1fb3e4c4
...
...
@@ -58,11 +58,11 @@ class CustomTorchPolicy(TorchPolicy):
aux_optim_params
=
list
(
network_params
-
value_params
)
ppo_optim_params
=
list
(
network_params
-
aux_params
-
value_params
)
if
not
self
.
config
[
'single_optimizer'
]:
self
.
optimizer
=
torch
.
optim
.
Adam
(
ppo_optim_params
,
lr
=
self
.
config
[
'lr'
])
self
.
optimizer
=
torch
.
optim
.
Adam
(
ppo_optim_params
,
lr
=
self
.
config
[
'lr'
]
,
weight_decay
=
self
.
config
[
'l2_reg'
]
)
else
:
self
.
optimizer
=
torch
.
optim
.
Adam
(
network_params
,
lr
=
self
.
config
[
'lr'
])
self
.
aux_optimizer
=
torch
.
optim
.
Adam
(
aux_optim_params
,
lr
=
self
.
config
[
'aux_lr'
])
self
.
value_optimizer
=
torch
.
optim
.
Adam
(
value_params
,
lr
=
self
.
config
[
'value_lr'
])
self
.
optimizer
=
torch
.
optim
.
Adam
(
network_params
,
lr
=
self
.
config
[
'lr'
]
,
weight_decay
=
self
.
config
[
'l2_reg'
]
)
self
.
aux_optimizer
=
torch
.
optim
.
Adam
(
aux_optim_params
,
lr
=
self
.
config
[
'aux_lr'
]
,
weight_decay
=
self
.
config
[
'l2_reg'
]
)
self
.
value_optimizer
=
torch
.
optim
.
Adam
(
value_params
,
lr
=
self
.
config
[
'value_lr'
]
,
weight_decay
=
self
.
config
[
'l2_reg'
]
)
self
.
max_reward
=
self
.
config
[
'env_config'
][
'return_max'
]
self
.
rewnorm
=
RewardNormalizer
(
cliprew
=
self
.
max_reward
)
## TODO: Might need to go to custom state
self
.
reward_deque
=
deque
(
maxlen
=
100
)
...
...
algorithms/custom_ppg/ppg.py
View file @
1fb3e4c4
...
...
@@ -99,6 +99,7 @@ DEFAULT_CONFIG = with_common_config({
"max_time"
:
7200
,
"pi_phase_mixed_precision"
:
False
,
"aux_num_accumulates"
:
1
,
"l2_reg"
:
0.0
,
})
# __sphinx_doc_end__
# yapf: enable
...
...
algorithms/ppg_experimental/__init__.py
0 → 100644
View file @
1fb3e4c4
#from ray.rllib.agents.ppo.ppo import PPOTrainer, DEFAULT_CONFIG
#from ppo_tf_policy import PPOTFPolicy
#__all__ = [
# "DEFAULT_CONFIG",
# "PPOTFPolicy",
# "PPOTrainer",
#]
algorithms/ppg_experimental/custom_torch_ppg.py
0 → 100644
View file @
1fb3e4c4
This diff is collapsed.
Click to expand it.
algorithms/ppg_experimental/custom_trainer_template.py
0 → 100644
View file @
1fb3e4c4
import
logging
import
os
import
time
from
ray.rllib.agents.trainer
import
Trainer
,
COMMON_CONFIG
from
ray.rllib.optimizers
import
SyncSamplesOptimizer
from
ray.rllib.utils
import
add_mixins
from
ray.rllib.utils.annotations
import
override
,
DeveloperAPI
import
numpy
as
np
from
zlib
import
compress
,
decompress
from
sys
import
getsizeof
logger
=
logging
.
getLogger
(
__name__
)
@
DeveloperAPI
def
build_trainer
(
name
,
default_policy
,
default_config
=
None
,
validate_config
=
None
,
get_initial_state
=
None
,
get_policy_class
=
None
,
before_init
=
None
,
make_workers
=
None
,
make_policy_optimizer
=
None
,
after_init
=
None
,
before_train_step
=
None
,
after_optimizer_step
=
None
,
after_train_result
=
None
,
collect_metrics_fn
=
None
,
before_evaluate_fn
=
None
,
mixins
=
None
,
execution_plan
=
None
):
"""Helper function for defining a custom trainer.
Functions will be run in this order to initialize the trainer:
1. Config setup: validate_config, get_initial_state, get_policy
2. Worker setup: before_init, make_workers, make_policy_optimizer
3. Post setup: after_init
Arguments:
name (str): name of the trainer (e.g., "PPO")
default_policy (cls): the default Policy class to use
default_config (dict): The default config dict of the algorithm,
otherwise uses the Trainer default config.
validate_config (func): optional callback that checks a given config
for correctness. It may mutate the config as needed.
get_initial_state (func): optional function that returns the initial
state dict given the trainer instance as an argument. The state
dict must be serializable so that it can be checkpointed, and will
be available as the `trainer.state` variable.
get_policy_class (func): optional callback that takes a config and
returns the policy class to override the default with
before_init (func): optional function to run at the start of trainer
init that takes the trainer instance as argument
make_workers (func): override the method that creates rollout workers.
This takes in (trainer, env_creator, policy, config) as args.
make_policy_optimizer (func): optional function that returns a
PolicyOptimizer instance given (WorkerSet, config)
after_init (func): optional function to run at the end of trainer init
that takes the trainer instance as argument
before_train_step (func): optional callback to run before each train()
call. It takes the trainer instance as an argument.
after_optimizer_step (func): optional callback to run after each
step() call to the policy optimizer. It takes the trainer instance
and the policy gradient fetches as arguments.
after_train_result (func): optional callback to run at the end of each
train() call. It takes the trainer instance and result dict as
arguments, and may mutate the result dict as needed.
collect_metrics_fn (func): override the method used to collect metrics.
It takes the trainer instance as argumnt.
before_evaluate_fn (func): callback to run before evaluation. This
takes the trainer instance as argument.
mixins (list): list of any class mixins for the returned trainer class.
These mixins will be applied in order and will have higher
precedence than the Trainer class
execution_plan (func): Experimental distributed execution
API. This overrides `make_policy_optimizer`.
Returns:
a Trainer instance that uses the specified args.
"""
original_kwargs
=
locals
().
copy
()
base
=
add_mixins
(
Trainer
,
mixins
)
class
trainer_cls
(
base
):
_name
=
name
_default_config
=
default_config
or
COMMON_CONFIG
_policy
=
default_policy
def
__init__
(
self
,
config
=
None
,
env
=
None
,
logger_creator
=
None
):
Trainer
.
__init__
(
self
,
config
,
env
,
logger_creator
)
def
_init
(
self
,
config
,
env_creator
):
if
validate_config
:
validate_config
(
config
)
if
get_initial_state
:
self
.
state
=
get_initial_state
(
self
)
else
:
self
.
state
=
{}
if
get_policy_class
is
None
:
self
.
_policy
=
default_policy
else
:
self
.
_policy
=
get_policy_class
(
config
)
if
before_init
:
before_init
(
self
)
use_exec_api
=
(
execution_plan
and
(
self
.
config
[
"use_exec_api"
]
or
"RLLIB_EXEC_API"
in
os
.
environ
))
# Creating all workers (excluding evaluation workers).
if
make_workers
and
not
use_exec_api
:
self
.
workers
=
make_workers
(
self
,
env_creator
,
self
.
_policy
,
config
)
else
:
self
.
workers
=
self
.
_make_workers
(
env_creator
,
self
.
_policy
,
config
,
self
.
config
[
"num_workers"
])
self
.
train_exec_impl
=
None
self
.
optimizer
=
None
self
.
execution_plan
=
execution_plan
if
use_exec_api
:
logger
.
warning
(
"The experimental distributed execution API is enabled "
"for this algorithm. Disable this by setting "
"'use_exec_api': False."
)
self
.
train_exec_impl
=
execution_plan
(
self
.
workers
,
config
)
elif
make_policy_optimizer
:
self
.
optimizer
=
make_policy_optimizer
(
self
.
workers
,
config
)
else
:
optimizer_config
=
dict
(
config
[
"optimizer"
],
**
{
"train_batch_size"
:
config
[
"train_batch_size"
]})
self
.
optimizer
=
SyncSamplesOptimizer
(
self
.
workers
,
**
optimizer_config
)
if
after_init
:
after_init
(
self
)
policy
=
Trainer
.
get_policy
(
self
)
policy
.
init_training
()
@
override
(
Trainer
)
def
_train
(
self
):
if
self
.
train_exec_impl
:
return
self
.
_train_exec_impl
()
if
before_train_step
:
before_train_step
(
self
)
prev_steps
=
self
.
optimizer
.
num_steps_sampled
start
=
time
.
time
()
optimizer_steps_this_iter
=
0
while
True
:
fetches
=
self
.
optimizer
.
step
()
optimizer_steps_this_iter
+=
1
if
after_optimizer_step
:
after_optimizer_step
(
self
,
fetches
)
if
(
time
.
time
()
-
start
>=
self
.
config
[
"min_iter_time_s"
]
and
self
.
optimizer
.
num_steps_sampled
-
prev_steps
>=
self
.
config
[
"timesteps_per_iteration"
]):
break
if
collect_metrics_fn
:
res
=
collect_metrics_fn
(
self
)
else
:
res
=
self
.
collect_metrics
()
res
.
update
(
optimizer_steps_this_iter
=
optimizer_steps_this_iter
,
timesteps_this_iter
=
self
.
optimizer
.
num_steps_sampled
-
prev_steps
,
info
=
res
.
get
(
"info"
,
{}))
if
after_train_result
:
after_train_result
(
self
,
res
)
return
res
def
_train_exec_impl
(
self
):
if
before_train_step
:
logger
.
debug
(
"Ignoring before_train_step callback"
)
res
=
next
(
self
.
train_exec_impl
)
if
after_train_result
:
logger
.
debug
(
"Ignoring after_train_result callback"
)
return
res
@
override
(
Trainer
)
def
_before_evaluate
(
self
):
if
before_evaluate_fn
:
before_evaluate_fn
(
self
)
def
__getstate__
(
self
):
state
=
Trainer
.
__getstate__
(
self
)
state
[
"trainer_state"
]
=
self
.
state
.
copy
()
policy
=
Trainer
.
get_policy
(
self
)
try
:
state
[
"custom_state_vars"
]
=
policy
.
get_custom_state_vars
()
state
[
"optimizer_state"
]
=
{
k
:
v
for
k
,
v
in
policy
.
optimizer
.
state_dict
().
items
()}
state
[
"aux_optimizer_state"
]
=
{
k
:
v
for
k
,
v
in
policy
.
aux_optimizer
.
state_dict
().
items
()}
state
[
"value_optimizer_state"
]
=
{
k
:
v
for
k
,
v
in
policy
.
value_optimizer
.
state_dict
().
items
()}
state
[
"amp_scaler_state"
]
=
{
k
:
v
for
k
,
v
in
policy
.
amp_scaler
.
state_dict
().
items
()}
except
:
print
(
"################# WARNING: SAVING STATE VARS AND OPTIMIZER FAILED ################"
)
if
self
.
train_exec_impl
:
state
[
"train_exec_impl"
]
=
(
self
.
train_exec_impl
.
shared_metrics
.
get
().
save
())
return
state
def
__setstate__
(
self
,
state
):
Trainer
.
__setstate__
(
self
,
state
)
policy
=
Trainer
.
get_policy
(
self
)
self
.
state
=
state
[
"trainer_state"
].
copy
()
try
:
policy
.
set_optimizer_state
(
state
[
"optimizer_state"
],
state
[
"aux_optimizer_state"
],
state
[
"value_optimizer_state"
],
state
[
"amp_scaler_state"
])
policy
.
set_custom_state_vars
(
state
[
"custom_state_vars"
])
except
:
print
(
"################# WARNING: LOADING STATE VARS AND OPTIMIZER FAILED ################"
)
if
self
.
train_exec_impl
:
self
.
train_exec_impl
.
shared_metrics
.
get
().
restore
(
state
[
"train_exec_impl"
])
def
with_updates
(
**
overrides
):
"""Build a copy of this trainer with the specified overrides.
Arguments:
overrides (dict): use this to override any of the arguments
originally passed to build_trainer() for this policy.
"""
return
build_trainer
(
**
dict
(
original_kwargs
,
**
overrides
))
trainer_cls
.
with_updates
=
staticmethod
(
with_updates
)
trainer_cls
.
__name__
=
name
trainer_cls
.
__qualname__
=
name
return
trainer_cls
algorithms/ppg_experimental/ppg.py
0 → 100644
View file @
1fb3e4c4
import
logging
from
ray.rllib.agents
import
with_common_config
from
.custom_torch_ppg
import
CustomTorchPolicy
# from ray.rllib.agents.trainer_template import build_trainer
from
.custom_trainer_template
import
build_trainer
logger
=
logging
.
getLogger
(
__name__
)
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG
=
with_common_config
({
# Should use a critic as a baseline (otherwise don't use value baseline;
# required for using GAE).
"use_critic"
:
True
,
# If true, use the Generalized Advantage Estimator (GAE)
# with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
"use_gae"
:
True
,
# The GAE(lambda) parameter.
"lambda"
:
1.0
,
# Initial coefficient for KL divergence.
"kl_coeff"
:
0.2
,
# Size of batches collected from each worker.
"rollout_fragment_length"
:
200
,
# Number of timesteps collected for each SGD round. This defines the size
# of each SGD epoch.
"train_batch_size"
:
4000
,
# Total SGD batch size across all devices for SGD. This defines the
# minibatch size within each epoch.
"sgd_minibatch_size"
:
128
,
# Whether to shuffle sequences in the batch when training (recommended).
"shuffle_sequences"
:
True
,
# Number of SGD iterations in each outer loop (i.e., number of epochs to
# execute per train batch).
"num_sgd_iter"
:
30
,
# Stepsize of SGD.
"lr"
:
5e-5
,
# Learning rate schedule.
"lr_schedule"
:
None
,
# Share layers for value function. If you set this to True, it's important
# to tune vf_loss_coeff.
"vf_share_layers"
:
False
,
# Coefficient of the value function loss. IMPORTANT: you must tune this if
# you set vf_share_layers: True.
"vf_loss_coeff"
:
1.0
,
# Coefficient of the entropy regularizer.
"entropy_coeff"
:
0.0
,
# Decay schedule for the entropy regularizer.
"entropy_coeff_schedule"
:
None
,
# PPO clip parameter.
"clip_param"
:
0.3
,
# Clip param for the value function. Note that this is sensitive to the
# scale of the rewards. If your expected V is large, increase this.
"vf_clip_param"
:
10.0
,
# If specified, clip the global norm of gradients by this amount.
"grad_clip"
:
None
,
# Target value for KL divergence.
"kl_target"
:
0.01
,
# Whether to rollout "complete_episodes" or "truncate_episodes".
"batch_mode"
:
"truncate_episodes"
,
# Which observation filter to apply to the observation.
"observation_filter"
:
"NoFilter"
,
# Uses the sync samples optimizer instead of the multi-gpu one. This is
# usually slower, but you might want to try it if you run into issues with
# the default optimizer.
"simple_optimizer"
:
False
,
# Whether to fake GPUs (using CPUs).
# Set this to True for debugging on non-GPU machines (set `num_gpus` > 0).
"_fake_gpus"
:
False
,
# Use PyTorch as framework?
"use_pytorch"
:
True
,
# Custom swithches
"skips"
:
0
,
"n_pi"
:
32
,
"num_retunes"
:
100
,
"retune_epochs"
:
6
,
"standardize_rewards"
:
True
,
"scale_reward"
:
1.0
,
"accumulate_train_batches"
:
1
,
"adaptive_gamma"
:
False
,
"final_lr"
:
2e-4
,
"lr_schedule"
:
'None'
,
"final_entropy_coeff"
:
0.002
,
"entropy_schedule"
:
False
,
"max_minibatch_size"
:
2048
,
"updates_per_batch"
:
8
,
"aux_mbsize"
:
4
,
"augment_buffer"
:
False
,
"reset_returns"
:
True
,
"flattened_buffer"
:
False
,
"augment_randint_num"
:
6
,
"aux_lr"
:
5e-4
,
"value_lr"
:
1e-3
,
"same_lr_everywhere"
:
False
,
"aux_phase_mixed_precision"
:
False
,
"single_optimizer"
:
False
,
"max_time"
:
7200
,
"pi_phase_mixed_precision"
:
False
,
"aux_num_accumulates"
:
1
,
})
# __sphinx_doc_end__
# yapf: enable
PPGTrainer
=
build_trainer
(
name
=
"PPGExperimentalAgent"
,
default_config
=
DEFAULT_CONFIG
,
default_policy
=
CustomTorchPolicy
)
algorithms/ppg_experimental/utils.py
0 → 100644
View file @
1fb3e4c4
import
numpy
as
np
from
ray.rllib.utils
import
try_import_torch
from
collections
import
deque
from
skimage.util
import
view_as_windows
torch
,
nn
=
try_import_torch
()
import
torch.distributions
as
td
from
functools
import
partial
import
itertools
def
calculate_gae_buffer
(
values_buffer
,
dones_buffer
,
rewards_buffer
,
last_values
,
gamma
,
lam
):
new_returns
=
np
.
empty_like
(
values_buffer
)
lastgaelam
=
0
nsegs
,
nsteps
=
values_buffer
.
shape
[:
2
]
for
s
in
reversed
(
range
(
nsegs
)):
mb_values
=
values_buffer
[
s
]
mb_rewards
=
rewards_buffer
[
s
]
mb_dones
=
dones_buffer
[
s
]
mb_returns
,
_
=
calculate_gae
(
mb_values
,
mb_dones
,
mb_rewards
,
last_values
,
gamma
,
lam
)
new_returns
[
s
]
=
mb_returns
last_values
=
mb_values
[
0
]
return
new_returns
def
calculate_gae
(
mb_values
,
mb_dones
,
mb_rewards
,
last_values
,
gamma
,
lam
):
lastgaelam
=
0
nsteps
=
mb_values
.
shape
[
0
]
mb_advs
=
np
.
empty_like
(
mb_values
)
for
t
in
reversed
(
range
(
nsteps
)):
if
t
==
nsteps
-
1
:
nextvalues
=
last_values
else
:
nextvalues
=
mb_values
[
t
+
1
]
nextnonterminal
=
1.0
-
mb_dones
[
t
]
delta
=
mb_rewards
[
t
]
+
gamma
*
nextvalues
*
nextnonterminal
-
mb_values
[
t
]
mb_advs
[
t
]
=
lastgaelam
=
delta
+
gamma
*
lam
*
nextnonterminal
*
lastgaelam
mb_returns
=
mb_advs
+
mb_values
return
mb_returns
,
mb_advs
def
_make_categorical
(
x
,
ncat
,
shape
):
x
=
x
.
reshape
((
x
.
shape
[
0
],
shape
,
ncat
))
return
td
.
Categorical
(
logits
=
x
)
def
dist_build
(
ac_space
):
return
partial
(
_make_categorical
,
shape
=
1
,
ncat
=
ac_space
.
n
)
def
neglogp_actions
(
pi_logits
,
actions
):
return
nn
.
functional
.
cross_entropy
(
pi_logits
,
actions
,
reduction
=
'none'
)
def
sample_actions
(
logits
,
device
):
u
=
torch
.
rand
(
logits
.
shape
,
dtype
=
logits
.
dtype
).
to
(
device
)
return
torch
.
argmax
(
logits
-
torch
.
log
(
-
torch
.
log
(
u
)),
dim
=
1
)
def
pi_entropy
(
logits
):
a0
=
logits
-
torch
.
max
(
logits
,
dim
=
1
,
keepdim
=
True
)[
0
]
ea0
=
torch
.
exp
(
a0
)
z0
=
torch
.
sum
(
ea0
,
dim
=
1
,
keepdim
=
True
)
p0
=
ea0
/
z0
return
torch
.
sum
(
p0
*
(
torch
.
log
(
z0
)
-
a0
),
axis
=
1
)
def
roll
(
arr
):
s
=
arr
.
shape
return
arr
.
swapaxes
(
0
,
1
).
reshape
(
s
[
0
]
*
s
[
1
],
*
s
[
2
:])
def
unroll
(
arr
,
targetshape
):
s
=
arr
.
shape
return
arr
.
reshape
(
*
targetshape
,
*
s
[
1
:]).
swapaxes
(
0
,
1
)
def
safe_mean
(
xs
):
return
-
np
.
inf
if
len
(
xs
)
==
0
else
np
.
mean
(
xs
)
def
pad_and_random_crop
(
imgs
,
out
,
pad
):
"""
Vectorized pad and random crop
Assumes square images?
args:
imgs: shape (B,H,W,C)
out: output size (e.g. 64)
"""
# n: batch size.
imgs
=
np
.
pad
(
imgs
,
[[
0
,
0
],
[
pad
,
pad
],
[
pad
,
pad
],
[
0
,
0
]])
n
=
imgs
.
shape
[
0
]
img_size
=
imgs
.
shape
[
1
]
# e.g. 64
crop_max
=
img_size
-
out
w1
=
np
.
random
.
randint
(
0
,
crop_max
,
n
)
h1
=
np
.
random
.
randint
(
0
,
crop_max
,
n
)
# creates all sliding window
# combinations of size (out)
windows
=
view_as_windows
(
imgs
,
(
1
,
out
,
out
,
1
))[...,
0
,:,:,
0
]
# selects a random window
# for each batch element
cropped
=
windows
[
np
.
arange
(
n
),
w1
,
h1
]
cropped
=
cropped
.
transpose
(
0
,
2
,
3
,
1
)
return
cropped
def
random_cutout_color
(
imgs
,
min_cut
,
max_cut
):
n
,
h
,
w
,
c
=
imgs
.
shape
w1
=
np
.
random
.
randint
(
min_cut
,
max_cut
,
n
)
h1
=
np
.
random
.
randint
(
min_cut
,
max_cut
,
n
)
cutouts
=
np
.
empty
((
n
,
h
,
w
,
c
),
dtype
=
imgs
.
dtype
)
rand_box
=
np
.
random
.
randint
(
0
,
255
,
size
=
(
n
,
c
),
dtype
=
imgs
.
dtype
)
for
i
,
(
img
,
w11
,
h11
)
in
enumerate
(
zip
(
imgs
,
w1
,
h1
)):
cut_img
=
img
.
copy
()
# add random box
cut_img
[
h11
:
h11
+
h11
,
w11
:
w11
+
w11
,
:]
=
rand_box
[
i
]
cutouts
[
i
]
=
cut_img
return
cutouts
def
linear_schedule
(
initial_val
,
final_val
,
current_steps
,
total_steps
):
frac
=
1.0
-
current_steps
/
total_steps
return
(
initial_val
-
final_val
)
*
frac
+
final_val
def
horizon_to_gamma
(
horizon
):
return
1.0
-
1.0
/
horizon
class
AdaptiveDiscountTuner
:
def
__init__
(
self
,
gamma
,
momentum
=
0.98
,
eplenmult
=
1
):
self
.
gamma
=
gamma
self
.
momentum
=
momentum
self
.
eplenmult
=
eplenmult
def
update
(
self
,
horizon
):
if
horizon
>
0
:
htarg
=
horizon
*
self
.
eplenmult
gtarg
=
horizon_to_gamma
(
htarg
)
self
.
gamma
=
self
.
gamma
*
self
.
momentum
+
gtarg
*
(
1
-
self
.
momentum
)
return
self
.
gamma
def
flatten01
(
arr
):
return
arr
.
reshape
(
-
1
,
*
arr
.
shape
[
2
:])
def
flatten012
(
arr
):
return
arr
.
reshape
(
-
1
,
*
arr
.
shape
[
3
:])
class
RetuneSelector
:
def
__init__
(
self
,
nenvs
,
ob_space
,
ac_space
,
replay_shape
,
skips
=
0
,
n_pi
=
32
,
num_retunes
=
5
,
flat_buffer
=
False
):
self
.
skips
=
skips
self
.
n_pi
=
n_pi
self
.
nenvs
=
nenvs
self
.
exp_replay
=
np
.
empty
((
*
replay_shape
,
*
ob_space
.
shape
),
dtype
=
np
.
uint8
)
self
.
dones_replay
=
np
.
empty
((
*
replay_shape
,),
dtype
=
np
.
bool
)
self
.
rewards_replay
=
np
.
empty
((
*
replay_shape
,),
dtype
=
np
.
float32
)
self
.
replay_shape
=
replay_shape
self
.
num_retunes
=
num_retunes
self
.
ac_space
=
ac_space
self
.
ob_space
=
ob_space
self
.
cooldown_counter
=
skips
self
.
replay_index
=
0
self
.
flat_buffer
=
flat_buffer
def
update
(
self
,
obs_batch
,
dones_batch
,
rewards_batch
):
if
self
.
num_retunes
==
0
:
return
False
if
self
.
cooldown_counter
>
0
:
self
.
cooldown_counter
-=
1
return
False
self
.
exp_replay
[
self
.
replay_index
]
=
obs_batch
self
.
dones_replay
[
self
.
replay_index
]
=
dones_batch
self
.
rewards_replay
[
self
.
replay_index
]
=
rewards_batch
self
.
replay_index
=
(
self
.
replay_index
+
1
)
%
self
.
n_pi
return
self
.
replay_index
==
0
def
retune_done
(
self
):
self
.
cooldown_counter
=
self
.
skips
self
.
num_retunes
-=
1
self
.
replay_index
=
0
def
make_minibatches
(
self
,
presleep_pi
,
returns_buffer
,
num_rollouts
):
if
not
self
.
flat_buffer
:
env_segs
=
list
(
itertools
.
product
(
range
(
self
.
n_pi
),
range
(
self
.
nenvs
)))
np
.
random
.
shuffle
(
env_segs
)
env_segs
=
np
.
array
(
env_segs
)
for
idx
in
range
(
0
,
len
(
env_segs
),
num_rollouts
):
esinds
=
env_segs
[
idx
:
idx
+
num_rollouts
]
mbatch
=
[
flatten01
(
arr
[
esinds
[:,
0
],
:
,
esinds
[:,
1
]])
for
arr
in
(
self
.
exp_replay
,
returns_buffer
,
presleep_pi
)]
yield
mbatch
else
:
nsteps
=
returns_buffer
.
shape
[
1
]
buffsize
=
self
.
n_pi
*
nsteps
*
self
.
nenvs
inds
=
np
.
arange
(
buffsize
)
np
.
random
.
shuffle
(
inds
)
batchsize
=
num_rollouts
*
nsteps
for
start
in
range
(
0
,
buffsize
,
batchsize
):
end
=
start
+
batchsize