Commit 9b28f2ac authored by Dipam Chakraborty's avatar Dipam Chakraborty
Browse files

rnorm index bugfix

parent aab9e125
......@@ -208,13 +208,14 @@ class CustomTorchPolicy(TorchPolicy):
for g in self.optimizer.param_groups:
g['lr'] = lr
# advs = returns - values
# Advantages are normalized with full size batch instead of memory limited batch
# advs = returns - values
# advs = (advs - torch.mean(advs)) / (torch.std(advs) + 1e-8)
vpred, pi_logits = self.model.vf_pi(obs, ret_numpy=False, no_grad=False, to_torch=False)
neglogpac = neglogp_actions(pi_logits, actions)
entropy = torch.mean(pi_entropy(pi_logits))
vpredclipped = values + torch.clamp(vpred - values, -cliprange, cliprange)
vpredclipped = values + torch.clamp(vpred - values, -vfcliprange, vfcliprange)
vf_losses1 = torch.pow((vpred - returns), 2)
vf_losses2 = torch.pow((vpredclipped - returns), 2)
vf_loss = .5 * torch.mean(torch.max(vf_losses1, vf_losses2))
......
......@@ -139,13 +139,13 @@ class RewardNormalizer(object):
self.gamma = gamma
self.ret_rms = RunningMeanStd(shape=())
self.cliprew = cliprew
self.ret = 0 # size updates after first pass
self.ret = 0. # size updates after first pass
def normalize(self, rews, news):
self.ret = self.ret * self.gamma + rews
self.ret_rms.update(self.ret)
rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
self.ret[np.int32(news)] = 0.
self.ret[np.array(news, dtype=bool)] = 0. ## Values should be True of False to set positional index
return rews
class RunningMeanStd(object):
......
......@@ -53,7 +53,7 @@ procgen-ppo:
standardize_rewards: True
adaptive_gamma: False
final_lr: 3.0e-4
final_lr: 2.0e-4
lr_schedule: 'linear'
final_entropy_coeff: 0.002
entropy_schedule: False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment