Commit f473f516 authored by Dipam Chakraborty's avatar Dipam Chakraborty
Browse files

ppg npi 32 softq eval 0.5

parent f2a86f1f
......@@ -225,8 +225,6 @@ class CustomTorchPolicy(TorchPolicy):
loss.backward()
vf_loss.backward()
if apply_grad:
if self.config['grad_clip'] is not None:
nn.utils.clip_grad_norm_(self.model.parameters(), self.config['grad_clip'])
self.optimizer.step()
self.optimizer.zero_grad()
if not self.config['single_optimizer']:
......@@ -365,6 +363,7 @@ class CustomTorchPolicy(TorchPolicy):
self.best_rew_tsteps = self.timesteps_total
if self.timesteps_total > self.target_timesteps or (self.time_elapsed + self.buffer_time) > self.max_time:
if self.timesteps_total > 1_000_000: # Adding this hack due to maze reward deque very high in beginning
if self.best_weights is not None:
self.set_model_weights(self.best_weights)
return True
......
......@@ -45,10 +45,10 @@ procgen-ppo:
no_done_at_end: False
# Custom switches
skips: 2
n_pi: 16
num_retunes: 15
retune_epochs: 7
skips: 0
n_pi: 32
num_retunes: 8
retune_epochs: 6
standardize_rewards: True
aux_mbsize: 4
aux_num_accumulates: 2
......@@ -113,6 +113,11 @@ procgen-ppo:
exploration_config:
type: "StochasticSampling"
evaluation_config:
exploration_config:
type: SoftQ
temperature: 0.5
observation_filter: "NoFilter"
synchronize_filters: True
compress_observations: False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment