Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • marvinchung/minerl_sqil_baseline
  • shyam_sudhakaran/minerl_sqil_baseline
  • byron_galbraith/minerl_sqil_baseline
  • MasterScrat/minerl_sqil_baseline
  • keisuke_nishida/minerl_sqil_baseline
  • pfrl_sqil/minerl_sqil_baseline
6 results
Show changes
Commits on Source (16)
Showing
with 63 additions and 375 deletions
......@@ -2,7 +2,7 @@
"challenge_id": "aicrowd-neurips-2020-minerl-challenge",
"grader_id": "aicrowd-neurips-2020-minerl-challenge",
"authors": ["minerl_sqil_baseline"],
"tags": "IL",
"tags": "RL",
"description": "Test Model for MineRL Challenge",
"gpu": false
"gpu": true
}
name: minerl
channels:
- conda-forge
- defaults
dependencies:
- python=3.6
- pip
- pytorch
- py-opencv
- pip:
- crowdai_api
- minerl
- coloredlogs
- matplotlib
- pyro4
- pfrl
- scikit-learn
......@@ -106,6 +106,8 @@ def _batch_reset_recurrent_states_when_episodes_end(
def load_experiences_from_demonstrations(
expert_dataset, batch_size, reward=1):
if expert_dataset is None:
raise ValueError("Expert dataset must be provided.")
ret = []
for _ in range(batch_size):
ob, act, _, next_ob, done = expert_dataset.sample()
......@@ -162,7 +164,6 @@ class RewardBasedSampler:
n_samples = [0 for _ in range(len(self.reward_boundaries) + 1)]
for frame in experiences:
n_samples[self._policy_index(frame[0]['state'])] += 1
print(n_samples)
ret = []
for rbuf, n_sample in zip(self.replay_buffers, n_samples):
samples = rbuf.sample(n_sample)
......@@ -235,8 +236,6 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent):
recurrent=False,
reward_boundaries=None, # specific to options
):
if expert_dataset is None:
raise ValueError("Expert dataset must be provided.")
self.expert_dataset = expert_dataset
self.model = q_function
......@@ -290,7 +289,7 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent):
self.reward_scale = reward_scale
self.experience_lambda = experience_lambda
if reward_boundaries is not None:
if reward_boundaries is not None and self.expert_dataset is not None:
self.reward_based_sampler = RewardBasedSampler(
self.expert_dataset,
reward_boundaries,
......@@ -327,21 +326,6 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent):
# cumulative_steps counts the overall steps during the training.
return self._cumulative_steps
def _setup_actor_learner_training(self, n_actors, actor_update_interval):
assert actor_update_interval > 0
self.actor_update_interval = actor_update_interval
self.update_counter = 0
# Make a copy on shared memory and share among actors and the poller
shared_model = copy.deepcopy(self.model).cpu()
shared_model.share_memory()
# Pipes are used for infrequent communication
learner_pipes, actor_pipes = list(zip(*[mp.Pipe() for _ in range(n_actors)]))
return (shared_model, learner_pipes, actor_pipes)
def sync_target_network(self):
"""Synchronize target network with current network."""
if self.target_model is None:
......@@ -663,158 +647,6 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent):
return False
return True
def _poll_pipe(self, actor_idx, pipe, replay_buffer_lock, exception_event):
if pipe.closed:
return
try:
while pipe.poll() and not exception_event.is_set():
cmd, data = pipe.recv()
if cmd == "get_statistics":
assert data is None
with replay_buffer_lock:
stats = self.get_statistics()
pipe.send(stats)
elif cmd == "load":
self.load(data)
pipe.send(None)
elif cmd == "save":
self.save(data)
pipe.send(None)
elif cmd == "transition":
with replay_buffer_lock:
if "env_id" not in data:
data["env_id"] = actor_idx
self.replay_buffer.append(**data)
self._cumulative_steps += 1
elif cmd == "stop_episode":
idx = actor_idx if data is None else data
with replay_buffer_lock:
self.replay_buffer.stop_current_episode(env_id=idx)
stats = self.get_statistics()
pipe.send(stats)
else:
raise RuntimeError("Unknown command from actor: {}".format(cmd))
except EOFError:
pipe.close()
except Exception:
self.logger.exception("Poller loop failed. Exiting")
exception_event.set()
def _learner_loop(
self,
shared_model,
pipes,
replay_buffer_lock,
stop_event,
exception_event,
n_updates=None,
):
try:
update_counter = 0
# To stop this loop, call stop_event.set()
while not stop_event.is_set():
# Update model if possible
if not self._can_start_replay():
continue
if n_updates is not None:
assert self.optim_t <= n_updates
if self.optim_t == n_updates:
stop_event.set()
break
if self.recurrent:
with replay_buffer_lock:
episodes = self.replay_buffer.sample_episodes(
self.minibatch_size, self.episodic_update_len
)
self.update_from_episodes(episodes)
else:
with replay_buffer_lock:
transitions = self.replay_buffer.sample(self.minibatch_size)
self.update(transitions)
# Update the shared model. This can be expensive if GPU is used
# since this is a DtoH copy, so it is updated only at regular
# intervals.
update_counter += 1
if update_counter % self.actor_update_interval == 0:
self.update_counter += 1
shared_model.load_state_dict(self.model.state_dict())
# To keep the ratio of target updates to model updates,
# here we calculate back the effective current timestep
# from update_interval and number of updates so far.
effective_timestep = self.optim_t * self.update_interval
# We can safely assign self.t since in the learner
# it isn't updated by any other method
self.t = effective_timestep
if effective_timestep % self.target_update_interval == 0:
self.sync_target_network()
except Exception:
self.logger.exception("Learner loop failed. Exiting")
exception_event.set()
def _poller_loop(
self, shared_model, pipes, replay_buffer_lock, stop_event, exception_event
):
# To stop this loop, call stop_event.set()
while not stop_event.is_set() and not exception_event.is_set():
time.sleep(1e-6)
# Poll actors for messages
for i, pipe in enumerate(pipes):
self._poll_pipe(i, pipe, replay_buffer_lock, exception_event)
def setup_actor_learner_training(
self, n_actors, n_updates=None, actor_update_interval=8
):
(shared_model, learner_pipes, actor_pipes) = self._setup_actor_learner_training(
n_actors, actor_update_interval
)
exception_event = mp.Event()
def make_actor(i):
return pfrl.agents.StateQFunctionActor(
pipe=actor_pipes[i],
model=shared_model,
explorer=self.explorer,
phi=self.phi,
batch_states=self.batch_states,
logger=self.logger,
recurrent=self.recurrent,
)
replay_buffer_lock = mp.Lock()
poller_stop_event = mp.Event()
poller = pfrl.utils.StoppableThread(
target=self._poller_loop,
kwargs=dict(
shared_model=shared_model,
pipes=learner_pipes,
replay_buffer_lock=replay_buffer_lock,
stop_event=poller_stop_event,
exception_event=exception_event,
),
stop_event=poller_stop_event,
)
learner_stop_event = mp.Event()
learner = pfrl.utils.StoppableThread(
target=self._learner_loop,
kwargs=dict(
shared_model=shared_model,
pipes=learner_pipes,
replay_buffer_lock=replay_buffer_lock,
stop_event=learner_stop_event,
n_updates=n_updates,
exception_event=exception_event,
),
stop_event=learner_stop_event,
)
return make_actor, learner, poller, exception_event
def stop_episode(self):
if self.recurrent:
self.test_recurrent_states = None
......
......@@ -14,47 +14,24 @@ class _KMeansCacheNotFound(FileNotFoundError):
pass
class BoundedLengthMemory:
def __init__(self, maxlen, random_state):
self.maxlen = maxlen
self.t = 0
self._rand = np.random.RandomState(random_state)
self.memory = []
def __call__(self):
return np.array(self.memory)
def append(self, action):
self.t += 1
if self.maxlen is None or len(self.memory) < self.maxlen:
self.memory.append(action)
else:
idx = self._rand.randint(self.t)
if idx < self.maxlen:
self.memory[idx] = action
def cached_kmeans(cache_dir, env_id, n_clusters, random_state,
subtask_reward_max=None, maxlen_each=None,
only_vector_converter=False):
sample_by_trajectory=False, only_vector_converter=False):
if only_vector_converter and not sample_by_trajectory:
raise ValueError("The vector converter option must be selected with the ascending order option.")
if cache_dir is None: # ignore cache
logger.info('Load dataset & do kmeans')
kmeans = _do_kmeans(env_id=env_id, n_clusters=n_clusters,
random_state=random_state,
subtask_reward_max=subtask_reward_max,
sample_by_trajectory=sample_by_trajectory,
only_vector_converter=only_vector_converter)
else:
if subtask_reward_max is None:
name_subtask_reward_max = ''
else:
name_subtask_reward_max = '_{}'.format(subtask_reward_max)
if only_vector_converter:
filename = 'kmeans_vector_converter{}.joblib'.format(name_subtask_reward_max) # noqa
elif maxlen_each is not None:
filename = 'kmeans_balanced_{}{}.joblib'.format(
maxlen_each, name_subtask_reward_max)
filename = 'kmeans_vector_converter.joblib'
else:
filename = 'kmeans{}.joblib'.format(name_subtask_reward_max)
if sample_by_trajectory:
filename = 'kmeans_normal.joblib'
else:
filename = 'kmeans.joblib'
filepath = os.path.join(cache_dir, env_id, f'n_clusters_{n_clusters}', f'random_state_{random_state}', filename)
try:
kmeans = _load_kmeans_result_cache(filepath)
......@@ -63,18 +40,17 @@ def cached_kmeans(cache_dir, env_id, n_clusters, random_state,
logger.info('kmeans cache not found. Load dataset & do kmeans & save result as cache')
kmeans = _do_kmeans(env_id=env_id, n_clusters=n_clusters,
random_state=random_state,
subtask_reward_max=subtask_reward_max,
maxlen_each=maxlen_each,
sample_by_trajectory=sample_by_trajectory,
only_vector_converter=only_vector_converter)
_save_kmeans_result_cache(kmeans, filepath)
return kmeans
def _do_kmeans(env_id, n_clusters, random_state, subtask_reward_max,
maxlen_each, only_vector_converter):
def _do_kmeans(env_id, n_clusters, random_state, sample_by_trajectory,
only_vector_converter):
logger.debug(f'loading data...')
dat = minerl.data.make(env_id)
if subtask_reward_max is None and maxlen_each is None:
if not sample_by_trajectory:
act_vectors = []
for ob, act, _, next_ob, _ in tqdm.tqdm(dat.batch_iter(batch_size=16, seq_len=32, num_epochs=1, preload_buffer_size=32, seed=random_state)):
if only_vector_converter:
......@@ -85,8 +61,8 @@ def _do_kmeans(env_id, n_clusters, random_state, subtask_reward_max,
acts = np.concatenate(act_vectors).reshape(-1, 64)
else:
episode_names = dat.get_trajectory_names()
mem_normal = BoundedLengthMemory(maxlen=maxlen_each, random_state=random_state)
mem_vc = BoundedLengthMemory(maxlen=maxlen_each, random_state=random_state)
mem_normal = []
mem_vc = []
for episode_name in episode_names:
traj = dat.load_data(episode_name)
dn = False
......@@ -94,17 +70,15 @@ def _do_kmeans(env_id, n_clusters, random_state, subtask_reward_max,
while not dn:
ob, act, rw, next_ob, dn = next(traj)
current_reward_sum += rw
if subtask_reward_max is not None and current_reward_sum >= subtask_reward_max:
dn = True
if np.allclose(ob['vector'], next_ob['vector']):
# Ignore the case when the action does not change observation$vector.
mem_normal.append(act['vector'])
else:
mem_vc.append(act['vector'])
if only_vector_converter:
acts = mem_vc().reshape(-1, 64)
acts = np.array(mem_vc).reshape(-1, 64)
else:
acts = np.concatenate((mem_normal(), mem_vc()), axis=0).reshape(-1, 64)
acts = np.concatenate((np.array(mem_normal), np.array(mem_vc)), axis=0).reshape(-1, 64)
logger.debug(f'loading data... done.')
logger.debug(f'executing keamns...')
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(acts)
......
......@@ -59,12 +59,9 @@ class KMeansActionConverter:
def __init__(self, kmeans):
self.kmeans = kmeans
keys = ['craft', 'nearbyCraft', 'nearbySmelt', 'equip', 'place', 'attack', 'forward', 'camera']
print(keys)
for center in kmeans.cluster_centers_:
action = {'vector': center}
dict = minerl.herobraine.envs.MINERL_OBTAIN_DIAMOND_OBF_V0.unwrap_action(action)
# print(dict)
print([dict[k] for k in keys])
def __call__(self, action):
index = self.kmeans.predict(action['vector'])
......
......@@ -45,7 +45,7 @@ class PoVOnlyConverter:
def convert_space(self, space):
pov = space['pov']
return gym.spaces.Box(
low=-255, high=255,
low=0, high=255,
shape=pov.shape,
dtype=np.float32)
......@@ -57,14 +57,14 @@ class VectorCombineConverter:
scale = 1 / 255
pov, vector = observation['pov'], observation['vector']
num_elem = pov.shape[-3] * pov.shape[-2]
vector_channel = np.tile(vector, num_elem // vector.shape[-1]).reshape(*pov.shape[:-1], -1) # noqa
vector_channel = np.tile((vector + 1) / 2, num_elem // vector.shape[-1]).reshape(*pov.shape[:-1], -1) # noqa
return np.concatenate([pov, vector_channel / scale], axis=-1)
def convert_space(self, space):
pov, vector = space['pov'], space['vector']
num_new_channel = 1
return gym.spaces.Box(
low=-255, high=255,
low=0, high=255,
shape=(*pov.shape[:-1], pov.shape[-1] + num_new_channel),
dtype=np.float32)
......@@ -95,25 +95,3 @@ class ScaledFloatConverter:
low=(low * self.scale).astype(np.float32),
high=(high * self.scale).astype(np.float32),
shape=low.shape, dtype=np.float32)
if __name__ == '__main__':
"""Testing"""
input = {
'pov': np.arange(18 * 64 * 64).reshape(2, 3, 64, 64, 3).astype(np.uint8),
'vector': np.arange(6 * 64).reshape(2, 3, 64) / 64,
}
print(input['pov'])
gray_scaled = GrayScaleConverter()(input)
for i in range(2):
for j in range(3):
assert np.allclose(
gray_scaled['pov'][i, j],
np.expand_dims(
cv2.cvtColor(input['pov'][i, j], cv2.COLOR_RGB2GRAY),
-1))
combined = VectorCombineConverter()(gray_scaled)
axis_moved = MoveAxisConverter()(combined)
print(axis_moved.shape)
assert axis_moved.shape == (2, 3, 2, 64, 64)
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from builtins import * # NOQA
from future import standard_library
standard_library.install_aliases() # NOQA
from logging import getLogger
import numpy as np
from collections import deque, OrderedDict
......@@ -58,14 +50,12 @@ def _get_aggregated_action(obs, actions, next_obs, frameskip):
class DataPipelineWrapper:
def __init__(self, dataset, observation_converters,
action_converters, frameskip=1, framestack=1,
subtask_reward_max=None, append_reward_channel=False,
reward_scale=1./1024):
append_reward_channel=False, reward_scale=1./1024):
self.dataset = dataset
self.observation_converters = observation_converters
self.action_converters = action_converters
self.frameskip = frameskip
self.framestack = framestack
self.subtask_reward_max = subtask_reward_max
self.append_reward_channel = append_reward_channel
self.episode_names = self.dataset.get_trajectory_names()
self.current_episode_name = np.random.choice(self.episode_names)
......@@ -81,8 +71,6 @@ class DataPipelineWrapper:
while len(rewards[0]) < self.frameskip * self.framestack:
ob, ac, rw, nob, dn = next(self.batch_loader)
self.current_reward_sum += rw
if self.subtask_reward_max is not None and self.current_reward_sum >= self.subtask_reward_max:
dn = True
obs_pov[0].append(ob['pov'])
obs_vector[0].append(ob['vector'])
......
......@@ -23,13 +23,14 @@ def wrap_env(
randomize_action, eval_epsilon,
action_choices,
action_choices_vector_converter=None,
append_reward_channel=True):
append_reward_channel=False):
# wrap env: time limit...
if isinstance(env, gym.wrappers.TimeLimit):
logger.info('Detected `gym.wrappers.TimeLimit`! Unwrap it and re-wrap our own time limit.')
env = env.env
max_episode_steps = env.spec.max_episode_steps
env = ContinuingTimeLimit(env, max_episode_steps=max_episode_steps)
if not test:
if isinstance(env, gym.wrappers.TimeLimit):
logger.info('Detected `gym.wrappers.TimeLimit`! Unwrap it and re-wrap our own time limit.')
env = env.env
max_episode_steps = env.spec.max_episode_steps
env = ContinuingTimeLimit(env, max_episode_steps=max_episode_steps)
# wrap env: observation...
# NOTE: wrapping order matters!
......@@ -228,7 +229,7 @@ class PoVWithVectorWrapper(gym.ObservationWrapper):
def observation(self, observation):
pov = observation['pov']
vector_scaled = observation['vector'] / self._vector_scale
vector_scaled = (observation['vector'] + 1) / 2 / self._vector_scale # to (0, 255)
num_elem = pov.shape[-3] * pov.shape[-2]
vector_channel = np.tile(vector_scaled, num_elem // vector_scaled.shape[-1]).reshape(*pov.shape[:-1], -1) # noqa
return np.concatenate([pov, vector_channel], axis=-1)
......
import minerl
import copy
from logging import getLogger
from collections import OrderedDict, deque
import os
import gym
import numpy as np
import cv2
from pfrl.wrappers import ContinuingTimeLimit, RandomizeAction, Monitor
from pfrl.wrappers.atari_wrappers import ScaledFloatFrame, LazyFrames
cv2.ocl.setUseOpenCL(False)
logger = getLogger(__name__)
datasetA = minerl.data.make("MineRLObtainDiamondVectorObf-v0", num_workers=1)
datasetB = minerl.data.make("MineRLObtainDiamond-v0", num_workers=1)
traj_name="v3_villainous_black_eyed_peas_loch_ness_monster-2_3997-48203"
itrA = datasetA.load_data(traj_name)
itrB = datasetB.load_data(traj_name)
# itr = dataset.batch_iter(1, 1, -1)
class MockEnv(gym.Env):
def __init__(self, kmeans=None):
self.mock_obf = OrderedDict([
('pov', np.random.randint(256, size=(64, 64, 3), dtype=np.uint8)),
('vector', np.random.rand(64).astype(np.float32)),
])
self.observation_space = gym.spaces.Dict(
OrderedDict([
('pov', gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)),
('vector', gym.spaces.Box(low=-1, high=1, shape=(64,), dtype=np.float32)),
])
)
self.action_space = gym.spaces.Dict(
OrderedDict([
('vector', gym.spaces.Box(low=-1, high=1, shape=(64,), dtype=np.float32)),
])
)
self.count = 0
self.kmeans = kmeans
def reset(self):
self.count = 0
return self.mock_obf
def step(self, action):
"""
action = {'vector': np.random.rand(64) * 2 - 1}
for i in range(20000):
action = next(itrA)[1]
print("MineRLObtainDiamondVectorObf")
print(minerl.herobraine.envs.MINERL_OBTAIN_DIAMOND_OBF_V0.unwrap_action(action))
if self.kmeans is not None:
nearest_idx = self.kmeans.predict([action['vector']])[0]
recov = self.kmeans.cluster_centers_[nearest_idx]
print("MineRLObtainDiamondVectorObf (nearest k-means)")
print(minerl.herobraine.envs.MINERL_OBTAIN_DIAMOND_OBF_V0.unwrap_action({'vector': recov}))
action_b = next(itrB)[1]
print("MineRLObtainDiamond")
print(action_b)
while not (action_b['craft'] == 'none'):
pass
print("_____")
assert False
"""
self.count += 1
return self.mock_obf, np.random.rand(), self.count > 8000, {}
......@@ -13,19 +13,13 @@ class _CacheNotFound(FileNotFoundError):
pass
def cached_reward_boundary(cache_dir, env_id, n_groups, random_state,
subtask_reward_max=None):
def cached_reward_boundary(cache_dir, env_id, n_groups, random_state):
if cache_dir is None: # ignore cache
logger.info('Load dataset & calculate boundaries')
boundaries = _calc_boundaries(env_id=env_id, n_groups=n_groups,
random_state=random_state,
subtask_reward_max=subtask_reward_max)
random_state=random_state)
else:
if subtask_reward_max is None:
name_subtask_reward_max = ''
else:
name_subtask_reward_max = '_{}'.format(subtask_reward_max)
filename = 'reward_boundaries{}.joblib'.format(name_subtask_reward_max)
filename = 'reward_boundaries.joblib'
filepath = os.path.join(cache_dir, env_id, f'n_groups_{n_groups}', f'random_state_{random_state}', filename)
try:
boundaries = _load_result_cache(filepath)
......@@ -33,15 +27,13 @@ def cached_reward_boundary(cache_dir, env_id, n_groups, random_state,
except _CacheNotFound:
logger.info('boundary cache not found. Load dataset & calculate boundaries & save result as cache')
boundaries = _calc_boundaries(env_id=env_id, n_groups=n_groups,
random_state=random_state,
subtask_reward_max=subtask_reward_max)
random_state=random_state)
_save_result_cache(boundaries, filepath)
print(boundaries)
logger.debug(boundaries)
return boundaries
def _calc_boundaries(env_id, n_groups, random_state, subtask_reward_max):
def _calc_boundaries(env_id, n_groups, random_state):
logger.debug(f'loading data...')
dat = minerl.data.make(env_id)
episode_names = dat.get_trajectory_names()
......@@ -65,7 +57,6 @@ def _calc_boundaries(env_id, n_groups, random_state, subtask_reward_max):
cnt += 1
distrib.append((prev, cnt))
print(distrib)
# Find boundaries
def separate(distrib, max_region_size):
region_size = None
......@@ -86,7 +77,6 @@ def _calc_boundaries(env_id, n_groups, random_state, subtask_reward_max):
cand_min = half + 1
else:
cand_max = half
print(cand_min)
return separate(distrib, cand_min)[1:]
......
......@@ -114,8 +114,6 @@ def main(argv=None):
parser.add_argument('--dual-kmeans', action='store_true', default=False,
help='Use dual kmeans of different clustering criteria.')
parser.add_argument('--kmeans-n-clusters-vc', type=int, default=30, help='#clusters for Dual K-means of vector converters')
parser.add_argument('--balanced-maxlen', type=int, default=None,
help='Number of observations for each of normal and vector converters')
# SQIL specific settings
parser.add_argument('--max-episode-len', type=int, default=None, help='Manual maximum episode length.')
......@@ -123,10 +121,6 @@ def main(argv=None):
parser.add_argument('--experience-lambda', type=float, default=1, help='Weight coefficient of batches from experiences.')
parser.add_argument('--option-n-groups', type=int, default=1, help='Number of options to switch polices.')
# misc
parser.add_argument('--subtask-reward-max', type=int, default=None,
help='Maximum reward of each episode. MAY NOT BE OFFICIALLY ALLOWED TO USE.')
args = parser.parse_args(args=argv)
if args.remove_timestamp:
......@@ -163,28 +157,25 @@ def _main(args):
# K-Means
if args.dual_kmeans:
if args.balanced_maxlen is not None:
raise ValueError('balanced_maxlen is unavailable in DualKmeans.')
kmeans_normal = cached_kmeans(
cache_dir=os.environ.get('KMEANS_CACHE'),
env_id=args.env,
n_clusters=args.kmeans_n_clusters,
random_state=args.seed,
subtask_reward_max=args.subtask_reward_max,
sample_by_trajectory=True,
only_vector_converter=False)
kmeans_vector_converter = cached_kmeans(
cache_dir=os.environ.get('KMEANS_CACHE'),
env_id=args.env,
n_clusters=args.kmeans_n_clusters_vc,
random_state=args.seed,
subtask_reward_max=args.subtask_reward_max,
sample_by_trajectory=True,
only_vector_converter=True)
else:
kmeans = cached_kmeans(
cache_dir=os.environ.get('KMEANS_CACHE'),
env_id=args.env,
n_clusters=args.kmeans_n_clusters,
maxlen_each=args.balanced_maxlen,
random_state=args.seed)
if args.option_n_groups > 1:
boundaries = cached_reward_boundary(
......@@ -223,7 +214,6 @@ def _main(args):
orig_data,
observation_converters=observation_converters,
action_converters=action_converters,
subtask_reward_max=args.subtask_reward_max,
frameskip=args.frame_skip, framestack=args.frame_stack,
append_reward_channel=(args.option_n_groups > 1),
reward_scale=reward_channel_scale)
......
......@@ -23,13 +23,12 @@ import coloredlogs
coloredlogs.install(logging.DEBUG)
# Agent settings
GPU = -1
GPU = 0
ARCH = 'dueling'
KMEANS_N_CLUSTERS = 30
KMEANS_N_CLUSTERS_VC = 60
KMEANS_SEED = 0
SUBTASK_REWARD_MAX = 9999
OPTION_N_GROUPS = 10
FINAL_EPSILON = 0.01
FINAL_EXPLORATION_FRAMES = 10 ** 6
......@@ -188,7 +187,7 @@ class MineRLSQILBaselineAgent(MineRLAgentBase):
def load_agent(self):
boundaries = cached_reward_boundary(
cache_dir=os.environ.get('BOUNDARY_CACHE'),
cache_dir='./train/boundary_cache/',
env_id=MINERL_GYM_ENV,
n_groups=OPTION_N_GROUPS,
random_state=KMEANS_SEED)
......@@ -236,14 +235,14 @@ def main():
env_id=MINERL_GYM_ENV,
n_clusters=KMEANS_N_CLUSTERS,
random_state=KMEANS_SEED,
subtask_reward_max=SUBTASK_REWARD_MAX,
sample_by_trajectory=True,
only_vector_converter=False)
kmeans_vector_converter = cached_kmeans(
cache_dir='./train/kmeans_cache/',
env_id=MINERL_GYM_ENV,
n_clusters=KMEANS_N_CLUSTERS_VC,
random_state=KMEANS_SEED,
subtask_reward_max=SUBTASK_REWARD_MAX,
sample_by_trajectory=True,
only_vector_converter=True)
def wrapper(env):
......
......@@ -60,7 +60,7 @@ def main():
'--eval-interval', '2500',
'--eval-n-runs', '20',
'--remove-timestamp', # save to outdir/latest
'--dual-kmeans', '--subtask-reward-max', '9999', # currently, it is not used for bounding observation but for avoiding conflict of kmeans_cache implementation of other baselines.
'--dual-kmeans',
'--kmeans-n-clusters-vc', '60',
'--option-n-groups', '10'])
......
No preview for this file type
No preview for this file type
No preview for this file type