Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • marvinchung/minerl_sqil_baseline
  • shyam_sudhakaran/minerl_sqil_baseline
  • byron_galbraith/minerl_sqil_baseline
  • MasterScrat/minerl_sqil_baseline
  • keisuke_nishida/minerl_sqil_baseline
  • pfrl_sqil/minerl_sqil_baseline
6 results
Show changes
Commits on Source (16)
Showing
with 63 additions and 375 deletions
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
"challenge_id": "aicrowd-neurips-2020-minerl-challenge", "challenge_id": "aicrowd-neurips-2020-minerl-challenge",
"grader_id": "aicrowd-neurips-2020-minerl-challenge", "grader_id": "aicrowd-neurips-2020-minerl-challenge",
"authors": ["minerl_sqil_baseline"], "authors": ["minerl_sqil_baseline"],
"tags": "IL", "tags": "RL",
"description": "Test Model for MineRL Challenge", "description": "Test Model for MineRL Challenge",
"gpu": false "gpu": true
} }
name: minerl
channels:
- conda-forge
- defaults
dependencies:
- python=3.6
- pip
- pytorch
- py-opencv
- pip:
- crowdai_api
- minerl
- coloredlogs
- matplotlib
- pyro4
- pfrl
- scikit-learn
...@@ -106,6 +106,8 @@ def _batch_reset_recurrent_states_when_episodes_end( ...@@ -106,6 +106,8 @@ def _batch_reset_recurrent_states_when_episodes_end(
def load_experiences_from_demonstrations( def load_experiences_from_demonstrations(
expert_dataset, batch_size, reward=1): expert_dataset, batch_size, reward=1):
if expert_dataset is None:
raise ValueError("Expert dataset must be provided.")
ret = [] ret = []
for _ in range(batch_size): for _ in range(batch_size):
ob, act, _, next_ob, done = expert_dataset.sample() ob, act, _, next_ob, done = expert_dataset.sample()
...@@ -162,7 +164,6 @@ class RewardBasedSampler: ...@@ -162,7 +164,6 @@ class RewardBasedSampler:
n_samples = [0 for _ in range(len(self.reward_boundaries) + 1)] n_samples = [0 for _ in range(len(self.reward_boundaries) + 1)]
for frame in experiences: for frame in experiences:
n_samples[self._policy_index(frame[0]['state'])] += 1 n_samples[self._policy_index(frame[0]['state'])] += 1
print(n_samples)
ret = [] ret = []
for rbuf, n_sample in zip(self.replay_buffers, n_samples): for rbuf, n_sample in zip(self.replay_buffers, n_samples):
samples = rbuf.sample(n_sample) samples = rbuf.sample(n_sample)
...@@ -235,8 +236,6 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent): ...@@ -235,8 +236,6 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent):
recurrent=False, recurrent=False,
reward_boundaries=None, # specific to options reward_boundaries=None, # specific to options
): ):
if expert_dataset is None:
raise ValueError("Expert dataset must be provided.")
self.expert_dataset = expert_dataset self.expert_dataset = expert_dataset
self.model = q_function self.model = q_function
...@@ -290,7 +289,7 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent): ...@@ -290,7 +289,7 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent):
self.reward_scale = reward_scale self.reward_scale = reward_scale
self.experience_lambda = experience_lambda self.experience_lambda = experience_lambda
if reward_boundaries is not None: if reward_boundaries is not None and self.expert_dataset is not None:
self.reward_based_sampler = RewardBasedSampler( self.reward_based_sampler = RewardBasedSampler(
self.expert_dataset, self.expert_dataset,
reward_boundaries, reward_boundaries,
...@@ -327,21 +326,6 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent): ...@@ -327,21 +326,6 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent):
# cumulative_steps counts the overall steps during the training. # cumulative_steps counts the overall steps during the training.
return self._cumulative_steps return self._cumulative_steps
def _setup_actor_learner_training(self, n_actors, actor_update_interval):
assert actor_update_interval > 0
self.actor_update_interval = actor_update_interval
self.update_counter = 0
# Make a copy on shared memory and share among actors and the poller
shared_model = copy.deepcopy(self.model).cpu()
shared_model.share_memory()
# Pipes are used for infrequent communication
learner_pipes, actor_pipes = list(zip(*[mp.Pipe() for _ in range(n_actors)]))
return (shared_model, learner_pipes, actor_pipes)
def sync_target_network(self): def sync_target_network(self):
"""Synchronize target network with current network.""" """Synchronize target network with current network."""
if self.target_model is None: if self.target_model is None:
...@@ -663,158 +647,6 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent): ...@@ -663,158 +647,6 @@ class SQIL(agent.AttributeSavingMixin, agent.BatchAgent):
return False return False
return True return True
def _poll_pipe(self, actor_idx, pipe, replay_buffer_lock, exception_event):
if pipe.closed:
return
try:
while pipe.poll() and not exception_event.is_set():
cmd, data = pipe.recv()
if cmd == "get_statistics":
assert data is None
with replay_buffer_lock:
stats = self.get_statistics()
pipe.send(stats)
elif cmd == "load":
self.load(data)
pipe.send(None)
elif cmd == "save":
self.save(data)
pipe.send(None)
elif cmd == "transition":
with replay_buffer_lock:
if "env_id" not in data:
data["env_id"] = actor_idx
self.replay_buffer.append(**data)
self._cumulative_steps += 1
elif cmd == "stop_episode":
idx = actor_idx if data is None else data
with replay_buffer_lock:
self.replay_buffer.stop_current_episode(env_id=idx)
stats = self.get_statistics()
pipe.send(stats)
else:
raise RuntimeError("Unknown command from actor: {}".format(cmd))
except EOFError:
pipe.close()
except Exception:
self.logger.exception("Poller loop failed. Exiting")
exception_event.set()
def _learner_loop(
self,
shared_model,
pipes,
replay_buffer_lock,
stop_event,
exception_event,
n_updates=None,
):
try:
update_counter = 0
# To stop this loop, call stop_event.set()
while not stop_event.is_set():
# Update model if possible
if not self._can_start_replay():
continue
if n_updates is not None:
assert self.optim_t <= n_updates
if self.optim_t == n_updates:
stop_event.set()
break
if self.recurrent:
with replay_buffer_lock:
episodes = self.replay_buffer.sample_episodes(
self.minibatch_size, self.episodic_update_len
)
self.update_from_episodes(episodes)
else:
with replay_buffer_lock:
transitions = self.replay_buffer.sample(self.minibatch_size)
self.update(transitions)
# Update the shared model. This can be expensive if GPU is used
# since this is a DtoH copy, so it is updated only at regular
# intervals.
update_counter += 1
if update_counter % self.actor_update_interval == 0:
self.update_counter += 1
shared_model.load_state_dict(self.model.state_dict())
# To keep the ratio of target updates to model updates,
# here we calculate back the effective current timestep
# from update_interval and number of updates so far.
effective_timestep = self.optim_t * self.update_interval
# We can safely assign self.t since in the learner
# it isn't updated by any other method
self.t = effective_timestep
if effective_timestep % self.target_update_interval == 0:
self.sync_target_network()
except Exception:
self.logger.exception("Learner loop failed. Exiting")
exception_event.set()
def _poller_loop(
self, shared_model, pipes, replay_buffer_lock, stop_event, exception_event
):
# To stop this loop, call stop_event.set()
while not stop_event.is_set() and not exception_event.is_set():
time.sleep(1e-6)
# Poll actors for messages
for i, pipe in enumerate(pipes):
self._poll_pipe(i, pipe, replay_buffer_lock, exception_event)
def setup_actor_learner_training(
self, n_actors, n_updates=None, actor_update_interval=8
):
(shared_model, learner_pipes, actor_pipes) = self._setup_actor_learner_training(
n_actors, actor_update_interval
)
exception_event = mp.Event()
def make_actor(i):
return pfrl.agents.StateQFunctionActor(
pipe=actor_pipes[i],
model=shared_model,
explorer=self.explorer,
phi=self.phi,
batch_states=self.batch_states,
logger=self.logger,
recurrent=self.recurrent,
)
replay_buffer_lock = mp.Lock()
poller_stop_event = mp.Event()
poller = pfrl.utils.StoppableThread(
target=self._poller_loop,
kwargs=dict(
shared_model=shared_model,
pipes=learner_pipes,
replay_buffer_lock=replay_buffer_lock,
stop_event=poller_stop_event,
exception_event=exception_event,
),
stop_event=poller_stop_event,
)
learner_stop_event = mp.Event()
learner = pfrl.utils.StoppableThread(
target=self._learner_loop,
kwargs=dict(
shared_model=shared_model,
pipes=learner_pipes,
replay_buffer_lock=replay_buffer_lock,
stop_event=learner_stop_event,
n_updates=n_updates,
exception_event=exception_event,
),
stop_event=learner_stop_event,
)
return make_actor, learner, poller, exception_event
def stop_episode(self): def stop_episode(self):
if self.recurrent: if self.recurrent:
self.test_recurrent_states = None self.test_recurrent_states = None
......
...@@ -14,47 +14,24 @@ class _KMeansCacheNotFound(FileNotFoundError): ...@@ -14,47 +14,24 @@ class _KMeansCacheNotFound(FileNotFoundError):
pass pass
class BoundedLengthMemory:
def __init__(self, maxlen, random_state):
self.maxlen = maxlen
self.t = 0
self._rand = np.random.RandomState(random_state)
self.memory = []
def __call__(self):
return np.array(self.memory)
def append(self, action):
self.t += 1
if self.maxlen is None or len(self.memory) < self.maxlen:
self.memory.append(action)
else:
idx = self._rand.randint(self.t)
if idx < self.maxlen:
self.memory[idx] = action
def cached_kmeans(cache_dir, env_id, n_clusters, random_state, def cached_kmeans(cache_dir, env_id, n_clusters, random_state,
subtask_reward_max=None, maxlen_each=None, sample_by_trajectory=False, only_vector_converter=False):
only_vector_converter=False): if only_vector_converter and not sample_by_trajectory:
raise ValueError("The vector converter option must be selected with the ascending order option.")
if cache_dir is None: # ignore cache if cache_dir is None: # ignore cache
logger.info('Load dataset & do kmeans') logger.info('Load dataset & do kmeans')
kmeans = _do_kmeans(env_id=env_id, n_clusters=n_clusters, kmeans = _do_kmeans(env_id=env_id, n_clusters=n_clusters,
random_state=random_state, random_state=random_state,
subtask_reward_max=subtask_reward_max, sample_by_trajectory=sample_by_trajectory,
only_vector_converter=only_vector_converter) only_vector_converter=only_vector_converter)
else: else:
if subtask_reward_max is None:
name_subtask_reward_max = ''
else:
name_subtask_reward_max = '_{}'.format(subtask_reward_max)
if only_vector_converter: if only_vector_converter:
filename = 'kmeans_vector_converter{}.joblib'.format(name_subtask_reward_max) # noqa filename = 'kmeans_vector_converter.joblib'
elif maxlen_each is not None:
filename = 'kmeans_balanced_{}{}.joblib'.format(
maxlen_each, name_subtask_reward_max)
else: else:
filename = 'kmeans{}.joblib'.format(name_subtask_reward_max) if sample_by_trajectory:
filename = 'kmeans_normal.joblib'
else:
filename = 'kmeans.joblib'
filepath = os.path.join(cache_dir, env_id, f'n_clusters_{n_clusters}', f'random_state_{random_state}', filename) filepath = os.path.join(cache_dir, env_id, f'n_clusters_{n_clusters}', f'random_state_{random_state}', filename)
try: try:
kmeans = _load_kmeans_result_cache(filepath) kmeans = _load_kmeans_result_cache(filepath)
...@@ -63,18 +40,17 @@ def cached_kmeans(cache_dir, env_id, n_clusters, random_state, ...@@ -63,18 +40,17 @@ def cached_kmeans(cache_dir, env_id, n_clusters, random_state,
logger.info('kmeans cache not found. Load dataset & do kmeans & save result as cache') logger.info('kmeans cache not found. Load dataset & do kmeans & save result as cache')
kmeans = _do_kmeans(env_id=env_id, n_clusters=n_clusters, kmeans = _do_kmeans(env_id=env_id, n_clusters=n_clusters,
random_state=random_state, random_state=random_state,
subtask_reward_max=subtask_reward_max, sample_by_trajectory=sample_by_trajectory,
maxlen_each=maxlen_each,
only_vector_converter=only_vector_converter) only_vector_converter=only_vector_converter)
_save_kmeans_result_cache(kmeans, filepath) _save_kmeans_result_cache(kmeans, filepath)
return kmeans return kmeans
def _do_kmeans(env_id, n_clusters, random_state, subtask_reward_max, def _do_kmeans(env_id, n_clusters, random_state, sample_by_trajectory,
maxlen_each, only_vector_converter): only_vector_converter):
logger.debug(f'loading data...') logger.debug(f'loading data...')
dat = minerl.data.make(env_id) dat = minerl.data.make(env_id)
if subtask_reward_max is None and maxlen_each is None: if not sample_by_trajectory:
act_vectors = [] act_vectors = []
for ob, act, _, next_ob, _ in tqdm.tqdm(dat.batch_iter(batch_size=16, seq_len=32, num_epochs=1, preload_buffer_size=32, seed=random_state)): for ob, act, _, next_ob, _ in tqdm.tqdm(dat.batch_iter(batch_size=16, seq_len=32, num_epochs=1, preload_buffer_size=32, seed=random_state)):
if only_vector_converter: if only_vector_converter:
...@@ -85,8 +61,8 @@ def _do_kmeans(env_id, n_clusters, random_state, subtask_reward_max, ...@@ -85,8 +61,8 @@ def _do_kmeans(env_id, n_clusters, random_state, subtask_reward_max,
acts = np.concatenate(act_vectors).reshape(-1, 64) acts = np.concatenate(act_vectors).reshape(-1, 64)
else: else:
episode_names = dat.get_trajectory_names() episode_names = dat.get_trajectory_names()
mem_normal = BoundedLengthMemory(maxlen=maxlen_each, random_state=random_state) mem_normal = []
mem_vc = BoundedLengthMemory(maxlen=maxlen_each, random_state=random_state) mem_vc = []
for episode_name in episode_names: for episode_name in episode_names:
traj = dat.load_data(episode_name) traj = dat.load_data(episode_name)
dn = False dn = False
...@@ -94,17 +70,15 @@ def _do_kmeans(env_id, n_clusters, random_state, subtask_reward_max, ...@@ -94,17 +70,15 @@ def _do_kmeans(env_id, n_clusters, random_state, subtask_reward_max,
while not dn: while not dn:
ob, act, rw, next_ob, dn = next(traj) ob, act, rw, next_ob, dn = next(traj)
current_reward_sum += rw current_reward_sum += rw
if subtask_reward_max is not None and current_reward_sum >= subtask_reward_max:
dn = True
if np.allclose(ob['vector'], next_ob['vector']): if np.allclose(ob['vector'], next_ob['vector']):
# Ignore the case when the action does not change observation$vector. # Ignore the case when the action does not change observation$vector.
mem_normal.append(act['vector']) mem_normal.append(act['vector'])
else: else:
mem_vc.append(act['vector']) mem_vc.append(act['vector'])
if only_vector_converter: if only_vector_converter:
acts = mem_vc().reshape(-1, 64) acts = np.array(mem_vc).reshape(-1, 64)
else: else:
acts = np.concatenate((mem_normal(), mem_vc()), axis=0).reshape(-1, 64) acts = np.concatenate((np.array(mem_normal), np.array(mem_vc)), axis=0).reshape(-1, 64)
logger.debug(f'loading data... done.') logger.debug(f'loading data... done.')
logger.debug(f'executing keamns...') logger.debug(f'executing keamns...')
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(acts) kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(acts)
......
...@@ -59,12 +59,9 @@ class KMeansActionConverter: ...@@ -59,12 +59,9 @@ class KMeansActionConverter:
def __init__(self, kmeans): def __init__(self, kmeans):
self.kmeans = kmeans self.kmeans = kmeans
keys = ['craft', 'nearbyCraft', 'nearbySmelt', 'equip', 'place', 'attack', 'forward', 'camera'] keys = ['craft', 'nearbyCraft', 'nearbySmelt', 'equip', 'place', 'attack', 'forward', 'camera']
print(keys)
for center in kmeans.cluster_centers_: for center in kmeans.cluster_centers_:
action = {'vector': center} action = {'vector': center}
dict = minerl.herobraine.envs.MINERL_OBTAIN_DIAMOND_OBF_V0.unwrap_action(action) dict = minerl.herobraine.envs.MINERL_OBTAIN_DIAMOND_OBF_V0.unwrap_action(action)
# print(dict)
print([dict[k] for k in keys])
def __call__(self, action): def __call__(self, action):
index = self.kmeans.predict(action['vector']) index = self.kmeans.predict(action['vector'])
......
...@@ -45,7 +45,7 @@ class PoVOnlyConverter: ...@@ -45,7 +45,7 @@ class PoVOnlyConverter:
def convert_space(self, space): def convert_space(self, space):
pov = space['pov'] pov = space['pov']
return gym.spaces.Box( return gym.spaces.Box(
low=-255, high=255, low=0, high=255,
shape=pov.shape, shape=pov.shape,
dtype=np.float32) dtype=np.float32)
...@@ -57,14 +57,14 @@ class VectorCombineConverter: ...@@ -57,14 +57,14 @@ class VectorCombineConverter:
scale = 1 / 255 scale = 1 / 255
pov, vector = observation['pov'], observation['vector'] pov, vector = observation['pov'], observation['vector']
num_elem = pov.shape[-3] * pov.shape[-2] num_elem = pov.shape[-3] * pov.shape[-2]
vector_channel = np.tile(vector, num_elem // vector.shape[-1]).reshape(*pov.shape[:-1], -1) # noqa vector_channel = np.tile((vector + 1) / 2, num_elem // vector.shape[-1]).reshape(*pov.shape[:-1], -1) # noqa
return np.concatenate([pov, vector_channel / scale], axis=-1) return np.concatenate([pov, vector_channel / scale], axis=-1)
def convert_space(self, space): def convert_space(self, space):
pov, vector = space['pov'], space['vector'] pov, vector = space['pov'], space['vector']
num_new_channel = 1 num_new_channel = 1
return gym.spaces.Box( return gym.spaces.Box(
low=-255, high=255, low=0, high=255,
shape=(*pov.shape[:-1], pov.shape[-1] + num_new_channel), shape=(*pov.shape[:-1], pov.shape[-1] + num_new_channel),
dtype=np.float32) dtype=np.float32)
...@@ -95,25 +95,3 @@ class ScaledFloatConverter: ...@@ -95,25 +95,3 @@ class ScaledFloatConverter:
low=(low * self.scale).astype(np.float32), low=(low * self.scale).astype(np.float32),
high=(high * self.scale).astype(np.float32), high=(high * self.scale).astype(np.float32),
shape=low.shape, dtype=np.float32) shape=low.shape, dtype=np.float32)
if __name__ == '__main__':
"""Testing"""
input = {
'pov': np.arange(18 * 64 * 64).reshape(2, 3, 64, 64, 3).astype(np.uint8),
'vector': np.arange(6 * 64).reshape(2, 3, 64) / 64,
}
print(input['pov'])
gray_scaled = GrayScaleConverter()(input)
for i in range(2):
for j in range(3):
assert np.allclose(
gray_scaled['pov'][i, j],
np.expand_dims(
cv2.cvtColor(input['pov'][i, j], cv2.COLOR_RGB2GRAY),
-1))
combined = VectorCombineConverter()(gray_scaled)
axis_moved = MoveAxisConverter()(combined)
print(axis_moved.shape)
assert axis_moved.shape == (2, 3, 2, 64, 64)
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from builtins import * # NOQA
from future import standard_library
standard_library.install_aliases() # NOQA
from logging import getLogger from logging import getLogger
import numpy as np import numpy as np
from collections import deque, OrderedDict from collections import deque, OrderedDict
...@@ -58,14 +50,12 @@ def _get_aggregated_action(obs, actions, next_obs, frameskip): ...@@ -58,14 +50,12 @@ def _get_aggregated_action(obs, actions, next_obs, frameskip):
class DataPipelineWrapper: class DataPipelineWrapper:
def __init__(self, dataset, observation_converters, def __init__(self, dataset, observation_converters,
action_converters, frameskip=1, framestack=1, action_converters, frameskip=1, framestack=1,
subtask_reward_max=None, append_reward_channel=False, append_reward_channel=False, reward_scale=1./1024):
reward_scale=1./1024):
self.dataset = dataset self.dataset = dataset
self.observation_converters = observation_converters self.observation_converters = observation_converters
self.action_converters = action_converters self.action_converters = action_converters
self.frameskip = frameskip self.frameskip = frameskip
self.framestack = framestack self.framestack = framestack
self.subtask_reward_max = subtask_reward_max
self.append_reward_channel = append_reward_channel self.append_reward_channel = append_reward_channel
self.episode_names = self.dataset.get_trajectory_names() self.episode_names = self.dataset.get_trajectory_names()
self.current_episode_name = np.random.choice(self.episode_names) self.current_episode_name = np.random.choice(self.episode_names)
...@@ -81,8 +71,6 @@ class DataPipelineWrapper: ...@@ -81,8 +71,6 @@ class DataPipelineWrapper:
while len(rewards[0]) < self.frameskip * self.framestack: while len(rewards[0]) < self.frameskip * self.framestack:
ob, ac, rw, nob, dn = next(self.batch_loader) ob, ac, rw, nob, dn = next(self.batch_loader)
self.current_reward_sum += rw self.current_reward_sum += rw
if self.subtask_reward_max is not None and self.current_reward_sum >= self.subtask_reward_max:
dn = True
obs_pov[0].append(ob['pov']) obs_pov[0].append(ob['pov'])
obs_vector[0].append(ob['vector']) obs_vector[0].append(ob['vector'])
......
...@@ -23,13 +23,14 @@ def wrap_env( ...@@ -23,13 +23,14 @@ def wrap_env(
randomize_action, eval_epsilon, randomize_action, eval_epsilon,
action_choices, action_choices,
action_choices_vector_converter=None, action_choices_vector_converter=None,
append_reward_channel=True): append_reward_channel=False):
# wrap env: time limit... # wrap env: time limit...
if isinstance(env, gym.wrappers.TimeLimit): if not test:
logger.info('Detected `gym.wrappers.TimeLimit`! Unwrap it and re-wrap our own time limit.') if isinstance(env, gym.wrappers.TimeLimit):
env = env.env logger.info('Detected `gym.wrappers.TimeLimit`! Unwrap it and re-wrap our own time limit.')
max_episode_steps = env.spec.max_episode_steps env = env.env
env = ContinuingTimeLimit(env, max_episode_steps=max_episode_steps) max_episode_steps = env.spec.max_episode_steps
env = ContinuingTimeLimit(env, max_episode_steps=max_episode_steps)
# wrap env: observation... # wrap env: observation...
# NOTE: wrapping order matters! # NOTE: wrapping order matters!
...@@ -228,7 +229,7 @@ class PoVWithVectorWrapper(gym.ObservationWrapper): ...@@ -228,7 +229,7 @@ class PoVWithVectorWrapper(gym.ObservationWrapper):
def observation(self, observation): def observation(self, observation):
pov = observation['pov'] pov = observation['pov']
vector_scaled = observation['vector'] / self._vector_scale vector_scaled = (observation['vector'] + 1) / 2 / self._vector_scale # to (0, 255)
num_elem = pov.shape[-3] * pov.shape[-2] num_elem = pov.shape[-3] * pov.shape[-2]
vector_channel = np.tile(vector_scaled, num_elem // vector_scaled.shape[-1]).reshape(*pov.shape[:-1], -1) # noqa vector_channel = np.tile(vector_scaled, num_elem // vector_scaled.shape[-1]).reshape(*pov.shape[:-1], -1) # noqa
return np.concatenate([pov, vector_channel], axis=-1) return np.concatenate([pov, vector_channel], axis=-1)
......
import minerl
import copy
from logging import getLogger
from collections import OrderedDict, deque
import os
import gym
import numpy as np
import cv2
from pfrl.wrappers import ContinuingTimeLimit, RandomizeAction, Monitor
from pfrl.wrappers.atari_wrappers import ScaledFloatFrame, LazyFrames
cv2.ocl.setUseOpenCL(False)
logger = getLogger(__name__)
datasetA = minerl.data.make("MineRLObtainDiamondVectorObf-v0", num_workers=1)
datasetB = minerl.data.make("MineRLObtainDiamond-v0", num_workers=1)
traj_name="v3_villainous_black_eyed_peas_loch_ness_monster-2_3997-48203"
itrA = datasetA.load_data(traj_name)
itrB = datasetB.load_data(traj_name)
# itr = dataset.batch_iter(1, 1, -1)
class MockEnv(gym.Env):
def __init__(self, kmeans=None):
self.mock_obf = OrderedDict([
('pov', np.random.randint(256, size=(64, 64, 3), dtype=np.uint8)),
('vector', np.random.rand(64).astype(np.float32)),
])
self.observation_space = gym.spaces.Dict(
OrderedDict([
('pov', gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)),
('vector', gym.spaces.Box(low=-1, high=1, shape=(64,), dtype=np.float32)),
])
)
self.action_space = gym.spaces.Dict(
OrderedDict([
('vector', gym.spaces.Box(low=-1, high=1, shape=(64,), dtype=np.float32)),
])
)
self.count = 0
self.kmeans = kmeans
def reset(self):
self.count = 0
return self.mock_obf
def step(self, action):
"""
action = {'vector': np.random.rand(64) * 2 - 1}
for i in range(20000):
action = next(itrA)[1]
print("MineRLObtainDiamondVectorObf")
print(minerl.herobraine.envs.MINERL_OBTAIN_DIAMOND_OBF_V0.unwrap_action(action))
if self.kmeans is not None:
nearest_idx = self.kmeans.predict([action['vector']])[0]
recov = self.kmeans.cluster_centers_[nearest_idx]
print("MineRLObtainDiamondVectorObf (nearest k-means)")
print(minerl.herobraine.envs.MINERL_OBTAIN_DIAMOND_OBF_V0.unwrap_action({'vector': recov}))
action_b = next(itrB)[1]
print("MineRLObtainDiamond")
print(action_b)
while not (action_b['craft'] == 'none'):
pass
print("_____")
assert False
"""
self.count += 1
return self.mock_obf, np.random.rand(), self.count > 8000, {}
...@@ -13,19 +13,13 @@ class _CacheNotFound(FileNotFoundError): ...@@ -13,19 +13,13 @@ class _CacheNotFound(FileNotFoundError):
pass pass
def cached_reward_boundary(cache_dir, env_id, n_groups, random_state, def cached_reward_boundary(cache_dir, env_id, n_groups, random_state):
subtask_reward_max=None):
if cache_dir is None: # ignore cache if cache_dir is None: # ignore cache
logger.info('Load dataset & calculate boundaries') logger.info('Load dataset & calculate boundaries')
boundaries = _calc_boundaries(env_id=env_id, n_groups=n_groups, boundaries = _calc_boundaries(env_id=env_id, n_groups=n_groups,
random_state=random_state, random_state=random_state)
subtask_reward_max=subtask_reward_max)
else: else:
if subtask_reward_max is None: filename = 'reward_boundaries.joblib'
name_subtask_reward_max = ''
else:
name_subtask_reward_max = '_{}'.format(subtask_reward_max)
filename = 'reward_boundaries{}.joblib'.format(name_subtask_reward_max)
filepath = os.path.join(cache_dir, env_id, f'n_groups_{n_groups}', f'random_state_{random_state}', filename) filepath = os.path.join(cache_dir, env_id, f'n_groups_{n_groups}', f'random_state_{random_state}', filename)
try: try:
boundaries = _load_result_cache(filepath) boundaries = _load_result_cache(filepath)
...@@ -33,15 +27,13 @@ def cached_reward_boundary(cache_dir, env_id, n_groups, random_state, ...@@ -33,15 +27,13 @@ def cached_reward_boundary(cache_dir, env_id, n_groups, random_state,
except _CacheNotFound: except _CacheNotFound:
logger.info('boundary cache not found. Load dataset & calculate boundaries & save result as cache') logger.info('boundary cache not found. Load dataset & calculate boundaries & save result as cache')
boundaries = _calc_boundaries(env_id=env_id, n_groups=n_groups, boundaries = _calc_boundaries(env_id=env_id, n_groups=n_groups,
random_state=random_state, random_state=random_state)
subtask_reward_max=subtask_reward_max)
_save_result_cache(boundaries, filepath) _save_result_cache(boundaries, filepath)
print(boundaries)
logger.debug(boundaries) logger.debug(boundaries)
return boundaries return boundaries
def _calc_boundaries(env_id, n_groups, random_state, subtask_reward_max): def _calc_boundaries(env_id, n_groups, random_state):
logger.debug(f'loading data...') logger.debug(f'loading data...')
dat = minerl.data.make(env_id) dat = minerl.data.make(env_id)
episode_names = dat.get_trajectory_names() episode_names = dat.get_trajectory_names()
...@@ -65,7 +57,6 @@ def _calc_boundaries(env_id, n_groups, random_state, subtask_reward_max): ...@@ -65,7 +57,6 @@ def _calc_boundaries(env_id, n_groups, random_state, subtask_reward_max):
cnt += 1 cnt += 1
distrib.append((prev, cnt)) distrib.append((prev, cnt))
print(distrib)
# Find boundaries # Find boundaries
def separate(distrib, max_region_size): def separate(distrib, max_region_size):
region_size = None region_size = None
...@@ -86,7 +77,6 @@ def _calc_boundaries(env_id, n_groups, random_state, subtask_reward_max): ...@@ -86,7 +77,6 @@ def _calc_boundaries(env_id, n_groups, random_state, subtask_reward_max):
cand_min = half + 1 cand_min = half + 1
else: else:
cand_max = half cand_max = half
print(cand_min)
return separate(distrib, cand_min)[1:] return separate(distrib, cand_min)[1:]
......
...@@ -114,8 +114,6 @@ def main(argv=None): ...@@ -114,8 +114,6 @@ def main(argv=None):
parser.add_argument('--dual-kmeans', action='store_true', default=False, parser.add_argument('--dual-kmeans', action='store_true', default=False,
help='Use dual kmeans of different clustering criteria.') help='Use dual kmeans of different clustering criteria.')
parser.add_argument('--kmeans-n-clusters-vc', type=int, default=30, help='#clusters for Dual K-means of vector converters') parser.add_argument('--kmeans-n-clusters-vc', type=int, default=30, help='#clusters for Dual K-means of vector converters')
parser.add_argument('--balanced-maxlen', type=int, default=None,
help='Number of observations for each of normal and vector converters')
# SQIL specific settings # SQIL specific settings
parser.add_argument('--max-episode-len', type=int, default=None, help='Manual maximum episode length.') parser.add_argument('--max-episode-len', type=int, default=None, help='Manual maximum episode length.')
...@@ -123,10 +121,6 @@ def main(argv=None): ...@@ -123,10 +121,6 @@ def main(argv=None):
parser.add_argument('--experience-lambda', type=float, default=1, help='Weight coefficient of batches from experiences.') parser.add_argument('--experience-lambda', type=float, default=1, help='Weight coefficient of batches from experiences.')
parser.add_argument('--option-n-groups', type=int, default=1, help='Number of options to switch polices.') parser.add_argument('--option-n-groups', type=int, default=1, help='Number of options to switch polices.')
# misc
parser.add_argument('--subtask-reward-max', type=int, default=None,
help='Maximum reward of each episode. MAY NOT BE OFFICIALLY ALLOWED TO USE.')
args = parser.parse_args(args=argv) args = parser.parse_args(args=argv)
if args.remove_timestamp: if args.remove_timestamp:
...@@ -163,28 +157,25 @@ def _main(args): ...@@ -163,28 +157,25 @@ def _main(args):
# K-Means # K-Means
if args.dual_kmeans: if args.dual_kmeans:
if args.balanced_maxlen is not None:
raise ValueError('balanced_maxlen is unavailable in DualKmeans.')
kmeans_normal = cached_kmeans( kmeans_normal = cached_kmeans(
cache_dir=os.environ.get('KMEANS_CACHE'), cache_dir=os.environ.get('KMEANS_CACHE'),
env_id=args.env, env_id=args.env,
n_clusters=args.kmeans_n_clusters, n_clusters=args.kmeans_n_clusters,
random_state=args.seed, random_state=args.seed,
subtask_reward_max=args.subtask_reward_max, sample_by_trajectory=True,
only_vector_converter=False) only_vector_converter=False)
kmeans_vector_converter = cached_kmeans( kmeans_vector_converter = cached_kmeans(
cache_dir=os.environ.get('KMEANS_CACHE'), cache_dir=os.environ.get('KMEANS_CACHE'),
env_id=args.env, env_id=args.env,
n_clusters=args.kmeans_n_clusters_vc, n_clusters=args.kmeans_n_clusters_vc,
random_state=args.seed, random_state=args.seed,
subtask_reward_max=args.subtask_reward_max, sample_by_trajectory=True,
only_vector_converter=True) only_vector_converter=True)
else: else:
kmeans = cached_kmeans( kmeans = cached_kmeans(
cache_dir=os.environ.get('KMEANS_CACHE'), cache_dir=os.environ.get('KMEANS_CACHE'),
env_id=args.env, env_id=args.env,
n_clusters=args.kmeans_n_clusters, n_clusters=args.kmeans_n_clusters,
maxlen_each=args.balanced_maxlen,
random_state=args.seed) random_state=args.seed)
if args.option_n_groups > 1: if args.option_n_groups > 1:
boundaries = cached_reward_boundary( boundaries = cached_reward_boundary(
...@@ -223,7 +214,6 @@ def _main(args): ...@@ -223,7 +214,6 @@ def _main(args):
orig_data, orig_data,
observation_converters=observation_converters, observation_converters=observation_converters,
action_converters=action_converters, action_converters=action_converters,
subtask_reward_max=args.subtask_reward_max,
frameskip=args.frame_skip, framestack=args.frame_stack, frameskip=args.frame_skip, framestack=args.frame_stack,
append_reward_channel=(args.option_n_groups > 1), append_reward_channel=(args.option_n_groups > 1),
reward_scale=reward_channel_scale) reward_scale=reward_channel_scale)
......
...@@ -23,13 +23,12 @@ import coloredlogs ...@@ -23,13 +23,12 @@ import coloredlogs
coloredlogs.install(logging.DEBUG) coloredlogs.install(logging.DEBUG)
# Agent settings # Agent settings
GPU = -1 GPU = 0
ARCH = 'dueling' ARCH = 'dueling'
KMEANS_N_CLUSTERS = 30 KMEANS_N_CLUSTERS = 30
KMEANS_N_CLUSTERS_VC = 60 KMEANS_N_CLUSTERS_VC = 60
KMEANS_SEED = 0 KMEANS_SEED = 0
SUBTASK_REWARD_MAX = 9999
OPTION_N_GROUPS = 10 OPTION_N_GROUPS = 10
FINAL_EPSILON = 0.01 FINAL_EPSILON = 0.01
FINAL_EXPLORATION_FRAMES = 10 ** 6 FINAL_EXPLORATION_FRAMES = 10 ** 6
...@@ -188,7 +187,7 @@ class MineRLSQILBaselineAgent(MineRLAgentBase): ...@@ -188,7 +187,7 @@ class MineRLSQILBaselineAgent(MineRLAgentBase):
def load_agent(self): def load_agent(self):
boundaries = cached_reward_boundary( boundaries = cached_reward_boundary(
cache_dir=os.environ.get('BOUNDARY_CACHE'), cache_dir='./train/boundary_cache/',
env_id=MINERL_GYM_ENV, env_id=MINERL_GYM_ENV,
n_groups=OPTION_N_GROUPS, n_groups=OPTION_N_GROUPS,
random_state=KMEANS_SEED) random_state=KMEANS_SEED)
...@@ -236,14 +235,14 @@ def main(): ...@@ -236,14 +235,14 @@ def main():
env_id=MINERL_GYM_ENV, env_id=MINERL_GYM_ENV,
n_clusters=KMEANS_N_CLUSTERS, n_clusters=KMEANS_N_CLUSTERS,
random_state=KMEANS_SEED, random_state=KMEANS_SEED,
subtask_reward_max=SUBTASK_REWARD_MAX, sample_by_trajectory=True,
only_vector_converter=False) only_vector_converter=False)
kmeans_vector_converter = cached_kmeans( kmeans_vector_converter = cached_kmeans(
cache_dir='./train/kmeans_cache/', cache_dir='./train/kmeans_cache/',
env_id=MINERL_GYM_ENV, env_id=MINERL_GYM_ENV,
n_clusters=KMEANS_N_CLUSTERS_VC, n_clusters=KMEANS_N_CLUSTERS_VC,
random_state=KMEANS_SEED, random_state=KMEANS_SEED,
subtask_reward_max=SUBTASK_REWARD_MAX, sample_by_trajectory=True,
only_vector_converter=True) only_vector_converter=True)
def wrapper(env): def wrapper(env):
......
...@@ -60,7 +60,7 @@ def main(): ...@@ -60,7 +60,7 @@ def main():
'--eval-interval', '2500', '--eval-interval', '2500',
'--eval-n-runs', '20', '--eval-n-runs', '20',
'--remove-timestamp', # save to outdir/latest '--remove-timestamp', # save to outdir/latest
'--dual-kmeans', '--subtask-reward-max', '9999', # currently, it is not used for bounding observation but for avoiding conflict of kmeans_cache implementation of other baselines. '--dual-kmeans',
'--kmeans-n-clusters-vc', '60', '--kmeans-n-clusters-vc', '60',
'--option-n-groups', '10']) '--option-n-groups', '10'])
......
No preview for this file type
No preview for this file type
No preview for this file type