train.py 9.68 KB
Newer Older
Deniz's avatar
Deniz committed
1 2 3 4 5
# Simple env test.
import json
import select
import time
import logging
dzorlu's avatar
dzorlu committed
6
import functools
Deniz's avatar
Deniz committed
7
import os
dzorlu's avatar
dzorlu committed
8
import tqdm
dzorlu's avatar
dzorlu committed
9
from pathlib import Path
Deniz's avatar
Deniz committed
10 11 12 13 14

import aicrowd_helper
import gym
import minerl
from utility.parser import Parser
dzorlu's avatar
dzorlu committed
15 16 17 18
from typing import List, Any
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical  
Deniz's avatar
Deniz committed
19

dzorlu's avatar
dzorlu committed
20 21 22 23 24

from collections import OrderedDict
from sklearn.cluster import MiniBatchKMeans

import coloredlogs, logging
dzorlu's avatar
dzorlu committed
25
coloredlogs.install(logging.INFO)
dzorlu's avatar
dzorlu committed
26
logger = logging.getLogger(__name__)
Deniz's avatar
Deniz committed
27

dzorlu's avatar
dzorlu committed
28 29 30

# Acme dependencies
import acme
dzorlu's avatar
dzorlu committed
31
import tree
dzorlu's avatar
dzorlu committed
32
from acme import specs
dzorlu's avatar
dzorlu committed
33
from acme import types
dzorlu's avatar
dzorlu committed
34 35
from acme.agents.tf import r2d3
from acme import wrappers
dzorlu's avatar
dzorlu committed
36 37
from acme.wrappers.minerl_wrapper import OVAR
from acme.wrappers import MineRLWrapper
dzorlu's avatar
dzorlu committed
38 39 40 41 42
from acme.tf import networks

import dm_env

from acme.utils import loggers
dzorlu's avatar
dzorlu committed
43
#logger = loggers.TerminalLogger(label='minerl', time_delta=10.)
dzorlu's avatar
dzorlu committed
44 45 46

# number of discrete actions
NUMBER_OF_DISCRETE_ACTIONS = 25
Deniz's avatar
Deniz committed
47 48 49 50 51 52 53 54 55 56 57
# All the evaluations will be evaluated on MineRLObtainDiamond-v0 environment
MINERL_GYM_ENV = os.getenv('MINERL_GYM_ENV', 'MineRLObtainDiamondVectorObf-v0')
# You need to ensure that your submission is trained in under MINERL_TRAINING_MAX_STEPS steps
MINERL_TRAINING_MAX_STEPS = int(os.getenv('MINERL_TRAINING_MAX_STEPS', 8000000))
# You need to ensure that your submission is trained by launching less than MINERL_TRAINING_MAX_INSTANCES instances
MINERL_TRAINING_MAX_INSTANCES = int(os.getenv('MINERL_TRAINING_MAX_INSTANCES', 5))
# You need to ensure that your submission is trained within allowed training time.
# Round 1: Training timeout is 15 minutes
# Round 2: Training timeout is 4 days
MINERL_TRAINING_TIMEOUT = int(os.getenv('MINERL_TRAINING_TIMEOUT_MINUTES', 4*24*60))
# The dataset is available in data/ directory from repository root.
58
MINERL_DATA_ROOT = os.getenv('MINERL_DATA_ROOT', '/data')
Deniz's avatar
Deniz committed
59 60 61

# Optional: You can view best effort status of your instances with the help of parser.py
# This will give you current state like number of steps completed, instances launched and so on. Make your you keep a tap on the numbers to avoid breaching any limits.
dzorlu's avatar
dzorlu committed
62 63 64 65
rel_path = os.path.dirname(__file__) # relative directory path
performance_dir = os.path.join(rel_path, "performance")
Path(performance_dir).mkdir(parents=True, exist_ok=True)

Deniz's avatar
Deniz committed
66 67 68 69 70 71 72 73 74
parser = Parser('performance/',
                allowed_environment=MINERL_GYM_ENV,
                maximum_instances=MINERL_TRAINING_MAX_INSTANCES,
                maximum_steps=MINERL_TRAINING_MAX_STEPS,
                raise_on_error=False,
                no_entry_poll_timeout=600,
                submission_timeout=MINERL_TRAINING_TIMEOUT*60,
                initial_poll_timeout=600)

dzorlu's avatar
dzorlu committed
75 76 77 78 79
def create_network(nb_actions: int = NUMBER_OF_DISCRETE_ACTIONS) -> networks.RNNCore:
    """Creates the policy network"""
    return networks.R2D2MineRLNetwork(nb_actions)


dzorlu's avatar
dzorlu committed
80 81 82 83 84
def make_environment(k_means_path: str,
                     num_actions: int = NUMBER_OF_DISCRETE_ACTIONS,
                     dat_loader: minerl.data.data_pipeline.DataPipeline = None, 
                     train: bool = True,
                     minerl_gym_env: str = MINERL_GYM_ENV) -> dm_env.Environment:
dzorlu's avatar
dzorlu committed
85 86 87 88 89 90 91 92 93
  """
  Wrap the environment in:
    1 - MineRLWrapper 
        - similar to OAR but add proprioceptive features
        - kMeans to map cont action space to a discrete one
    2 - SinglePrecisionWrapper
    3 - GymWrapper
  """

dzorlu's avatar
dzorlu committed
94
  env = gym.make(minerl_gym_env)
dzorlu's avatar
dzorlu committed
95 96 97 98 99 100 101
      
  return wrappers.wrap_all(env, [
      wrappers.GymWrapper,
      functools.partial(
        wrappers.MineRLWrapper,
          num_actions=num_actions,
          dat_loader=dat_loader,
dzorlu's avatar
dzorlu committed
102
          k_means_path=k_means_path,
103
          train=train
dzorlu's avatar
dzorlu committed
104 105 106 107 108 109 110 111 112
      ),
      wrappers.SinglePrecisionWrapper,
  ])

def _nested_stack(sequence: List[Any]):
  """Stack nested elements in a sequence."""
  return tree.map_structure(lambda *x: np.stack(x), *sequence)

class DemonstrationRecorder:
dzorlu's avatar
dzorlu committed
113
  """Generate (TimeStep, action) tuples
dzorlu's avatar
dzorlu committed
114 115
  """

dzorlu's avatar
dzorlu committed
116
  def __init__(self, environment: dm_env.Environment):
dzorlu's avatar
dzorlu committed
117
    self._demos = []
dzorlu's avatar
dzorlu committed
118 119 120 121 122
    self._environment = environment
    self.k_means = environment.k_means
    self.num_classes = self.k_means.n_clusters
    self._prev_action: types.NestedArray 
    self._prev_reward: types.NestedArray
dzorlu's avatar
dzorlu committed
123 124
    self._reset_episode()

dzorlu's avatar
dzorlu committed
125 126 127 128 129 130
  def map_action(self, action: types.NestedArray) -> types.NestedArray:
    # map from cont to discrete for the agent
    action = action['vector'].reshape(1, 64)
    action = self.k_means.predict(action)[0]
    return action

dzorlu's avatar
dzorlu committed
131
  def step(self, timestep: dm_env.TimeStep, action: np.ndarray):
dzorlu's avatar
dzorlu committed
132
    reward = np.array(timestep.reward or 0., np.float32)
dzorlu's avatar
dzorlu committed
133
    self._episode_reward += reward
dzorlu's avatar
dzorlu committed
134 135 136 137 138
    # this imitates the enviroment step to create data in the same format.
    new_timestep = self._augment_observation(timestep)
    discrete_action = self.map_action(action)
    self._prev_action = discrete_action
    self._prev_reward = reward
dzorlu's avatar
dzorlu committed
139
    return (new_timestep, discrete_action)
dzorlu's avatar
dzorlu committed
140 141 142

  def _augment_observation(self, timestep: dm_env.TimeStep) -> dm_env.TimeStep:
    ovar = OVAR(observation=timestep.observation['pov'].astype(np.float32),
dzorlu's avatar
dzorlu committed
143
                obs_vector=timestep.observation['vector'].astype(np.float32),
dzorlu's avatar
dzorlu committed
144 145 146
                action=self._prev_action,
                reward=self._prev_reward)
    return timestep._replace(observation=ovar)
dzorlu's avatar
dzorlu committed
147 148

  def record_episode(self):
dzorlu's avatar
dzorlu committed
149
    logger.info(f"episode length of {len(self._episode)}")
dzorlu's avatar
dzorlu committed
150 151 152 153 154 155
    self._demos.append(_nested_stack(self._episode))
    self._reset_episode()

  def _reset_episode(self):
    self._episode = []
    self._episode_reward = 0
dzorlu's avatar
dzorlu committed
156 157 158 159
    self._prev_action = tree.map_structure(
        lambda x: x.generate_value(), self._environment.action_spec())
    self._prev_reward = tree.map_structure(
        lambda x: x.generate_value(), self._environment.reward_spec())
dzorlu's avatar
dzorlu committed
160 161 162 163

  @property
  def episode_reward(self):
    return self._episode_reward
164 165 166 167 168
  
  def _change_shape(self, shape):
      shape = list(shape)
      shape[0] = None
      return tuple(shape)
dzorlu's avatar
dzorlu committed
169 170 171 172 173 174
  
  def _change_type(self, _type):
    if _type == np.dtype('float64'):
      return np.dtype('float32')
    else:
      return _type
dzorlu's avatar
dzorlu committed
175 176

  def make_tf_dataset(self):
dzorlu's avatar
dzorlu committed
177
    types = tree.map_structure(lambda x: self._change_type(x.dtype), self._demos[0])
178
    shapes = tree.map_structure(lambda x: self._change_shape(x.shape), self._demos[0])
dzorlu's avatar
dzorlu committed
179
    logger.info({"types": types})
dzorlu's avatar
dzorlu committed
180 181 182
    ds = tf.data.Dataset.from_generator(lambda: self._demos, types, shapes)
    return ds.repeat().shuffle(len(self._demos))

dzorlu's avatar
dzorlu committed
183
def generate_demonstration(env: dm_env.Environment, 
dzorlu's avatar
dzorlu committed
184
                         dat_loader: minerl.data.data_pipeline.DataPipeline, 
dzorlu's avatar
dzorlu committed
185
                         nb_experts: int = 20):
dzorlu's avatar
dzorlu committed
186
  # Build demonstrations.    
dzorlu's avatar
dzorlu committed
187 188
  recorder = DemonstrationRecorder(env)
  recorder._reset_episode()
dzorlu's avatar
dzorlu committed
189 190
  # replay trajectories
  trajectories = dat_loader.get_trajectory_names()
dzorlu's avatar
dzorlu committed
191
  t = 0
dzorlu's avatar
dzorlu committed
192 193
  for t, trajectory in enumerate(trajectories):
      if t < nb_experts:
dzorlu's avatar
dzorlu committed
194 195 196 197 198 199 200 201
        logger.info({str(t): trajectory})
        for i, (state, a, r, _, done, meta) in enumerate(dat_loader.load_data(trajectory, include_metadata=True)):    
          if done:
            step_type = dm_env.StepType(2)
          elif i == 0:
            step_type = dm_env.StepType(0)
          else:
            step_type = dm_env.StepType(1)
dzorlu's avatar
dzorlu committed
202 203 204 205
          ts = dm_env.TimeStep(observation=state, 
                               reward=r, 
                               step_type=step_type, 
                               discount=np.array(1., dtype=np.float32))
dzorlu's avatar
dzorlu committed
206
          yield recorder.step(ts, a)
dzorlu's avatar
dzorlu committed
207

Deniz's avatar
Deniz committed
208 209 210 211
def main():
    """
    This function will be called for training phase.
    """
dzorlu's avatar
dzorlu committed
212 213 214 215 216
  
    rel_path = os.path.dirname(__file__) # relative directory path
    model_dir = os.path.join(rel_path, "train")
    Path(model_dir).mkdir(parents=True, exist_ok=True)

dzorlu's avatar
dzorlu committed
217 218 219 220
    burn_in_length = 40
    trace_length = 40

    # Create data loader
dzorlu's avatar
dzorlu committed
221 222 223 224 225
    logger.info((MINERL_GYM_ENV, MINERL_DATA_ROOT))
    data = minerl.data.make(MINERL_GYM_ENV, 
                            data_dir=MINERL_DATA_ROOT, 
                            num_workers=1,
                            worker_batch_size=4)
Deniz's avatar
Deniz committed
226

dzorlu's avatar
dzorlu committed
227
    # Create env
dzorlu's avatar
dzorlu committed
228
    logger.info("creating environment")
dzorlu's avatar
dzorlu committed
229 230
    environment = make_environment(num_actions=NUMBER_OF_DISCRETE_ACTIONS,
                                   k_means_path=model_dir,
dzorlu's avatar
dzorlu committed
231
                                   train=True,
dzorlu's avatar
dzorlu committed
232
                                   dat_loader=data)
dzorlu's avatar
dzorlu committed
233 234
    spec = specs.make_environment_spec(environment)

dzorlu's avatar
dzorlu committed
235 236 237
    # # Create a logger for the agent and environment loop.
    # agent_logger = loggers.TerminalLogger(label='agent', time_delta=10.)
    # env_loop_logger = loggers.TerminalLogger(label='env_loop', time_delta=10.)
dzorlu's avatar
dzorlu committed
238 239

    # Build demonstrations
dzorlu's avatar
dzorlu committed
240
    logger.info("building the demonstration dataset")
dzorlu's avatar
dzorlu committed
241
    generator = generate_demonstration(environment, data)
dzorlu's avatar
dzorlu committed
242
    logger.info("demonstration dataset is built..")
dzorlu's avatar
dzorlu committed
243 244 245 246 247

    # Construct the network.
    network = create_network()
    target_network = create_network()

dzorlu's avatar
dzorlu committed
248
    logger.info(f"model directory: {model_dir}")
dzorlu's avatar
dzorlu committed
249 250
    # sequence_length = burn_in_length + trace_length
    agent = r2d3.R2D3(
dzorlu's avatar
dzorlu committed
251
        model_directory=model_dir,
dzorlu's avatar
dzorlu committed
252 253 254
        environment_spec=spec,
        network=network,
        target_network=target_network,
dzorlu's avatar
dzorlu committed
255
        demonstration_generator=generator,
dzorlu's avatar
dzorlu committed
256
        demonstration_ratio=0.1,
dzorlu's avatar
dzorlu committed
257
        batch_size=8,
dzorlu's avatar
dzorlu committed
258
        samples_per_insert=2,
dzorlu's avatar
dzorlu committed
259
        min_replay_size=1000,
dzorlu's avatar
dzorlu committed
260
        max_replay_size=10_000,
dzorlu's avatar
dzorlu committed
261 262
        burn_in_length=burn_in_length,
        trace_length=trace_length,
263
        replay_period=40, # per R2D3 paper.
dzorlu's avatar
dzorlu committed
264 265
        checkpoint=True,
        #logger=agent_logger
dzorlu's avatar
dzorlu committed
266 267 268 269 270
    )

    # Run the env loop
    loop = acme.EnvironmentLoop(environment, agent)
    loop.run(num_steps=MINERL_TRAINING_MAX_STEPS)  # pytype: disable=attribute-error
Deniz's avatar
Deniz committed
271 272 273 274

    # Save trained model to train/ directory
    # Training 100% Completed
    aicrowd_helper.register_progress(1)
dzorlu's avatar
dzorlu committed
275
    environment.close()
Deniz's avatar
Deniz committed
276 277 278


if __name__ == "__main__":
dzorlu's avatar
dzorlu committed
279
  main()