diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 040a2944e662cb2e4737784cf131643bbf68e91e..06c35b2d1116eae35e9a10b292f1657bd1bcc5d7 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -206,14 +206,6 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): scores_window = deque(maxlen=checkpoint_interval) # todo smooth when rendering instead completion_window = deque(maxlen=checkpoint_interval) - # IF USE_SINGLE_AGENT_TRAINING is set and the episode_idx <= MAX_SINGLE_TRAINING_ITERATION then - # the training gets done with single use. Each UPDATE_POLICY2_N_EPISODE the second policy get replaced - # with the policy (the one which get trained). - USE_SINGLE_AGENT_TRAINING = False - MAX_SINGLE_TRAINING_ITERATION = 100000 - UPDATE_POLICY2_N_EPISODE = 200 - USE_DEADLOCK_AVOIDANCE_AS_POLICY2 = False - # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) if False: @@ -263,9 +255,6 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): preproc_timer = Timer() inference_timer = Timer() - if episode_idx > MAX_SINGLE_TRAINING_ITERATION: - USE_SINGLE_AGENT_TRAINING = False - # Reset environment reset_timer.start() number_of_agents = int(min(n_agents, 1 + np.floor(episode_idx / 200))) @@ -274,13 +263,6 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): train_env = create_rail_env(train_env_params, tree_observation) obs, info = train_env.reset(regenerate_rail=True, regenerate_schedule=True) policy.reset() - - if USE_DEADLOCK_AVOIDANCE_AS_POLICY2: - policy2 = DeadLockAvoidanceAgent(train_env, action_size) - else: - if episode_idx % UPDATE_POLICY2_N_EPISODE == 0: - policy2 = policy.clone() - reset_timer.end() if train_params.render: @@ -307,26 +289,14 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): max_steps = train_env._max_episode_steps # Run episode - agent_to_learn = [0] - if train_env.get_num_agents() > 1: - agent_to_learn = np.unique(np.random.choice(train_env.get_num_agents(), train_env.get_num_agents())) - # agent_to_learn = np.arange(train_env.get_num_agents()) - for step in range(max_steps - 1): inference_timer.start() policy.start_step() - policy2.start_step() for agent_handle in train_env.get_agent_handles(): agent = train_env.agents[agent_handle] if info['action_required'][agent_handle]: update_values[agent_handle] = True - if (agent_handle in agent_to_learn) or (not USE_SINGLE_AGENT_TRAINING): - action = policy.act(agent_obs[agent_handle], eps=eps_start) - else: - if USE_DEADLOCK_AVOIDANCE_AS_POLICY2: - action = policy2.act([agent_handle], eps=0.0) - else: - action = policy2.act(agent_obs[agent_handle], eps=0.0) + action = policy.act(agent_obs[agent_handle], eps=eps_start) action_count[action] += 1 actions_taken.append(action) @@ -337,7 +307,6 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): action = 0 action_dict.update({agent_handle: action}) policy.end_step() - policy2.end_step() inference_timer.end() # Environment step @@ -383,13 +352,12 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): if update_values[agent_handle] or done['__all__']: # Only learn from timesteps where somethings happened learn_timer.start() - if (agent_handle in agent_to_learn) or (not USE_SINGLE_AGENT_TRAINING): - policy.step(agent_handle, - agent_prev_obs[agent_handle], - agent_prev_action[agent_handle], - all_rewards[agent_handle], - agent_obs[agent_handle], - done[agent_handle]) + policy.step(agent_handle, + agent_prev_obs[agent_handle], + agent_prev_action[agent_handle], + all_rewards[agent_handle], + agent_obs[agent_handle], + done[agent_handle]) learn_timer.end() agent_prev_obs[agent_handle] = agent_obs[agent_handle].copy()