diff --git a/checkpoints/201106234900-100.pth.local b/checkpoints/201106234900-100.pth.local deleted file mode 100644 index 1b4b1cb2c430282098d599f76b71123fc84c9ba4..0000000000000000000000000000000000000000 Binary files a/checkpoints/201106234900-100.pth.local and /dev/null differ diff --git a/checkpoints/201106234900-100.pth.target b/checkpoints/201106234900-100.pth.target deleted file mode 100644 index 391a36c37d22af1fdc97de554f96d7ecdc0d4874..0000000000000000000000000000000000000000 Binary files a/checkpoints/201106234900-100.pth.target and /dev/null differ diff --git a/checkpoints/201106234900-300.pth.local b/checkpoints/201106234900-300.pth.local deleted file mode 100644 index 0da81f8a4d34395de91b3d9e516c2b7535685f46..0000000000000000000000000000000000000000 Binary files a/checkpoints/201106234900-300.pth.local and /dev/null differ diff --git a/checkpoints/201106234900-300.pth.target b/checkpoints/201106234900-300.pth.target deleted file mode 100644 index 8517ad36ccbddc6f4ef34e493120af6262e9f86a..0000000000000000000000000000000000000000 Binary files a/checkpoints/201106234900-300.pth.target and /dev/null differ diff --git a/reinforcement_learning/multi_agent_training.py b/reinforcement_learning/multi_agent_training.py index 74288e73bc182195e6b3f0778c4739ffbc487217..0f26a206d72de7bbc5393df477b500760a28eadf 100755 --- a/reinforcement_learning/multi_agent_training.py +++ b/reinforcement_learning/multi_agent_training.py @@ -264,7 +264,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): if info['action_required'][agent]: update_values[agent] = True - if agent == agent_to_learn: + if agent == agent_to_learn or True: action = policy.act(agent_obs[agent], eps=eps_start) else: action = policy2.act([agent], eps=eps_start) @@ -284,20 +284,21 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params): step_timer.start() next_obs, all_rewards, done, info = train_env.step(action_dict) - for agent in train_env.get_agent_handles(): - act = action_dict.get(agent, RailEnvActions.DO_NOTHING) - if agent_obs[agent][26] == 1: - if act == RailEnvActions.STOP_MOVING: - all_rewards[agent] *= 0.01 - else: - if act == RailEnvActions.MOVE_LEFT: - all_rewards[agent] *= 0.9 + if False: + for agent in train_env.get_agent_handles(): + act = action_dict.get(agent, RailEnvActions.DO_NOTHING) + if agent_obs[agent][26] == 1: + if act == RailEnvActions.STOP_MOVING: + all_rewards[agent] *= 0.01 else: - if agent_obs[agent][7] == 0 and agent_obs[agent][8] == 0: - if act == RailEnvActions.MOVE_FORWARD: - all_rewards[agent] *= 0.01 - if done[agent]: - all_rewards[agent] += 100.0 + if act == RailEnvActions.MOVE_LEFT: + all_rewards[agent] *= 0.9 + else: + if agent_obs[agent][7] == 0 and agent_obs[agent][8] == 0: + if act == RailEnvActions.MOVE_FORWARD: + all_rewards[agent] *= 0.01 + if done[agent]: + all_rewards[agent] += 100.0 step_timer.end() @@ -531,7 +532,7 @@ if __name__ == "__main__": parser.add_argument("--load_policy", help="policy filename (reference) to load", default="", type=str) parser.add_argument("--use_fast_tree_observation", help="use FastTreeObs instead of stock TreeObs", action='store_true') - parser.add_argument("--max_depth", help="max depth", default=1, type=int) + parser.add_argument("--max_depth", help="max depth", default=2, type=int) training_params = parser.parse_args() env_params = [ diff --git a/run.py b/run.py index a7e7e7bdda1ac152ad06a570b80e74a87dd34082..0b7108fe2fe930fae71f07b1784b96b721e97fd9 100644 --- a/run.py +++ b/run.py @@ -26,8 +26,7 @@ from reinforcement_learning.dddqn_policy import DDDQNPolicy VERBOSE = True # Checkpoint to use (remember to push it!) -checkpoint = "./checkpoints/201106234244-400.pth" # 15.64082361736683 Depth 1 -checkpoint = "./checkpoints/201106234900-300.pth" # 15.64082361736683 Depth 1 +checkpoint = "./checkpoints/201106234900-5400.pth" # 15.64082361736683 Depth 1 # Use last action cache USE_ACTION_CACHE = False diff --git a/utils/fast_tree_obs.py b/utils/fast_tree_obs.py index 3b14b0f7be2379bc472a23122e982336a4421106..cd7f9c54f79e055215efad6dbecd8679e123eb52 100755 --- a/utils/fast_tree_obs.py +++ b/utils/fast_tree_obs.py @@ -168,7 +168,7 @@ class FastTreeObs(ObservationBuilder): if depth >= self.max_depth: return has_opp_agent, has_same_agent, has_switch, visited - # max_explore_steps = 100 + # max_explore_steps = 100 -> just to ensure that the exploration ends cnt = 0 while cnt < 100: cnt += 1 @@ -177,26 +177,41 @@ class FastTreeObs(ObservationBuilder): opp_a = self.env.agent_positions[new_position] if opp_a != -1 and opp_a != handle: if self.env.agents[opp_a].direction != new_direction: - # opp agent found + # opp agent found -> stop exploring. This would be a strong signal. has_opp_agent = 1 return has_opp_agent, has_same_agent, has_switch, visited else: + # same agent found + # the agent can follow the agent, because this agent is still moving ahead and there shouldn't + # be any dead-lock nor other issue -> agent is just walking -> if other agent has a deadlock + # this should be avoided by other agents -> one edge case would be when other agent has it's + # target on this branch -> thus the agents should scan further whether there will be an opposite + # agent walking on same track has_same_agent = 1 - return has_opp_agent, has_same_agent, has_switch, visited + # !NOT stop exploring! return has_opp_agent, has_same_agent, has_switch, visited - # convert one-hot encoding to 0,1,2,3 - agents_on_switch, \ - agents_near_to_switch, \ - agents_near_to_switch_all, \ - agents_on_switch_all = \ + # agents_on_switch == TRUE -> Current cell is a switch where the agent can decide (branch) in exploration + # agent_near_to_switch == TRUE -> One cell before the switch, where the agent can decide + # + agents_on_switch, agents_near_to_switch, _, _ = \ self.check_agent_decision(new_position, new_direction) + if agents_near_to_switch: + # The exploration was walking on a path where the agent can not decide + # Best option would be MOVE_FORWARD -> Skip exploring - just walking return has_opp_agent, has_same_agent, has_switch, visited possible_transitions = self.env.rail.get_transitions(*new_position, new_direction) if agents_on_switch: f = 0 - for dir_loop in range(4): + orientation = new_direction + if fast_count_nonzero(possible_transitions) == 1: + orientation = fast_argmax(possible_transitions) + for dir_loop, branch_direction in enumerate( + [(orientation + dir_loop) % 4 for dir_loop in range(-1, 3)]): + # branch the exploration path and aggregate the found information + # --- OPEN RESEARCH QUESTION ---> is this good or shall we use full detailed information as + # we did in the TreeObservation (FLATLAND) ? if possible_transitions[dir_loop] == 1: f += 1 hoa, hsa, hs, v = self._explore(handle,