From 23e98b594dcb2c54832413317de422f1a7d2628e Mon Sep 17 00:00:00 2001 From: Erik Nygren <erik.nygren@sbb.ch> Date: Sun, 1 Sep 2019 13:58:39 -0400 Subject: [PATCH] only store observations in buffer when you are allowed to take an action! --- .../observation_builders/observations.py | 2 +- torch_training/render_agent_behavior.py | 8 +-- torch_training/training_navigation.py | 58 +------------------ utils/observation_utils.py | 3 +- 4 files changed, 8 insertions(+), 63 deletions(-) diff --git a/torch_training/observation_builders/observations.py b/torch_training/observation_builders/observations.py index 10bd1f0..70e5840 100644 --- a/torch_training/observation_builders/observations.py +++ b/torch_training/observation_builders/observations.py @@ -346,7 +346,7 @@ class TreeObsForRailEnv(ObservationBuilder): unusable_switch = np.inf other_agent_same_direction = 0 other_agent_opposite_direction = 0 - malfunctioning_agent = 0 + malfunctioning_agent = 0. min_fractional_speed = 1. num_steps = 1 while exploring: diff --git a/torch_training/render_agent_behavior.py b/torch_training/render_agent_behavior.py index fc0e067..651ec3e 100644 --- a/torch_training/render_agent_behavior.py +++ b/torch_training/render_agent_behavior.py @@ -103,7 +103,7 @@ action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() agent = Agent(state_size, action_size, "FC", 0) -with path(torch_training.Nets, "navigator_checkpoint1200.pth") as file_in: +with path(torch_training.Nets, "navigator_checkpoint500.pth") as file_in: agent.qnetwork_local.load_state_dict(torch.load(file_in)) record_images = False @@ -126,14 +126,12 @@ for trials in range(1, n_trials + 1): # Action for a in range(env.get_num_agents()): - action = agent.act(agent_obs[a], eps=0) + action = agent.act(agent_obs[a], eps=0.) action_prob[action] += 1 action_dict.update({a: action}) - # Environment step obs, all_rewards, done, _ = env.step(action_dict) - - env_renderer.render_env(show=True, show_predictions=False, show_observations=False) + env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): agent_obs[a] = normalize_observation(obs[a], observation_radius=10) diff --git a/torch_training/training_navigation.py b/torch_training/training_navigation.py index 25b8c14..2417746 100644 --- a/torch_training/training_navigation.py +++ b/torch_training/training_navigation.py @@ -71,11 +71,9 @@ def main(argv): number_of_agents=n_agents, stochastic_data=stochastic_data, # Malfunction data generator obs_builder_object=TreeObservation) - env.reset(True, True) # After training we want to render the results so we also load a renderer env_renderer = RenderTool(env, gl="PILSVG", ) - # Given the depth of the tree observation and the number of features per node we get the following state_size num_features_per_node = env.obs_builder.observation_dim tree_depth = 2 @@ -104,7 +102,6 @@ def main(argv): final_action_dict = dict() scores_window = deque(maxlen=100) done_window = deque(maxlen=100) - time_obs = deque(maxlen=2) scores = [] dones_list = [] action_prob = [0] * action_size @@ -114,8 +111,6 @@ def main(argv): # Now we load a Double dueling DQN agent agent = Agent(state_size, action_size, "FC", 0) - Training = True - for trials in range(1, n_trials + 1): # Reset environment @@ -126,19 +121,17 @@ def main(argv): # Build agent specific observations for a in range(env.get_num_agents()): - agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], observation_radius=10) + agent_obs[a] = normalize_observation(obs[a], observation_radius=10) # Reset score and done score = 0 env_done = 0 - # Run episode for step in range(max_steps): - # Action for a in range(env.get_num_agents()): - if env.agents[a].speed_data['position_fraction'] == 0.: + if env.agents[a].speed_data['position_fraction'] < 0.001: register_action_state[a] = True else: register_action_state[a] = False @@ -166,7 +159,6 @@ def main(argv): # Copy observation agent_obs = agent_next_obs.copy() - if done['__all__']: env_done = 1 for a in range(env.get_num_agents()): @@ -206,52 +198,6 @@ def main(argv): './Nets/navigator_checkpoint' + str(trials) + '.pth') action_prob = [1] * action_size - # Render the trained agent - - # Reset environment - obs = env.reset(True, True) - env_renderer.set_new_rail() - - # Split the observation tree into its parts and normalize the observation using the utility functions. - # Build agent specific local observation - for a in range(env.get_num_agents()): - rail_data, distance_data, agent_data = split_tree(tree=np.array(obs[a]), - num_features_per_node=num_features_per_node, - current_depth=0) - rail_data = norm_obs_clip(rail_data) - distance_data = norm_obs_clip(distance_data) - agent_data = np.clip(agent_data, -1, 1) - agent_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data)) - - # Reset score and done - score = 0 - env_done = 0 - - # Run episode - for step in range(max_steps): - env_renderer.render_env(show=True, show_observations=False) - - # Chose the actions - for a in range(env.get_num_agents()): - eps = 0 - action = agent.act(agent_obs[a], eps=eps) - action_dict.update({a: action}) - - # Environment step - next_obs, all_rewards, done, _ = env.step(action_dict) - - for a in range(env.get_num_agents()): - rail_data, distance_data, agent_data = split_tree(tree=np.array(next_obs[a]), - num_features_per_node=num_features_per_node, - current_depth=0) - rail_data = norm_obs_clip(rail_data) - distance_data = norm_obs_clip(distance_data) - agent_data = np.clip(agent_data, -1, 1) - agent_next_obs[a] = np.concatenate((np.concatenate((rail_data, distance_data)), agent_data)) - - agent_obs = agent_next_obs.copy() - if done['__all__']: - break # Plot overall training progress at the end plt.plot(scores) plt.show() diff --git a/utils/observation_utils.py b/utils/observation_utils.py index b4badeb..7352601 100644 --- a/utils/observation_utils.py +++ b/utils/observation_utils.py @@ -95,6 +95,7 @@ def split_tree(tree, num_features_per_node, current_depth=0): tree_data.extend(tmp_tree_data) distance_data.extend(tmp_distance_data) agent_data.extend(tmp_agent_data) + return tree_data, distance_data, agent_data @@ -103,6 +104,6 @@ def normalize_observation(observation, num_features_per_node=11, observation_rad current_depth=0) data = norm_obs_clip(data, fixed_radius=observation_radius) distance = norm_obs_clip(distance, normalize_to_range=True) - agent_data = np.clip(agent_data, -1, 20) + agent_data = np.clip(agent_data, -1, 1) normalized_obs = np.concatenate((np.concatenate((data, distance)), agent_data)) return normalized_obs -- GitLab