From 4bee50ba29d73d5f2067e60e49b8b4245b321a1a Mon Sep 17 00:00:00 2001 From: Erik Nygren <erik.nygren@sbb.ch> Date: Thu, 13 Jun 2019 18:38:02 +0200 Subject: [PATCH] fixed normalizing function --- torch_training/training_navigation.py | 40 +++++++++++++++------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/torch_training/training_navigation.py b/torch_training/training_navigation.py index f16a2c4..9659386 100644 --- a/torch_training/training_navigation.py +++ b/torch_training/training_navigation.py @@ -46,19 +46,21 @@ env = RailEnv(width=10, env.load("./railway/complex_scene.pkl") """ -env = RailEnv(width=20, - height=20, - rail_generator=complex_rail_generator(nr_start_goal=10, nr_extra=1, min_dist=8, max_dist=99999, seed=0), +env = RailEnv(width=8, + height=8, + rail_generator=complex_rail_generator(nr_start_goal=5, nr_extra=1, min_dist=4, max_dist=99999, seed=0), obs_builder_object=TreeObsForRailEnv(max_depth=2, predictor=DummyPredictorForRailEnv()), - number_of_agents=10) + number_of_agents=3) + env.reset(True, True) env_renderer = RenderTool(env, gl="PILSVG") handle = env.get_agent_handles() -state_size = 147 * 2 +state_size = 168 * 2 action_size = 5 n_trials = 15000 +max_steps = int(1.5 * (env.height + env.width)) eps = 1. eps_end = 0.005 eps_decay = 0.9995 @@ -73,9 +75,9 @@ action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() agent = Agent(state_size, action_size, "FC", 0) -agent.qnetwork_local.load_state_dict(torch.load('./Nets/avoid_checkpoint15000.pth')) +# agent.qnetwork_local.load_state_dict(torch.load('./Nets/avoid_checkpoint15000.pth')) -demo = True +demo = False def max_lt(seq, val): """ @@ -99,7 +101,7 @@ def min_lt(seq, val): min = np.inf idx = len(seq) - 1 while idx >= 0: - if seq[idx] > val and seq[idx] < min: + if seq[idx] >= val and seq[idx] < min: min = seq[idx] idx -= 1 return min @@ -114,7 +116,8 @@ def norm_obs_clip(obs, clip_min=-1, clip_max=1): :return: returnes normalized and clipped observatoin """ max_obs = max(1, max_lt(obs, 1000)) - min_obs = max(0, min_lt(obs, 0)) + min_obs = min(max_obs, min_lt(obs, 0)) + if max_obs == min_obs: return np.clip(np.array(obs) / max_obs, clip_min, clip_max) norm = np.abs(max_obs - min_obs) @@ -131,13 +134,14 @@ for trials in range(1, n_trials + 1): env_renderer.set_new_rail() final_obs = obs.copy() final_obs_next = obs.copy() - for a in range(env.get_num_agents()): - data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(obs[a]), num_features_per_node=7, current_depth=0) + data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(obs[a]), num_features_per_node=8, + current_depth=0) data = norm_obs_clip(data) distance = norm_obs_clip(distance) agent_data = np.clip(agent_data, -1, 1) obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data)) + for i in range(2): time_obs.append(obs) # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5) @@ -147,7 +151,7 @@ for trials in range(1, n_trials + 1): score = 0 env_done = 0 # Run episode - for step in range(env.height * env.width): + for step in range(max_steps): if demo: env_renderer.renderEnv(show=True, show_observations=False) # print(step) @@ -163,13 +167,12 @@ for trials in range(1, n_trials + 1): next_obs, all_rewards, done, _ = env.step(action_dict) for a in range(env.get_num_agents()): - data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(next_obs[a]), num_features_per_node=7, + data, distance, agent_data = env.obs_builder.split_tree(tree=np.array(next_obs[a]), num_features_per_node=8, current_depth=0) data = norm_obs_clip(data) distance = norm_obs_clip(distance) agent_data = np.clip(agent_data, -1, 1) next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data)) - time_obs.append(next_obs) # Update replay buffer and train agent @@ -181,7 +184,7 @@ for trials in range(1, n_trials + 1): final_action_dict.update({a: action_dict[a]}) if not demo and not done[a]: agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a]) - score += all_rewards[a] + score += all_rewards[a] / env.get_num_agents() agent_obs = agent_next_obs.copy() if done['__all__']: @@ -193,11 +196,12 @@ for trials in range(1, n_trials + 1): eps = max(eps_end, eps_decay * eps) # decrease epsilon done_window.append(env_done) - scores_window.append(score) # save most recent score + scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append((np.mean(done_window))) - print('\rTraining {} Agents.\t Episode {}\t Average Score: {:.0f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( + print( + '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( env.get_num_agents(), trials, np.mean(scores_window), @@ -206,7 +210,7 @@ for trials in range(1, n_trials + 1): if trials % 100 == 0: print( - '\rTraining {} Agents.\t Episode {}\t Average Score: {:.0f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( + '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format( env.get_num_agents(), trials, np.mean(scores_window), -- GitLab