#42 run baselines in ci

2cf1b9d1 · u214892 · ccf03494 · 2cf1b9d1
Commit 2cf1b9d1 authored 5 years ago by u214892
--- a/torch_training/bla.py
+++ b/torch_training/bla.py
@@ -92,108 +92,108 @@ def main(argv):
    print("Going to run training for {} trials...".format(n_trials))
    for trials in range(1, n_trials + 1):
-        if trials % 50 == 0 and not demo:
+        # if trials % 50 == 0 and not demo:
-            x_dim = np.random.randint(8, 20)
+        #     x_dim = np.random.randint(8, 20)
-            y_dim = np.random.randint(8, 20)
+        #     y_dim = np.random.randint(8, 20)
-            n_agents = np.random.randint(3, 8)
+        #     n_agents = np.random.randint(3, 8)
-            n_goals = n_agents + np.random.randint(0, 3)
+        #     n_goals = n_agents + np.random.randint(0, 3)
-            min_dist = int(0.75 * min(x_dim, y_dim))
+        #     min_dist = int(0.75 * min(x_dim, y_dim))
-            env = RailEnv(width=x_dim,
+        #     env = RailEnv(width=x_dim,
-                          height=y_dim,
+        #                   height=y_dim,
-                          rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
+        #                   rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
-                                                                max_dist=99999,
+        #                                                         max_dist=99999,
-                                                                seed=0),
+        #                                                         seed=0),
-                          obs_builder_object=TreeObsForRailEnv(max_depth=3,
+        #                   obs_builder_object=TreeObsForRailEnv(max_depth=3,
-                                                               predictor=ShortestPathPredictorForRailEnv()),
+        #                                                        predictor=ShortestPathPredictorForRailEnv()),
-                          number_of_agents=n_agents)
+        #                   number_of_agents=n_agents)
-            env.reset(True, True)
+        #     env.reset(True, True)
-            max_steps = int(3 * (env.height + env.width))
+        #     max_steps = int(3 * (env.height + env.width))
-            agent_obs = [None] * env.get_num_agents()
+        #     agent_obs = [None] * env.get_num_agents()
-            agent_next_obs = [None] * env.get_num_agents()
+        #     agent_next_obs = [None] * env.get_num_agents()
-        # Reset environment
+        # # Reset environment
-        if file_load:
+        # if file_load:
-            obs = env.reset(False, False)
+        #     obs = env.reset(False, False)
-        else:
+        # else:
-            obs = env.reset(True, True)
+        #     obs = env.reset(True, True)
-        if demo:
+        # if demo:
-            env_renderer.set_new_rail()
+        #     env_renderer.set_new_rail()
-        obs_original = obs.copy()
+        # obs_original = obs.copy()
-        final_obs = obs.copy()
+        # final_obs = obs.copy()
-        final_obs_next = obs.copy()
+        # final_obs_next = obs.copy()
-        for a in range(env.get_num_agents()):
+        # for a in range(env.get_num_agents()):
-            data, distance, agent_data = split_tree(tree=np.array(obs[a]),
+        #     data, distance, agent_data = split_tree(tree=np.array(obs[a]),
-                                                    current_depth=0)
+        #                                             current_depth=0)
-            data = norm_obs_clip(data)
+        #     data = norm_obs_clip(data)
-            distance = norm_obs_clip(distance)
+        #     distance = norm_obs_clip(distance)
-            agent_data = np.clip(agent_data, -1, 1)
+        #     agent_data = np.clip(agent_data, -1, 1)
-            obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+        #     obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
-            agent_data = env.agents[a]
+        #     agent_data = env.agents[a]
-            speed = 1  # np.random.randint(1,5)
+        #     speed = 1  # np.random.randint(1,5)
-            agent_data.speed_data['speed'] = 1. / speed
+        #     agent_data.speed_data['speed'] = 1. / speed
+        #
-        for i in range(2):
+        # for i in range(2):
-            time_obs.append(obs)
+        #     time_obs.append(obs)
-        # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
+        # # env.obs_builder.util_print_obs_subtree(tree=obs[0], num_elements_per_node=5)
-        for a in range(env.get_num_agents()):
+        # for a in range(env.get_num_agents()):
-            agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
+        #     agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
+        #
-        score = 0
+        # score = 0
-        env_done = 0
+        # env_done = 0
-        # Run episode
+        # # Run episode
-        for step in range(max_steps):
+        # for step in range(max_steps):
-            if demo:
+        #     if demo:
-                env_renderer.renderEnv(show=True, show_observations=False)
+        #         env_renderer.renderEnv(show=True, show_observations=False)
-                # observation_helper.util_print_obs_subtree(obs_original[0])
+        #         # observation_helper.util_print_obs_subtree(obs_original[0])
-                if record_images:
+        #         if record_images:
-                    env_renderer.gl.saveImage("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
+        #             env_renderer.gl.saveImage("./Images/flatland_frame_{:04d}.bmp".format(frame_step))
-                    frame_step += 1
+        #             frame_step += 1
-            # print(step)
+        #     # print(step)
-            # Action
+        #     # Action
-            for a in range(env.get_num_agents()):
+        #     for a in range(env.get_num_agents()):
-                if demo:
+        #         if demo:
-                    eps = 0
+        #             eps = 0
-                # action = agent.act(np.array(obs[a]), eps=eps)
+        #         # action = agent.act(np.array(obs[a]), eps=eps)
-                action = agent.act(agent_obs[a], eps=eps)
+        #         action = agent.act(agent_obs[a], eps=eps)
-                action_prob[action] += 1
+        #         action_prob[action] += 1
-                action_dict.update({a: action})
+        #         action_dict.update({a: action})
-            # Environment step
+        #     # Environment step
+        #
-            next_obs, all_rewards, done, _ = env.step(action_dict)
+        #     next_obs, all_rewards, done, _ = env.step(action_dict)
-            # print(all_rewards,action)
+        #     # print(all_rewards,action)
-            obs_original = next_obs.copy()
+        #     obs_original = next_obs.copy()
-            for a in range(env.get_num_agents()):
+        #     for a in range(env.get_num_agents()):
-                data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
+        #         data, distance, agent_data = split_tree(tree=np.array(next_obs[a]),
-                                                        current_depth=0)
+        #                                                 current_depth=0)
-                data = norm_obs_clip(data)
+        #         data = norm_obs_clip(data)
-                distance = norm_obs_clip(distance)
+        #         distance = norm_obs_clip(distance)
-                agent_data = np.clip(agent_data, -1, 1)
+        #         agent_data = np.clip(agent_data, -1, 1)
-                next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
+        #         next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
-            time_obs.append(next_obs)
+        #     time_obs.append(next_obs)
+        #
-            # Update replay buffer and train agent
+        #     # Update replay buffer and train agent
-            for a in range(env.get_num_agents()):
+        #     for a in range(env.get_num_agents()):
-                agent_next_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
+        #         agent_next_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
-                if done[a]:
+        #         if done[a]:
-                    final_obs[a] = agent_obs[a].copy()
+        #             final_obs[a] = agent_obs[a].copy()
-                    final_obs_next[a] = agent_next_obs[a].copy()
+        #             final_obs_next[a] = agent_next_obs[a].copy()
-                    final_action_dict.update({a: action_dict[a]})
+        #             final_action_dict.update({a: action_dict[a]})
-                if not demo and not done[a]:
+        #         if not demo and not done[a]:
-                    agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
+        #             agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
-                score += all_rewards[a] / env.get_num_agents()
+        #         score += all_rewards[a] / env.get_num_agents()
+        #
-            agent_obs = agent_next_obs.copy()
+        #     agent_obs = agent_next_obs.copy()
-            if done['__all__']:
+        #     if done['__all__']:
-                env_done = 1
+        #         env_done = 1
-                for a in range(env.get_num_agents()):
+        #         for a in range(env.get_num_agents()):
-                    agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
+        #             agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
-                break
+        #         break
-        # Epsilon decay
+        # # Epsilon decay
-        eps = max(eps_end, eps_decay * eps)  # decrease epsilon
+        # eps = max(eps_end, eps_decay * eps)  # decrease epsilon
+        #
-        done_window.append(env_done)
+        # done_window.append(env_done)
-        scores_window.append(score / max_steps)  # save most recent score
+        # scores_window.append(score / max_steps)  # save most recent score
-        scores.append(np.mean(scores_window))
+        # scores.append(np.mean(scores_window))
-        dones_list.append((np.mean(done_window)))
+        # dones_list.append((np.mean(done_window)))
        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(