saving_experiences.py 9.28 KB
Newer Older
nilabha's avatar
nilabha committed
1
import getopt
2
import os
nilabha's avatar
nilabha committed
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
import sys
import time

import numpy as np

import pandas as pd
from collections import deque

import gc

from flatland.envs.rail_env import RailEnv
from flatland.utils.misc import str2bool
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv

from flatland.envs.malfunction_generators import malfunction_from_file
from flatland.envs.rail_generators import rail_from_file
from flatland.envs.schedule_generators import schedule_from_file

from flatland.envs.agent_utils import RailAgentStatus

nilabha's avatar
nilabha committed
24
from utils.observation_utils import normalize_observation  # noqa
nilabha's avatar
nilabha committed
25 26 27 28 29 30 31 32 33

# from gen_envs import *
import json
from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder
from ray.rllib.offline.json_writer import JsonWriter

imitate = True


34 35 36 37 38
## Legacy Code for the correct expert actions

# change below line in method malfunction_from_file in the file flatland.envs.malfunction_generators.py
# mean_malfunction_rate = 1/oMPD.malfunction_rate

nilabha's avatar
nilabha committed
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    batch_builder = SampleBatchBuilder()  # or MultiAgentSampleBatchBuilder
    writer = JsonWriter("./")

    #  Setting these 2 parameters to True can slow down training
    visuals = False
    sleep_for_animation = False

    if visuals:
        from flatland.utils.rendertools import RenderTool

    max_depth = 30
    tree_depth = 2
64
    trial_start = 100
65
    n_trials = 999
nilabha's avatar
nilabha committed
66 67 68 69 70 71 72 73 74
    start = 0

    columns = ['Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO',
               'REWARD', 'NORMALIZED_REWARD',
               'DONE_RATIO', 'STEPS', 'ACTION_PROB']
    df_all_results = pd.DataFrame(columns=columns)

    for trials in range(trial_start, n_trials + 1):

75
        env_file = f"envs-100-999/envs/Level_{trials}.pkl"
76 77 78
        # env_file = f"../env_configs/test-envs-small/Test_0/Level_{trials}.mpk"

        # file = f"../env_configs/actions-small/Test_0/Level_{trials}.mpk"
79
        file = f"envs-100-999/actions/envs/Level_{trials}.json"
80 81 82 83 84

        if not os.path.isfile(env_file) or not os.path.isfile(file):
            print("Missing file!", env_file, file)
            continue

nilabha's avatar
nilabha committed
85 86 87 88 89 90
        step = 0

        obs_builder_object = TreeObsForRailEnv(max_depth=tree_depth,
                                               predictor=ShortestPathPredictorForRailEnv(
                                                   max_depth))

nilabha's avatar
nilabha committed
91 92
        env = RailEnv(width=1, height=1,
                      rail_generator=rail_from_file(env_file),
nilabha's avatar
nilabha committed
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
                      schedule_generator=schedule_from_file(env_file),
                      malfunction_generator_and_process_data=malfunction_from_file(
                          env_file),
                      obs_builder_object=obs_builder_object)

        obs, info = env.reset(
            regenerate_rail=True,
            regenerate_schedule=True,
            activate_agents=False,
            random_seed=1001
        )

        with open(file, "r") as files:
            expert_actions = json.load(files)

        n_agents = env.get_num_agents()
        x_dim, y_dim = env.width, env.height

        agent_obs = [None] * n_agents
        agent_obs_buffer = [None] * n_agents
        done = dict()
        done["__all__"] = False

        if imitate:
            agent_action_buffer = list(
nilabha's avatar
nilabha committed
118
                expert_actions[step].values())
nilabha's avatar
nilabha committed
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
        else:
            # , p=[0.2, 0, 0.5])  # [0] * n_agents
            agent_action_buffer = np.random.choice(5, n_agents, replace=True)
        update_values = [False] * n_agents

        max_steps = int(4 * 2 * (20 + env.height + env.width))

        action_size = 5  # 3

        # And some variables to keep track of the progress
        action_dict = dict()
        scores_window = deque(maxlen=100)
        reward_window = deque(maxlen=100)
        done_window = deque(maxlen=100)
        action_prob = [0] * action_size

        # agent = Agent(state_size, action_size)

        if visuals:
            env_renderer = RenderTool(env, gl="PILSVG")
            env_renderer.render_env(
                show=True, frames=True, show_observations=True)

        for a in range(n_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(
                    obs[a], tree_depth, observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        agent_action_buffer = np.zeros(n_agents)
        # prev_action = np.zeros_like(env.action_space.sample())
        prev_reward = np.zeros(n_agents)
        for step in range(max_steps):
            for a in range(n_agents):
                if info['action_required'][a]:
                    if imitate:
                        if step < len(expert_actions):
                            action = expert_actions[step][str(a)]
                        else:
                            action = 0
                    else:
nilabha's avatar
nilabha committed
162
                        action = 0
nilabha's avatar
nilabha committed
163 164 165 166 167 168

                    action_prob[action] += 1
                    update_values[a] = True

                else:
                    update_values[a] = False
nilabha's avatar
nilabha committed
169
                    action = 0
nilabha's avatar
nilabha committed
170 171 172 173 174 175 176 177 178 179 180

                action_dict.update({a: action})

            next_obs, all_rewards, done, info = env.step(action_dict)

            for a in range(n_agents):

                if next_obs[a] is not None:
                    agent_obs[a] = normalize_observation(
                        next_obs[a], tree_depth, observation_radius=10)

nilabha's avatar
nilabha committed
181 182
                # Only update the values when we are done or when an action
                # was taken and thus relevant information is present
nilabha's avatar
nilabha committed
183 184 185 186 187 188 189 190
                if update_values[a] or done[a]:
                    start += 1

                    batch_builder.add_values(
                        t=step,
                        eps_id=trials,
                        agent_index=0,
                        obs=agent_obs_buffer[a],
nilabha's avatar
nilabha committed
191 192
                        actions=action_dict[a],
                        action_prob=1.0,  # put the true action probability
nilabha's avatar
nilabha committed
193 194 195 196 197 198 199
                        rewards=all_rewards[a],
                        prev_actions=agent_action_buffer[a],
                        prev_rewards=prev_reward[a],
                        dones=done[a],
                        infos=info['action_required'][a],
                        new_obs=agent_obs[a])

nilabha's avatar
nilabha committed
200 201 202
                agent_obs_buffer[a] = agent_obs[a].copy()
                agent_action_buffer[a] = action_dict[a]
                prev_reward[a] = all_rewards[a]
nilabha's avatar
nilabha committed
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227

                score += all_rewards[a]  # / env.get_num_agents()

            if visuals:
                env_renderer.render_env(
                    show=True, frames=True, show_observations=True)
                if sleep_for_animation:
                    time.sleep(0.5)

            if done["__all__"] or step > max_steps:
                writer.write(batch_builder.build_and_reset())
                break

            # Collection information about training
            if step % 100 == 0:
                tasks_finished = 0
                for current_agent in env.agents:
                    if current_agent.status == RailAgentStatus.DONE_REMOVED:
                        tasks_finished += 1
                print(
                    '\rTrial No {} Training {} Agents on ({},{}).\t Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'.format(
                        trials, env.get_num_agents(), x_dim, y_dim,
                        step,
                        score,
                        score / (max_steps + n_agents),
nilabha's avatar
nilabha committed
228 229
                        100 * np.mean(tasks_finished / max(
                            1, env.get_num_agents()))), end=" ")
nilabha's avatar
nilabha committed
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258

        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        reward_window.append(score)
        scores_window.append(score / (max_steps + n_agents))

        data = [[n_agents, x_dim, y_dim,
                 trials,
                 np.mean(reward_window),
                 np.mean(scores_window),
                 100 * np.mean(done_window),
                 step, action_prob / np.sum(action_prob)]]

        df_cur = pd.DataFrame(data, columns=columns)
        df_all_results = pd.concat([df_all_results, df_cur])

        if imitate:
            df_all_results.to_csv(
                f'TreeImitationLearning_DQN_TrainingResults.csv', index=False)

        print(
            '\rTrial No {} Training {} Agents on ({},{}).\t Total Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'.format(
                trials, env.get_num_agents(), x_dim, y_dim,
                step,
                np.mean(reward_window),
                np.mean(scores_window),
259
                100 * np.mean(done_window)))
nilabha's avatar
nilabha committed
260 261 262 263 264 265 266 267 268 269 270 271

        if visuals:
            env_renderer.close_window()

        gc.collect()


if __name__ == '__main__':
    if 'argv' in globals():
        main(sys.argv)
    else:
        main(sys.argv[1:])