saving_experiences.py 9.28 KB
Newer Older
nilabha's avatar
nilabha committed
1
import getopt
2
import os
nilabha's avatar
nilabha committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import sys
import time

import numpy as np

import pandas as pd
from collections import deque

import gc

from flatland.envs.rail_env import RailEnv
from flatland.utils.misc import str2bool
from flatland.envs.observations import TreeObsForRailEnv
from flatland.envs.predictions import ShortestPathPredictorForRailEnv

from flatland.envs.malfunction_generators import malfunction_from_file
from flatland.envs.rail_generators import rail_from_file
from flatland.envs.schedule_generators import schedule_from_file

from flatland.envs.agent_utils import RailAgentStatus

nilabha's avatar
nilabha committed
24
from utils.observation_utils import normalize_observation  # noqa
nilabha's avatar
nilabha committed
25
26
27
28
29
30
31
32
33

# from gen_envs import *
import json
from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder
from ray.rllib.offline.json_writer import JsonWriter

imitate = True


34
35
36
37
38
## Legacy Code for the correct expert actions

# change below line in method malfunction_from_file in the file flatland.envs.malfunction_generators.py
# mean_malfunction_rate = 1/oMPD.malfunction_rate

nilabha's avatar
nilabha committed
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def main(args):
    try:
        opts, args = getopt.getopt(args, "", ["sleep-for-animation=", ""])
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        sys.exit(2)
    sleep_for_animation = True
    for o, a in opts:
        if o in ("--sleep-for-animation"):
            sleep_for_animation = str2bool(a)
        else:
            assert False, "unhandled option"

    batch_builder = SampleBatchBuilder()  # or MultiAgentSampleBatchBuilder
    writer = JsonWriter("./")

    #  Setting these 2 parameters to True can slow down training
    visuals = False
    sleep_for_animation = False

    if visuals:
        from flatland.utils.rendertools import RenderTool

    max_depth = 30
    tree_depth = 2
64
    trial_start = 100
65
    n_trials = 999
nilabha's avatar
nilabha committed
66
67
68
69
70
71
72
73
74
    start = 0

    columns = ['Agents', 'X_DIM', 'Y_DIM', 'TRIAL_NO',
               'REWARD', 'NORMALIZED_REWARD',
               'DONE_RATIO', 'STEPS', 'ACTION_PROB']
    df_all_results = pd.DataFrame(columns=columns)

    for trials in range(trial_start, n_trials + 1):

75
        env_file = f"envs-100-999/envs/Level_{trials}.pkl"
76
77
78
        # env_file = f"../env_configs/test-envs-small/Test_0/Level_{trials}.mpk"

        # file = f"../env_configs/actions-small/Test_0/Level_{trials}.mpk"
79
        file = f"envs-100-999/actions/envs/Level_{trials}.json"
80
81
82
83
84

        if not os.path.isfile(env_file) or not os.path.isfile(file):
            print("Missing file!", env_file, file)
            continue

nilabha's avatar
nilabha committed
85
86
87
88
89
90
        step = 0

        obs_builder_object = TreeObsForRailEnv(max_depth=tree_depth,
                                               predictor=ShortestPathPredictorForRailEnv(
                                                   max_depth))

nilabha's avatar
nilabha committed
91
92
        env = RailEnv(width=1, height=1,
                      rail_generator=rail_from_file(env_file),
nilabha's avatar
nilabha committed
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
                      schedule_generator=schedule_from_file(env_file),
                      malfunction_generator_and_process_data=malfunction_from_file(
                          env_file),
                      obs_builder_object=obs_builder_object)

        obs, info = env.reset(
            regenerate_rail=True,
            regenerate_schedule=True,
            activate_agents=False,
            random_seed=1001
        )

        with open(file, "r") as files:
            expert_actions = json.load(files)

        n_agents = env.get_num_agents()
        x_dim, y_dim = env.width, env.height

        agent_obs = [None] * n_agents
        agent_obs_buffer = [None] * n_agents
        done = dict()
        done["__all__"] = False

        if imitate:
            agent_action_buffer = list(
nilabha's avatar
nilabha committed
118
                expert_actions[step].values())
nilabha's avatar
nilabha committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
        else:
            # , p=[0.2, 0, 0.5])  # [0] * n_agents
            agent_action_buffer = np.random.choice(5, n_agents, replace=True)
        update_values = [False] * n_agents

        max_steps = int(4 * 2 * (20 + env.height + env.width))

        action_size = 5  # 3

        # And some variables to keep track of the progress
        action_dict = dict()
        scores_window = deque(maxlen=100)
        reward_window = deque(maxlen=100)
        done_window = deque(maxlen=100)
        action_prob = [0] * action_size

        # agent = Agent(state_size, action_size)

        if visuals:
            env_renderer = RenderTool(env, gl="PILSVG")
            env_renderer.render_env(
                show=True, frames=True, show_observations=True)

        for a in range(n_agents):
            if obs[a]:
                agent_obs[a] = normalize_observation(
                    obs[a], tree_depth, observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        agent_action_buffer = np.zeros(n_agents)
        # prev_action = np.zeros_like(env.action_space.sample())
        prev_reward = np.zeros(n_agents)
        for step in range(max_steps):
            for a in range(n_agents):
                if info['action_required'][a]:
                    if imitate:
                        if step < len(expert_actions):
                            action = expert_actions[step][str(a)]
                        else:
                            action = 0
                    else:
nilabha's avatar
nilabha committed
162
                        action = 0
nilabha's avatar
nilabha committed
163
164
165
166
167
168

                    action_prob[action] += 1
                    update_values[a] = True

                else:
                    update_values[a] = False
nilabha's avatar
nilabha committed
169
                    action = 0
nilabha's avatar
nilabha committed
170
171
172
173
174
175
176
177
178
179
180

                action_dict.update({a: action})

            next_obs, all_rewards, done, info = env.step(action_dict)

            for a in range(n_agents):

                if next_obs[a] is not None:
                    agent_obs[a] = normalize_observation(
                        next_obs[a], tree_depth, observation_radius=10)

nilabha's avatar
nilabha committed
181
182
                # Only update the values when we are done or when an action
                # was taken and thus relevant information is present
nilabha's avatar
nilabha committed
183
184
185
186
187
188
189
190
                if update_values[a] or done[a]:
                    start += 1

                    batch_builder.add_values(
                        t=step,
                        eps_id=trials,
                        agent_index=0,
                        obs=agent_obs_buffer[a],
nilabha's avatar
nilabha committed
191
192
                        actions=action_dict[a],
                        action_prob=1.0,  # put the true action probability
nilabha's avatar
nilabha committed
193
194
195
196
197
198
199
                        rewards=all_rewards[a],
                        prev_actions=agent_action_buffer[a],
                        prev_rewards=prev_reward[a],
                        dones=done[a],
                        infos=info['action_required'][a],
                        new_obs=agent_obs[a])

nilabha's avatar
nilabha committed
200
201
202
                agent_obs_buffer[a] = agent_obs[a].copy()
                agent_action_buffer[a] = action_dict[a]
                prev_reward[a] = all_rewards[a]
nilabha's avatar
nilabha committed
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227

                score += all_rewards[a]  # / env.get_num_agents()

            if visuals:
                env_renderer.render_env(
                    show=True, frames=True, show_observations=True)
                if sleep_for_animation:
                    time.sleep(0.5)

            if done["__all__"] or step > max_steps:
                writer.write(batch_builder.build_and_reset())
                break

            # Collection information about training
            if step % 100 == 0:
                tasks_finished = 0
                for current_agent in env.agents:
                    if current_agent.status == RailAgentStatus.DONE_REMOVED:
                        tasks_finished += 1
                print(
                    '\rTrial No {} Training {} Agents on ({},{}).\t Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'.format(
                        trials, env.get_num_agents(), x_dim, y_dim,
                        step,
                        score,
                        score / (max_steps + n_agents),
nilabha's avatar
nilabha committed
228
229
                        100 * np.mean(tasks_finished / max(
                            1, env.get_num_agents()))), end=" ")
nilabha's avatar
nilabha committed
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258

        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        reward_window.append(score)
        scores_window.append(score / (max_steps + n_agents))

        data = [[n_agents, x_dim, y_dim,
                 trials,
                 np.mean(reward_window),
                 np.mean(scores_window),
                 100 * np.mean(done_window),
                 step, action_prob / np.sum(action_prob)]]

        df_cur = pd.DataFrame(data, columns=columns)
        df_all_results = pd.concat([df_all_results, df_cur])

        if imitate:
            df_all_results.to_csv(
                f'TreeImitationLearning_DQN_TrainingResults.csv', index=False)

        print(
            '\rTrial No {} Training {} Agents on ({},{}).\t Total Steps {}\t Reward: {:.3f}\t Normalized Reward: {:.3f}\tDones: {:.2f}%\t'.format(
                trials, env.get_num_agents(), x_dim, y_dim,
                step,
                np.mean(reward_window),
                np.mean(scores_window),
259
                100 * np.mean(done_window)))
nilabha's avatar
nilabha committed
260
261
262
263
264
265
266
267
268
269
270
271

        if visuals:
            env_renderer.close_window()

        gc.collect()


if __name__ == '__main__':
    if 'argv' in globals():
        main(sys.argv)
    else:
        main(sys.argv[1:])