From 1317886f691e1620e71f4275145a49fb8f6a6f34 Mon Sep 17 00:00:00 2001 From: hagrid67 <jdhwatson@gmail.com> Date: Fri, 2 Oct 2020 20:17:44 +0100 Subject: [PATCH] add timeout test in test-service notebook --- flatland/evaluators/service.py | 11 +- notebooks/test-service.ipynb | 475 ++++++++++++++++++++++++++------- 2 files changed, 382 insertions(+), 104 deletions(-) diff --git a/flatland/evaluators/service.py b/flatland/evaluators/service.py index edffc419..de17a0c6 100644 --- a/flatland/evaluators/service.py +++ b/flatland/evaluators/service.py @@ -55,7 +55,7 @@ TEST_MIN_PERCENTAGE_COMPLETE_MEAN = 0.25 # After this number of consecutive timeouts, kill the submission: # this probably means the submission has crashed -MAX_SUCCESSIVE_TIMEOUTS = 10 +MAX_SUCCESSIVE_TIMEOUTS = int(os.getenv("FLATLAND_MAX_SUCCESSIVE_TIMEOUTS", 10)) debug_mode = (os.getenv("AICROWD_DEBUG_SUBMISSION", 0) == 1) if debug_mode: @@ -1184,6 +1184,15 @@ class FlatlandRemoteEvaluationService: print(msg, "Evaluation will stop.") self.termination_cause = msg self.evaluation_done = True + # JW - change the command to a submit + print("Creating fake submit message after excessive timeouts.") + command = { + "type":messages.FLATLAND_RL.ENV_SUBMIT, + "payload": {}, + "response_channel": self.previous_command.get("response_channel") } + + return self.handle_env_submit(command) + continue self.timeout_counter = 0 diff --git a/notebooks/test-service.ipynb b/notebooks/test-service.ipynb index 7929b591..6c67d2b7 100644 --- a/notebooks/test-service.ipynb +++ b/notebooks/test-service.ipynb @@ -97,32 +97,37 @@ }, "outputs": [], "source": [ - "import pickle\n", - "\n", "from flatland.envs.rail_env import RailEnv\n", "from flatland.envs.rail_generators import sparse_rail_generator\n", "from flatland.envs.schedule_generators import sparse_schedule_generator\n", "from flatland.envs.malfunction_generators import malfunction_from_file, no_malfunction_generator\n", "from flatland.envs.rail_generators import rail_from_file\n", "from flatland.envs.schedule_generators import schedule_from_file\n", - "from flatland.core.env_observation_builder import DummyObservationBuilder" + "from flatland.core.env_observation_builder import DummyObservationBuilder\n", + "from flatland.envs.persistence import RailEnvPersister\n", + "from flatland.evaluators.client import FlatlandRemoteClient, TimeoutException\n", + "import flatland.evaluators.service as fes" ] }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PU5GkH271guD" + }, "outputs": [], "source": [ - "from flatland.envs.persistence import RailEnvPersister\n", - "from flatland.evaluators.client import FlatlandRemoteClient\n", + "import pickle\n", "import redis\n", "import subprocess as sp\n", "import shlex\n", "import time\n", "import pkg_resources as pr\n", "import importlib_resources as ir\n", - "import sys" + "import sys, os\n", + "import pandas as pd" ] }, { @@ -151,7 +156,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -170,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -179,7 +184,7 @@ "'/home3/jeremy/projects/aicrowd/rl-trains/flatland5/env_data/tests/service_test/'" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -198,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -207,16 +212,17 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[b'flatland-rl::FLATLAND_RL_SERVICE_ID::response::65c5cdafbda515c05db3af5b2c7800ce']" + "[b'flatland-rl::FLATLAND_RL_SERVICE_ID::commands',\n", + " b'flatland-rl::FLATLAND_RL_SERVICE_ID::response::9233d209716f4ae78a5dbe124de67e27']" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -228,14 +234,15 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Deleting: b'flatland-rl::FLATLAND_RL_SERVICE_ID::response::65c5cdafbda515c05db3af5b2c7800ce'\n" + "Deleting: b'flatland-rl::FLATLAND_RL_SERVICE_ID::commands'\n", + "Deleting: b'flatland-rl::FLATLAND_RL_SERVICE_ID::response::9233d209716f4ae78a5dbe124de67e27'\n" ] } ], @@ -249,23 +256,99 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Service python command\n", - "### Kill any old service process" + "### Remove `/tm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### kill any old `service.py` process" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "!ps -ef | grep -i python | grep -i flatland.evaluators.service | awk '{print $2}' | xargs kill" ] }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "osEnv2 = os.environ.copy()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Timeouts copied from service.py" + ] + }, { "cell_type": "code", "execution_count": 15, "metadata": {}, + "outputs": [], + "source": [ + "#MAX_SUCCESSIVE_TIMEOUTS = int(os.getenv(\"FLATLAND_MAX_SUCCESSIVE_TIMEOUTS\", 10))\n", + "\n", + "# 8 hours (will get debug timeout from env variable if applicable)\n", + "#OVERALL_TIMEOUT = int(os.getenv(\n", + "# \"FLATLAND_OVERALL_TIMEOUT\",\n", + "# 8 * 60 * 60))\n", + "\n", + "# 10 mins\n", + "#INTIAL_PLANNING_TIMEOUT = int(os.getenv(\n", + "# \"FLATLAND_INITIAL_PLANNING_TIMEOUT\",\n", + "# 10 * 60))\n", + "\n", + "# 10 seconds\n", + "#PER_STEP_TIMEOUT = int(os.getenv(\n", + "# \"FLATLAND_PER_STEP_TIMEOUT\",\n", + "# 10))\n", + "\n", + "# 5 min - applies to the rest of the commands\n", + "#DEFAULT_COMMAND_TIMEOUT = int(os.getenv(\n", + "# \"FLATLAND_DEFAULT_COMMAND_TIMEOUT\",\n", + "# 5 * 60))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set some short timeouts for testing" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "osEnv2[\"FLATLAND_OVERALL_TIMEOUT\"]=\"10\"\n", + "osEnv2[\"FLATLAND_PER_STEP_TIMEOUT\"] = \"1\"\n", + "osEnv2[\"FLATLAND_MAX_SUCCESSIVE_TIMEOUTS\"] = \"2\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the python command for `service.py`" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -286,42 +369,19 @@ ] }, { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "#wOut = ipw.Output()\n", - "#wOut" - ] - }, - { - "cell_type": "code", - "execution_count": 17, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "oPipe = sp.Popen(lsCmd)" + "### Run the command with Popen (output goes to jupyter stdout not notebook)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "subprocess.Popen" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "type(oPipe)" + "oPipe = sp.Popen(lsCmd, env=osEnv2)" ] }, { @@ -402,6 +462,56 @@ { "cell_type": "code", "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def run_submission(slow_ep=1, delay=2):\n", + " episode = 0\n", + " obs = True\n", + " while obs:\n", + " obs, info = oFRC.env_create(obs_builder_object=oObsB)\n", + " if not obs:\n", + " print(\"null observation - all envs completed!\")\n", + " break\n", + " print(f\"Episode : {episode}\")\n", + " \n", + "\n", + " print(oFRC.env.dones['__all__'])\n", + "\n", + " while True:\n", + " if episode < 3:\n", + " action = expert_controller(obs, oFRC.env)\n", + " else:\n", + " action = random_controller(obs, oFRC.env)\n", + " \n", + " time_start = time.time()\n", + " \n", + " if (episode == slow_ep) and (oFRC.env._elapsed_steps > 10):\n", + " time.sleep(2)\n", + " \n", + " try:\n", + " observation, all_rewards, done, info = oFRC.env_step(action)\n", + " time_diff = time.time() - time_start\n", + " print(\".\", end=\"\")\n", + " if done['__all__']:\n", + " print(\"\\nCompleted Episode : \", episode)\n", + " print(\"Reward : \", sum(list(all_rewards.values())))\n", + " break\n", + " except TimeoutException as err:\n", + " print(\"Timeout: \", err)\n", + " break\n", + " \n", + " episode += 1\n", + " \n", + " print(f\"Evaluation Complete - episodes={episode} - send submit message...\")\n", + " print(oFRC.submit())\n", + " print(\"All done.\")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 26, "metadata": { "scrolled": false }, @@ -415,76 +525,37 @@ "Episode : 0\n", "False\n", "...........................................................................................................................................................................\n", - "Current Episode : 1\n", - "Episode Done\n", + "Completed Episode : 0\n", "Reward : 10.0\n", "DEPRECATED - use FileMalfunctionGen instead of malfunction_from_file\n", "DEPRECATED - RailEnv arg: malfunction_and_process_data - use malfunction_generator\n", "Episode : 1\n", "False\n", - "...........................................................................................................................................................................\n", - "Current Episode : 2\n", - "Episode Done\n", - "Reward : 10.0\n", - "null observation!\n", - "Evaluation Complete...\n", + "...........Error received: {'type': 'FLATLAND_RL.ENV_STEP_TIMEOUT'}\n", + "Timeout: FLATLAND_RL.ENV_STEP_TIMEOUT\n", + "null observation - all envs completed!\n", + "Evaluation Complete - episodes=2 - send submit message...\n", "====================================================================================================\n", "====================================================================================================\n", "## Client Performance Stats\n", "====================================================================================================\n", - "\t - env_creation_wait_time\t => min: 0.0010077953338623047 || mean: 0.0071858565012613935 || max: 0.014672040939331055\n", - "\t - internal_env_reset_time\t => min: 0.002426624298095703 || mean: 0.0024870634078979492 || max: 0.0025475025177001953\n", - "\t - inference_time(approx)\t => min: 2.1696090698242188e-05 || mean: 4.437164953577589e-05 || max: 0.0003075599670410156\n", - "\t - internal_env_step_time\t => min: 0.00030541419982910156 || mean: 0.0008465407187478588 || max: 0.0026504993438720703\n", + "\t - env_creation_wait_time\t => min: 0.0006797313690185547 || mean: 0.006682872772216797 || max: 0.013753414154052734\n", + "\t - internal_env_reset_time\t => min: 0.001880645751953125 || mean: 0.0022284984588623047 || max: 0.0025763511657714844\n", + "\t - inference_time(approx)\t => min: 2.4080276489257812e-05 || mean: 0.010996600969241616 || max: 2.000370979309082\n", + "\t - internal_env_step_time\t => min: 0.0004317760467529297 || mean: 0.0010453842498443946 || max: 0.0029251575469970703\n", "====================================================================================================\n", - "{'mean_reward': -944.0, 'mean_normalized_reward': 0.80735, 'mean_percentage_complete': 1.0}\n" + "{'mean_reward': 1978.0, 'mean_normalized_reward': 0.40367, 'mean_percentage_complete': 0.5}\n", + "All done.\n" ] } ], "source": [ - "if True:\n", - " episode = 0\n", - " obs = True\n", - " while obs:\n", - " obs, info = oFRC.env_create(\n", - " obs_builder_object=oObsB\n", - " )\n", - " if not obs:\n", - " print(\"null observation!\")\n", - " \"\"\"\n", - " The remote env returns False as the first obs\n", - " when it is done evaluating all the individual episodes\n", - " \"\"\"\n", - " break\n", - " print(\"Episode : {}\".format(episode))\n", - " episode += 1\n", - "\n", - " print(oFRC.env.dones['__all__'])\n", - "\n", - " while True:\n", - " if episode < 3:\n", - " action = expert_controller(obs, oFRC.env)\n", - " else:\n", - " action = random_controller(obs, oFRC.env)\n", - " \n", - " time_start = time.time()\n", - "\n", - " try:\n", - " observation, all_rewards, done, info = oFRC.env_step(action)\n", - " time_diff = time.time() - time_start\n", - " #print(\"Step Time : \", time_diff)\n", - " print(\".\", end=\"\")\n", - " if done['__all__']:\n", - " print(\"\\nCurrent Episode : \", episode)\n", - " print(\"Episode Done\")\n", - " print(\"Reward : \", sum(list(all_rewards.values())))\n", - " break\n", - " except TimeoutException as err:\n", - " print(\"Timeout: \", err)\n", - " break\n", - "\n", - " print(\"Evaluation Complete...\")\n", - " print(oFRC.submit())" + "try:\n", + " run_submission()\n", + "except TimeoutException as timeoutException:\n", + " print(\"Timed out.\")\n", + " print(timeoutException)\n", + " " ] }, { @@ -496,12 +567,210 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "!ps -ef | grep -i python | grep -i flatland.evaluators.service | awk '{print $2}' | xargs kill" ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>filename</th>\n", + " <td>Test_0/Level_0.pkl</td>\n", + " <td>Test_0/Level_1.pkl</td>\n", + " </tr>\n", + " <tr>\n", + " <th>test_id</th>\n", + " <td>Test_0</td>\n", + " <td>Test_0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>env_id</th>\n", + " <td>Level_0</td>\n", + " <td>Level_1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>n_agents</th>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>x_dim</th>\n", + " <td>25</td>\n", + " <td>25</td>\n", + " </tr>\n", + " <tr>\n", + " <th>y_dim</th>\n", + " <td>25</td>\n", + " <td>25</td>\n", + " </tr>\n", + " <tr>\n", + " <th>n_cities</th>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max_rails_in_city</th>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>malfunction_interval</th>\n", + " <td>50</td>\n", + " <td>50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>n_envs_run</th>\n", + " <td>50</td>\n", + " <td>50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>seed</th>\n", + " <td>11111</td>\n", + " <td>11111</td>\n", + " </tr>\n", + " <tr>\n", + " <th>grid_mode</th>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max_rails_between_cities</th>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>malfunction_duration_min</th>\n", + " <td>20</td>\n", + " <td>20</td>\n", + " </tr>\n", + " <tr>\n", + " <th>malfunction_duration_max</th>\n", + " <td>50</td>\n", + " <td>50</td>\n", + " </tr>\n", + " <tr>\n", + " <th>speed_ratios</th>\n", + " <td>{1.0: 1.0}</td>\n", + " <td>{1.0: 1.0}</td>\n", + " </tr>\n", + " <tr>\n", + " <th>reward</th>\n", + " <td>-944</td>\n", + " <td>4900</td>\n", + " </tr>\n", + " <tr>\n", + " <th>normalized_reward</th>\n", + " <td>0.807347</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>percentage_complete</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>steps</th>\n", + " <td>171</td>\n", + " <td>12</td>\n", + " </tr>\n", + " <tr>\n", + " <th>simulation_time</th>\n", + " <td>0.258108</td>\n", + " <td>2.0388</td>\n", + " </tr>\n", + " <tr>\n", + " <th>nb_malfunctioning_trains</th>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>controller_inference_time_min</th>\n", + " <td>2.40803e-05</td>\n", + " <td>2.40803e-05</td>\n", + " </tr>\n", + " <tr>\n", + " <th>controller_inference_time_mean</th>\n", + " <td>6.7676e-05</td>\n", + " <td>6.59725e-05</td>\n", + " </tr>\n", + " <tr>\n", + " <th>controller_inference_time_max</th>\n", + " <td>0.000576019</td>\n", + " <td>0.000576019</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1\n", + "filename Test_0/Level_0.pkl Test_0/Level_1.pkl\n", + "test_id Test_0 Test_0\n", + "env_id Level_0 Level_1\n", + "n_agents 5 5\n", + "x_dim 25 25\n", + "y_dim 25 25\n", + "n_cities 2 2\n", + "max_rails_in_city 3 3\n", + "malfunction_interval 50 50\n", + "n_envs_run 50 50\n", + "seed 11111 11111\n", + "grid_mode False False\n", + "max_rails_between_cities 2 2\n", + "malfunction_duration_min 20 20\n", + "malfunction_duration_max 50 50\n", + "speed_ratios {1.0: 1.0} {1.0: 1.0}\n", + "reward -944 4900\n", + "normalized_reward 0.807347 0\n", + "percentage_complete 1 0\n", + "steps 171 12\n", + "simulation_time 0.258108 2.0388\n", + "nb_malfunctioning_trains 2 0\n", + "controller_inference_time_min 2.40803e-05 2.40803e-05\n", + "controller_inference_time_mean 6.7676e-05 6.59725e-05\n", + "controller_inference_time_max 0.000576019 0.000576019" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_csv(\"/tmp/output.csv\").T" + ] } ], "metadata": { -- GitLab