diff --git a/flatland/cli.py b/flatland/cli.py index cc7576d16a02b0d0268ecaab201921b5034d7ee0..9bb7107476a0981fda13b05502949549ce8d4d31 100644 --- a/flatland/cli.py +++ b/flatland/cli.py @@ -61,7 +61,13 @@ def demo(args=None): help="Evaluation Service ID. This has to match the service id on the client.", required=False ) -def evaluator(tests, service_id): +@click.option('--results_path', + type=click.Path(exists=False), + default=None, + help="Path where the evaluator should write the results metadata.", + required=False + ) +def evaluator(tests, service_id, results_path): try: redis_connection = redis.Redis() redis_connection.ping() @@ -75,6 +81,7 @@ def evaluator(tests, service_id): test_env_folder=tests, flatland_rl_service_id=service_id, visualize=False, + result_output_path=results_path, verbose=False ) grader.run() diff --git a/flatland/evaluators/aicrowd_helpers.py b/flatland/evaluators/aicrowd_helpers.py index 0d46dca01f47cf00185332a1cd0e751b9f1c8d4c..606550d521ea02e7bba305fab46e4d94163a54ab 100644 --- a/flatland/evaluators/aicrowd_helpers.py +++ b/flatland/evaluators/aicrowd_helpers.py @@ -3,6 +3,7 @@ import os import random import subprocess import uuid +import pathlib ############################################################### # Expected Env Variables @@ -11,7 +12,7 @@ import uuid # AICROWD_IS_GRADING : true # CROWDAI_IS_GRADING : true # S3_BUCKET : aicrowd-production -# S3_UPLOAD_PATH_TEMPLATE : misc/flatland-rl-Media/{}.mp4 +# S3_UPLOAD_PATH_TEMPLATE : misc/flatland-rl-Media/{} # AWS_ACCESS_KEY_ID # AWS_SECRET_ACCESS_KEY # http_proxy @@ -20,7 +21,7 @@ import uuid AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", False) AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", False) S3_BUCKET = os.getenv("S3_BUCKET", "aicrowd-production") -S3_UPLOAD_PATH_TEMPLATE = os.getenv("S3_UPLOAD_PATH_TEMPLATE", "misc/flatland-rl-Media/{}.mp4") +S3_UPLOAD_PATH_TEMPLATE = os.getenv("S3_UPLOAD_PATH_TEMPLATE", "misc/flatland-rl-Media/{}") def get_boto_client(): @@ -62,7 +63,7 @@ def upload_random_frame_to_s3(frames_folder): if not S3_BUCKET: raise Exception("S3_BUCKET not provided...") - image_target_key = S3_UPLOAD_PATH_TEMPLATE.replace(".mp4", ".png").format(str(uuid.uuid4())) + image_target_key = (S3_UPLOAD_PATH_TEMPLATE + ".png").format(str(uuid.uuid4())) s3.put_object( ACL="public-read", Bucket=S3_BUCKET, @@ -79,14 +80,17 @@ def upload_to_s3(localpath): if not S3_BUCKET: raise Exception("S3_BUCKET not provided...") - image_target_key = S3_UPLOAD_PATH_TEMPLATE.format(str(uuid.uuid4())) + file_suffix = str(pathlib.Path(localpath).suffix) + file_target_key = (S3_UPLOAD_PATH_TEMPLATE + file_suffix).format( + str(uuid.uuid4()) + ) s3.put_object( ACL="public-read", Bucket=S3_BUCKET, - Key=image_target_key, + Key=file_target_key, Body=open(localpath, 'rb') ) - return image_target_key + return file_target_key def make_subprocess_call(command, shell=False): diff --git a/flatland/evaluators/service.py b/flatland/evaluators/service.py index 09b98b4d9d0081b3da4651e0142f103383df1f8b..25fdeee51d4b1bc28a76ecc7ffb715eee80f79a9 100644 --- a/flatland/evaluators/service.py +++ b/flatland/evaluators/service.py @@ -284,7 +284,7 @@ class FlatlandRemoteEvaluationService: self.evaluation_metadata_df["controller_inference_time_mean"] = np.nan self.evaluation_metadata_df["controller_inference_time_max"] = np.nan else: - print("[WARNING] metadata.csv not found in tests folder. Granular metric collection is hence Disabled.") + raise Exception("metadata.csv not found in tests folder. Please use an updated version of the test set.") def update_evaluation_metadata(self): """ @@ -565,9 +565,9 @@ class FlatlandRemoteEvaluationService: progress = np.clip( self.simulation_count * 1.0 / len(self.env_file_paths), 0, 1) - mean_reward = round(np.mean(self.simulation_rewards), 2) - mean_normalized_reward = round(np.mean(self.simulation_rewards_normalized), 2) - mean_percentage_complete = round(np.mean(self.simulation_percentage_complete), 3) + + mean_reward, mean_normalized_reward, mean_percentage_complete = self.compute_mean_scores() + self.evaluation_state["state"] = "IN_PROGRESS" self.evaluation_state["progress"] = progress self.evaluation_state["simulation_count"] = self.simulation_count @@ -688,9 +688,7 @@ class FlatlandRemoteEvaluationService: """ ) - mean_reward = round(np.mean(self.simulation_rewards), 2) - mean_normalized_reward = round(np.mean(self.simulation_rewards_normalized), 2) - mean_percentage_complete = round(np.mean(self.simulation_percentage_complete), 3) + mean_reward, mean_normalized_reward, mean_percentage_complete = self.compute_mean_scores() if self.visualize and len(os.listdir(self.vizualization_folder_name)) > 0: # Generate the video @@ -729,12 +727,15 @@ class FlatlandRemoteEvaluationService: # Write Results to a file (if applicable) ##################################################################### if self.result_output_path: - if self.evaluation_metadata_df is not None: - self.evaluation_metadata_df.to_csv(self.result_output_path) - print("Wrote output results to : {}".format(self.result_output_path)) - else: - print("[WARING] Unable to write final results to the specified path" - " as metadata.csv is not provided in the tests_folder") + self.evaluation_metadata_df.to_csv(self.result_output_path) + print("Wrote output results to : {}".format(self.result_output_path)) + + # Upload the metadata file to S3 + if aicrowd_helpers.is_grading() and aicrowd_helpers.is_aws_configured(): + metadata_s3_key = aicrowd_helpers.upload_to_s3( + self.result_output_path + ) + self.evaluation_state["meta"]["private_metadata_s3_key"] = metadata_s3_key _command_response = {} _command_response['type'] = messages.FLATLAND_RL.ENV_SUBMIT_RESPONSE @@ -751,9 +752,11 @@ class FlatlandRemoteEvaluationService: self.evaluation_state["state"] = "FINISHED" self.evaluation_state["progress"] = 1.0 self.evaluation_state["simulation_count"] = self.simulation_count - self.evaluation_state["score"]["score"] = mean_percentage_complete - self.evaluation_state["score"]["score_secondary"] = mean_reward + self.evaluation_state["score"]["score"] = mean_normalized_reward + self.evaluation_state["score"]["score_secondary"] = mean_percentage_complete self.evaluation_state["meta"]["normalized_reward"] = mean_normalized_reward + self.evaluation_state["meta"]["reward"] = mean_reward + self.evaluation_state["meta"]["percentage_complete"] = mean_percentage_complete self.handle_aicrowd_success_event(self.evaluation_state) print("#" * 100) print("EVALUATION COMPLETE !!") @@ -764,6 +767,30 @@ class FlatlandRemoteEvaluationService: print("#" * 100) print("#" * 100) + def compute_mean_scores(self): + ################################################################################# + ################################################################################# + # Compute the mean rewards, mean normalized_reward and mean_percentage_complete + # we group all the results by the test_ids + # so we first compute the mean in each of the test_id groups, + # and then we compute the mean across each of the test_id groups + # + # + ################################################################################# + ################################################################################# + source_df = self.evaluation_metadata_df.dropna() + grouped_df = source_df.groupby(['test_id']).mean() + + mean_reward = grouped_df["reward"].mean() + mean_normalized_reward = grouped_df["normalized_reward"].mean() + mean_percentage_complete = grouped_df["percentage_complete"].mean() + # Round off the reward values + mean_reward = round(mean_reward, 2) + mean_normalized_reward = round(mean_normalized_reward, 5) + mean_percentage_complete = round(mean_percentage_complete, 3) + + return mean_reward, mean_normalized_reward, mean_percentage_complete + def report_error(self, error_message, command_response_channel): """ A helper function used to report error back to the client diff --git a/requirements_continuous_integration.txt b/requirements_continuous_integration.txt index 06199e95d5dbba7bf58f40abe7e26e54c39078b2..21a58438209f4e3da065de0a642a83ec950bff23 100644 --- a/requirements_continuous_integration.txt +++ b/requirements_continuous_integration.txt @@ -16,6 +16,7 @@ pydeps>=1.7.2 jupyter>=1.0.0 jupyter-core>=4.5.0 notebook>=5.7.8 +PyVirtualDisplay==0.2.5 pytest-xvfb>=1.2.0 git+https://github.com/who8mylunch/Jupyter_Canvas_Widget.git@bd151ae1509c50b5809944dd3294f58b7b069c86 m2r>=0.2.1 diff --git a/tox.ini b/tox.ini index b7a359b43c79cbb5d274755b35fd9f6a7873daff..e22607811c9741a347cdc67303ba459087e02d7c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py36, py37, examples, notebooks, flake8, docs, coverage +envlist = py36, py37, examples, notebooks, docs, coverage [travis]