diff --git a/flatland/cli.py b/flatland/cli.py index cc7576d16a02b0d0268ecaab201921b5034d7ee0..7f3b95fd1666ed6eafd6bb623d3546083b7f898d 100644 --- a/flatland/cli.py +++ b/flatland/cli.py @@ -61,7 +61,13 @@ def demo(args=None): help="Evaluation Service ID. This has to match the service id on the client.", required=False ) -def evaluator(tests, service_id): +@click.option('--results_path', + type=click.Path(exists=False), + default=False, + help="Path where the evaluator should write the results metadata.", + required=False + ) +def evaluator(tests, service_id, results_path): try: redis_connection = redis.Redis() redis_connection.ping() @@ -75,6 +81,7 @@ def evaluator(tests, service_id): test_env_folder=tests, flatland_rl_service_id=service_id, visualize=False, + result_output_path=results_path, verbose=False ) grader.run() diff --git a/flatland/evaluators/service.py b/flatland/evaluators/service.py index 064471535e89d6fe87c16ecefac9127bbf8c6ffa..782dc1856bd6c4e439a7a13b7489c016df9c566c 100644 --- a/flatland/evaluators/service.py +++ b/flatland/evaluators/service.py @@ -687,10 +687,27 @@ class FlatlandRemoteEvaluationService: to operate on all the test environments. """ ) - - mean_reward = round(np.mean(self.simulation_rewards), 2) - mean_normalized_reward = round(np.mean(self.simulation_rewards_normalized), 2) - mean_percentage_complete = round(np.mean(self.simulation_percentage_complete), 3) + ################################################################################# + ################################################################################# + # Compute the mean rewards, mean normalized_reward and mean_percentage_complete + # we group all the results by the test_ids + # so we first compute the mean in each of the test_id groups, + # and then we compute the mean across each of the test_id groups + # + # NOTE : this df should not have NaN rows for any of the above + # metrics if all the evaluations are successfully completed + # + ################################################################################# + ################################################################################# + + grouped_df = self.evaluation_metadata_df.groupby(['test_id']).mean() + mean_reward = grouped_df["reward"].mean() + mean_normalized_reward = grouped_df["normalized_reward"].mean() + mean_percentage_complete = grouped_df["percentage_complete"].mean() + # + mean_reward = round(mean_reward, 2) + mean_normalized_reward = round(mean_normalized_reward, 2) + mean_percentage_complete = round(mean_percentage_complete, 3) if self.visualize and len(os.listdir(self.vizualization_folder_name)) > 0: # Generate the video