diff --git a/flatland/cli.py b/flatland/cli.py
index cc7576d16a02b0d0268ecaab201921b5034d7ee0..7f3b95fd1666ed6eafd6bb623d3546083b7f898d 100644
--- a/flatland/cli.py
+++ b/flatland/cli.py
@@ -61,7 +61,13 @@ def demo(args=None):
               help="Evaluation Service ID. This has to match the service id on the client.",
               required=False
               )
-def evaluator(tests, service_id):
+@click.option('--results_path',
+              type=click.Path(exists=False),
+              default=False,
+              help="Path where the evaluator should write the results metadata.",
+              required=False
+              )
+def evaluator(tests, service_id, results_path):
     try:
         redis_connection = redis.Redis()
         redis_connection.ping()
@@ -75,6 +81,7 @@ def evaluator(tests, service_id):
         test_env_folder=tests,
         flatland_rl_service_id=service_id,
         visualize=False,
+        result_output_path=results_path,
         verbose=False
     )
     grader.run()
diff --git a/flatland/evaluators/service.py b/flatland/evaluators/service.py
index 064471535e89d6fe87c16ecefac9127bbf8c6ffa..782dc1856bd6c4e439a7a13b7489c016df9c566c 100644
--- a/flatland/evaluators/service.py
+++ b/flatland/evaluators/service.py
@@ -687,10 +687,27 @@ class FlatlandRemoteEvaluationService:
                 to operate on all the test environments.
                 """
             )
-
-        mean_reward = round(np.mean(self.simulation_rewards), 2)
-        mean_normalized_reward = round(np.mean(self.simulation_rewards_normalized), 2)
-        mean_percentage_complete = round(np.mean(self.simulation_percentage_complete), 3)
+        #################################################################################
+        #################################################################################
+        # Compute the mean rewards, mean normalized_reward and mean_percentage_complete
+        # we group all the results by the test_ids
+        # so we first compute the mean in each of the test_id groups, 
+        # and then we compute the mean across each of the test_id groups
+        #
+        # NOTE : this df should not have NaN rows for any of the above 
+        #        metrics if all the evaluations are successfully completed
+        #
+        #################################################################################
+        #################################################################################
+
+        grouped_df = self.evaluation_metadata_df.groupby(['test_id']).mean()
+        mean_reward = grouped_df["reward"].mean()
+        mean_normalized_reward = grouped_df["normalized_reward"].mean()
+        mean_percentage_complete = grouped_df["percentage_complete"].mean()
+        # 
+        mean_reward = round(mean_reward, 2)
+        mean_normalized_reward = round(mean_normalized_reward, 2)
+        mean_percentage_complete = round(mean_percentage_complete, 3)
 
         if self.visualize and len(os.listdir(self.vizualization_folder_name)) > 0:
             # Generate the video