diff --git a/flatland/cli.py b/flatland/cli.py
index cc7576d16a02b0d0268ecaab201921b5034d7ee0..9bb7107476a0981fda13b05502949549ce8d4d31 100644
--- a/flatland/cli.py
+++ b/flatland/cli.py
@@ -61,7 +61,13 @@ def demo(args=None):
               help="Evaluation Service ID. This has to match the service id on the client.",
               required=False
               )
-def evaluator(tests, service_id):
+@click.option('--results_path',
+              type=click.Path(exists=False),
+              default=None,
+              help="Path where the evaluator should write the results metadata.",
+              required=False
+              )
+def evaluator(tests, service_id, results_path):
     try:
         redis_connection = redis.Redis()
         redis_connection.ping()
@@ -75,6 +81,7 @@ def evaluator(tests, service_id):
         test_env_folder=tests,
         flatland_rl_service_id=service_id,
         visualize=False,
+        result_output_path=results_path,
         verbose=False
     )
     grader.run()
diff --git a/flatland/evaluators/aicrowd_helpers.py b/flatland/evaluators/aicrowd_helpers.py
index 0d46dca01f47cf00185332a1cd0e751b9f1c8d4c..606550d521ea02e7bba305fab46e4d94163a54ab 100644
--- a/flatland/evaluators/aicrowd_helpers.py
+++ b/flatland/evaluators/aicrowd_helpers.py
@@ -3,6 +3,7 @@ import os
 import random
 import subprocess
 import uuid
+import pathlib
 
 ###############################################################
 # Expected Env Variables
@@ -11,7 +12,7 @@ import uuid
 # AICROWD_IS_GRADING : true
 # CROWDAI_IS_GRADING : true
 # S3_BUCKET : aicrowd-production
-# S3_UPLOAD_PATH_TEMPLATE : misc/flatland-rl-Media/{}.mp4
+# S3_UPLOAD_PATH_TEMPLATE : misc/flatland-rl-Media/{}
 # AWS_ACCESS_KEY_ID
 # AWS_SECRET_ACCESS_KEY
 # http_proxy
@@ -20,7 +21,7 @@ import uuid
 AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", False)
 AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", False)
 S3_BUCKET = os.getenv("S3_BUCKET", "aicrowd-production")
-S3_UPLOAD_PATH_TEMPLATE = os.getenv("S3_UPLOAD_PATH_TEMPLATE", "misc/flatland-rl-Media/{}.mp4")
+S3_UPLOAD_PATH_TEMPLATE = os.getenv("S3_UPLOAD_PATH_TEMPLATE", "misc/flatland-rl-Media/{}")
 
 
 def get_boto_client():
@@ -62,7 +63,7 @@ def upload_random_frame_to_s3(frames_folder):
     if not S3_BUCKET:
         raise Exception("S3_BUCKET not provided...")
 
-    image_target_key = S3_UPLOAD_PATH_TEMPLATE.replace(".mp4", ".png").format(str(uuid.uuid4()))
+    image_target_key = (S3_UPLOAD_PATH_TEMPLATE + ".png").format(str(uuid.uuid4()))
     s3.put_object(
         ACL="public-read",
         Bucket=S3_BUCKET,
@@ -79,14 +80,17 @@ def upload_to_s3(localpath):
     if not S3_BUCKET:
         raise Exception("S3_BUCKET not provided...")
 
-    image_target_key = S3_UPLOAD_PATH_TEMPLATE.format(str(uuid.uuid4()))
+    file_suffix = str(pathlib.Path(localpath).suffix)
+    file_target_key = (S3_UPLOAD_PATH_TEMPLATE + file_suffix).format(
+        str(uuid.uuid4())
+    )
     s3.put_object(
         ACL="public-read",
         Bucket=S3_BUCKET,
-        Key=image_target_key,
+        Key=file_target_key,
         Body=open(localpath, 'rb')
     )
-    return image_target_key
+    return file_target_key
 
 
 def make_subprocess_call(command, shell=False):
diff --git a/flatland/evaluators/service.py b/flatland/evaluators/service.py
index 09b98b4d9d0081b3da4651e0142f103383df1f8b..25fdeee51d4b1bc28a76ecc7ffb715eee80f79a9 100644
--- a/flatland/evaluators/service.py
+++ b/flatland/evaluators/service.py
@@ -284,7 +284,7 @@ class FlatlandRemoteEvaluationService:
             self.evaluation_metadata_df["controller_inference_time_mean"] = np.nan
             self.evaluation_metadata_df["controller_inference_time_max"] = np.nan
         else:
-            print("[WARNING] metadata.csv not found in tests folder. Granular metric collection is hence Disabled.")
+            raise Exception("metadata.csv not found in tests folder. Please use an updated version of the test set.")
 
     def update_evaluation_metadata(self):
         """
@@ -565,9 +565,9 @@ class FlatlandRemoteEvaluationService:
         progress = np.clip(
             self.simulation_count * 1.0 / len(self.env_file_paths),
             0, 1)
-        mean_reward = round(np.mean(self.simulation_rewards), 2)
-        mean_normalized_reward = round(np.mean(self.simulation_rewards_normalized), 2)
-        mean_percentage_complete = round(np.mean(self.simulation_percentage_complete), 3)
+
+        mean_reward, mean_normalized_reward, mean_percentage_complete = self.compute_mean_scores()
+
         self.evaluation_state["state"] = "IN_PROGRESS"
         self.evaluation_state["progress"] = progress
         self.evaluation_state["simulation_count"] = self.simulation_count
@@ -688,9 +688,7 @@ class FlatlandRemoteEvaluationService:
                 """
             )
 
-        mean_reward = round(np.mean(self.simulation_rewards), 2)
-        mean_normalized_reward = round(np.mean(self.simulation_rewards_normalized), 2)
-        mean_percentage_complete = round(np.mean(self.simulation_percentage_complete), 3)
+        mean_reward, mean_normalized_reward, mean_percentage_complete = self.compute_mean_scores()
 
         if self.visualize and len(os.listdir(self.vizualization_folder_name)) > 0:
             # Generate the video
@@ -729,12 +727,15 @@ class FlatlandRemoteEvaluationService:
         # Write Results to a file (if applicable)
         #####################################################################
         if self.result_output_path:
-            if self.evaluation_metadata_df is not None:
-                self.evaluation_metadata_df.to_csv(self.result_output_path)
-                print("Wrote output results to : {}".format(self.result_output_path))
-            else:
-                print("[WARING] Unable to write final results to the specified path"
-                      " as metadata.csv is not provided in the tests_folder")
+            self.evaluation_metadata_df.to_csv(self.result_output_path)
+            print("Wrote output results to : {}".format(self.result_output_path))
+            
+            # Upload the metadata file to S3 
+            if aicrowd_helpers.is_grading() and aicrowd_helpers.is_aws_configured():
+                metadata_s3_key = aicrowd_helpers.upload_to_s3(
+                    self.result_output_path
+                )
+                self.evaluation_state["meta"]["private_metadata_s3_key"] = metadata_s3_key
 
         _command_response = {}
         _command_response['type'] = messages.FLATLAND_RL.ENV_SUBMIT_RESPONSE
@@ -751,9 +752,11 @@ class FlatlandRemoteEvaluationService:
         self.evaluation_state["state"] = "FINISHED"
         self.evaluation_state["progress"] = 1.0
         self.evaluation_state["simulation_count"] = self.simulation_count
-        self.evaluation_state["score"]["score"] = mean_percentage_complete
-        self.evaluation_state["score"]["score_secondary"] = mean_reward
+        self.evaluation_state["score"]["score"] = mean_normalized_reward
+        self.evaluation_state["score"]["score_secondary"] = mean_percentage_complete
         self.evaluation_state["meta"]["normalized_reward"] = mean_normalized_reward
+        self.evaluation_state["meta"]["reward"] = mean_reward
+        self.evaluation_state["meta"]["percentage_complete"] = mean_percentage_complete
         self.handle_aicrowd_success_event(self.evaluation_state)
         print("#" * 100)
         print("EVALUATION COMPLETE !!")
@@ -764,6 +767,30 @@ class FlatlandRemoteEvaluationService:
         print("#" * 100)
         print("#" * 100)
 
+    def compute_mean_scores(self):
+        #################################################################################
+        #################################################################################
+        # Compute the mean rewards, mean normalized_reward and mean_percentage_complete
+        # we group all the results by the test_ids
+        # so we first compute the mean in each of the test_id groups, 
+        # and then we compute the mean across each of the test_id groups
+        #
+        #
+        #################################################################################
+        #################################################################################
+        source_df = self.evaluation_metadata_df.dropna()
+        grouped_df = source_df.groupby(['test_id']).mean()
+
+        mean_reward = grouped_df["reward"].mean()
+        mean_normalized_reward = grouped_df["normalized_reward"].mean()
+        mean_percentage_complete = grouped_df["percentage_complete"].mean()
+        # Round off the reward values
+        mean_reward = round(mean_reward, 2)
+        mean_normalized_reward = round(mean_normalized_reward, 5)
+        mean_percentage_complete = round(mean_percentage_complete, 3)
+
+        return mean_reward, mean_normalized_reward, mean_percentage_complete
+
     def report_error(self, error_message, command_response_channel):
         """
         A helper function used to report error back to the client
diff --git a/requirements_continuous_integration.txt b/requirements_continuous_integration.txt
index 06199e95d5dbba7bf58f40abe7e26e54c39078b2..21a58438209f4e3da065de0a642a83ec950bff23 100644
--- a/requirements_continuous_integration.txt
+++ b/requirements_continuous_integration.txt
@@ -16,6 +16,7 @@ pydeps>=1.7.2
 jupyter>=1.0.0
 jupyter-core>=4.5.0
 notebook>=5.7.8
+PyVirtualDisplay==0.2.5
 pytest-xvfb>=1.2.0
 git+https://github.com/who8mylunch/Jupyter_Canvas_Widget.git@bd151ae1509c50b5809944dd3294f58b7b069c86
 m2r>=0.2.1
diff --git a/tox.ini b/tox.ini
index b7a359b43c79cbb5d274755b35fd9f6a7873daff..e22607811c9741a347cdc67303ba459087e02d7c 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py36, py37, examples, notebooks, flake8, docs, coverage
+envlist = py36, py37, examples, notebooks, docs, coverage
 
 
 [travis]