diff --git a/local_evaluation.py b/local_evaluation.py
index 7c5eb86167ed5ce4bf48fd3dde5668228c545b71..d6a25282ca64e3a6d7a890b20c673c7f921f156a 100644
--- a/local_evaluation.py
+++ b/local_evaluation.py
@@ -85,7 +85,8 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
     for row_idx, row in tqdm(
         data_df.iterrows(), total=len(data_df), desc="Evaluating"
     ):
-        task_type, metric, ground_truth = (
+        task_name, task_type, metric, ground_truth = (
+            row['task_name']
             row["task_type"],
             row["metric"],
             row["output_field"],
@@ -94,7 +95,8 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
         if metric not in eval_methods:
             raise NotImplementedError(f"No metric for {metric=}")
 
-        task_name = f"{task_type}---{metric}"
+        task_name = f"{task_name}"
+        # task_name = f"{task_type}---{metric}"
         # Note: In practice, here we are using the task_type-metric pair as a unique identifier, calling it as the task_name.
         # During the actual evaluations, the task names are more semantically defined, meaning, there could be multiple tasks
         # with the same task_type and metric.