From 0a3428e983325a643de34168829cebc16824579d Mon Sep 17 00:00:00 2001
From: yilun_jin <jyl.jal123@gmail.com>
Date: Mon, 18 Mar 2024 21:09:44 +0000
Subject: [PATCH] Update local_evaluation.py

---
 local_evaluation.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/local_evaluation.py b/local_evaluation.py
index 7c5eb86..d6a2528 100644
--- a/local_evaluation.py
+++ b/local_evaluation.py
@@ -85,7 +85,8 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
     for row_idx, row in tqdm(
         data_df.iterrows(), total=len(data_df), desc="Evaluating"
     ):
-        task_type, metric, ground_truth = (
+        task_name, task_type, metric, ground_truth = (
+            row['task_name']
             row["task_type"],
             row["metric"],
             row["output_field"],
@@ -94,7 +95,8 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
         if metric not in eval_methods:
             raise NotImplementedError(f"No metric for {metric=}")
 
-        task_name = f"{task_type}---{metric}"
+        task_name = f"{task_name}"
+        # task_name = f"{task_type}---{metric}"
         # Note: In practice, here we are using the task_type-metric pair as a unique identifier, calling it as the task_name.
         # During the actual evaluations, the task names are more semantically defined, meaning, there could be multiple tasks
         # with the same task_type and metric.
-- 
GitLab