From f80a34b22f84083d1f71f5006e200e2a90ce20a9 Mon Sep 17 00:00:00 2001
From: "S.P. Mohanty" <spmohanty91@gmail.com>
Date: Tue, 19 Mar 2024 02:42:57 +0000
Subject: [PATCH] end to end plumbing for new interface

---
 local_evaluation.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/local_evaluation.py b/local_evaluation.py
index d6a2528..2e0196b 100644
--- a/local_evaluation.py
+++ b/local_evaluation.py
@@ -57,8 +57,9 @@ def generate_model_outputs(data_df, model):
         data_df.iterrows(), total=len(data_df), desc="Generating Responses"
     ):
         is_multiple_choice = row["task_type"] == "multiple-choice"
-        # the 'task_type' column won't be available during evaluation, so you should use something like
-        # ```is_multiple_choice = row['is_multiple_choice']``
+        # the 'task_type' column won't be available during evaluation
+        # please consistently use just the `is_multiple_choice` parameter
+        # passed to the `.predict`method.`
         prompt = row["input_field"]
         model_output = model.predict(prompt, is_multiple_choice)
         outputs.append(model_output)
@@ -86,7 +87,7 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
         data_df.iterrows(), total=len(data_df), desc="Evaluating"
     ):
         task_name, task_type, metric, ground_truth = (
-            row['task_name']
+            row["task_name"],
             row["task_type"],
             row["metric"],
             row["output_field"],
@@ -95,12 +96,6 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
         if metric not in eval_methods:
             raise NotImplementedError(f"No metric for {metric=}")
 
-        task_name = f"{task_name}"
-        # task_name = f"{task_type}---{metric}"
-        # Note: In practice, here we are using the task_type-metric pair as a unique identifier, calling it as the task_name.
-        # During the actual evaluations, the task names are more semantically defined, meaning, there could be multiple tasks
-        # with the same task_type and metric.
-
         model_output = task_parsers[task_type].parse(outputs[row_idx])
         eval_fn = eval_methods[metric]
         metric_score = eval_fn(model_output, ground_truth)
-- 
GitLab