end to end plumbing for new interface

f80a34b2 · spmohanty · 0a3428e9 · f80a34b2
Commit f80a34b2 authored 1 year ago by spmohanty
--- a/local_evaluation.py
+++ b/local_evaluation.py
@@ -57,8 +57,9 @@ def generate_model_outputs(data_df, model):
        data_df.iterrows(), total=len(data_df), desc="Generating Responses"
    ):
        is_multiple_choice = row["task_type"] == "multiple-choice"
-        # the 'task_type' column won't be available during evaluation, so you should use something like
-        # ```is_multiple_choice = row['is_multiple_choice']``
+        # the 'task_type' column won't be available during evaluation
+        # please consistently use just the `is_multiple_choice` parameter
+        # passed to the `.predict`method.`
        prompt = row["input_field"]
        model_output = model.predict(prompt, is_multiple_choice)
        outputs.append(model_output)
@@ -86,7 +87,7 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
        data_df.iterrows(), total=len(data_df), desc="Evaluating"
    ):
        task_name, task_type, metric, ground_truth = (
-            row['task_name']
+            row["task_name"],
            row["task_type"],
            row["metric"],
            row["output_field"],
@@ -95,12 +96,6 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
        if metric not in eval_methods:
            raise NotImplementedError(f"No metric for {metric=}")

-        task_name = f"{task_name}"
-        # task_name = f"{task_type}---{metric}"
-        # Note: In practice, here we are using the task_type-metric pair as a unique identifier, calling it as the task_name.
-        # During the actual evaluations, the task names are more semantically defined, meaning, there could be multiple tasks
-        # with the same task_type and metric.
-
        model_output = task_parsers[task_type].parse(outputs[row_idx])
        eval_fn = eval_methods[metric]
        metric_score = eval_fn(model_output, ground_truth)