From f80a34b22f84083d1f71f5006e200e2a90ce20a9 Mon Sep 17 00:00:00 2001 From: "S.P. Mohanty" <spmohanty91@gmail.com> Date: Tue, 19 Mar 2024 02:42:57 +0000 Subject: [PATCH] end to end plumbing for new interface --- local_evaluation.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/local_evaluation.py b/local_evaluation.py index d6a2528..2e0196b 100644 --- a/local_evaluation.py +++ b/local_evaluation.py @@ -57,8 +57,9 @@ def generate_model_outputs(data_df, model): data_df.iterrows(), total=len(data_df), desc="Generating Responses" ): is_multiple_choice = row["task_type"] == "multiple-choice" - # the 'task_type' column won't be available during evaluation, so you should use something like - # ```is_multiple_choice = row['is_multiple_choice']`` + # the 'task_type' column won't be available during evaluation + # please consistently use just the `is_multiple_choice` parameter + # passed to the `.predict`method.` prompt = row["input_field"] model_output = model.predict(prompt, is_multiple_choice) outputs.append(model_output) @@ -86,7 +87,7 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1): data_df.iterrows(), total=len(data_df), desc="Evaluating" ): task_name, task_type, metric, ground_truth = ( - row['task_name'] + row["task_name"], row["task_type"], row["metric"], row["output_field"], @@ -95,12 +96,6 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1): if metric not in eval_methods: raise NotImplementedError(f"No metric for {metric=}") - task_name = f"{task_name}" - # task_name = f"{task_type}---{metric}" - # Note: In practice, here we are using the task_type-metric pair as a unique identifier, calling it as the task_name. - # During the actual evaluations, the task names are more semantically defined, meaning, there could be multiple tasks - # with the same task_type and metric. - model_output = task_parsers[task_type].parse(outputs[row_idx]) eval_fn = eval_methods[metric] metric_score = eval_fn(model_output, ground_truth) -- GitLab