Skip to content
Snippets Groups Projects
Commit f80a34b2 authored by spmohanty's avatar spmohanty
Browse files

end to end plumbing for new interface

parent 0a3428e9
No related branches found
No related tags found
No related merge requests found
......@@ -57,8 +57,9 @@ def generate_model_outputs(data_df, model):
data_df.iterrows(), total=len(data_df), desc="Generating Responses"
):
is_multiple_choice = row["task_type"] == "multiple-choice"
# the 'task_type' column won't be available during evaluation, so you should use something like
# ```is_multiple_choice = row['is_multiple_choice']``
# the 'task_type' column won't be available during evaluation
# please consistently use just the `is_multiple_choice` parameter
# passed to the `.predict`method.`
prompt = row["input_field"]
model_output = model.predict(prompt, is_multiple_choice)
outputs.append(model_output)
......@@ -86,7 +87,7 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
data_df.iterrows(), total=len(data_df), desc="Evaluating"
):
task_name, task_type, metric, ground_truth = (
row['task_name']
row["task_name"],
row["task_type"],
row["metric"],
row["output_field"],
......@@ -95,12 +96,6 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
if metric not in eval_methods:
raise NotImplementedError(f"No metric for {metric=}")
task_name = f"{task_name}"
# task_name = f"{task_type}---{metric}"
# Note: In practice, here we are using the task_type-metric pair as a unique identifier, calling it as the task_name.
# During the actual evaluations, the task names are more semantically defined, meaning, there could be multiple tasks
# with the same task_type and metric.
model_output = task_parsers[task_type].parse(outputs[row_idx])
eval_fn = eval_methods[metric]
metric_score = eval_fn(model_output, ground_truth)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment