Skip to content
Snippets Groups Projects
Commit 929296c8 authored by spmohanty's avatar spmohanty
Browse files

Refactor local eval script

parent 3ef95c87
No related branches found
No related tags found
No related merge requests found
import pandas as pd
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer
import metrics
import parsers
def print_sample(i, generation, truth, metric, score):
print(f"Sample {i}, generation: {generation}")
print(f"Sample {i}, truth: {truth}")
def print_sample(idx, generation, truth, metric, score):
"""
Print a sample's generated output, the truth, and its evaluation score.
"""
print(f"Sample {idx}, generation: {generation}")
print(f"Sample {idx}, truth: {truth}")
if isinstance(score, tuple) and len(score) == 3:
print(
f"Metric ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}"
f"Per Sample Metric Score ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}"
)
else:
print(f"Metric ({metric}): {score}")
print(f"Per Sample Metric Score ({metric}): {score}")
print()
if __name__ == "__main__":
# Function to load development data from a JSON file
def load_development_data(filename):
"""
Load development data from a specified JSON file.
# Load Development Data
DATA_FILENAME = "./data/development.json"
data_df = pd.read_json(DATA_FILENAME, lines=True)
Parameters:
- filename: Path to the JSON file containing the development data.
# Load UserModel
from models.user_config import UserModel
Returns:
- A pandas DataFrame containing the loaded data.
"""
return pd.read_json(filename, lines=True)
model = UserModel()
# Generate Responses
# Function to generate model outputs based on the input data
def generate_model_outputs(data_df, model):
"""
Generate predictions for each entry in the data DataFrame using a given model.
Parameters:
- data_df: A pandas DataFrame containing the input data for predictions.
- model: The model instance used for generating predictions.
Returns:
- A list containing the model outputs for each entry in the data DataFrame.
"""
outputs = []
for _rowd_idx, row in tqdm(
data_df.iterrows(),
total=len(data_df),
desc="Generating Responses",
for _, row in tqdm(
data_df.iterrows(), total=len(data_df), desc="Generating Responses"
):
print("=" * 100)
is_multiple_choice = row["task_type"] == "multiple-choice"
prompt = row["input_field"]
model_output = model.predict(prompt, is_multiple_choice)
outputs.append(model_output)
return outputs
print(prompt, model_output)
# Merge outputs into DF
data_df["outputs"] = outputs
print(data_df)
# Function to evaluate the generated model outputs
def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
"""
Evaluate the model outputs against ground truth values using specified metrics.
# Evaluate
print_interval = 1
Parameters:
- data_df: DataFrame containing the development data, including ground truth.
- outputs: The generated outputs from the model to be evaluated.
- log_every_n_steps: Logs samples every N steps
Returns:
- A dictionary containing evaluation metrics and scores for each task.
"""
eval_methods = get_evaluation_methods()
task_parsers = get_task_parsers()
per_task_metrics = {}
for row_idx, row in tqdm(
data_df.iterrows(), total=len(data_df), desc="Evaluating"
):
task_type, metric, ground_truth = (
row["task_type"],
row["metric"],
row["output_field"],
)
if metric not in eval_methods:
raise NotImplementedError(f"No metric for {metric=}")
task_name = f"{task_type}---{metric}"
# Note: In practice, here we are using the task_type-metric pair as a unique identifier, calling it as the task_name.
# During the actual evaluations, the task names are more semantically defined, meaning, there could be multiple tasks
# with the same task_type and metric.
model_output = task_parsers[task_type].parse(outputs[row_idx])
eval_fn = eval_methods[metric]
metric_score = eval_fn(model_output, ground_truth)
if task_name not in per_task_metrics:
per_task_metrics[task_name] = {
"task_type": task_type,
"metric": metric,
"sample_score": [],
}
per_task_metrics[task_name]["sample_score"].append(metric_score)
if row_idx % log_every_n_steps == 0:
print_sample(
row_idx, model_output, ground_truth, metric, metric_score
)
return per_task_metrics
# Function to aggregate scores from evaluations
def aggregate_scores(per_task_metrics):
"""
Aggregate evaluation scores across different tasks and metrics.
Parameters:
- per_task_metrics: A dictionary containing raw evaluation scores for each task.
Returns:
- A pandas DataFrame summarizing the overall metrics and scores.
"""
overall_metrics = {
"task_name": [],
"task_type": [],
"metric": [],
"overall_score": [],
}
for task_name, values in per_task_metrics.items():
task_type, metric, sample_scores = (
values["task_type"],
values["metric"],
values["sample_score"],
)
overall_score = (
np.mean(sample_scores)
if metric != "micro f1"
else metrics.compute_f1_score(sample_scores)
)
overall_metrics["task_name"].append(task_name)
overall_metrics["task_type"].append(task_type)
overall_metrics["metric"].append(metric)
overall_metrics["overall_score"].append(overall_score)
return pd.DataFrame(overall_metrics)
# Define and return evaluation methods
def get_evaluation_methods():
"""
Get evaluation methods including accuracy, sentence transformers, and other metrics.
Returns:
- A dictionary mapping metric names to their respective evaluation functions.
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
sentence_all_lm = SentenceTransformer("all-MiniLM-L6-v2").to(device)
sentece_multilingual = SentenceTransformer(
sentence_multilingual = SentenceTransformer(
"paraphrase-multilingual-MiniLM-L12-v2"
).to(device)
eval_methods = {
return {
"accuracy": metrics.accuracy,
"hit rate@3": metrics.hit_rate_3,
"rougel": metrics.rougel,
......@@ -67,7 +174,7 @@ if __name__ == "__main__":
g, t, sentence_all_lm
),
"multilingual-sent-transformer": lambda g, t: metrics.sent_transformer(
g, t, sentece_multilingual
g, t, sentence_multilingual
),
"micro f1": metrics.tp_fp_fn,
"ndcg": metrics.ndcg_eval,
......@@ -75,7 +182,16 @@ if __name__ == "__main__":
"jp-bleu": lambda g, t: metrics.bleu(g, t, jp=True),
}
task_parsers = {
# Define and return task parsers
def get_task_parsers():
"""
Define parsers for different task types to format model outputs accordingly.
Returns:
- A dictionary mapping task types to their respective parsers.
"""
return {
"multiple-choice": parsers.ShoppingBenchTaskParsers("multichoice"),
"generation": parsers.ShoppingBenchTaskParsers("generation"),
"retrieval": parsers.ShoppingBenchTaskParsers("retrieval"),
......@@ -85,57 +201,41 @@ if __name__ == "__main__":
),
}
per_task_metrics = {}
for row_idx, row in tqdm(
data_df.iterrows(), total=len(data_df), desc="Evaluating"
):
metric = row["metric"]
if metric not in eval_methods:
raise NotImplementedError(f"No metric for {metric=}")
task_type = row["task_type"]
task_name = f"{task_type}---{metric}"
per_task_metrics.setdefault(
task_name, {"metric": metric, "sample_score": []}
)
gt = row["output_field"]
model_output = task_parsers[task_type].parse(outputs[row_idx])
eval_fn = eval_methods[metric]
metric_score = eval_fn(model_output, gt)
per_task_metrics[task_name]["sample_score"].append(metric_score)
per_task_metrics[task_name]["sample_score"].append(metric_score)
# Main execution function to load data, generate model outputs, evaluate, and aggregate scores
def main():
# Load development data
DATA_FILENAME = "./data/development.json"
data_df = load_development_data(DATA_FILENAME)
if row_idx % print_interval == 0:
print_sample(row_idx, outputs[row_idx], gt, metric, metric_score)
# Load the model from the user's custom configuration
# Note: The evaluator **Always** imports the UserModel, please reference your own class
# by setting the `UserModel` variable in models.user_config
from models.user_config import UserModel
# Aggregate scores
for task_name in per_task_metrics:
if per_task_metrics[task_name]["metric"] != "micro f1":
per_task_metrics[task_name]["overall_metric"] = np.mean(
per_task_metrics[task_name]["sample_score"]
)
else:
per_task_metrics[task_name]["overall_metric"] = (
metrics.compute_f1_score(
per_task_metrics[task_name]["sample_score"]
)
)
model = UserModel()
print(per_task_metrics)
# Generate model outputs
outputs = generate_model_outputs(data_df, model)
data_df["outputs"] = (
outputs # Optional: Add outputs back to DataFrame for inspection
)
print(data_df.head())
overall_metrics = {"task_name": [], "metric": [], "overall_score": []}
for task_name in per_task_metrics:
overall_metrics["task_name"].append(task_name)
overall_metrics["metric"].append(per_task_metrics[task_name]["metric"])
overall_metrics["overall_score"].append(
per_task_metrics[task_name]["overall_metric"]
)
# Evaluate the generated outputs and calculate metrics
per_task_metrics = evaluate_outputs(data_df, outputs)
overall_metrics = pd.DataFrame(overall_metrics)
# Aggregate and display the evaluation scores
overall_metrics = aggregate_scores(per_task_metrics)
print("=" * 100)
print("Task specific metrics: ")
print(overall_metrics)
print()
# Calculate and print the overall score across all tasks and metrics
overall_score = overall_metrics["overall_score"].mean()
print(f"Overall Score: {overall_score}")
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment