From 929296c85e0eea115b1c2afce2f6c6453c14e5bc Mon Sep 17 00:00:00 2001 From: "S.P. Mohanty" <spmohanty91@gmail.com> Date: Mon, 18 Mar 2024 01:47:10 +0000 Subject: [PATCH] Refactor local eval script --- local_evaluation.py | 252 +++++++++++++++++++++++++++++++------------- 1 file changed, 176 insertions(+), 76 deletions(-) diff --git a/local_evaluation.py b/local_evaluation.py index b58248b..221358e 100644 --- a/local_evaluation.py +++ b/local_evaluation.py @@ -1,65 +1,172 @@ +import pandas as pd +from tqdm import tqdm import torch import numpy as np -import pandas as pd -from tqdm.auto import tqdm -from sentence_transformers import SentenceTransformer +from sentence_transformers import SentenceTransformer import metrics import parsers -def print_sample(i, generation, truth, metric, score): - print(f"Sample {i}, generation: {generation}") - print(f"Sample {i}, truth: {truth}") +def print_sample(idx, generation, truth, metric, score): + """ + Print a sample's generated output, the truth, and its evaluation score. + """ + print(f"Sample {idx}, generation: {generation}") + print(f"Sample {idx}, truth: {truth}") if isinstance(score, tuple) and len(score) == 3: print( - f"Metric ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}" + f"Per Sample Metric Score ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}" ) else: - print(f"Metric ({metric}): {score}") + print(f"Per Sample Metric Score ({metric}): {score}") print() -if __name__ == "__main__": +# Function to load development data from a JSON file +def load_development_data(filename): + """ + Load development data from a specified JSON file. - # Load Development Data - DATA_FILENAME = "./data/development.json" - data_df = pd.read_json(DATA_FILENAME, lines=True) + Parameters: + - filename: Path to the JSON file containing the development data. - # Load UserModel - from models.user_config import UserModel + Returns: + - A pandas DataFrame containing the loaded data. + """ + return pd.read_json(filename, lines=True) - model = UserModel() - # Generate Responses +# Function to generate model outputs based on the input data +def generate_model_outputs(data_df, model): + """ + Generate predictions for each entry in the data DataFrame using a given model. + + Parameters: + - data_df: A pandas DataFrame containing the input data for predictions. + - model: The model instance used for generating predictions. + + Returns: + - A list containing the model outputs for each entry in the data DataFrame. + """ outputs = [] - for _rowd_idx, row in tqdm( - data_df.iterrows(), - total=len(data_df), - desc="Generating Responses", + for _, row in tqdm( + data_df.iterrows(), total=len(data_df), desc="Generating Responses" ): - print("=" * 100) is_multiple_choice = row["task_type"] == "multiple-choice" prompt = row["input_field"] model_output = model.predict(prompt, is_multiple_choice) outputs.append(model_output) + return outputs - print(prompt, model_output) - # Merge outputs into DF - data_df["outputs"] = outputs - print(data_df) +# Function to evaluate the generated model outputs +def evaluate_outputs(data_df, outputs, log_every_n_steps=1): + """ + Evaluate the model outputs against ground truth values using specified metrics. - # Evaluate - print_interval = 1 + Parameters: + - data_df: DataFrame containing the development data, including ground truth. + - outputs: The generated outputs from the model to be evaluated. + - log_every_n_steps: Logs samples every N steps + Returns: + - A dictionary containing evaluation metrics and scores for each task. + """ + eval_methods = get_evaluation_methods() + task_parsers = get_task_parsers() + per_task_metrics = {} + + for row_idx, row in tqdm( + data_df.iterrows(), total=len(data_df), desc="Evaluating" + ): + task_type, metric, ground_truth = ( + row["task_type"], + row["metric"], + row["output_field"], + ) + + if metric not in eval_methods: + raise NotImplementedError(f"No metric for {metric=}") + + task_name = f"{task_type}---{metric}" + # Note: In practice, here we are using the task_type-metric pair as a unique identifier, calling it as the task_name. + # During the actual evaluations, the task names are more semantically defined, meaning, there could be multiple tasks + # with the same task_type and metric. + + model_output = task_parsers[task_type].parse(outputs[row_idx]) + eval_fn = eval_methods[metric] + metric_score = eval_fn(model_output, ground_truth) + + if task_name not in per_task_metrics: + per_task_metrics[task_name] = { + "task_type": task_type, + "metric": metric, + "sample_score": [], + } + + per_task_metrics[task_name]["sample_score"].append(metric_score) + + if row_idx % log_every_n_steps == 0: + print_sample( + row_idx, model_output, ground_truth, metric, metric_score + ) + + return per_task_metrics + + +# Function to aggregate scores from evaluations +def aggregate_scores(per_task_metrics): + """ + Aggregate evaluation scores across different tasks and metrics. + + Parameters: + - per_task_metrics: A dictionary containing raw evaluation scores for each task. + + Returns: + - A pandas DataFrame summarizing the overall metrics and scores. + """ + overall_metrics = { + "task_name": [], + "task_type": [], + "metric": [], + "overall_score": [], + } + for task_name, values in per_task_metrics.items(): + task_type, metric, sample_scores = ( + values["task_type"], + values["metric"], + values["sample_score"], + ) + overall_score = ( + np.mean(sample_scores) + if metric != "micro f1" + else metrics.compute_f1_score(sample_scores) + ) + + overall_metrics["task_name"].append(task_name) + overall_metrics["task_type"].append(task_type) + overall_metrics["metric"].append(metric) + overall_metrics["overall_score"].append(overall_score) + + return pd.DataFrame(overall_metrics) + + +# Define and return evaluation methods +def get_evaluation_methods(): + """ + Get evaluation methods including accuracy, sentence transformers, and other metrics. + + Returns: + - A dictionary mapping metric names to their respective evaluation functions. + """ device = "cuda" if torch.cuda.is_available() else "cpu" sentence_all_lm = SentenceTransformer("all-MiniLM-L6-v2").to(device) - sentece_multilingual = SentenceTransformer( + sentence_multilingual = SentenceTransformer( "paraphrase-multilingual-MiniLM-L12-v2" ).to(device) - eval_methods = { + return { "accuracy": metrics.accuracy, "hit rate@3": metrics.hit_rate_3, "rougel": metrics.rougel, @@ -67,7 +174,7 @@ if __name__ == "__main__": g, t, sentence_all_lm ), "multilingual-sent-transformer": lambda g, t: metrics.sent_transformer( - g, t, sentece_multilingual + g, t, sentence_multilingual ), "micro f1": metrics.tp_fp_fn, "ndcg": metrics.ndcg_eval, @@ -75,7 +182,16 @@ if __name__ == "__main__": "jp-bleu": lambda g, t: metrics.bleu(g, t, jp=True), } - task_parsers = { + +# Define and return task parsers +def get_task_parsers(): + """ + Define parsers for different task types to format model outputs accordingly. + + Returns: + - A dictionary mapping task types to their respective parsers. + """ + return { "multiple-choice": parsers.ShoppingBenchTaskParsers("multichoice"), "generation": parsers.ShoppingBenchTaskParsers("generation"), "retrieval": parsers.ShoppingBenchTaskParsers("retrieval"), @@ -85,57 +201,41 @@ if __name__ == "__main__": ), } - per_task_metrics = {} - - for row_idx, row in tqdm( - data_df.iterrows(), total=len(data_df), desc="Evaluating" - ): - metric = row["metric"] - if metric not in eval_methods: - raise NotImplementedError(f"No metric for {metric=}") - - task_type = row["task_type"] - - task_name = f"{task_type}---{metric}" - per_task_metrics.setdefault( - task_name, {"metric": metric, "sample_score": []} - ) - - gt = row["output_field"] - model_output = task_parsers[task_type].parse(outputs[row_idx]) - eval_fn = eval_methods[metric] - metric_score = eval_fn(model_output, gt) - per_task_metrics[task_name]["sample_score"].append(metric_score) - per_task_metrics[task_name]["sample_score"].append(metric_score) +# Main execution function to load data, generate model outputs, evaluate, and aggregate scores +def main(): + # Load development data + DATA_FILENAME = "./data/development.json" + data_df = load_development_data(DATA_FILENAME) - if row_idx % print_interval == 0: - print_sample(row_idx, outputs[row_idx], gt, metric, metric_score) + # Load the model from the user's custom configuration + # Note: The evaluator **Always** imports the UserModel, please reference your own class + # by setting the `UserModel` variable in models.user_config + from models.user_config import UserModel - # Aggregate scores - for task_name in per_task_metrics: - if per_task_metrics[task_name]["metric"] != "micro f1": - per_task_metrics[task_name]["overall_metric"] = np.mean( - per_task_metrics[task_name]["sample_score"] - ) - else: - per_task_metrics[task_name]["overall_metric"] = ( - metrics.compute_f1_score( - per_task_metrics[task_name]["sample_score"] - ) - ) + model = UserModel() - print(per_task_metrics) + # Generate model outputs + outputs = generate_model_outputs(data_df, model) + data_df["outputs"] = ( + outputs # Optional: Add outputs back to DataFrame for inspection + ) + print(data_df.head()) - overall_metrics = {"task_name": [], "metric": [], "overall_score": []} - for task_name in per_task_metrics: - overall_metrics["task_name"].append(task_name) - overall_metrics["metric"].append(per_task_metrics[task_name]["metric"]) - overall_metrics["overall_score"].append( - per_task_metrics[task_name]["overall_metric"] - ) + # Evaluate the generated outputs and calculate metrics + per_task_metrics = evaluate_outputs(data_df, outputs) - overall_metrics = pd.DataFrame(overall_metrics) + # Aggregate and display the evaluation scores + overall_metrics = aggregate_scores(per_task_metrics) + print("=" * 100) + print("Task specific metrics: ") print(overall_metrics) + + print() + # Calculate and print the overall score across all tasks and metrics overall_score = overall_metrics["overall_score"].mean() print(f"Overall Score: {overall_score}") + + +if __name__ == "__main__": + main() -- GitLab