diff --git a/local_evaluation.py b/local_evaluation.py index 3484c15a892e2815426a64f7c8b044146ee841d5..b58248b06390b412f50a4481da3971d4590453dd 100644 --- a/local_evaluation.py +++ b/local_evaluation.py @@ -5,6 +5,7 @@ from tqdm.auto import tqdm from sentence_transformers import SentenceTransformer import metrics +import parsers def print_sample(i, generation, truth, metric, score): @@ -19,36 +20,39 @@ def print_sample(i, generation, truth, metric, score): print() -def run_and_evaluate(data_df, max_eval_rows, print_interval=200): +if __name__ == "__main__": - if max_eval_rows < len(data_df): - data_df_eval = data_df.sample(max_eval_rows) - else: - data_df_eval = data_df + # Load Development Data + DATA_FILENAME = "./data/development.json" + data_df = pd.read_json(DATA_FILENAME, lines=True) - # Run model - outputs = [] - task_methods = { - "multiple-choice": model.task_multichoice, - "generation": model.task_generation, - "retrieval": model.task_retrieval, - "ranking": model.task_ranking, - "named_entity_recognition": model.task_named_entity_recognition, - } + # Load UserModel + from models.user_config import UserModel + + model = UserModel() - for _, row in tqdm( - data_df_eval.iterrows(), total=len(data_df_eval), desc="Processing" + # Generate Responses + outputs = [] + for _rowd_idx, row in tqdm( + data_df.iterrows(), + total=len(data_df), + desc="Generating Responses", ): - task_type = row["task_type"] - if task_type not in task_methods: - raise NotImplementedError(f"No task method for {task_type=}") + print("=" * 100) + is_multiple_choice = row["task_type"] == "multiple-choice" + prompt = row["input_field"] + model_output = model.predict(prompt, is_multiple_choice) + outputs.append(model_output) - task_prompt = row["input_field"] - task_fn = task_methods[task_type] - task_output = task_fn(task_prompt) - outputs.append(task_output) + print(prompt, model_output) + + # Merge outputs into DF + data_df["outputs"] = outputs + print(data_df) # Evaluate + print_interval = 1 + device = "cuda" if torch.cuda.is_available() else "cpu" sentence_all_lm = SentenceTransformer("all-MiniLM-L6-v2").to(device) sentece_multilingual = SentenceTransformer( @@ -71,84 +75,67 @@ def run_and_evaluate(data_df, max_eval_rows, print_interval=200): "jp-bleu": lambda g, t: metrics.bleu(g, t, jp=True), } + task_parsers = { + "multiple-choice": parsers.ShoppingBenchTaskParsers("multichoice"), + "generation": parsers.ShoppingBenchTaskParsers("generation"), + "retrieval": parsers.ShoppingBenchTaskParsers("retrieval"), + "ranking": parsers.ShoppingBenchTaskParsers("ranking"), + "named_entity_recognition": parsers.ShoppingBenchTaskParsers( + "named_entity_recognition" + ), + } + per_task_metrics = {} - for ri, row in tqdm( - data_df_eval.iterrows(), total=len(data_df_eval), desc="Evaluating" + for row_idx, row in tqdm( + data_df.iterrows(), total=len(data_df), desc="Evaluating" ): metric = row["metric"] if metric not in eval_methods: raise NotImplementedError(f"No metric for {metric=}") - task_name = row["task_name"] + task_type = row["task_type"] + + task_name = f"{task_type}---{metric}" per_task_metrics.setdefault( task_name, {"metric": metric, "sample_score": []} ) gt = row["output_field"] - model_output = outputs[ri] + model_output = task_parsers[task_type].parse(outputs[row_idx]) eval_fn = eval_methods[metric] metric_score = eval_fn(model_output, gt) per_task_metrics[task_name]["sample_score"].append(metric_score) per_task_metrics[task_name]["sample_score"].append(metric_score) - if ri % print_interval == 0: - print_sample(ri, model_output, gt, metric, metric_score) + if row_idx % print_interval == 0: + print_sample(row_idx, outputs[row_idx], gt, metric, metric_score) # Aggregate scores - for k in per_task_metrics: - if per_task_metrics[k]["metric"] != "micro f1": - print(k, len(per_task_metrics[k]["sample_score"])) - per_task_metrics[k]["overall_metric"] = np.mean( - per_task_metrics[k]["sample_score"] + for task_name in per_task_metrics: + if per_task_metrics[task_name]["metric"] != "micro f1": + per_task_metrics[task_name]["overall_metric"] = np.mean( + per_task_metrics[task_name]["sample_score"] ) else: - per_task_metrics[k]["overall_metric"] = metrics.compute_f1_score( - per_task_metrics[k]["sample_score"] + per_task_metrics[task_name]["overall_metric"] = ( + metrics.compute_f1_score( + per_task_metrics[task_name]["sample_score"] + ) ) + print(per_task_metrics) + overall_metrics = {"task_name": [], "metric": [], "overall_score": []} - for k in per_task_metrics: - overall_metrics["task_name"].append(k) - overall_metrics["metric"].append(per_task_metrics[k]["metric"]) + for task_name in per_task_metrics: + overall_metrics["task_name"].append(task_name) + overall_metrics["metric"].append(per_task_metrics[task_name]["metric"]) overall_metrics["overall_score"].append( - per_task_metrics[k]["overall_metric"] + per_task_metrics[task_name]["overall_metric"] ) - track_wise_score = np.mean(overall_metrics["overall_score"]) - overall_metrics["task_name"].append("track_wise") - overall_metrics["metric"].append("track_wise") - overall_metrics["overall_score"].append(track_wise_score) - overall_metrics_df = pd.DataFrame(overall_metrics) - overall_metrics_df.to_json("scores.json", orient="records", lines=True) - print(f"Overall score {track_wise_score}") - - -if __name__ == "__main__": - - # Load Development Data - DATA_FILENAME = "./data/development.json" - data_df = pd.read_json(DATA_FILENAME, lines=True) - - # Load UserModel - from models.user_config import UserModel - - model = UserModel() - - # Generate Responses - - outputs = [] - for _rowd_idx, row in tqdm( - data_df.iterrows(), - total=len(data_df), - desc="Generating Responses", - ): - print("=" * 100) - is_multiple_choice = row["task_type"] == "multiple-choice" - prompt = row["input_field"] - model_output = model.predict(prompt, is_multiple_choice) - outputs.append(model_output) - - print(prompt, model_output) - # run_and_evaluate(data_df, MAX_EVAL_ROWS) + overall_metrics = pd.DataFrame(overall_metrics) + print(overall_metrics) + overall_score = overall_metrics["overall_score"].mean() + print(f"Overall Score: {overall_score}") diff --git a/metrics.py b/metrics.py index 7d70b8793e6898e60d91e529cad187c7129befaa..a2f24c9fa204db890b2e62a08a11f5b0cf83b574 100644 --- a/metrics.py +++ b/metrics.py @@ -6,28 +6,34 @@ import evaluate from typing import List print("\nsacrebleu loading...") -sacrebleu = evaluate.load('sacrebleu') +sacrebleu = evaluate.load("sacrebleu") + def accuracy(prediction: int, truth: int): return prediction == truth + def hit_rate_3(retrieved_int: List[int], truth: List[int]): hit = len(set(truth).intersection(set(retrieved_int[:3]))) hit /= len(truth) return hit + def rougel(generation: str, truth: str): - scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) + scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) scores = scorer.score(generation, truth) - return scores['rougeL'].fmeasure + return scores["rougeL"].fmeasure + def sent_transformer(generation: str, truth: str, sent_transformer_model): generation_embedding = sent_transformer_model.encode([generation])[0] if isinstance(truth, str): truth_embedding = sent_transformer_model.encode([truth])[0] - score = ((generation_embedding * truth_embedding).sum()) - score /= (np.linalg.norm(generation_embedding, ord=2) * np.linalg.norm(truth_embedding, ord=2)) + score = (generation_embedding * truth_embedding).sum() + score /= np.linalg.norm(generation_embedding, ord=2) * np.linalg.norm( + truth_embedding, ord=2 + ) if score > 0: return score else: @@ -37,17 +43,20 @@ def sent_transformer(generation: str, truth: str, sent_transformer_model): for label_item in truth: truth_embedding = sent_transformer_model.encode([label_item])[0] score_ = (generation_embedding * truth_embedding).sum() - score_ /= (np.linalg.norm(generation_embedding, ord=2) * np.linalg.norm(truth_embedding, ord=2)) + score_ /= np.linalg.norm( + generation_embedding, ord=2 + ) * np.linalg.norm(truth_embedding, ord=2) scores.append(score_) if np.mean(scores) > 0: return np.mean(scores) else: return 0 + def tp_fp_fn(entity_list, truth): answer_lower = [] for a in entity_list: - answer_lower.append(a.lower().lstrip(' ').rstrip(' ')) + answer_lower.append(a.lower().lstrip(" ").rstrip(" ")) truth_lower = [] for l in truth: truth_lower.append(l.lower()) @@ -56,6 +65,7 @@ def tp_fp_fn(entity_list, truth): false_negative = len(truth_lower) - true_positive return true_positive, false_positive, false_negative + def compute_f1_score(tp_fp_fn_list): total_tp = 0 total_fp = 0 @@ -70,44 +80,51 @@ def compute_f1_score(tp_fp_fn_list): return 0 else: return 2 * precision * recall / (precision + recall) - + + def ndcg(ranked_list, weight): idcg = 0 dcg = 0 for i in range(len(ranked_list)): - position = i+1 - if ranked_list[i]-1 < len(weight): - relevance = weight[ranked_list[i]-1] + position = i + 1 + if ranked_list[i] - 1 < len(weight): + relevance = weight[ranked_list[i] - 1] else: relevance = 0 - dcg += (np.power(2, relevance) - 1)/np.log2(position+1) + dcg += (np.power(2, relevance) - 1) / np.log2(position + 1) weight.sort(reverse=True) for i in range(len(weight)): - position = i+1 + position = i + 1 relevance = weight[i] - idcg += (np.power(2, relevance) - 1)/ np.log2(position+1) - return dcg/idcg + idcg += (np.power(2, relevance) - 1) / np.log2(position + 1) + return dcg / idcg + def ndcg_eval(relevance_scores: List[float], truth: List[float]): if len(relevance_scores) > len(truth): - relevance_scores = relevance_scores[:len(truth)] + relevance_scores = relevance_scores[: len(truth)] return ndcg(relevance_scores, truth) - -def bleu(generation, truth, jp = False): - generation = generation.lstrip('\n').rstrip('\n').split('\n')[0] + +def bleu(generation, truth, jp=False): + generation = generation.lstrip("\n").rstrip("\n").split("\n")[0] candidate = [generation] reference = [[truth]] if not jp: - score = sacrebleu.compute(predictions=candidate, references=reference, - lowercase=True)['score']/100 + score = ( + sacrebleu.compute( + predictions=candidate, references=reference, lowercase=True + )["score"] + / 100 + ) else: - score = sacrebleu.compute(predictions=candidate, references=reference, - lowercase=True, - tokenize='ja-mecab')['score']/100 + score = ( + sacrebleu.compute( + predictions=candidate, + references=reference, + lowercase=True, + tokenize="ja-mecab", + )["score"] + / 100 + ) return score - - - - - diff --git a/parsers.py b/parsers.py index ce5636f6afb799bf69c5445081c285ccdfecfd01..2dd7e5e011485a93160f8cb0c97d19e28de8c4d2 100644 --- a/parsers.py +++ b/parsers.py @@ -89,7 +89,7 @@ class ShoppingBenchTaskParsers: """ # Keep only numeric characters and specific punctuation. cleaned_response = "".join( - c for c in response if c.isnumeric() or c in ["[", "]", ",", " "] + c for c in response if c.isnumeric() or c in [",", " "] ) # Convert to list of integers @@ -137,7 +137,7 @@ class ShoppingBenchTaskParsers: """ # Similar to ranking parser, but only returns the first 3 elements. cleaned_response = "".join( - c for c in response if c.isnumeric() or c in ["[", "]", ",", " "] + c for c in response if c.isnumeric() or c in [",", " "] ) # Convert to list of integers @@ -174,6 +174,10 @@ class ShoppingBenchTaskParsers: isinstance(item, str) for item in entities ): return entities + else: + raise SyntaxError( + "Unexpected Syntax error - fall back to comma separated list." + ) except (SyntaxError, ValueError): # Fallback: split the string by commas and strip whitespace. return [entity.strip() for entity in response.split(",")] @@ -198,7 +202,10 @@ if __name__ == "__main__": ranking_parser.parse("1, 2, 3, 4, 5") ) # Expected output: [1, 2, 3, 4, 5] print( - ranking_parser.parse("[1, 2, 2, 3]") + ranking_parser.parse("[1, 2, 3, 4, 5]") + ) # Expected output: [1, 2, 3, 4, 5] - tolerant to [, ] + print( + ranking_parser.parse("1, 2, 2, 3") ) # Expected output (failure case): [] # because of repeating numbers print( ranking_parser.parse("1, 4, 5, aicrowd, 6") @@ -243,4 +250,8 @@ if __name__ == "__main__": ) # Expected output: ['New York', 'ShopBench', 'Amazon'] print( ner_parser.parse("[New York, ShopBench, Amazon]") - ) # Expected output (failure case - extra '[' characters added to boundary elems]): ['[New York', 'ShopBench', 'Amazon]'] + ) # failure case - not tolerant to [ if quotes not used + # - extra '[' characters added to boundary elems]): ['[New York', 'ShopBench', 'Amazon]'] + # Expected output: ['[New York', 'ShopBench', 'Amazon]'] + + print(ner_parser.parse("[4, 1, 2, 3]"))