Forked from
AIcrowd / Challenges / Amazon KDD Cup 2024 / Amazon KDD Cup 2024 Starter Kit
48 commits behind the upstream repository.
metrics.py 11.69 KiB
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
import numpy as np
import evaluate
import torch
from typing import List, Union, Tuple
sacrebleu = None
sentence_transformer_model_cache = {}
def calculate_per_sample_accuracy(prediction: int, truth: int) -> bool:
"""
Computes the accuracy of a single prediction.
This function checks if a given prediction matches the ground truth.
Parameters:
- prediction (int): The predicted value.
- truth (int): The actual ground truth value.
Returns:
- bool: True if the prediction matches the truth, False otherwise.
"""
return prediction == truth
def calculate_hit_rate_3(retrieved_int: List[int], truth: List[int]) -> float:
"""
Calculates the hit rate within the top 3 retrieved integers.
This function assesses how many of the truth integers are present
within the first three elements of the retrieved list of integers.
Parameters:
- retrieved_int (List[int]): The list of retrieved integers, ordered by relevance.
- truth (List[int]): The list of ground truth integers.
Returns:
- float: The hit rate, calculated as the proportion of truth integers found
in the top 3 retrieved integers, relative to the total number of truth integers.
"""
# Calculate the number of hits within the top 3 retrieved integers
hit = len(set(truth).intersection(set(retrieved_int[:3])))
# Normalize the hit count by the total number of truth integers to get the hit rate
hit_rate = hit / len(truth)
return hit_rate
def calculate_rougel(generation: str, truth: str) -> float:
"""
Calculates the ROUGE-L F-measure score between a generated string and the truth string.
ROUGE-L measures the longest common subsequence between the generated text and the truth text,
considering both the precision and recall of the sequences. It is widely used in evaluating
the quality of text generation systems.
Parameters:
- generation (str): The generated text to evaluate.
- truth (str): The ground truth text to compare against.
Returns:
- float: The ROUGE-L F-measure score, indicating the quality of the generated text.
"""
# Initialize the ROUGE scorer with the ROUGE-L metric
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
# Calculate the ROUGE scores between the generated text and the truth text
scores = scorer.score(generation, truth)
# Extract and return the ROUGE-L F-measure score
return scores["rougeL"].fmeasure
def load_sentence_transformer_model(model_name: str) -> SentenceTransformer:
"""
Loads a Sentence Transformer model by its name and moves it to the appropriate device.
Parameters:
- model_name (str): The name of the model to load.
Returns:
- SentenceTransformer: The loaded SentenceTransformer model.
"""
global sentence_transformer_model_cache
# a model cache ensure we do not load the model on every call
if model_name not in sentence_transformer_model_cache:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name).to(device)
sentence_transformer_model_cache[model_name] = model
return sentence_transformer_model_cache[model_name]
def calculate_cosine_similarity(generated_text: str, reference_texts: Union[str, List[str]], model_name) -> float:
"""
Computes the cosine similarity score(s) between a generated text and reference text(s) using a sentence embedding model.
This function calculates the cosine similarity between the embedding of the generated text and the embedding(s)
of reference text(s). The embeddings are generated using a specified sentence embedding model. The cosine similarity
score is a measure of similarity between two vectors, ranging from -1 (completely different) to 1 (exactly the same).
Parameters:
- generated_text (str): The text generated by the model.
- reference_texts (Union[str, List[str]]): The reference text(s) for comparison. Can be a single string or a list of strings.
- model_name: The sentence embedding model used to generate text embeddings.
Returns:
- float: The average cosine similarity score between the generated text and the reference text(s). If reference_texts is a single
string, a single score is returned. If reference_texts is a list of strings, the average score across all references is returned.
The score is bounded between 0 (no similarity) and 1 (identical), with negative scores adjusted to 0.
"""
# Load/Reference model
model = load_sentence_transformer_model(model_name)
# Embedding for the generated text
generated_embedding = model.encode([generated_text])[0]
# Handling a single reference text
if isinstance(reference_texts, str):
# Embedding for the single reference text
reference_embedding = model.encode([reference_texts])[0]
# Compute cosine similarity
similarity_score = np.dot(generated_embedding, reference_embedding) / (np.linalg.norm(generated_embedding) * np.linalg.norm(reference_embedding))
# Ensure non-negative score
return max(similarity_score, 0)
# Handling multiple reference texts
else:
similarity_scores = []
for reference_text in reference_texts:
# Embedding for each reference text
reference_embedding = model.encode([reference_text])[0]
# Compute cosine similarity for each reference
individual_score = np.dot(generated_embedding, reference_embedding) / (np.linalg.norm(generated_embedding) * np.linalg.norm(reference_embedding))
similarity_scores.append(individual_score)
# Calculate and ensure non-negative average score
return max(np.mean(similarity_scores), 0)
def calculate_true_positive_false_positives_false_negatives(extracted_entities: List[str], ground_truth_entities: List[str]) -> Tuple[int, int, int]:
"""
Calculates true positives, false positives, and false negatives for entity extraction.
This function compares a list of extracted entities against a list of ground truth entities
to determine the count of true positives (correctly extracted entities), false positives
(incorrectly extracted entities), and false negatives (missed entities).
Both lists are case-insensitive, and leading/trailing spaces in extracted entities are ignored.
Parameters:
- extracted_entities (List[str]): The list of entities extracted by the model.
- ground_truth_entities (List[str]): The list of actual entities (ground truth).
Returns:
- Tuple[int, int, int]: A tuple containing the counts of true positives, false positives, and false negatives.
"""
# Normalize the extracted entities by making them lowercase and stripping leading/trailing spaces
normalized_extracted_entities = [entity.lower().strip() for entity in extracted_entities]
# Normalize the ground truth entities by making them lowercase
normalized_ground_truth_entities = [entity.lower() for entity in ground_truth_entities]
# Calculate true positives by finding the intersection between extracted and ground truth entities
true_positives = len(set(normalized_extracted_entities).intersection(set(normalized_ground_truth_entities)))
# Calculate false positives as extracted entities not in ground truth
false_positives = len(normalized_extracted_entities) - true_positives
# Calculate false negatives as ground truth entities not extracted
false_negatives = len(normalized_ground_truth_entities) - true_positives
return true_positives, false_positives, false_negatives
def calculate_f1_score(metrics_list: List[Tuple[int, int, int]]) -> float:
"""
Calculates the F1 score from a list of tuples containing true positives, false positives, and false negatives.
Parameters:
- metrics_list (List[Tuple[int, int, int]]): A list of tuples, where each tuple contains counts of true positives,
false positives, and false negatives in that order for various classifications or entity extractions.
Returns:
- float: The computed F1 score, ranging from 0 to 1.
"""
total_tp, total_fp, total_fn = 0, 0, 0
# Aggregate total true positives, false positives, and false negatives
for tp, fp, fn in metrics_list:
total_tp += tp
total_fp += fp
total_fn += fn
# Calculate precision and recall
precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
# Calculate F1 score, handling the case where precision + recall equals 0
if precision + recall == 0:
return 0
else:
return 2 * precision * recall / (precision + recall)
def calculate_ndcg(predicted_relevance_scores: List[int], true_relevance_weights: List[float]) -> float:
"""
Calculates and evaluates the Normalized Discounted Cumulative Gain (NDCG) score directly from predicted relevance scores
against true relevance weights. It normalizes the scores to ensure a fair comparison, trimming the predicted scores
if necessary to match the length of the true relevance weights.
Parameters:
- predicted_relevance_scores (List[int]): Indices of items ranked by the algorithm, expected to be integers starting from 1.
- true_relevance_weights (List[float]): Actual relevance weights for the items, with higher values indicating greater relevance.
Returns:
- float: The NDCG score, normalized against the ideal ranking, ranging from 0 to 1.
"""
# Trim the predicted scores to match the true scores length if necessary
if len(predicted_relevance_scores) > len(true_relevance_weights):
predicted_relevance_scores = predicted_relevance_scores[:len(true_relevance_weights)]
dcg, idcg = 0.0, 0.0
# Calculate DCG for the predicted ranking
for i, score_index in enumerate(predicted_relevance_scores, start=1):
if score_index - 1 < len(true_relevance_weights):
relevance = true_relevance_weights[score_index - 1]
else:
relevance = 0
dcg += (np.power(2, relevance) - 1) / np.log2(i + 1)
# Calculate IDCG using sorted true relevance weights
for i, weight in enumerate(sorted(true_relevance_weights, reverse=True), start=1):
idcg += (np.power(2, weight) - 1) / np.log2(i + 1)
# Avoid division by zero
return 0 if idcg == 0 else dcg / idcg
def calculate_bleu_score(generated_text: str, reference_text: str, is_japanese: bool = False) -> float:
"""
Calculates the BLEU score for a generated text compared to a reference truth text. This function supports
both general text and Japanese-specific evaluation by using the sacrebleu library.
Parameters:
- generated_text (str): The generated text to be evaluated.
- reference_text (str): The reference truth text.
- is_japanese (bool, optional): Flag to indicate whether the text is in Japanese, requiring special tokenization.
Returns:
- float: The BLEU score as a percentage (0 to 1 scale) for the generated text against the reference truth.
"""
global sacrebleu
if sacrebleu is None:
sacrebleu = evaluate.load("sacrebleu")
# Preprocess input texts
generated_text = generated_text.lstrip("\n").rstrip("\n").split("\n")[0]
candidate = [generated_text]
reference = [[reference_text]]
# Compute BLEU score with or without Japanese-specific tokenization
bleu_args = {"predictions": candidate, "references": reference, "lowercase": True}
if is_japanese:
bleu_args["tokenize"] = "ja-mecab"
score = sacrebleu.compute(**bleu_args)["score"] / 100
return score