Skip to content
Snippets Groups Projects
Commit ad34ac85 authored by Dipam Chakraborty's avatar Dipam Chakraborty
Browse files

initial commit

parents
No related branches found
No related tags found
No related merge requests found
__pycache__
scores.json
\ No newline at end of file
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import metrics
from models.user_config import UserModel
def print_sample(i, generation, truth, metric, score):
print(f"Sample {i}, generation: {generation}")
print(f"Sample {i}, truth: {truth}")
if isinstance(score, tuple) and len(score) == 3:
print(f"Metric ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}")
else:
print(f"Metric ({metric}): {score}")
print()
def run_and_evaluate(data_df, max_eval_rows, print_interval=200):
model = UserModel()
if max_eval_rows < len(data_df):
data_df_eval = data_df.sample(max_eval_rows)
else:
data_df_eval = data_df
# Run model
outputs = []
task_methods = {
'multiple-choice': model.task_multichoice,
'generation': model.task_generation,
'retrieval': model.task_retrieval,
'ranking': model.task_ranking,
'named_entity_recognition': model.task_named_entity_recognition,
}
for _, row in tqdm(data_df_eval.iterrows(), total=len(data_df_eval), desc='Processing'):
task_type = row['task_type']
if task_type not in task_methods:
raise NotImplementedError(f"No task method for {task_type=}")
task_prompt = row['input_field']
task_fn = task_methods[task_type]
task_output = task_fn(task_prompt)
outputs.append(task_output)
# Evaluate
device = 'cuda' if torch.cuda.is_available() else 'cpu'
sentence_all_lm = SentenceTransformer('all-MiniLM-L6-v2').to(device)
sentece_multilingual = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2').to(device)
eval_methods = {
'accuracy': metrics.accuracy,
'hit rate@3': metrics.hit_rate_3,
'rougel': metrics.rougel,
'sent-transformer': lambda g,t: metrics.sent_transformer(g, t, sentence_all_lm),
'multilingual-sent-transformer': lambda g,t: metrics.sent_transformer(g, t, sentece_multilingual),
'micro f1': metrics.tp_fp_fn,
'ndcg': metrics.ndcg_eval,
'bleu': metrics.bleu,
'jp-bleu': lambda g,t: metrics.bleu(g,t, jp=True)
}
per_task_metrics = {}
for ri, row in tqdm(data_df_eval.iterrows(), total=len(data_df_eval), desc='Evaluating'):
metric = row['metric']
if metric not in eval_methods:
raise NotImplementedError(f"No metric for {metric=}")
task_name = row['task_name']
per_task_metrics.setdefault(task_name, {
'metric': metric,
'sample_score': []
})
gt = row['output_field']
model_output = outputs[ri]
eval_fn = eval_methods[metric]
metric_score = eval_fn(model_output, gt)
per_task_metrics[task_name]['sample_score'].append(metric_score)
per_task_metrics[task_name]['sample_score'].append(metric_score)
if ri % print_interval == 0:
print_sample(ri, model_output, gt, metric, metric_score)
# Aggregate scores
for k in per_task_metrics:
if per_task_metrics[k]['metric'] != 'micro f1':
print(k, len(per_task_metrics[k]['sample_score']))
per_task_metrics[k]['overall_metric'] = np.mean(per_task_metrics[k]['sample_score'])
else:
per_task_metrics[k]['overall_metric'] = metrics.compute_f1_score(per_task_metrics[k]['sample_score'])
overall_metrics = {
'task_name': [],
'metric': [],
'overall_score': []
}
for k in per_task_metrics:
overall_metrics['task_name'].append(k)
overall_metrics['metric'].append(per_task_metrics[k]['metric'])
overall_metrics['overall_score'].append(per_task_metrics[k]['overall_metric'])
track_wise_score = np.mean(overall_metrics['overall_score'])
overall_metrics['task_name'].append('track_wise')
overall_metrics['metric'].append('track_wise')
overall_metrics['overall_score'].append(track_wise_score)
overall_metrics_df = pd.DataFrame(overall_metrics)
overall_metrics_df.to_json("scores.json", orient='records', lines=True)
print(f"Overall score {track_wise_score}")
if __name__ == "__main__":
DATA_FILENAME = '../data/tracks/track3_rephrase.json'
data_df = pd.read_json(DATA_FILENAME, lines=True)
MAX_EVAL_ROWS = 100000
run_and_evaluate(data_df, MAX_EVAL_ROWS)
\ No newline at end of file
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
import numpy as np
import evaluate
from typing import List
print("\nsacrebleu loading...")
sacrebleu = evaluate.load('sacrebleu')
def accuracy(prediction: int, truth: int):
return prediction == truth
def hit_rate_3(retrieved_int: List[int], truth: List[int]):
hit = len(set(truth).intersection(set(retrieved_int[:3])))
hit /= len(truth)
return hit
def rougel(generation: str, truth: str):
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
scores = scorer.score(generation, truth)
return scores['rougeL'].fmeasure
def sent_transformer(generation: str, truth: str, sent_transformer_model):
generation_embedding = sent_transformer_model.encode([generation])[0]
if isinstance(truth, str):
truth_embedding = sent_transformer_model.encode([truth])[0]
score = ((generation_embedding * truth_embedding).sum())
score /= (np.linalg.norm(generation_embedding, ord=2) * np.linalg.norm(truth_embedding, ord=2))
if score > 0:
return score
else:
return 0
else:
scores = []
for label_item in truth:
truth_embedding = sent_transformer_model.encode([label_item])[0]
score_ = (generation_embedding * truth_embedding).sum()
score_ /= (np.linalg.norm(generation_embedding, ord=2) * np.linalg.norm(truth_embedding, ord=2))
scores.append(score_)
if np.mean(scores) > 0:
return np.mean(scores)
else:
return 0
def tp_fp_fn(entity_list, truth):
answer_lower = []
for a in entity_list:
answer_lower.append(a.lower().lstrip(' ').rstrip(' '))
truth_lower = []
for l in truth:
truth_lower.append(l.lower())
true_positive = len(set(answer_lower).intersection(set(truth_lower)))
false_positive = len(answer_lower) - true_positive
false_negative = len(truth_lower) - true_positive
return true_positive, false_positive, false_negative
def compute_f1_score(tp_fp_fn_list):
total_tp = 0
total_fp = 0
total_fn = 0
for tp, fp, fn in tp_fp_fn_list:
total_tp += tp
total_fp += fp
total_fn += fn
precision = total_tp / (total_tp + total_fp)
recall = total_tp / (total_tp + total_fn)
if precision + recall == 0:
return 0
else:
return 2 * precision * recall / (precision + recall)
def ndcg(ranked_list, weight):
idcg = 0
dcg = 0
for i in range(len(ranked_list)):
position = i+1
if ranked_list[i]-1 < len(weight):
relevance = weight[ranked_list[i]-1]
else:
relevance = 0
dcg += (np.power(2, relevance) - 1)/np.log2(position+1)
weight.sort(reverse=True)
for i in range(len(weight)):
position = i+1
relevance = weight[i]
idcg += (np.power(2, relevance) - 1)/ np.log2(position+1)
return dcg/idcg
def ndcg_eval(relevance_scores: List[float], truth: List[float]):
if len(relevance_scores) > len(truth):
relevance_scores = relevance_scores[:len(truth)]
return ndcg(relevance_scores, truth)
def bleu(generation, truth, jp = False):
generation = generation.lstrip('\n').rstrip('\n').split('\n')[0]
candidate = [generation]
reference = [[truth]]
if not jp:
score = sacrebleu.compute(predictions=candidate, references=reference,
lowercase=True)['score']/100
else:
score = sacrebleu.compute(predictions=candidate, references=reference,
lowercase=True,
tokenize='ja-mecab')['score']/100
return score
from typing import List
class DummyModel:
"""
Note to participants:
Example class to show the different functions to be implemented for each type of task
Make sure to follow the data types as mentioned in the function definitions
"""
def __init__(self):
""" Initialize your models here """
pass
def task_multichoice(self, task_prompt: str) -> int:
"""
Task method for Multiple choice questions
Input - Task Prompt (includes choices)
Output - Single integer index among ones given in the input
"""
return 0
def task_ranking(self, task_prompt: str) -> List[float]:
"""
Task method for Ranking
Input - Task Prompt (includes items to rank)
Output - Ordered List of ranks for each item
"""
return [1, 0, 2, 3]
def task_generation(self, task_prompt: str) -> str:
"""
Task method for Generation
Input - Task Prompt describing the required generation
Output - Generated text as per task prompt
"""
return "This is a test"
def task_retrieval(self, task_prompt: str) -> List[int]:
"""
Task method for Generation
Input - Task Prompt describing the items which need to be selected from (includes indexes of items)
Output - Unordered list of indexes selected (must be a python list even if single item)
"""
return [0, 1, 2]
def task_named_entity_recognition(self, task_prompt: str) -> List[str]:
"""
Task method for Named Entity Recognition
Input - Task Prompt describing the named entity recognition task
Output - Unordered list of one or more entity names (must be a python list even if single item)
"""
return ["food", "gpu"]
\ No newline at end of file
from models.dummy_model import DummyModel
UserModel = DummyModel
\ No newline at end of file
torch
\ No newline at end of file
# local eval
pandas
sentence-transformers
rouge_score
evaluate
sacrebleu
sacrebleu[ja]
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment