initial commit

64e4f64e · Dipam Chakraborty · 64e4f64e · 64e4f64e · 64e4f64e · 64e4f64e
Commit 64e4f64e authored 1 year ago by Dipam Chakraborty
--- a/.gitignore
+++ b/.gitignore
+__pycache__
\ No newline at end of file
--- a/README.md
+++ b/README.md
--- a/agents/README.md
+++ b/agents/README.md
+# TODO
--- a/agents/dummy_agent.py
+++ b/agents/dummy_agent.py
+from typing import List, Dict
+
+class DummyResponseAgent(object):
+    def __init__(self):
+        """ Load your model(s) here """
+        pass
+
+    def generate_responses(self, test_data: List[Dict]) -> List[str]:
+        """
+        You will be provided with a batch of upto 50 independent conversations
+        Return a string for every conversation  
+        """
+        return ["THIS IS A TEST REPLY" for _ in test_data]
\ No newline at end of file
--- a/agents/user_config.py
+++ b/agents/user_config.py
+from agents.dummy_agent import DummyResponseAgent
+
+UserAgent = DummyResponseAgent
\ No newline at end of file
--- a/aicrowd.json
+++ b/aicrowd.json
+{
+    "challenge_id": "task-1-commonsense-dialogue-response-generation",
+    "authors": [
+      "aicrowd-bot"
+    ],
+    "description": "(optional) description about your awesome agent"
+}
+  
\ No newline at end of file
--- a/apt.txt
+++ b/apt.txt
+git
\ No newline at end of file
--- a/dummy_data_task1.json
+++ b/dummy_data_task1.json
--- a/local_evaluation.py
+++ b/local_evaluation.py
+from typing import List, Dict
+import json
+import numpy as np
+
+from metrics import word_f1, bleu
+
+from agents.user_config import UserAgent
+
+
+def load_json_data(file_path: str, keys: List[str], modifications: dict = None) -> List[Dict]:
+    with open(file_path, "r") as fp:
+        data = json.load(fp)
+
+    result = []
+    for dialogue in data:
+        updated_dialogue = {}
+        for turn_id, sample in dialogue.items():
+            sample_data = {key: sample[key] for key in keys}
+            if modifications:
+                for key, value in modifications.items():
+                    sample_data[key] = value(sample_data[key])
+            updated_dialogue[turn_id] = sample_data
+        result.append(updated_dialogue)
+    return result
+
+
+def load_data(file_path: str) -> List[Dict]:
+    keys = ["persona A", "persona B", "dialogue", "gold_reference"]
+    modifications = {"dialogue": lambda x: x[:-1]}
+    return load_json_data(file_path, keys, modifications)
+
+
+def get_responses(agent, test_data, BATCH_SIZE):
+    all_responses = [{} for _ in range(len(test_data))]
+    for batch_idx in np.array_split(range(len(test_data)), BATCH_SIZE):
+        for turn_id in range(7):
+            batch_inputs = [test_data[i][f"turn_{turn_id}"] for i in batch_idx]
+            responses = agent.generate_responses(batch_inputs)
+            for resp in responses:
+                for bi in batch_idx:
+                    all_responses[bi][f"turn_{turn_id}"] = resp
+    return all_responses
+
+def evaluate(responses, test_data):
+    f1_scores = []
+    bleu_scores = []
+    for response, test_data_single in zip(responses, test_data):
+        for turn_id in range(7):
+            f1 = word_f1(response[f"turn_{turn_id}"],
+                         [test_data_single[f"turn_{turn_id}"]['gold_reference']])
+            bleu_score = bleu(response[f"turn_{turn_id}"],
+                         [test_data_single[f"turn_{turn_id}"]['gold_reference']])
+            f1_scores.append(f1)
+            bleu_scores.append(bleu_score)
+    return np.mean(f1_scores), np.mean(bleu_scores)
+
+if __name__ == "__main__":
+    BATCH_SIZE = 2    
+    data_path = 'dummy_data_task1.json'
+    test_data = load_data(data_path)
+    agent = UserAgent()
+    responses = get_responses(agent, test_data, BATCH_SIZE)
+    f1_score, bleu_score = evaluate(responses, test_data)
+
+    print("Word F1 Score:", f1_score)
+    print("Word Bleu Score:", bleu_score)
+
--- a/metrics.py
+++ b/metrics.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+from collections import Counter
+from nltk.translate import bleu_score as nltkbleu
+
+import re
+
+from typing import (
+    List,
+    Tuple,
+)
+
+re_art = re.compile(r'\b(a|an|the)\b')
+re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
+
+
+def normalize_answer(s):
+    """
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+    s = s.lower()
+    s = re_punc.sub(' ', s)
+    s = re_art.sub(' ', s)
+    s = ' '.join(s.split())
+    return s
+
+
+def _word_prec_recall_f1_score(pred_items, gold_items) -> Tuple[float, float, float]:
+    """
+     Compute precision, recall and f1 given a set of gold and prediction items.
+     :param pred_items: iterable of predicted values
+     :param gold_items: iterable of gold values
+     :return: tuple (p, r, f1) for precision, recall, f1
+    """
+    common = Counter(gold_items) & Counter(pred_items)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0, 0, 0
+    precision = 1.0 * num_same / len(pred_items)
+    recall = 1.0 * num_same / len(gold_items)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return precision, recall, f1
+
+
+def word_f1(pred_label: str, gold_labels: List[str], expose_p_and_r: bool = False) -> float:
+    if pred_label is None or gold_labels is None:
+        return 0
+    g_tokens = normalize_answer(pred_label).split()
+    scores = [
+        _word_prec_recall_f1_score(g_tokens, normalize_answer(a).split())
+        for a in gold_labels
+    ]
+    max_p, max_r, max_f1 = 0, 0, 0
+    for p, r, f1 in scores:
+        max_p, max_r, max_f1 = max(max_p, p), max(max_r, r), max(f1, max_f1)
+    if expose_p_and_r:
+        return max_p, max_r, max_f1
+    else:
+        return max_f1
+
+
+def bleu(guess: str, answers: List[str], k: int = 4) -> float:
+    # cumulative K-gram BLEU score, 4 by default.
+    weights = [1 / k for _ in range(k)]
+    score = nltkbleu.sentence_bleu(
+        [normalize_answer(a).split(" ") for a in answers],
+        normalize_answer(guess).split(" "),
+        smoothing_function=nltkbleu.SmoothingFunction(epsilon=1e-12).method1,
+        weights=weights,
+    )
+    return score
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+numpy
\ No newline at end of file