From 4384d19fc585f7528eceebf0c9e6e5ac5252d8e0 Mon Sep 17 00:00:00 2001
From: Dipam Chakraborty <dipamc77@gmail.com>
Date: Wed, 6 Dec 2023 16:30:34 +0530
Subject: [PATCH] updates for prompt engineering track

---
 agents/bart_agent.py         |   7 ++-
 agents/dummy_agent.py        |  10 +++-
 agents/prompt_agent.py       |  66 ++++++++++++++++++++++
 agents/user_config.py        |   2 +
 aicrowd.json                 |   2 +-
 local_evaluation.py          |   5 +-
 local_evaluation_with_api.py | 104 +++++++++++++++++++++++++++++++++++
 requirements_local_eval.txt  |   4 ++
 8 files changed, 191 insertions(+), 9 deletions(-)
 create mode 100644 agents/prompt_agent.py
 create mode 100644 local_evaluation_with_api.py
 create mode 100644 requirements_local_eval.txt

diff --git a/agents/bart_agent.py b/agents/bart_agent.py
index 6602f50..4711b8d 100644
--- a/agents/bart_agent.py
+++ b/agents/bart_agent.py
@@ -71,7 +71,7 @@ class BARTResponseAgent(object):
         return tensor_input_ids, tensor_attention_mask
 
 
-    def generate_responses(self, test_data: List[Dict]) -> List[str]:
+    def generate_responses(self, test_data: List[Dict], api_responses: List[str]) -> List[str]:
         """
         You will be provided with a batch of upto 50 independent conversations
 
@@ -127,8 +127,9 @@ class BARTResponseAgent(object):
         self.turn_id = self.turn_id % 7 + 1 # Turn id goes from 1 to 7
 
         response = {
-            "use_api": False,                                    # Ignored if GPU true is set in aicrowd.json
-            "prompts": ["" for _ in test_data],                  # Ignored if GPU true is set in aicrowd.json
+            "use_api": False,                                    # Cannot use API if GPU true is set in aicrowd.json
+            "prompts": ["" for _ in test_data],                  # Cannot use API if GPU true is set in aicrowd.json
+            "max_generated_tokens": [0 for _ in test_data],
             "final_responses": final_responses
         }
         return response
diff --git a/agents/dummy_agent.py b/agents/dummy_agent.py
index 5923503..abdd9f2 100644
--- a/agents/dummy_agent.py
+++ b/agents/dummy_agent.py
@@ -5,7 +5,7 @@ class DummyResponseAgent(object):
         """ Load your model(s) here """
         pass
 
-    def generate_responses(self, test_data: List[Dict]) -> List[str]:
+    def generate_responses(self, test_data: List[Dict], api_responses: List[str]) -> List[str]:
         """
         You will be provided with a batch of upto 50 independent conversations
         
@@ -27,6 +27,9 @@ class DummyResponseAgent(object):
         ]
         Model should return 50 responses for Turn 7
 
+        api_responses - A list of strings output by the api call for each previous prompt response, 
+                             Will be a list of blank strings on the first call
+
         Note: Turn numbers will NOT be provided as input
 
         Return a dictionary with the following format
@@ -40,8 +43,9 @@ class DummyResponseAgent(object):
         # print(f"{len(test_data)=}, {test_data[0].keys()=}, {len(test_data[-1]['dialogue'])}")
 
         response = {
-            "use_api": False,                                    # Ignored if GPU true is set in aicrowd.json
-            "prompts": ["" for _ in test_data],                  # Ignored if GPU true is set in aicrowd.json
+            "use_api": False,                                    # Cannot use API if GPU true is set in aicrowd.json
+            "prompts": ["" for _ in test_data],                  # Cannot use API if GPU true is set in aicrowd.json
+            "max_generated_tokens": [0 for _ in test_data],
             "final_responses": ["THIS IS A TEST REPLY" for _ in test_data]
         }
         return response
\ No newline at end of file
diff --git a/agents/prompt_agent.py b/agents/prompt_agent.py
new file mode 100644
index 0000000..dd79368
--- /dev/null
+++ b/agents/prompt_agent.py
@@ -0,0 +1,66 @@
+from typing import List, Dict
+
+class DummyPromptAgent(object):
+    def __init__(self):
+        """ Can initialize any retrieval models etc here """
+        self.api_limit = 2                          # Max number of api calls per utterance
+        self.input_token_limit = 10_000             # Max number of input tokens per dialogue (combined token usage of all 7 utterances)
+        self.output_token_limit = 1_000             # Max number of output tokens per dialogue (combined token usage of all 7 utterances)
+        
+        self.max_generated_token_per_call = 20      # Can be set by user as needed, can be different for each utterance and dialogue
+        self.api_usage_count = 0 
+
+    def generate_responses(self, test_data: List[Dict], api_responses: List[str]) -> List[str]:
+        """
+        You will be provided with a batch of upto 50 independent conversations
+        
+        Input 1 (test_data)
+        [
+            {"persona A": ..., "persona B": ... "dialogue": ... }, # conversation 1  Turn 1
+            ...
+            {"persona A": ..., "persona B": ... "dialogue": ... }  # conversation 50 Turn 1
+        ]
+
+        Model should return 50 responses for Turn 1
+
+        ...
+        Input 7 (test_data)
+        [
+            {"persona A": ..., "persona B": ... "dialogue": ... }, # conversation 1  Turn 7
+            ...
+            {"persona A": ..., "persona B": ... "dialogue": ... }  # conversation 50 Turn 7
+        ]
+        Model should return 50 responses for Turn 7
+
+        api_responses - A list of strings output by the api call for each previous prompt response, 
+                               Will be a list of blank strings on the first call
+
+        Note: Turn numbers will NOT be provided as input
+
+        Return a dictionary with the following format
+
+        "use_api": True/False                                                               - Note that this cannot be used when using GPU
+        "prompts": [ <list of the prompts that go as "content" to the api > ]               - Note that every api call is independent and we don't use threads
+        "max_generated_tokens": [ list of ints for the max generation limit on each call]   - Note that the submission will fail if the total generation limit is exceeded
+        "final_responses: [ <list of strings with the final responses> ]                    - Only used when use_api is set to False
+
+        """
+        # print(f"{len(test_data)=}, {test_data[0].keys()=}, {len(test_data[-1]['dialogue'])}")
+
+        if self.api_usage_count < self.api_limit:
+            self.api_usage_count += 1
+            response = {
+                "use_api": True,                                    
+                "prompts": ["You're a helpful assistant, say this is a test" for _ in test_data],
+                "max_generated_tokens": [self.max_generated_token_per_call for _ in test_data], 
+                "final_responses": ["not used" for _ in test_data]
+            }
+        else:   # After 2 calls of the api, must return the final responses
+            self.api_usage_count = 0
+            response = {
+                "use_api": False,                                    
+                "prompts": ["" for _ in test_data],
+                "max_generated_tokens": [self.max_generated_token_per_call for _ in test_data],
+                "final_responses": api_responses # Can preprocess in between calls if needed.
+            }
+        return response
\ No newline at end of file
diff --git a/agents/user_config.py b/agents/user_config.py
index e86a726..6a00cc6 100644
--- a/agents/user_config.py
+++ b/agents/user_config.py
@@ -1,5 +1,7 @@
 from agents.dummy_agent import DummyResponseAgent
+from agents.prompt_agent import DummyPromptAgent
 from agents.bart_agent import BARTResponseAgent
 
 # UserAgent = DummyResponseAgent
+# UserAgent = DummyPromptAgent
 UserAgent = BARTResponseAgent
\ No newline at end of file
diff --git a/aicrowd.json b/aicrowd.json
index 4aa7f4b..075cbf0 100644
--- a/aicrowd.json
+++ b/aicrowd.json
@@ -3,7 +3,7 @@
     "authors": [
       "aicrowd-bot"
     ],
-    "gpu": true,
+    "gpu": false,
     "description": "(optional) description about your awesome agent"
 }
   
diff --git a/local_evaluation.py b/local_evaluation.py
index cc26054..e5b8834 100644
--- a/local_evaluation.py
+++ b/local_evaluation.py
@@ -36,7 +36,8 @@ def get_responses(agent, test_data, BATCH_SIZE):
     for batch_idx in np.array_split(range(len(test_data)), split_size):
         for turn_id in range(7):
             batch_inputs = [test_data[i][f"turn_{turn_id}"] for i in batch_idx]
-            responses = agent.generate_responses(batch_inputs)['final_responses']
+            api_respones = ["" for i in batch_idx]
+            responses = agent.generate_responses(batch_inputs, api_respones)['final_responses']
             for bi, resp in zip(batch_idx, responses):
                 all_responses[bi][f"turn_{turn_id}"] = resp
     return all_responses
@@ -52,7 +53,7 @@ def evaluate(responses, test_data):
                          [test_data_single[f"turn_{turn_id}"]['gold_reference']])
             f1_scores.append(f1)
             bleu_scores.append(bleu_score)
-    return np.mean(f1_scores), np.mean(bleu_scores)
+    return np.mean(f1_scores) * 100, np.mean(bleu_scores) * 100
 
 if __name__ == "__main__":
     BATCH_SIZE = 2
diff --git a/local_evaluation_with_api.py b/local_evaluation_with_api.py
new file mode 100644
index 0000000..b1de213
--- /dev/null
+++ b/local_evaluation_with_api.py
@@ -0,0 +1,104 @@
+from typing import List, Dict
+import json
+import numpy as np
+import math
+from openai import OpenAI
+
+from metrics import word_f1, bleu
+
+from agents.user_config import UserAgent
+
+MAX_API_CALL_PER_UTTERANCE = 2
+
+def load_json_data(file_path: str, keys: List[str]) -> List[Dict]:
+    with open(file_path, "r") as fp:
+        data = json.load(fp)
+
+    result = []
+    for dialogue in data:
+        updated_dialogue = {}
+        for turn_id, sample in dialogue.items():
+            if not isinstance(sample, dict):
+                continue            
+            sample_data = {key: sample[key] for key in keys}
+            updated_dialogue[turn_id] = sample_data
+        result.append(updated_dialogue)
+    return result
+
+
+def load_data(file_path: str) -> List[Dict]:
+    # NOTE to participants: Gold reference will not available during actual evaluations
+    keys = ["persona A", "persona B", "dialogue", "gold_reference"]
+    return load_json_data(file_path, keys)
+
+class LLM_API:
+    def __init__(self):
+        # Please set OPENAI_API_TOKEN env variable
+        self.client = OpenAI()
+        self.model = "gpt-3.5-turbo-1106"
+
+    def api_call(self, prompt, max_tokens):
+        """ Simple single prompt api call """
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+        )
+        response_text = response.choices[0].message.content
+        input_tokens = response.usage.prompt_tokens
+        output_tokens = response.usage.completion_tokens
+        return response_text, input_tokens, output_tokens
+  
+llm_api = LLM_API()
+
+def get_responses(agent, test_data, BATCH_SIZE):
+    all_responses = [{} for _ in range(len(test_data))]
+    split_size = math.ceil(len(test_data) / BATCH_SIZE)
+    for batch_idx in np.array_split(range(len(test_data)), split_size):
+        for turn_id in range(7):
+            batch_inputs = [test_data[i][f"turn_{turn_id}"] for i in batch_idx]
+            api_responses = ["" for i in batch_idx]
+            for _ in range(MAX_API_CALL_PER_UTTERANCE+1):
+                agent_response = agent.generate_responses(batch_inputs, api_responses)
+                if not agent_response['use_api']:
+                    break
+                api_responses = []
+                for prompt, max_tokens in zip(agent_response['prompts'], agent_response['max_generated_tokens']):
+                    api_resp, _, _ = llm_api.api_call(prompt, max_tokens)
+                    print("Prompt:", prompt, "\nResponse:", api_resp)
+                    api_responses.append(api_resp)
+
+            responses = agent_response['final_responses']
+            for bi, resp in zip(batch_idx, responses):
+                all_responses[bi][f"turn_{turn_id}"] = resp
+    return all_responses
+
+def evaluate(responses, test_data):
+    f1_scores = []
+    bleu_scores = []
+    for response, test_data_single in zip(responses, test_data):
+        for turn_id in range(7):
+            f1 = word_f1(response[f"turn_{turn_id}"],
+                         [test_data_single[f"turn_{turn_id}"]['gold_reference']])
+            bleu_score = bleu(response[f"turn_{turn_id}"],
+                         [test_data_single[f"turn_{turn_id}"]['gold_reference']])
+            f1_scores.append(f1)
+            bleu_scores.append(bleu_score)
+    return np.mean(f1_scores) * 100, np.mean(bleu_scores) * 100
+
+if __name__ == "__main__": 
+    ####
+    # To use the local evaluation with API, you need to provide your own OPENAI_API_KEY
+    ####
+
+    BATCH_SIZE = 2
+
+    data_path = 'dummy_data_task1.json'
+    test_data = load_data(data_path)
+    agent = UserAgent()
+    responses = get_responses(agent, test_data, BATCH_SIZE)
+    f1_score, bleu_score = evaluate(responses, test_data)
+
+    print("Word F1 Score:", f1_score)
+    print("Word Bleu Score:", bleu_score)
+
diff --git a/requirements_local_eval.txt b/requirements_local_eval.txt
new file mode 100644
index 0000000..a31a39e
--- /dev/null
+++ b/requirements_local_eval.txt
@@ -0,0 +1,4 @@
+numpy
+scikit-learn
+openai==1.3.6
+tiktoken-0.5.2
\ No newline at end of file
-- 
GitLab