From 4384d19fc585f7528eceebf0c9e6e5ac5252d8e0 Mon Sep 17 00:00:00 2001 From: Dipam Chakraborty <dipamc77@gmail.com> Date: Wed, 6 Dec 2023 16:30:34 +0530 Subject: [PATCH] updates for prompt engineering track --- agents/bart_agent.py | 7 ++- agents/dummy_agent.py | 10 +++- agents/prompt_agent.py | 66 ++++++++++++++++++++++ agents/user_config.py | 2 + aicrowd.json | 2 +- local_evaluation.py | 5 +- local_evaluation_with_api.py | 104 +++++++++++++++++++++++++++++++++++ requirements_local_eval.txt | 4 ++ 8 files changed, 191 insertions(+), 9 deletions(-) create mode 100644 agents/prompt_agent.py create mode 100644 local_evaluation_with_api.py create mode 100644 requirements_local_eval.txt diff --git a/agents/bart_agent.py b/agents/bart_agent.py index 6602f50..4711b8d 100644 --- a/agents/bart_agent.py +++ b/agents/bart_agent.py @@ -71,7 +71,7 @@ class BARTResponseAgent(object): return tensor_input_ids, tensor_attention_mask - def generate_responses(self, test_data: List[Dict]) -> List[str]: + def generate_responses(self, test_data: List[Dict], api_responses: List[str]) -> List[str]: """ You will be provided with a batch of upto 50 independent conversations @@ -127,8 +127,9 @@ class BARTResponseAgent(object): self.turn_id = self.turn_id % 7 + 1 # Turn id goes from 1 to 7 response = { - "use_api": False, # Ignored if GPU true is set in aicrowd.json - "prompts": ["" for _ in test_data], # Ignored if GPU true is set in aicrowd.json + "use_api": False, # Cannot use API if GPU true is set in aicrowd.json + "prompts": ["" for _ in test_data], # Cannot use API if GPU true is set in aicrowd.json + "max_generated_tokens": [0 for _ in test_data], "final_responses": final_responses } return response diff --git a/agents/dummy_agent.py b/agents/dummy_agent.py index 5923503..abdd9f2 100644 --- a/agents/dummy_agent.py +++ b/agents/dummy_agent.py @@ -5,7 +5,7 @@ class DummyResponseAgent(object): """ Load your model(s) here """ pass - def generate_responses(self, test_data: List[Dict]) -> List[str]: + def generate_responses(self, test_data: List[Dict], api_responses: List[str]) -> List[str]: """ You will be provided with a batch of upto 50 independent conversations @@ -27,6 +27,9 @@ class DummyResponseAgent(object): ] Model should return 50 responses for Turn 7 + api_responses - A list of strings output by the api call for each previous prompt response, + Will be a list of blank strings on the first call + Note: Turn numbers will NOT be provided as input Return a dictionary with the following format @@ -40,8 +43,9 @@ class DummyResponseAgent(object): # print(f"{len(test_data)=}, {test_data[0].keys()=}, {len(test_data[-1]['dialogue'])}") response = { - "use_api": False, # Ignored if GPU true is set in aicrowd.json - "prompts": ["" for _ in test_data], # Ignored if GPU true is set in aicrowd.json + "use_api": False, # Cannot use API if GPU true is set in aicrowd.json + "prompts": ["" for _ in test_data], # Cannot use API if GPU true is set in aicrowd.json + "max_generated_tokens": [0 for _ in test_data], "final_responses": ["THIS IS A TEST REPLY" for _ in test_data] } return response \ No newline at end of file diff --git a/agents/prompt_agent.py b/agents/prompt_agent.py new file mode 100644 index 0000000..dd79368 --- /dev/null +++ b/agents/prompt_agent.py @@ -0,0 +1,66 @@ +from typing import List, Dict + +class DummyPromptAgent(object): + def __init__(self): + """ Can initialize any retrieval models etc here """ + self.api_limit = 2 # Max number of api calls per utterance + self.input_token_limit = 10_000 # Max number of input tokens per dialogue (combined token usage of all 7 utterances) + self.output_token_limit = 1_000 # Max number of output tokens per dialogue (combined token usage of all 7 utterances) + + self.max_generated_token_per_call = 20 # Can be set by user as needed, can be different for each utterance and dialogue + self.api_usage_count = 0 + + def generate_responses(self, test_data: List[Dict], api_responses: List[str]) -> List[str]: + """ + You will be provided with a batch of upto 50 independent conversations + + Input 1 (test_data) + [ + {"persona A": ..., "persona B": ... "dialogue": ... }, # conversation 1 Turn 1 + ... + {"persona A": ..., "persona B": ... "dialogue": ... } # conversation 50 Turn 1 + ] + + Model should return 50 responses for Turn 1 + + ... + Input 7 (test_data) + [ + {"persona A": ..., "persona B": ... "dialogue": ... }, # conversation 1 Turn 7 + ... + {"persona A": ..., "persona B": ... "dialogue": ... } # conversation 50 Turn 7 + ] + Model should return 50 responses for Turn 7 + + api_responses - A list of strings output by the api call for each previous prompt response, + Will be a list of blank strings on the first call + + Note: Turn numbers will NOT be provided as input + + Return a dictionary with the following format + + "use_api": True/False - Note that this cannot be used when using GPU + "prompts": [ <list of the prompts that go as "content" to the api > ] - Note that every api call is independent and we don't use threads + "max_generated_tokens": [ list of ints for the max generation limit on each call] - Note that the submission will fail if the total generation limit is exceeded + "final_responses: [ <list of strings with the final responses> ] - Only used when use_api is set to False + + """ + # print(f"{len(test_data)=}, {test_data[0].keys()=}, {len(test_data[-1]['dialogue'])}") + + if self.api_usage_count < self.api_limit: + self.api_usage_count += 1 + response = { + "use_api": True, + "prompts": ["You're a helpful assistant, say this is a test" for _ in test_data], + "max_generated_tokens": [self.max_generated_token_per_call for _ in test_data], + "final_responses": ["not used" for _ in test_data] + } + else: # After 2 calls of the api, must return the final responses + self.api_usage_count = 0 + response = { + "use_api": False, + "prompts": ["" for _ in test_data], + "max_generated_tokens": [self.max_generated_token_per_call for _ in test_data], + "final_responses": api_responses # Can preprocess in between calls if needed. + } + return response \ No newline at end of file diff --git a/agents/user_config.py b/agents/user_config.py index e86a726..6a00cc6 100644 --- a/agents/user_config.py +++ b/agents/user_config.py @@ -1,5 +1,7 @@ from agents.dummy_agent import DummyResponseAgent +from agents.prompt_agent import DummyPromptAgent from agents.bart_agent import BARTResponseAgent # UserAgent = DummyResponseAgent +# UserAgent = DummyPromptAgent UserAgent = BARTResponseAgent \ No newline at end of file diff --git a/aicrowd.json b/aicrowd.json index 4aa7f4b..075cbf0 100644 --- a/aicrowd.json +++ b/aicrowd.json @@ -3,7 +3,7 @@ "authors": [ "aicrowd-bot" ], - "gpu": true, + "gpu": false, "description": "(optional) description about your awesome agent" } diff --git a/local_evaluation.py b/local_evaluation.py index cc26054..e5b8834 100644 --- a/local_evaluation.py +++ b/local_evaluation.py @@ -36,7 +36,8 @@ def get_responses(agent, test_data, BATCH_SIZE): for batch_idx in np.array_split(range(len(test_data)), split_size): for turn_id in range(7): batch_inputs = [test_data[i][f"turn_{turn_id}"] for i in batch_idx] - responses = agent.generate_responses(batch_inputs)['final_responses'] + api_respones = ["" for i in batch_idx] + responses = agent.generate_responses(batch_inputs, api_respones)['final_responses'] for bi, resp in zip(batch_idx, responses): all_responses[bi][f"turn_{turn_id}"] = resp return all_responses @@ -52,7 +53,7 @@ def evaluate(responses, test_data): [test_data_single[f"turn_{turn_id}"]['gold_reference']]) f1_scores.append(f1) bleu_scores.append(bleu_score) - return np.mean(f1_scores), np.mean(bleu_scores) + return np.mean(f1_scores) * 100, np.mean(bleu_scores) * 100 if __name__ == "__main__": BATCH_SIZE = 2 diff --git a/local_evaluation_with_api.py b/local_evaluation_with_api.py new file mode 100644 index 0000000..b1de213 --- /dev/null +++ b/local_evaluation_with_api.py @@ -0,0 +1,104 @@ +from typing import List, Dict +import json +import numpy as np +import math +from openai import OpenAI + +from metrics import word_f1, bleu + +from agents.user_config import UserAgent + +MAX_API_CALL_PER_UTTERANCE = 2 + +def load_json_data(file_path: str, keys: List[str]) -> List[Dict]: + with open(file_path, "r") as fp: + data = json.load(fp) + + result = [] + for dialogue in data: + updated_dialogue = {} + for turn_id, sample in dialogue.items(): + if not isinstance(sample, dict): + continue + sample_data = {key: sample[key] for key in keys} + updated_dialogue[turn_id] = sample_data + result.append(updated_dialogue) + return result + + +def load_data(file_path: str) -> List[Dict]: + # NOTE to participants: Gold reference will not available during actual evaluations + keys = ["persona A", "persona B", "dialogue", "gold_reference"] + return load_json_data(file_path, keys) + +class LLM_API: + def __init__(self): + # Please set OPENAI_API_TOKEN env variable + self.client = OpenAI() + self.model = "gpt-3.5-turbo-1106" + + def api_call(self, prompt, max_tokens): + """ Simple single prompt api call """ + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + max_tokens=max_tokens, + ) + response_text = response.choices[0].message.content + input_tokens = response.usage.prompt_tokens + output_tokens = response.usage.completion_tokens + return response_text, input_tokens, output_tokens + +llm_api = LLM_API() + +def get_responses(agent, test_data, BATCH_SIZE): + all_responses = [{} for _ in range(len(test_data))] + split_size = math.ceil(len(test_data) / BATCH_SIZE) + for batch_idx in np.array_split(range(len(test_data)), split_size): + for turn_id in range(7): + batch_inputs = [test_data[i][f"turn_{turn_id}"] for i in batch_idx] + api_responses = ["" for i in batch_idx] + for _ in range(MAX_API_CALL_PER_UTTERANCE+1): + agent_response = agent.generate_responses(batch_inputs, api_responses) + if not agent_response['use_api']: + break + api_responses = [] + for prompt, max_tokens in zip(agent_response['prompts'], agent_response['max_generated_tokens']): + api_resp, _, _ = llm_api.api_call(prompt, max_tokens) + print("Prompt:", prompt, "\nResponse:", api_resp) + api_responses.append(api_resp) + + responses = agent_response['final_responses'] + for bi, resp in zip(batch_idx, responses): + all_responses[bi][f"turn_{turn_id}"] = resp + return all_responses + +def evaluate(responses, test_data): + f1_scores = [] + bleu_scores = [] + for response, test_data_single in zip(responses, test_data): + for turn_id in range(7): + f1 = word_f1(response[f"turn_{turn_id}"], + [test_data_single[f"turn_{turn_id}"]['gold_reference']]) + bleu_score = bleu(response[f"turn_{turn_id}"], + [test_data_single[f"turn_{turn_id}"]['gold_reference']]) + f1_scores.append(f1) + bleu_scores.append(bleu_score) + return np.mean(f1_scores) * 100, np.mean(bleu_scores) * 100 + +if __name__ == "__main__": + #### + # To use the local evaluation with API, you need to provide your own OPENAI_API_KEY + #### + + BATCH_SIZE = 2 + + data_path = 'dummy_data_task1.json' + test_data = load_data(data_path) + agent = UserAgent() + responses = get_responses(agent, test_data, BATCH_SIZE) + f1_score, bleu_score = evaluate(responses, test_data) + + print("Word F1 Score:", f1_score) + print("Word Bleu Score:", bleu_score) + diff --git a/requirements_local_eval.txt b/requirements_local_eval.txt new file mode 100644 index 0000000..a31a39e --- /dev/null +++ b/requirements_local_eval.txt @@ -0,0 +1,4 @@ +numpy +scikit-learn +openai==1.3.6 +tiktoken-0.5.2 \ No newline at end of file -- GitLab