Skip to content
Snippets Groups Projects
Commit 4384d19f authored by Dipam Chakraborty's avatar Dipam Chakraborty
Browse files

updates for prompt engineering track

parent ac181300
No related branches found
No related tags found
No related merge requests found
......@@ -71,7 +71,7 @@ class BARTResponseAgent(object):
return tensor_input_ids, tensor_attention_mask
def generate_responses(self, test_data: List[Dict]) -> List[str]:
def generate_responses(self, test_data: List[Dict], api_responses: List[str]) -> List[str]:
"""
You will be provided with a batch of upto 50 independent conversations
......@@ -127,8 +127,9 @@ class BARTResponseAgent(object):
self.turn_id = self.turn_id % 7 + 1 # Turn id goes from 1 to 7
response = {
"use_api": False, # Ignored if GPU true is set in aicrowd.json
"prompts": ["" for _ in test_data], # Ignored if GPU true is set in aicrowd.json
"use_api": False, # Cannot use API if GPU true is set in aicrowd.json
"prompts": ["" for _ in test_data], # Cannot use API if GPU true is set in aicrowd.json
"max_generated_tokens": [0 for _ in test_data],
"final_responses": final_responses
}
return response
......@@ -5,7 +5,7 @@ class DummyResponseAgent(object):
""" Load your model(s) here """
pass
def generate_responses(self, test_data: List[Dict]) -> List[str]:
def generate_responses(self, test_data: List[Dict], api_responses: List[str]) -> List[str]:
"""
You will be provided with a batch of upto 50 independent conversations
......@@ -27,6 +27,9 @@ class DummyResponseAgent(object):
]
Model should return 50 responses for Turn 7
api_responses - A list of strings output by the api call for each previous prompt response,
Will be a list of blank strings on the first call
Note: Turn numbers will NOT be provided as input
Return a dictionary with the following format
......@@ -40,8 +43,9 @@ class DummyResponseAgent(object):
# print(f"{len(test_data)=}, {test_data[0].keys()=}, {len(test_data[-1]['dialogue'])}")
response = {
"use_api": False, # Ignored if GPU true is set in aicrowd.json
"prompts": ["" for _ in test_data], # Ignored if GPU true is set in aicrowd.json
"use_api": False, # Cannot use API if GPU true is set in aicrowd.json
"prompts": ["" for _ in test_data], # Cannot use API if GPU true is set in aicrowd.json
"max_generated_tokens": [0 for _ in test_data],
"final_responses": ["THIS IS A TEST REPLY" for _ in test_data]
}
return response
\ No newline at end of file
from typing import List, Dict
class DummyPromptAgent(object):
def __init__(self):
""" Can initialize any retrieval models etc here """
self.api_limit = 2 # Max number of api calls per utterance
self.input_token_limit = 10_000 # Max number of input tokens per dialogue (combined token usage of all 7 utterances)
self.output_token_limit = 1_000 # Max number of output tokens per dialogue (combined token usage of all 7 utterances)
self.max_generated_token_per_call = 20 # Can be set by user as needed, can be different for each utterance and dialogue
self.api_usage_count = 0
def generate_responses(self, test_data: List[Dict], api_responses: List[str]) -> List[str]:
"""
You will be provided with a batch of upto 50 independent conversations
Input 1 (test_data)
[
{"persona A": ..., "persona B": ... "dialogue": ... }, # conversation 1 Turn 1
...
{"persona A": ..., "persona B": ... "dialogue": ... } # conversation 50 Turn 1
]
Model should return 50 responses for Turn 1
...
Input 7 (test_data)
[
{"persona A": ..., "persona B": ... "dialogue": ... }, # conversation 1 Turn 7
...
{"persona A": ..., "persona B": ... "dialogue": ... } # conversation 50 Turn 7
]
Model should return 50 responses for Turn 7
api_responses - A list of strings output by the api call for each previous prompt response,
Will be a list of blank strings on the first call
Note: Turn numbers will NOT be provided as input
Return a dictionary with the following format
"use_api": True/False - Note that this cannot be used when using GPU
"prompts": [ <list of the prompts that go as "content" to the api > ] - Note that every api call is independent and we don't use threads
"max_generated_tokens": [ list of ints for the max generation limit on each call] - Note that the submission will fail if the total generation limit is exceeded
"final_responses: [ <list of strings with the final responses> ] - Only used when use_api is set to False
"""
# print(f"{len(test_data)=}, {test_data[0].keys()=}, {len(test_data[-1]['dialogue'])}")
if self.api_usage_count < self.api_limit:
self.api_usage_count += 1
response = {
"use_api": True,
"prompts": ["You're a helpful assistant, say this is a test" for _ in test_data],
"max_generated_tokens": [self.max_generated_token_per_call for _ in test_data],
"final_responses": ["not used" for _ in test_data]
}
else: # After 2 calls of the api, must return the final responses
self.api_usage_count = 0
response = {
"use_api": False,
"prompts": ["" for _ in test_data],
"max_generated_tokens": [self.max_generated_token_per_call for _ in test_data],
"final_responses": api_responses # Can preprocess in between calls if needed.
}
return response
\ No newline at end of file
from agents.dummy_agent import DummyResponseAgent
from agents.prompt_agent import DummyPromptAgent
from agents.bart_agent import BARTResponseAgent
# UserAgent = DummyResponseAgent
# UserAgent = DummyPromptAgent
UserAgent = BARTResponseAgent
\ No newline at end of file
......@@ -3,7 +3,7 @@
"authors": [
"aicrowd-bot"
],
"gpu": true,
"gpu": false,
"description": "(optional) description about your awesome agent"
}
......@@ -36,7 +36,8 @@ def get_responses(agent, test_data, BATCH_SIZE):
for batch_idx in np.array_split(range(len(test_data)), split_size):
for turn_id in range(7):
batch_inputs = [test_data[i][f"turn_{turn_id}"] for i in batch_idx]
responses = agent.generate_responses(batch_inputs)['final_responses']
api_respones = ["" for i in batch_idx]
responses = agent.generate_responses(batch_inputs, api_respones)['final_responses']
for bi, resp in zip(batch_idx, responses):
all_responses[bi][f"turn_{turn_id}"] = resp
return all_responses
......@@ -52,7 +53,7 @@ def evaluate(responses, test_data):
[test_data_single[f"turn_{turn_id}"]['gold_reference']])
f1_scores.append(f1)
bleu_scores.append(bleu_score)
return np.mean(f1_scores), np.mean(bleu_scores)
return np.mean(f1_scores) * 100, np.mean(bleu_scores) * 100
if __name__ == "__main__":
BATCH_SIZE = 2
......
from typing import List, Dict
import json
import numpy as np
import math
from openai import OpenAI
from metrics import word_f1, bleu
from agents.user_config import UserAgent
MAX_API_CALL_PER_UTTERANCE = 2
def load_json_data(file_path: str, keys: List[str]) -> List[Dict]:
with open(file_path, "r") as fp:
data = json.load(fp)
result = []
for dialogue in data:
updated_dialogue = {}
for turn_id, sample in dialogue.items():
if not isinstance(sample, dict):
continue
sample_data = {key: sample[key] for key in keys}
updated_dialogue[turn_id] = sample_data
result.append(updated_dialogue)
return result
def load_data(file_path: str) -> List[Dict]:
# NOTE to participants: Gold reference will not available during actual evaluations
keys = ["persona A", "persona B", "dialogue", "gold_reference"]
return load_json_data(file_path, keys)
class LLM_API:
def __init__(self):
# Please set OPENAI_API_TOKEN env variable
self.client = OpenAI()
self.model = "gpt-3.5-turbo-1106"
def api_call(self, prompt, max_tokens):
""" Simple single prompt api call """
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
)
response_text = response.choices[0].message.content
input_tokens = response.usage.prompt_tokens
output_tokens = response.usage.completion_tokens
return response_text, input_tokens, output_tokens
llm_api = LLM_API()
def get_responses(agent, test_data, BATCH_SIZE):
all_responses = [{} for _ in range(len(test_data))]
split_size = math.ceil(len(test_data) / BATCH_SIZE)
for batch_idx in np.array_split(range(len(test_data)), split_size):
for turn_id in range(7):
batch_inputs = [test_data[i][f"turn_{turn_id}"] for i in batch_idx]
api_responses = ["" for i in batch_idx]
for _ in range(MAX_API_CALL_PER_UTTERANCE+1):
agent_response = agent.generate_responses(batch_inputs, api_responses)
if not agent_response['use_api']:
break
api_responses = []
for prompt, max_tokens in zip(agent_response['prompts'], agent_response['max_generated_tokens']):
api_resp, _, _ = llm_api.api_call(prompt, max_tokens)
print("Prompt:", prompt, "\nResponse:", api_resp)
api_responses.append(api_resp)
responses = agent_response['final_responses']
for bi, resp in zip(batch_idx, responses):
all_responses[bi][f"turn_{turn_id}"] = resp
return all_responses
def evaluate(responses, test_data):
f1_scores = []
bleu_scores = []
for response, test_data_single in zip(responses, test_data):
for turn_id in range(7):
f1 = word_f1(response[f"turn_{turn_id}"],
[test_data_single[f"turn_{turn_id}"]['gold_reference']])
bleu_score = bleu(response[f"turn_{turn_id}"],
[test_data_single[f"turn_{turn_id}"]['gold_reference']])
f1_scores.append(f1)
bleu_scores.append(bleu_score)
return np.mean(f1_scores) * 100, np.mean(bleu_scores) * 100
if __name__ == "__main__":
####
# To use the local evaluation with API, you need to provide your own OPENAI_API_KEY
####
BATCH_SIZE = 2
data_path = 'dummy_data_task1.json'
test_data = load_data(data_path)
agent = UserAgent()
responses = get_responses(agent, test_data, BATCH_SIZE)
f1_score, bleu_score = evaluate(responses, test_data)
print("Word F1 Score:", f1_score)
print("Word Bleu Score:", bleu_score)
numpy
scikit-learn
openai==1.3.6
tiktoken-0.5.2
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment