Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from typing import List, Dict
import json
import numpy as np
import math
from openai import OpenAI
from metrics import word_f1, bleu
from agents.user_config import UserAgent
MAX_API_CALL_PER_UTTERANCE = 2
def load_json_data(file_path: str, keys: List[str]) -> List[Dict]:
with open(file_path, "r") as fp:
data = json.load(fp)
result = []
for dialogue in data:
updated_dialogue = {}
for turn_id, sample in dialogue.items():
if not isinstance(sample, dict):
continue
sample_data = {key: sample[key] for key in keys}
updated_dialogue[turn_id] = sample_data
result.append(updated_dialogue)
return result
def load_data(file_path: str) -> List[Dict]:
# NOTE to participants: Gold reference will not available during actual evaluations
keys = ["persona A", "persona B", "dialogue", "gold_reference"]
return load_json_data(file_path, keys)
class LLM_API:
def __init__(self):
# Please set OPENAI_API_TOKEN env variable
self.client = OpenAI()
self.model = "gpt-3.5-turbo-1106"
def api_call(self, prompt, max_tokens):
""" Simple single prompt api call """
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
)
response_text = response.choices[0].message.content
input_tokens = response.usage.prompt_tokens
output_tokens = response.usage.completion_tokens
return response_text, input_tokens, output_tokens
llm_api = LLM_API()
def get_responses(agent, test_data, BATCH_SIZE):
all_responses = [{} for _ in range(len(test_data))]
split_size = math.ceil(len(test_data) / BATCH_SIZE)
for batch_idx in np.array_split(range(len(test_data)), split_size):
for turn_id in range(7):
batch_inputs = [test_data[i][f"turn_{turn_id}"] for i in batch_idx]
api_responses = ["" for i in batch_idx]
for _ in range(MAX_API_CALL_PER_UTTERANCE+1):
agent_response = agent.generate_responses(batch_inputs, api_responses)
if not agent_response['use_api']:
break
api_responses = []
for prompt, max_tokens in zip(agent_response['prompts'], agent_response['max_generated_tokens']):
api_resp, _, _ = llm_api.api_call(prompt, max_tokens)
print("Prompt:", prompt, "\nResponse:", api_resp)
api_responses.append(api_resp)
responses = agent_response['final_responses']
for bi, resp in zip(batch_idx, responses):
all_responses[bi][f"turn_{turn_id}"] = resp
return all_responses
def evaluate(responses, test_data):
f1_scores = []
bleu_scores = []
for response, test_data_single in zip(responses, test_data):
for turn_id in range(7):
f1 = word_f1(response[f"turn_{turn_id}"],
[test_data_single[f"turn_{turn_id}"]['gold_reference']])
bleu_score = bleu(response[f"turn_{turn_id}"],
[test_data_single[f"turn_{turn_id}"]['gold_reference']])
f1_scores.append(f1)
bleu_scores.append(bleu_score)
return np.mean(f1_scores) * 100, np.mean(bleu_scores) * 100
if __name__ == "__main__":
####
# To use the local evaluation with API, you need to provide your own OPENAI_API_KEY
####
BATCH_SIZE = 2
data_path = 'dummy_data_task1.json'
test_data = load_data(data_path)
agent = UserAgent()
responses = get_responses(agent, test_data, BATCH_SIZE)
f1_score, bleu_score = evaluate(responses, test_data)
print("Word F1 Score:", f1_score)
print("Word Bleu Score:", bleu_score)