Newer
Older
from typing import List, Union, Any, Dict, List
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
VLLM_TENSOR_PARALLEL_SIZE = 4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
VLLM_GPU_MEMORY_UTILIZATION = 0.96 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
AICROWD_SUBMISSION_BATCH_SIZE = VLLM_TENSOR_PARALLEL_SIZE*4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
A dummy model implementation for ShopBench, illustrating how to handle both
multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
This model uses a consistent random seed for reproducible results.
"""Initializes the model and sets the random seed for consistency."""
def initialize_models(self):
# Initialize Meta Llama 3 - 8B Instruct Model
if not os.path.exists(self.model_name):
raise Exception(
f"""
The evaluators expect the model weights to be checked into the repository,
but we could not find the model weights at {self.model_name}
Please follow the instructions in the docs below to download and check in the model weights.
https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/blob/master/docs/download-baseline-model-weights.md
"""
)
# initialize the model with vllm
self.system_prompt = "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n\n"
self.llm = vllm.LLM(
self.model_name,
tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE,
gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION,
trust_remote_code=True,
dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
enforce_eager=True,
)
self.tokenizer = self.llm.get_tokenizer()
self.faiss = self.load_rag_module(faiss_index_path="./models/index_0626.ivf")
self.faiss_retrieve_topk = 7
self.faiss_score_filter = 0.885
self.retrieve_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that similar (type of task ,languages involved and product) to the query."
def get_detailed_instruct(self, task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery: {query}'
def load_rag_module(self, faiss_index_path: str):
# rag_module : embedding + faiss index + reranker
self.embed_model = SentenceTransformer("./models/multilingual-e5-large-instruct", device='cpu')
# self.reranker = FlagReranker('./models/bge-reranker-v2-m3', use_fp16=True, device='cuda:1')
# few shot preprocess
self.few_shot_example_text = []
with open('./models/large_sample_example_0626.jsonl','r',encoding='utf8') as f:
for i in f.readlines():
passage = ''
t_data = json.loads(i.strip())
if "input" in t_data:
passage = t_data['instruction'] + t_data['input'] + '\nOutput:' + str( t_data['output']) + '\n'
else:
passage = t_data['instruction'] + str(t_data['output']) + '\n'
passage = passage.replace('\\n','\n')
self.few_shot_example_text.append(passage)
if os.path.exists(faiss_index_path):
self.index = faiss.read_index(faiss_index_path)
else:
self.index = self.train_save_faiss_index(faiss_index_path)
self.metadata = [{"fewshot_examaple": fewshot_examaple} for fewshot_examaple in self.few_shot_example_text]
def train_save_faiss_index(self,
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
dim: int = 1024,
nlist: int = 1024,
index_nprobe: int = 3):
# preprocess train retrieve index and save trained index
# dim : Embedding dimension for intfloat/multilingual-e5-large
# nlist : Number of cluster centroids
fewshot_embeddings = []
quantizer = faiss.IndexFlatIP(dim)
index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_INNER_PRODUCT)
index.nprobe = index_nprobe
fewshot_embeddings = self.embed_model.encode(self.few_shot_example_text, batch_size=256+128, show_progress_bar=True)
print(f'process few shot example embedding done! {len(self.few_shot_example_text)}')
index.train(fewshot_embeddings.astype(np.float32))
index.add(fewshot_embeddings.astype(np.float32))
faiss.write_index(index, index_save_path)
del fewshot_embeddings
return index
def get_batch_size(self) -> int:
"""
Determines the batch size that is used by the evaluator when calling the `batch_predict` function.
Returns:
int: The batch size, an integer between 1 and 16. This value indicates how many
queries should be processed together in a single batch. It can be dynamic
across different batch_predict calls, or stay a static value.
"""
self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE
return self.batch_size
def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
Generates a batch of prediction based on associated prompts and task_type
For multiple choice tasks, it randomly selects a choice.
For other tasks, it returns a list of integers as a string,
representing the model's prediction in a format compatible with task-specific parsers.
Parameters:
- batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
- prompt (List[str]): a list of input prompts for the model.
- is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
str: A list of predictions for each of the prompts received in the batch.
Each prediction is
a string representing a single integer[0, 3] for multiple choice tasks,
or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
or a string representing the (unconstrained) generated response for the generation tasks
Please refer to parsers.py for more details on how these responses will be parsed by the evaluator.
prompts = batch["prompt"]
# format prompts using the chat template
formatted_prompts = self.format_prommpts(prompts, is_multiple_choice)
# set max new tokens to be generated
if is_multiple_choice:
max_new_tokens = 1 # For MCQ tasks, we only need to generate 1 token
# Generate responses via vllm
responses = self.llm.generate(
formatted_prompts,
vllm.SamplingParams(
n=1, # Number of output sequences to return for each prompt.
# top_p=0.9, # Float that controls the cumulative probability of the top tokens to consider.
# top_k=1,
temperature=0, # randomness of the sampling
seed=AICROWD_RUN_SEED, # Seed for reprodicibility
skip_special_tokens=True, # Whether to skip special tokens in the output.
max_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence.
# stop_token_ids=self.terminators # llama 3 stop token
),
use_tqdm = False
)
# debug logging
print("raw batch generation:", [response.outputs[0].text for response in responses])
# Aggregate answers into List[str]
batch_response = []
for response in responses:
batch_response = [k[0] for k in batch_response]
print("formmated generation: MCQ: ", batch_response)
else:
print("formmated generation:", batch_response)
# # 0625 tmp for ranking task
# if '[' in batch_response[0][0] and ']' in batch_response[0][-1]:
# batch_response = [json.loads(t) for t in batch_response]
# batch_response = (np.argsort(batch_response, axis=1)[:,::-1]+1).tolist()
# batch_response = [str(k) for k in batch_response]
return batch_response
def format_prommpts(self, prompts:List[str], is_multiple_choice:bool):
"""
Formats prompts using the chat_template of the model.
Parameters:
- queries (list of str): A list of queries to be formatted into prompts.
"""
# 1. faiss index retrieve topK few shot example
# 2. rerank few shot example
# 3. select topK few shot example as prompt
# 4. [
# {"role":"system","content":self.system_prompt},
# {"role":"user","content": query + few shot exmaple}
# {"role":"assistant","content": model generate ... ...}
# ]
# faiss vector Retrieve smiliar few shot example
query_embed_batch = self.embed_model.encode([self.get_detailed_instruct(self.retrieve_task_description, query_text) for query_text in prompts])
# scores_indices = [self.index.search(np.array([query_embed]).astype(np.float32), self.faiss_retrieve_topk) for query_embed in query_embed_batch]
scores, indices = self.index.search(np.array(query_embed_batch).astype(np.float32), self.faiss_retrieve_topk)
# print("retrieve total time: {:.2f} s".format(time.time() - start_time))
for prompt_idx, prompt in enumerate(prompts):
for score, retrieved_idx in zip(scores[prompt_idx],indices[prompt_idx]):
if score>=self.faiss_score_filter and len(self.metadata[retrieved_idx]["fewshot_examaple"])<=5000 and "商品仕様】◉サイズ:46cm×27cm×15cm◉重さ:710g◉メイン素材:水、汚れに強い高品質ポリエステルキャンバス、インナー素材:ナイロン◉ブランド:honey&blue◉付属品:ベビーカー吊り下げ用フック 【たっぷりのメイン収納】大きく開く開口部はダ" not in self.metadata[retrieved_idx]["fewshot_examaple"]:
fewshot_examaple = self.metadata[retrieved_idx]["fewshot_examaple"]
few_shot_exmaple.append(fewshot_examaple)
few_shot_exmaple = few_shot_exmaple[:4] if is_multiple_choice else few_shot_exmaple
prompt_example = '## Here are some similar questions and answers you can refer to:\n'
prompt_example += i+'\n'
prompt_example += '## Now answer the Question:' + prompt
else:
prompt_example = '## Now answer the Question:' + prompt
messages = [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": prompt_example}
]
chat_prompt = self.tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
# return_tensors="pt"
)
# if "llama" in self.model_name.lower():
# chat_prompt = chat_prompt[len(self.tokenizer.bos_token):] # vllm tokenize will also add bos token
# print(chat_prompt)
## debug logging
print("batch formatted prompt:", formatted_prompts)
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
return formatted_prompts
class aya23_fewshot_VLLM(llama3_8b_FewShot_vllm):
"""
A dummy model implementation for ShopBench, illustrating how to handle both
multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
This model uses a consistent random seed for reproducible results.
"""
def __init__(self):
"""Initializes the model and sets the random seed for consistency."""
random.seed(AICROWD_RUN_SEED)
self.initialize_models()
def initialize_models(self):
# Initialize Meta Llama 3 - 8B Instruct Model
self.model_name = "./models/aya23-8b"
if not os.path.exists(self.model_name):
raise Exception(
f"""
The evaluators expect the model weights to be checked into the repository,
but we could not find the model weights at {self.model_name}
Please follow the instructions in the docs below to download and check in the model weights.
https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/blob/master/docs/download-baseline-model-weights.md
"""
)
# initialize the model with vllm
self.system_prompt = "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n\n"
self.model = AutoModelForCausalLM.from_pretrained("/home/jnu/gxw/amazon-kdd-cup-2024-starter-kit/models/aya23-8b", torch_dtype="auto", device_map="auto")
self.tokenizer = AutoTokenizer.from_pretrained("/home/jnu/gxw/amazon-kdd-cup-2024-starter-kit/models/aya23-8b")
self.faiss = self.load_rag_module(faiss_index_path="./models/index_0626.ivf")
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
self.faiss_retrieve_topk = 8
self.faiss_score_filter = 0.85
self.bge_rerank_topk = 6
self.bge_score_filter = 0.6
self.retrieve_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that similar (type of task ,languages involved and product) to the query."
@torch.no_grad()
def batch_predict(self, batch: Dict[str, Any], is_multiple_choice: bool) -> List[str]:
prompts = batch["prompt"]
# format prompts using the chat template
formatted_prompts = self.format_prommpts(prompts, is_multiple_choice)
# set max new tokens to be generated
max_new_tokens = 140
if is_multiple_choice:
max_new_tokens = 1
input_batch = [self.tokenizer.encode(i,return_tensors="pt").to(1) for i in formatted_prompts]
# Generate responses via vllm
gen_tokens = [self.model.generate(
i,
do_sample=False,
max_new_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence.
) for i in input_batch]
gen_tokens = [i[0][len(j[0]):] for i,j in zip(gen_tokens, input_batch)]
gen_text = [self.tokenizer.decode(i,skip_special_tokens=True) for i in gen_tokens]
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
print(gen_text)
del input_batch, gen_tokens
torch.cuda.empty_cache()
return gen_text
def format_prommpts(self, prompts:List[str], is_multiple_choice:bool):
"""
Formats prompts using the chat_template of the model.
Parameters:
- queries (list of str): A list of queries to be formatted into prompts.
"""
# 1. faiss index retrieve topK few shot example
# 2. rerank few shot example
# 3. select topK few shot example as prompt
# 4. [
# {"role":"system","content":self.system_prompt},
# {"role":"user","content": query + few shot exmaple}
# {"role":"assistant","content": model generate ... ...}
# ]
# faiss vector Retrieve smiliar few shot example
formatted_prompts = []
for prompt in prompts:
query_text = ' ' + prompt
query_embed = self.embed_model.encode([self.get_detailed_instruct(self.retrieve_task_description, query_text)])[0]
scores, indices = self.index.search(np.array([query_embed]).astype(np.float32), self.faiss_retrieve_topk)
# process results
few_shot_exmaple = []
for score, retrieved_idx in zip(scores[0], indices[0]):
if score>=self.faiss_score_filter and len(self.metadata[retrieved_idx]["fewshot_examaple"])<=6000:
fewshot_examaple = self.metadata[retrieved_idx]["fewshot_examaple"]
few_shot_exmaple.append(fewshot_examaple)
reranked_exmaple_prompt = few_shot_exmaple if is_multiple_choice else few_shot_exmaple[:4]
# rerank
# if len(exmaple_prompt)>0:
# print("before reranke:")
# print(exmaple_prompt[:4])
# # rerank the result
# retrank_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that same (type of task ,languages involved) to the query."
# rerank_scores = self.reranker.compute_score(
# [[query_text,retrieved_fse] for retrieved_fse in exmaple_prompt],
# normalize=True,
# batch_size=32
# )
# reranked_exmaple_prompt = [
# exmaple_prompt[bge_rerank_topK_idx]
# for bge_rerank_topK_idx in np.argsort(rerank_scores)[-bge_rerank_topk:]
# if rerank_scores[bge_rerank_topK_idx]>=bge_score_filter
# ]
# reranked_exmaple_prompt = [t_prompt for t_prompt in reranked_exmaple_prompt if len(t_prompt)<=6000]
# print("reranked:")
# else:
# reranked_exmaple_prompt = []
if len(reranked_exmaple_prompt) > 0:
prompt_example = '## Here are some similar questions and answers you can refer to:\n'
for i in reranked_exmaple_prompt:
prompt_example += i+'\n'
prompt_example += '## Now answer the Question:' + prompt
else:
prompt_example = '## Now answer the Question:' + prompt
messages = [
{"role": "user", "content": prompt_example}
]
chat_prompt = self.tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
# return_tensors="pt"
)
formatted_prompts.append(chat_prompt)
return formatted_prompts
class DummyModel(ShopBenchBaseModel):
"""
A dummy model implementation for ShopBench, illustrating how to handle both
multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
This model uses a consistent random seed for reproducible results.
"""
def __init__(self):
"""Initializes the model and sets the random seed for consistency."""
random.seed(AICROWD_RUN_SEED)
def get_batch_size(self) -> int:
"""
Determines the batch size that is used by the evaluator when calling the `batch_predict` function.
Returns:
int: The batch size, an integer between 1 and 16. This value indicates how many
queries should be processed together in a single batch. It can be dynamic
across different batch_predict calls, or stay a static value.
"""
self.batch_size = 4
return self.batch_size
def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
"""
Generates a batch of prediction based on associated prompts and task_type
For multiple choice tasks, it randomly selects a choice.
For other tasks, it returns a list of integers as a string,
representing the model's prediction in a format compatible with task-specific parsers.
Parameters:
- batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
- prompt (List[str]): a list of input prompts for the model.
- is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
Returns:
str: A list of predictions for each of the prompts received in the batch.
Each prediction is
a string representing a single integer[0, 3] for multiple choice tasks,
or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
or a string representing the (unconstrained) generated response for the generation tasks
Please refer to parsers.py for more details on how these responses will be parsed by the evaluator.
"""
prompts = batch["prompt"]
possible_responses = [1, 2, 3, 4]
batch_response = []
for prompt in prompts:
if is_multiple_choice:
# Randomly select one of the possible responses for multiple choice tasks
batch_response.append(str(random.choice(possible_responses)))
else:
# For other tasks, shuffle the possible responses and return as a string
random.shuffle(possible_responses)
batch_response.append(str(possible_responses))
# Note: As this is dummy model, we are returning random responses for non-multiple choice tasks.
# For generation tasks, this should ideally return an unconstrained string.
return batch_response
# class DummyModel(ShopBenchBaseModel):
# """
# A dummy model implementation for ShopBench, illustrating how to handle both
# multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
# This model uses a consistent random seed for reproducible results.
# """
# def __init__(self):
# """Initializes the model and sets the random seed for consistency."""
# random.seed(AICROWD_RUN_SEED)
# def predict(self, prompt: str, is_multiple_choice: bool) -> str:
# """
# Generates a prediction based on the input prompt and task type.
# For multiple choice tasks, it randomly selects a choice.
# For other tasks, it returns a list of integers as a string,
# representing the model's prediction in a format compatible with task-specific parsers.
# Args:
# prompt (str): The input prompt for the model.
# is_multiple_choice (bool): Indicates whether the task is a multiple choice question.
# Returns:
# str: The prediction as a string representing a single integer[0, 3] for multiple choice tasks,
# or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
# or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
# or a string representing the (unconstrained) generated response for the generation tasks
# Please refer to parsers.py for more details on how these responses will be parsed by the evaluator.
# """
# possible_responses = [1, 2, 3, 4]
# if is_multiple_choice:
# # Randomly select one of the possible responses for multiple choice tasks
# return str(random.choice(possible_responses))
# else:
# # For other tasks, shuffle the possible responses and return as a string
# random.shuffle(possible_responses)
# return str(possible_responses)
# # Note: As this is dummy model, we are returning random responses for non-multiple choice tasks.
# # For generation tasks, this should ideally return an unconstrained string.
class llama3_8b_FewShot(ShopBenchBaseModel):
def __init__(self):
random.seed(AICROWD_RUN_SEED)
model_path = './models/Meta-Llama-3-8B-Instruct'
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', trust_remote_code=True)
self.system_prompt = "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n"
self.terminators = [
self.tokenizer.eos_token_id,
self.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
# self.tokenizer.convert_tokens_to_ids("\\n"),
]
def get_detailed_instruct(self, task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery: {query}'
def load_rag_module(self, faiss_index_path:str):
# rag_module : embedding + faiss index + reranker
self.embed_model = SentenceTransformer("./models/multilingual-e5-large-instruct", device='cpu')
# self.reranker = FlagReranker('./models/bge-reranker-v2-m3', use_fp16=True, device='cuda:1')
# few shot preprocess
dim = 1024 # Embedding dimension for intfloat/multilingual-e5-large
nlist = 1024 # Number of cluster centroids
quantizer = faiss.IndexFlatIP(dim)
self.index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_INNER_PRODUCT)
self.index.nprobe = 3
with open('./models/large_sample_example.jsonl','r',encoding='utf8') as f:
for i in f.readlines():
passage = ''
t_data = json.loads(i.strip())
if "input" in t_data:
passage = t_data['instruction'] + t_data['input'] + '\nOutput:' + str( t_data['output']) + '\n'
else:
passage = t_data['instruction'] + str(t_data['output']) + '\n'
passage = passage.replace('\\n','\n')
# preprocess train retrieve index and save trained index
# self.fewshot_embeddings = self.embed_model.encode(self.few_shot_example_text, batch_size=128, show_progress_bar=True)
# print(f'process few shot example embedding done! {len(self.few_shot_example_text)}')
# self.index.train(self.fewshot_embeddings.astype(np.float32))
# self.index.add(self.fewshot_embeddings.astype(np.float32))
# faiss.write_index(self.index, "./models/index.ivf")
self.metadata = [{"fewshot_examaple": fewshot_examaple} for fewshot_examaple in self.few_shot_example_text]
def predict(self, prompt: str, is_multiple_choice: bool) -> str:
faiss_retrieve_topk = 7
faiss_score_filter = 0.88
bge_rerank_topk = 6
bge_score_filter = 0.6
# faiss vector Retrieve smiliar few shot example
task_description = "Given a online shopping user query, retrieve relevant Question-Answer that similar (type of task ,languages involved and product) to the query."
query_text = ' ' + prompt
query_embed = self.embed_model.encode([self.get_detailed_instruct(task_description, query_text)])[0]
scores, indices = self.index.search(np.array([query_embed]).astype(np.float32), faiss_retrieve_topk)
if score>=faiss_score_filter and len(self.metadata[idx]["fewshot_examaple"])<=6000:
fewshot_examaple = self.metadata[idx]["fewshot_examaple"]
exmaple_prompt.append(fewshot_examaple)
reranked_exmaple_prompt = exmaple_prompt if is_multiple_choice else exmaple_prompt[:4]
# if len(exmaple_prompt)>0:
# print("before reranke:")
# print(exmaple_prompt[:4])
# # rerank the result
# retrank_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that same (type of task ,languages involved) to the query."
# rerank_scores = self.reranker.compute_score(
# [[query_text,retrieved_fse] for retrieved_fse in exmaple_prompt],
# normalize=True,
# batch_size=32
# )
# reranked_exmaple_prompt = [
# exmaple_prompt[bge_rerank_topK_idx]
# for bge_rerank_topK_idx in np.argsort(rerank_scores)[-bge_rerank_topk:]
# if rerank_scores[bge_rerank_topK_idx]>=bge_score_filter
# ]
# reranked_exmaple_prompt = [t_prompt for t_prompt in reranked_exmaple_prompt if len(t_prompt)<=6000]
# print("reranked:")
# else:
# reranked_exmaple_prompt = []
prompt_example = self.system_prompt + '## Here are some similar questions and answers you can refer to:\n'
prompt_example = self.system_prompt + '\n## Now answer the Question:' + prompt
inputs = self.tokenizer.encode(prompt_example, add_special_tokens=False, return_tensors="pt").cuda()
print("prompt token length: ",len(inputs[0]))
generate_ids = self.model.generate(inputs, max_new_tokens=1, eos_token_id=self.terminators)
result = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
generation = result[len(prompt_example):]
else:
messages = [
{"role": "system", "content": prompt_example[:len(self.system_prompt)]},
{"role": "user", "content": prompt_example[len(self.system_prompt):]},
]
input_ids = self.tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(self.model.device)
)[0][input_ids.shape[-1]:]
generation = self.tokenizer.decode(outputs, skip_special_tokens=True)
"""
A dummy model implementation for ShopBench, illustrating how to handle both
multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
This model uses a consistent random seed for reproducible results.
"""
def __init__(self):
"""Initializes the model and sets the random seed for consistency."""
random.seed(AICROWD_RUN_SEED)
self.initialize_models()
def initialize_models(self):
# Initialize Meta Llama 3 - 8B Instruct Model
if not os.path.exists(self.model_name):
raise Exception(
f"""
The evaluators expect the model weights to be checked into the repository,
but we could not find the model weights at {self.model_name}
Please follow the instructions in the docs below to download and check in the model weights.
https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/blob/master/docs/download-baseline-model-weights.md
"""
)
# initialize the model with vllm
self.system_prompt = "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n\n"
self.llm = vllm.LLM(
self.model_name,
gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION,
max_model_len=8*1024,
self.faiss = self.load_rag_module(faiss_index_path="./models/index_0626.ivf")
self.faiss_retrieve_topk = 7
self.faiss_score_filter = 0.882
self.bge_rerank_topk = 6
self.bge_score_filter = 0.6
self.retrieve_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that similar (type of task ,languages involved and product) to the query."
def get_detailed_instruct(self, task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery: {query}'
def load_rag_module(self, faiss_index_path: str):
# rag_module : embedding + faiss index + reranker
self.embed_model = SentenceTransformer("./models/multilingual-e5-large-instruct", device='cpu')
# self.reranker = FlagReranker('./models/bge-reranker-v2-m3', use_fp16=True, device='cuda:1')
# few shot preprocess
self.few_shot_example_text = []
with open('./models/large_sample_example_0626.jsonl','r',encoding='utf8') as f:
for i in f.readlines():
passage = ''
t_data = json.loads(i.strip())
if "input" in t_data:
passage = t_data['instruction'] + t_data['input'] + '\nOutput:' + str( t_data['output']) + '\n'
else:
passage = t_data['instruction'] + str(t_data['output']) + '\n'
passage = passage.replace('\\n','\n')
self.few_shot_example_text.append(passage)
if os.path.exists(faiss_index_path):
self.index = faiss.read_index(faiss_index_path)
else:
self.index = self.train_save_faiss_index(faiss_index_path)
self.metadata = [{"fewshot_examaple": fewshot_examaple} for fewshot_examaple in self.few_shot_example_text]
def train_save_faiss_index(self,
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
dim: int = 1024,
nlist: int = 1024,
index_nprobe: int = 3):
# preprocess train retrieve index and save trained index
# dim : Embedding dimension for intfloat/multilingual-e5-large
# nlist : Number of cluster centroids
fewshot_embeddings = []
quantizer = faiss.IndexFlatIP(dim)
index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_INNER_PRODUCT)
index.nprobe = index_nprobe
fewshot_embeddings = self.embed_model.encode(self.few_shot_example_text, batch_size=256+128, show_progress_bar=True)
print(f'process few shot example embedding done! {len(self.few_shot_example_text)}')
index.train(fewshot_embeddings.astype(np.float32))
index.add(fewshot_embeddings.astype(np.float32))
faiss.write_index(index, index_save_path)
del fewshot_embeddings
return index
def get_batch_size(self) -> int:
"""
Determines the batch size that is used by the evaluator when calling the `batch_predict` function.
Returns:
int: The batch size, an integer between 1 and 16. This value indicates how many
queries should be processed together in a single batch. It can be dynamic
across different batch_predict calls, or stay a static value.
"""
self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE
return self.batch_size
def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
"""
Generates a batch of prediction based on associated prompts and task_type
For multiple choice tasks, it randomly selects a choice.
For other tasks, it returns a list of integers as a string,
representing the model's prediction in a format compatible with task-specific parsers.
Parameters:
- batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
- prompt (List[str]): a list of input prompts for the model.
- is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
Returns:
str: A list of predictions for each of the prompts received in the batch.
Each prediction is
a string representing a single integer[0, 3] for multiple choice tasks,
or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
or a string representing the (unconstrained) generated response for the generation tasks
Please refer to parsers.py for more details on how these responses will be parsed by the evaluator.
"""
prompts = batch["prompt"]
# format prompts using the chat template
formatted_prompts = self.format_prommpts(prompts, is_multiple_choice)
# set max new tokens to be generated
max_new_tokens = 2 # For MCQ tasks, we only need to generate 1 token
# Generate responses via vllm
responses = self.llm.generate(
formatted_prompts,
n=1, # Number of output sequences to return for each prompt.
# top_p=0.9, # Float that controls the cumulative probability of the top tokens to consider.
# top_k=1,
temperature=0, # randomness of the sampling
seed=AICROWD_RUN_SEED, # Seed for reprodicibility
skip_special_tokens=True, # Whether to skip special tokens in the output.
max_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence.
# debug logging
print("raw batch generation:", [response.outputs[0].text for response in responses])
# Aggregate answers into List[str]
batch_response = []
for response in responses:
batch_response = [k[0] for k in batch_response]
print("formmated generation: MCQ: ", batch_response)
else:
print("formmated generation:", batch_response)
# # tmp for ranking task
# if '[' in batch_response[0][0] and ']' in batch_response[0][-1]:
# batch_response = [json.loads(t) for t in batch_response]
# batch_response = (np.argsort(batch_response, axis=1)[:,::-1]+1).tolist()
# batch_response = [str(k) for k in batch_response]
return batch_response
def format_prommpts(self, prompts:List[str], is_multiple_choice:bool):
"""
Formats prompts using the chat_template of the model.
Parameters:
- queries (list of str): A list of queries to be formatted into prompts.
"""
# 1. faiss index retrieve topK few shot example
# 2. rerank few shot example
# 3. select topK few shot example as prompt
# 4. [
# {"role":"system","content":self.system_prompt},
# {"role":"user","content": query + few shot exmaple}
# {"role":"assistant","content": model generate ... ...}
# ]
# faiss vector Retrieve smiliar few shot example
query_embed_batch = self.embed_model.encode([self.get_detailed_instruct(self.retrieve_task_description, query_text) for query_text in prompts])
# scores_indices = [self.index.search(np.array([query_embed]).astype(np.float32), self.faiss_retrieve_topk) for query_embed in query_embed_batch]
scores, indices = self.index.search(np.array(query_embed_batch).astype(np.float32), self.faiss_retrieve_topk)
# print("retrieve total time: {:.2f} s".format(time.time() - start_time))
for prompt_idx, prompt in enumerate(prompts):
for score, retrieved_idx in zip(scores[prompt_idx],indices[prompt_idx]):
if score>=self.faiss_score_filter and len(self.metadata[retrieved_idx]["fewshot_examaple"])<=5000 and "商品仕様】◉サイズ:46cm×27cm×15cm◉重さ:710g◉メイン素材:水、汚れに強い高品質ポリエステルキャンバス、インナー素材:ナイロン◉ブランド:honey&blue◉付属品:ベビーカー吊り下げ用フック 【たっぷりのメイン収納】大きく開く開口部はダ" not in self.metadata[retrieved_idx]["fewshot_examaple"]:
fewshot_examaple = self.metadata[retrieved_idx]["fewshot_examaple"]
few_shot_exmaple.append(fewshot_examaple)
few_shot_exmaple = few_shot_exmaple[:4] if is_multiple_choice else few_shot_exmaple
if len(few_shot_exmaple) > 0:
prompt_example = '## Here are some similar questions and answers you can refer to:\n'
for i in few_shot_exmaple:
prompt_example += i+'\n'
prompt_example += '## Now answer the Question:' + prompt
else:
prompt_example = '## Now answer the Question:' + prompt
messages = [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": prompt_example}
]
chat_prompt = self.tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
# return_tensors="pt"
)
# if "llama" in self.model_name.lower():
# chat_prompt = chat_prompt[len(self.tokenizer.bos_token):] # vllm tokenize will also add bos token
formatted_prompts.append(chat_prompt)
# print(chat_prompt)
## debug logging
print("batch formatted prompt:", formatted_prompts)
return formatted_prompts
class llm_model_ensemble_vllm(ShopBenchBaseModel):
"""
A dummy model implementation for ShopBench, illustrating how to handle both
multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
This model uses a consistent random seed for reproducible results.
"""
def __init__(self):
"""Initializes the model and sets the random seed for consistency."""
random.seed(AICROWD_RUN_SEED)
self.initialize_models()
def initialize_models(self):
# Initialize Meta Llama 3 - 8B Instruct Model
self.model1_name = "./models/llama-mixed-626"
self.model2_name = "/home/jnu/gxw/glm-4-9b-chat"
if not os.path.exists(self.model1_name) or not os.path.exists(self.model2_name):
raise Exception(
f"""
The evaluators expect the model weights to be checked into the repository,
but we could not find the model weights at {self.model1_name} or {self.model2_name}
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
Please follow the instructions in the docs below to download and check in the model weights.
https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/blob/master/docs/download-baseline-model-weights.md
"""
)
# initialize the model with vllm
self.system_prompt = "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n\n"
# load model method 1 : model1 and model2 share gpu 0,1,2,3 compute and memory
# load model method 2 : model1 load into gpu 0,1 and model2 load into gpu 2,3
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
self.llm1 = vllm.LLM(
self.model1_name,
worker_use_ray=True,
tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE,
gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION,
trust_remote_code=True,
dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
enforce_eager=True,
)
self.tokenizer1 = self.llm1.get_tokenizer()
os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
self.llm2 = vllm.LLM(
self.model2_name,
worker_use_ray=True,
tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE,
gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION,
max_model_len=8*1024,
trust_remote_code=True,
dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
enforce_eager=True,
)
self.tokenizer2 = self.llm2.get_tokenizer()
self.faiss = self.load_rag_module(faiss_index_path="./models/index_0626.ivf")
self.faiss_retrieve_topk = 7
self.faiss_score_filter = 0.882
self.bge_rerank_topk = 6