VLLM_TENSOR_PARALLEL_SIZE=4# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
VLLM_TENSOR_PARALLEL_SIZE=1# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
VLLM_GPU_MEMORY_UTILIZATION=0.96# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
AICROWD_SUBMISSION_BATCH_SIZE=VLLM_TENSOR_PARALLEL_SIZE*4# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
...
...
@@ -57,7 +57,7 @@ class llama3_8b_FewShot_vllm(ShopBenchBaseModel):
)
# initialize the model with vllm
self.system_prompt="You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n\n"
self.system_prompt="You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Portuguese, Arabic, Hebrew, Korean, Chinese. You are knowledgeable about various products and adept at providing detailed information, recommendations, and assistance. Respond concisely and accurately to enhance the shopping experience. NOTE:ONLY OUTPUT THE ANSWER!!"
self.llm=vllm.LLM(
self.model_name,
...
...
@@ -72,7 +72,7 @@ class llama3_8b_FewShot_vllm(ShopBenchBaseModel):
self.retrieve_task_description="Given a online shopping user query, retrieve relevant Question-Answer that similar (type of task ,languages involved and product) to the query."
...
...
@@ -253,16 +253,19 @@ class llama3_8b_FewShot_vllm(ShopBenchBaseModel):
else:
prompt_example='## Now answer the Question:'+prompt
messages=[
{"role":"system","content":self.system_prompt},
{"role":"user","content":prompt_example}
]
chat_prompt=self.tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
# return_tensors="pt"
)
ifis_multiple_choice:
chat_prompt=self.system_prompt+prompt_example
else:
messages=[
{"role":"system","content":self.system_prompt},
{"role":"user","content":prompt_example}
]
chat_prompt=self.tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
# return_tensors="pt"
)
# if "llama" in self.model_name.lower():
# chat_prompt = chat_prompt[len(self.tokenizer.bos_token):] # vllm tokenize will also add bos token