Skip to content
Snippets Groups Projects 58.4 KiB
Newer Older
der2933's avatar
der2933 committed
import asyncio
spmohanty's avatar
spmohanty committed
import os
der2933's avatar
der2933 committed
import torch
import json
import random
import faiss
der2933's avatar
der2933 committed
import vllm
der2933's avatar
der2933 committed

xw_g's avatar
xw_g committed
import torch.nn.functional as F
der2933's avatar
der2933 committed
from typing import List, Union, Any, Dict, List
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
der2933's avatar
der2933 committed
from sentence_transformers import SentenceTransformer
xw_g's avatar
xw_g committed
from import tqdm
import numpy as np
der2933's avatar
der2933 committed
# from FlagEmbedding import FlagReranker
# import time
xw_g's avatar
xw_g committed

der2933's avatar
der2933 committed
from .base_model import ShopBenchBaseModel
spmohanty's avatar
spmohanty committed
# Set a consistent seed for reproducibility
der2933's avatar
der2933 committed
AICROWD_RUN_SEED = int(os.getenv("AICROWD_RUN_SEED", 42*4096))
der2933's avatar
der2933 committed

# VLLM Parameters 
der2933's avatar
der2933 committed
VLLM_TENSOR_PARALLEL_SIZE = 4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
VLLM_GPU_MEMORY_UTILIZATION = 0.96 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
AICROWD_SUBMISSION_BATCH_SIZE = VLLM_TENSOR_PARALLEL_SIZE*4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
der2933's avatar
der2933 committed

class llama3_8b_FewShot_vllm(ShopBenchBaseModel):
Dipam Chakraborty's avatar
Dipam Chakraborty committed
spmohanty's avatar
spmohanty committed
    A dummy model implementation for ShopBench, illustrating how to handle both
    multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
    This model uses a consistent random seed for reproducible results.
Dipam Chakraborty's avatar
Dipam Chakraborty committed
spmohanty's avatar
spmohanty committed

Dipam Chakraborty's avatar
Dipam Chakraborty committed
    def __init__(self):
spmohanty's avatar
spmohanty committed
        """Initializes the model and sets the random seed for consistency."""
spmohanty's avatar
spmohanty committed
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
    def initialize_models(self):
        # Initialize Meta Llama 3 - 8B Instruct Model
der2933's avatar
der2933 committed
        self.model_name = "./models/llama3-0630"
der2933's avatar
der2933 committed

        if not os.path.exists(self.model_name):
            raise Exception(
            The evaluators expect the model weights to be checked into the repository,
            but we could not find the model weights at {self.model_name}
            Please follow the instructions in the docs below to download and check in the model weights.

        # initialize the model with vllm
        self.system_prompt = "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n\n"

        self.llm = vllm.LLM(
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
            dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
        self.tokenizer = self.llm.get_tokenizer()

der2933's avatar
der2933 committed
        self.faiss = self.load_rag_module(faiss_index_path="./models/index_0626.ivf")
        self.faiss_retrieve_topk = 7
        self.faiss_score_filter = 0.885
der2933's avatar
der2933 committed

        self.retrieve_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that similar (type of task ,languages involved and product) to the query."

    def get_detailed_instruct(self, task_description: str, query: str) -> str:
        return f'Instruct: {task_description}\nQuery: {query}'
    def load_rag_module(self, faiss_index_path: str):
        # rag_module : embedding + faiss index + reranker
        self.embed_model = SentenceTransformer("./models/multilingual-e5-large-instruct", device='cpu')
        # self.reranker = FlagReranker('./models/bge-reranker-v2-m3', use_fp16=True, device='cuda:1')

        # few shot preprocess
        self.few_shot_example_text = []
der2933's avatar
der2933 committed
        with open('./models/large_sample_example_0626.jsonl','r',encoding='utf8') as f:
der2933's avatar
der2933 committed
            for i in f.readlines():
                passage = ''
                t_data = json.loads(i.strip())
                if "input" in t_data:
                    passage = t_data['instruction'] + t_data['input'] + '\nOutput:' + str( t_data['output']) + '\n'
                    passage = t_data['instruction'] + str(t_data['output']) + '\n'
                passage = passage.replace('\\n','\n')
        if os.path.exists(faiss_index_path):
            self.index = faiss.read_index(faiss_index_path)
            self.index = self.train_save_faiss_index(faiss_index_path)
        self.metadata = [{"fewshot_examaple": fewshot_examaple} for fewshot_examaple in self.few_shot_example_text]

    def train_save_faiss_index(self,
der2933's avatar
der2933 committed
                               index_save_path: str = "./models/index_0626.ivf",
der2933's avatar
der2933 committed
                               dim: int = 1024,
                               nlist: int = 1024,
                               index_nprobe: int = 3):
        # preprocess train retrieve index and save trained index
        # dim : Embedding dimension for intfloat/multilingual-e5-large
        # nlist : Number of cluster centroids
        fewshot_embeddings = []
        quantizer = faiss.IndexFlatIP(dim)
        index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_INNER_PRODUCT)
        index.nprobe = index_nprobe
        fewshot_embeddings = self.embed_model.encode(self.few_shot_example_text, batch_size=256+128, show_progress_bar=True)
        print(f'process few shot example embedding done! {len(self.few_shot_example_text)}')
        faiss.write_index(index, index_save_path)
        del fewshot_embeddings

        return index

    def get_batch_size(self) -> int:
        Determines the batch size that is used by the evaluator when calling the `batch_predict` function.

            int: The batch size, an integer between 1 and 16. This value indicates how many
                 queries should be processed together in a single batch. It can be dynamic
                 across different batch_predict calls, or stay a static value.
        self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE
        return self.batch_size

    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
Dipam Chakraborty's avatar
Dipam Chakraborty committed
der2933's avatar
der2933 committed
        Generates a batch of prediction based on associated prompts and task_type
spmohanty's avatar
spmohanty committed
        For multiple choice tasks, it randomly selects a choice.
        For other tasks, it returns a list of integers as a string,
        representing the model's prediction in a format compatible with task-specific parsers.
der2933's avatar
der2933 committed
            - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
                - prompt (List[str]): a list of input prompts for the model.
            - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
spmohanty's avatar
spmohanty committed
der2933's avatar
der2933 committed
            str: A list of predictions for each of the prompts received in the batch.
                    Each prediction is
                           a string representing a single integer[0, 3] for multiple choice tasks,
spmohanty's avatar
spmohanty committed
                        or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
                        or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
                        or a string representing the (unconstrained) generated response for the generation tasks
spmohanty's avatar
spmohanty committed
                        Please refer to for more details on how these responses will be parsed by the evaluator.
Dipam Chakraborty's avatar
Dipam Chakraborty committed
der2933's avatar
der2933 committed
        prompts = batch["prompt"]
        # format prompts using the chat template
        formatted_prompts = self.format_prommpts(prompts, is_multiple_choice)
        # set max new tokens to be generated
der2933's avatar
der2933 committed
        max_new_tokens = 150 
der2933's avatar
der2933 committed
        if is_multiple_choice:
            max_new_tokens = 1 # For MCQ tasks, we only need to generate 1 token
        # Generate responses via vllm
        responses = self.llm.generate(
                n=1,  # Number of output sequences to return for each prompt.
                # top_p=0.9,  # Float that controls the cumulative probability of the top tokens to consider.
                # top_k=1,
der2933's avatar
der2933 committed
                # repetition_penalty=1.05,
der2933's avatar
der2933 committed
                temperature=0,  # randomness of the sampling
                seed=AICROWD_RUN_SEED, # Seed for reprodicibility
                skip_special_tokens=True,  # Whether to skip special tokens in the output.
                max_tokens=max_new_tokens,  # Maximum number of tokens to generate per output sequence.
                # stop_token_ids=self.terminators # llama 3 stop token
            use_tqdm = False
der2933's avatar
der2933 committed

        # debug logging
        print("raw batch generation:", [response.outputs[0].text for response in responses])
der2933's avatar
der2933 committed
        # Aggregate answers into List[str]
        batch_response = []
        for response in responses:
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
        if is_multiple_choice:
der2933's avatar
der2933 committed
            batch_response = [k[0] for k in batch_response]
            print("formmated generation: MCQ: ", batch_response)
            print("formmated generation:", batch_response)
            # # 0625 tmp for ranking task 
            # if '[' in batch_response[0][0] and ']' in batch_response[0][-1]:
            #     batch_response = [json.loads(t) for t in batch_response]
            #     batch_response = (np.argsort(batch_response, axis=1)[:,::-1]+1).tolist()
            #     batch_response = [str(k) for k in batch_response]
der2933's avatar
der2933 committed

        return batch_response

    def format_prommpts(self, prompts:List[str], is_multiple_choice:bool):
        Formats prompts using the chat_template of the model.
        - queries (list of str): A list of queries to be formatted into prompts.
        # 1. faiss index retrieve topK few shot example
        # 2. rerank few shot example
        # 3. select topK few shot example as prompt
        # 4.  [
        #       {"role":"system","content":self.system_prompt},
        #       {"role":"user","content": query + few shot exmaple}
        #       {"role":"assistant","content": model generate ... ...}
        #  ]

        # faiss vector Retrieve smiliar few shot example
der2933's avatar
der2933 committed
        # start_time = time.time()
der2933's avatar
der2933 committed
        formatted_prompts = []
der2933's avatar
der2933 committed
        query_embed_batch = self.embed_model.encode([self.get_detailed_instruct(self.retrieve_task_description, query_text) for query_text in prompts])
        # scores_indices = [[query_embed]).astype(np.float32), self.faiss_retrieve_topk) for query_embed in query_embed_batch]
        scores, indices =, self.faiss_retrieve_topk) 
        # print("retrieve total time: {:.2f} s".format(time.time() - start_time))

        for prompt_idx, prompt in enumerate(prompts):
der2933's avatar
der2933 committed
            # process results
            few_shot_exmaple = []
der2933's avatar
der2933 committed
            for score, retrieved_idx in zip(scores[prompt_idx],indices[prompt_idx]):
                if score>=self.faiss_score_filter and len(self.metadata[retrieved_idx]["fewshot_examaple"])<=5000 and "商品仕様】◉サイズ:46cm×27cm×15cm◉重さ:710g◉メイン素材:水、汚れに強い高品質ポリエステルキャンバス、インナー素材:ナイロン◉ブランド:honey&blue◉付属品:ベビーカー吊り下げ用フック 【たっぷりのメイン収納】大きく開く開口部はダ" not in self.metadata[retrieved_idx]["fewshot_examaple"]:
der2933's avatar
der2933 committed
                    fewshot_examaple = self.metadata[retrieved_idx]["fewshot_examaple"]
der2933's avatar
der2933 committed
            few_shot_exmaple = few_shot_exmaple[:4] if  is_multiple_choice else few_shot_exmaple
der2933's avatar
der2933 committed

der2933's avatar
der2933 committed
            if len(few_shot_exmaple) > 0:
der2933's avatar
der2933 committed
                prompt_example = '## Here are some similar questions and answers you can refer to:\n' 
der2933's avatar
der2933 committed
                for i in few_shot_exmaple:
der2933's avatar
der2933 committed
                    prompt_example += i+'\n'
                prompt_example += '## Now answer the Question:' + prompt
                prompt_example = '## Now answer the Question:' + prompt

der2933's avatar
der2933 committed
            messages = [
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt_example}
            chat_prompt = self.tokenizer.apply_chat_template(
                # return_tensors="pt"
            # if "llama" in self.model_name.lower():
                # chat_prompt = chat_prompt[len(self.tokenizer.bos_token):]  # vllm tokenize will also add bos token
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
            # print(chat_prompt)
        ## debug logging
        print("batch formatted prompt:", formatted_prompts)
der2933's avatar
der2933 committed
        return formatted_prompts

class aya23_fewshot_VLLM(llama3_8b_FewShot_vllm):
    A dummy model implementation for ShopBench, illustrating how to handle both
    multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
    This model uses a consistent random seed for reproducible results.

    def __init__(self):
        """Initializes the model and sets the random seed for consistency."""

    def initialize_models(self):
        # Initialize Meta Llama 3 - 8B Instruct Model
        self.model_name = "./models/aya23-8b"

        if not os.path.exists(self.model_name):
            raise Exception(
            The evaluators expect the model weights to be checked into the repository,
            but we could not find the model weights at {self.model_name}
            Please follow the instructions in the docs below to download and check in the model weights.

        # initialize the model with vllm
        self.system_prompt = "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n\n"

        self.model = AutoModelForCausalLM.from_pretrained("/home/jnu/gxw/amazon-kdd-cup-2024-starter-kit/models/aya23-8b", torch_dtype="auto", device_map="auto")
        self.tokenizer = AutoTokenizer.from_pretrained("/home/jnu/gxw/amazon-kdd-cup-2024-starter-kit/models/aya23-8b")

der2933's avatar
der2933 committed
        self.faiss = self.load_rag_module(faiss_index_path="./models/index_0626.ivf")
der2933's avatar
der2933 committed
        self.faiss_retrieve_topk = 8
        self.faiss_score_filter = 0.85
        self.bge_rerank_topk = 6
        self.bge_score_filter = 0.6

        self.retrieve_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that similar (type of task ,languages involved and product) to the query."

    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice: bool) -> List[str]:

        prompts = batch["prompt"]
        # format prompts using the chat template
        formatted_prompts = self.format_prommpts(prompts, is_multiple_choice)
        # set max new tokens to be generated
        max_new_tokens = 140
        if is_multiple_choice:
            max_new_tokens = 1
        input_batch = [self.tokenizer.encode(i,return_tensors="pt").to(1) for i in formatted_prompts]

        # Generate responses via vllm
        gen_tokens = [self.model.generate(
            max_new_tokens=max_new_tokens,  # Maximum number of tokens to generate per output sequence.
        ) for i in input_batch]

        gen_tokens = [i[0][len(j[0]):] for i,j in zip(gen_tokens, input_batch)]
        gen_text = [self.tokenizer.decode(i,skip_special_tokens=True) for i in gen_tokens]
spmohanty's avatar
spmohanty committed

        if is_multiple_choice:
der2933's avatar
der2933 committed
            print("MCQ: ", gen_text)
spmohanty's avatar
spmohanty committed
der2933's avatar
der2933 committed
        del input_batch, gen_tokens
        return gen_text

    def format_prommpts(self, prompts:List[str], is_multiple_choice:bool):
        Formats prompts using the chat_template of the model.
        - queries (list of str): A list of queries to be formatted into prompts.
        # 1. faiss index retrieve topK few shot example
        # 2. rerank few shot example
        # 3. select topK few shot example as prompt
        # 4.  [
        #       {"role":"system","content":self.system_prompt},
        #       {"role":"user","content": query + few shot exmaple}
        #       {"role":"assistant","content": model generate ... ...}
        #  ]

        # faiss vector Retrieve smiliar few shot example
        formatted_prompts = []
        for prompt in prompts:
            query_text = ' ' + prompt
            query_embed = self.embed_model.encode([self.get_detailed_instruct(self.retrieve_task_description, query_text)])[0]
            scores, indices =[query_embed]).astype(np.float32), self.faiss_retrieve_topk)
            # process results
            few_shot_exmaple = []
            for score, retrieved_idx in zip(scores[0], indices[0]):
                if score>=self.faiss_score_filter and len(self.metadata[retrieved_idx]["fewshot_examaple"])<=6000:
                    fewshot_examaple = self.metadata[retrieved_idx]["fewshot_examaple"]
            reranked_exmaple_prompt = few_shot_exmaple if  is_multiple_choice else few_shot_exmaple[:4]

            # rerank

            # if len(exmaple_prompt)>0:
            #     print("before reranke:")
            #     print(exmaple_prompt[:4])
            #     # rerank the result
            #     retrank_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that same (type of task ,languages involved) to the query."
            #     rerank_scores = self.reranker.compute_score(
            #         [[query_text,retrieved_fse] for retrieved_fse in exmaple_prompt],
            #         normalize=True,
            #         batch_size=32
            #     )
            #     reranked_exmaple_prompt = [
            #         exmaple_prompt[bge_rerank_topK_idx] 
            #             for bge_rerank_topK_idx in np.argsort(rerank_scores)[-bge_rerank_topk:] 
            #             if rerank_scores[bge_rerank_topK_idx]>=bge_score_filter
            #     ]
            #     reranked_exmaple_prompt = [t_prompt for t_prompt in reranked_exmaple_prompt if len(t_prompt)<=6000]
            #     print("reranked:")
            # else:
            #     reranked_exmaple_prompt = []
            if len(reranked_exmaple_prompt) > 0:
                prompt_example = '## Here are some similar questions and answers you can refer to:\n' 
                for i in reranked_exmaple_prompt:
                    prompt_example += i+'\n'
                prompt_example += '## Now answer the Question:' + prompt
                prompt_example = '## Now answer the Question:' + prompt

            messages = [
                {"role": "user", "content": prompt_example}
            chat_prompt = self.tokenizer.apply_chat_template(
                # return_tensors="pt"


        return formatted_prompts

class DummyModel(ShopBenchBaseModel):
    A dummy model implementation for ShopBench, illustrating how to handle both
    multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
    This model uses a consistent random seed for reproducible results.

    def __init__(self):
        """Initializes the model and sets the random seed for consistency."""

    def get_batch_size(self) -> int:
        Determines the batch size that is used by the evaluator when calling the `batch_predict` function.

            int: The batch size, an integer between 1 and 16. This value indicates how many
                 queries should be processed together in a single batch. It can be dynamic
                 across different batch_predict calls, or stay a static value.
        self.batch_size = 4
        return self.batch_size

    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
        Generates a batch of prediction based on associated prompts and task_type

        For multiple choice tasks, it randomly selects a choice.
        For other tasks, it returns a list of integers as a string,
        representing the model's prediction in a format compatible with task-specific parsers.

            - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
                - prompt (List[str]): a list of input prompts for the model.
            - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.

            str: A list of predictions for each of the prompts received in the batch.
                    Each prediction is
                           a string representing a single integer[0, 3] for multiple choice tasks,
                        or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
                        or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
                        or a string representing the (unconstrained) generated response for the generation tasks
                        Please refer to for more details on how these responses will be parsed by the evaluator.
        prompts = batch["prompt"]

        possible_responses = [1, 2, 3, 4]

        batch_response = []
        for prompt in prompts:
            if is_multiple_choice:
                # Randomly select one of the possible responses for multiple choice tasks
                # For other tasks, shuffle the possible responses and return as a string
                # Note: As this is dummy model, we are returning random responses for non-multiple choice tasks.
                # For generation tasks, this should ideally return an unconstrained string.

        return batch_response

# class DummyModel(ShopBenchBaseModel):
#     """
#     A dummy model implementation for ShopBench, illustrating how to handle both
#     multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
#     This model uses a consistent random seed for reproducible results.
#     """

#     def __init__(self):
#         """Initializes the model and sets the random seed for consistency."""
#         random.seed(AICROWD_RUN_SEED)

#     def predict(self, prompt: str, is_multiple_choice: bool) -> str:
#         """
#         Generates a prediction based on the input prompt and task type.

#         For multiple choice tasks, it randomly selects a choice.
#         For other tasks, it returns a list of integers as a string,
#         representing the model's prediction in a format compatible with task-specific parsers.

#         Args:
#             prompt (str): The input prompt for the model.
#             is_multiple_choice (bool): Indicates whether the task is a multiple choice question.

#         Returns:
#             str: The prediction as a string representing a single integer[0, 3] for multiple choice tasks,
#                         or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
#                         or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
#                         or a string representing the (unconstrained) generated response for the generation tasks
#                         Please refer to for more details on how these responses will be parsed by the evaluator.
#         """
#         possible_responses = [1, 2, 3, 4]

#         if is_multiple_choice:
#             # Randomly select one of the possible responses for multiple choice tasks
#             return str(random.choice(possible_responses))
#         else:
#             # For other tasks, shuffle the possible responses and return as a string
#             random.shuffle(possible_responses)
#             return str(possible_responses)
#             # Note: As this is dummy model, we are returning random responses for non-multiple choice tasks.
#             # For generation tasks, this should ideally return an unconstrained string.
xw_g's avatar
xw_g committed

class llama3_8b_FewShot(ShopBenchBaseModel):
    def __init__(self):
        model_path = './models/Meta-Llama-3-8B-Instruct'
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map='auto', trust_remote_code=True)
der2933's avatar
der2933 committed
        self.system_prompt =  "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n" 
xw_g's avatar
xw_g committed
        self.terminators = [
            # self.tokenizer.convert_tokens_to_ids("\\n"),
der2933's avatar
der2933 committed
xw_g's avatar
xw_g committed

xw_g's avatar
xw_g committed
    def get_detailed_instruct(self, task_description: str, query: str) -> str:
        return f'Instruct: {task_description}\nQuery: {query}'
der2933's avatar
der2933 committed
    def load_rag_module(self, faiss_index_path:str):
        # rag_module : embedding + faiss index + reranker
der2933's avatar
der2933 committed
        self.embed_model = SentenceTransformer("./models/multilingual-e5-large-instruct", device='cpu')
        # self.reranker = FlagReranker('./models/bge-reranker-v2-m3', use_fp16=True, device='cuda:1')
xw_g's avatar
xw_g committed

xw_g's avatar
xw_g committed
        # few shot preprocess
        dim = 1024  # Embedding dimension for intfloat/multilingual-e5-large
        nlist = 1024 # Number of cluster centroids
        quantizer = faiss.IndexFlatIP(dim)
        self.index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_INNER_PRODUCT)
        self.index.nprobe = 3
xw_g's avatar
xw_g committed

xw_g's avatar
xw_g committed
        self.few_shot_example_text = []
        self.fewshot_embeddings = []
xw_g's avatar
xw_g committed
        with open('./models/large_sample_example.jsonl','r',encoding='utf8') as f:
xw_g's avatar
xw_g committed
            for i in f.readlines():
                passage = ''
                t_data = json.loads(i.strip())
                if "input" in t_data:
                    passage = t_data['instruction'] + t_data['input'] + '\nOutput:' + str( t_data['output']) + '\n'
                    passage = t_data['instruction'] + str(t_data['output']) + '\n'
                passage = passage.replace('\\n','\n')
xw_g's avatar
xw_g committed

der2933's avatar
der2933 committed
        # preprocess train retrieve index and save trained index
        # self.fewshot_embeddings = self.embed_model.encode(self.few_shot_example_text, batch_size=128, show_progress_bar=True)
        # print(f'process few shot example embedding done! {len(self.few_shot_example_text)}')
        # self.index.train(self.fewshot_embeddings.astype(np.float32))
        # self.index.add(self.fewshot_embeddings.astype(np.float32))
        # faiss.write_index(self.index, "./models/index.ivf")
xw_g's avatar
xw_g committed

der2933's avatar
der2933 committed
        self.index = faiss.read_index(faiss_index_path)
xw_g's avatar
xw_g committed
        self.metadata = [{"fewshot_examaple": fewshot_examaple} for fewshot_examaple in self.few_shot_example_text]

    def predict(self, prompt: str, is_multiple_choice: bool) -> str:
der2933's avatar
der2933 committed
        faiss_retrieve_topk = 7
        faiss_score_filter = 0.88
        bge_rerank_topk = 6
xw_g's avatar
xw_g committed
        bge_score_filter = 0.6

        # faiss vector Retrieve smiliar few shot example
        task_description = "Given a online shopping user query, retrieve relevant Question-Answer that similar (type of task ,languages involved and product) to the query."
xw_g's avatar
xw_g committed
        query_text = ' ' + prompt
        query_embed = self.embed_model.encode([self.get_detailed_instruct(task_description, query_text)])[0]
xw_g's avatar
xw_g committed
        scores, indices =[query_embed]).astype(np.float32), faiss_retrieve_topk)
xw_g's avatar
xw_g committed

xw_g's avatar
xw_g committed
        # process results
xw_g's avatar
xw_g committed
        exmaple_prompt = []
        for score, idx in zip(scores[0], indices[0]):
der2933's avatar
der2933 committed
            if score>=faiss_score_filter and len(self.metadata[idx]["fewshot_examaple"])<=6000:
xw_g's avatar
xw_g committed
                fewshot_examaple = self.metadata[idx]["fewshot_examaple"]
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
        reranked_exmaple_prompt = exmaple_prompt if is_multiple_choice else exmaple_prompt[:4]
der2933's avatar
der2933 committed

        # if len(exmaple_prompt)>0:
        #     print("before reranke:")
        #     print(exmaple_prompt[:4])
        #     # rerank the result
        #     retrank_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that same (type of task ,languages involved) to the query."
        #     rerank_scores = self.reranker.compute_score(
        #         [[query_text,retrieved_fse] for retrieved_fse in exmaple_prompt],
        #         normalize=True,
        #         batch_size=32
        #     )
        #     reranked_exmaple_prompt = [
        #         exmaple_prompt[bge_rerank_topK_idx] 
        #             for bge_rerank_topK_idx in np.argsort(rerank_scores)[-bge_rerank_topk:] 
        #             if rerank_scores[bge_rerank_topK_idx]>=bge_score_filter
        #     ]
        #     reranked_exmaple_prompt = [t_prompt for t_prompt in reranked_exmaple_prompt if len(t_prompt)<=6000]
        #     print("reranked:")
        # else:
        #     reranked_exmaple_prompt = []
xw_g's avatar
xw_g committed

xw_g's avatar
xw_g committed
        if len(reranked_exmaple_prompt) > 0:
xw_g's avatar
xw_g committed
            prompt_example = self.system_prompt + '## Here are some similar questions and answers you can refer to:\n' 
xw_g's avatar
xw_g committed
            for i in reranked_exmaple_prompt:
xw_g's avatar
xw_g committed
                prompt_example += i+'\n'
xw_g's avatar
xw_g committed
            prompt_example += '## Now answer the Question:' + prompt
xw_g's avatar
xw_g committed
xw_g's avatar
xw_g committed
            prompt_example = self.system_prompt + '\n## Now answer the Question:' + prompt
xw_g's avatar
xw_g committed

xw_g's avatar
xw_g committed
        if is_multiple_choice:
xw_g's avatar
xw_g committed
            inputs = self.tokenizer.encode(prompt_example, add_special_tokens=False, return_tensors="pt").cuda()
            print("prompt token length: ",len(inputs[0]))
            generate_ids = self.model.generate(inputs, max_new_tokens=1, eos_token_id=self.terminators)
xw_g's avatar
xw_g committed
            result = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
            generation = result[len(prompt_example):]
xw_g's avatar
xw_g committed
            messages = [
                {"role": "system", "content": prompt_example[:len(self.system_prompt)]},
                {"role": "user", "content": prompt_example[len(self.system_prompt):]},
            input_ids = self.tokenizer.apply_chat_template(
xw_g's avatar
xw_g committed
            print("prompt token length :", len(input_ids[0]))
xw_g's avatar
xw_g committed
            outputs = self.model.generate(
der2933's avatar
der2933 committed
xw_g's avatar
xw_g committed
xw_g's avatar
xw_g committed
            generation = self.tokenizer.decode(outputs, skip_special_tokens=True)
xw_g's avatar
xw_g committed

        print(f'model generate answer : {generation}')
der2933's avatar
der2933 committed
        return generation
der2933's avatar
der2933 committed
class glm4_9b_FewShot_vllm(ShopBenchBaseModel):
der2933's avatar
der2933 committed
    A dummy model implementation for ShopBench, illustrating how to handle both
    multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
    This model uses a consistent random seed for reproducible results.

    def __init__(self):
        """Initializes the model and sets the random seed for consistency."""

    def initialize_models(self):
        # Initialize Meta Llama 3 - 8B Instruct Model
der2933's avatar
der2933 committed
        self.model_name = "/home/jnu/gxw/glm-4-9b-chat"
der2933's avatar
der2933 committed

        if not os.path.exists(self.model_name):
            raise Exception(
            The evaluators expect the model weights to be checked into the repository,
            but we could not find the model weights at {self.model_name}
            Please follow the instructions in the docs below to download and check in the model weights.

        # initialize the model with vllm
        self.system_prompt = "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n\n"

        self.llm = vllm.LLM(
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
            dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
        self.tokenizer = self.llm.get_tokenizer()
der2933's avatar
der2933 committed

der2933's avatar
der2933 committed
        self.faiss = self.load_rag_module(faiss_index_path="./models/index_0626.ivf")
        self.faiss_retrieve_topk = 7
        self.faiss_score_filter = 0.882
der2933's avatar
der2933 committed
        self.bge_rerank_topk = 6
        self.bge_score_filter = 0.6

        self.retrieve_task_description = "Given a online shopping user query, retrieve relevant Question-Answer that similar (type of task ,languages involved and product) to the query."

    def get_detailed_instruct(self, task_description: str, query: str) -> str:
        return f'Instruct: {task_description}\nQuery: {query}'
    def load_rag_module(self, faiss_index_path: str):
        # rag_module : embedding + faiss index + reranker
der2933's avatar
der2933 committed
        self.embed_model = SentenceTransformer("./models/multilingual-e5-large-instruct", device='cpu')
der2933's avatar
der2933 committed
        # self.reranker = FlagReranker('./models/bge-reranker-v2-m3', use_fp16=True, device='cuda:1')

        # few shot preprocess
        self.few_shot_example_text = []
der2933's avatar
der2933 committed
        with open('./models/large_sample_example_0626.jsonl','r',encoding='utf8') as f:
der2933's avatar
der2933 committed
            for i in f.readlines():
                passage = ''
                t_data = json.loads(i.strip())
                if "input" in t_data:
                    passage = t_data['instruction'] + t_data['input'] + '\nOutput:' + str( t_data['output']) + '\n'
                    passage = t_data['instruction'] + str(t_data['output']) + '\n'
                passage = passage.replace('\\n','\n')
        if os.path.exists(faiss_index_path):
            self.index = faiss.read_index(faiss_index_path)
            self.index = self.train_save_faiss_index(faiss_index_path)
        self.metadata = [{"fewshot_examaple": fewshot_examaple} for fewshot_examaple in self.few_shot_example_text]

    def train_save_faiss_index(self,
der2933's avatar
der2933 committed
                               index_save_path: str = "./models/index_0626.ivf",
der2933's avatar
der2933 committed
                               dim: int = 1024,
                               nlist: int = 1024,
                               index_nprobe: int = 3):
        # preprocess train retrieve index and save trained index
        # dim : Embedding dimension for intfloat/multilingual-e5-large
        # nlist : Number of cluster centroids
        fewshot_embeddings = []
        quantizer = faiss.IndexFlatIP(dim)
        index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_INNER_PRODUCT)
        index.nprobe = index_nprobe
        fewshot_embeddings = self.embed_model.encode(self.few_shot_example_text, batch_size=256+128, show_progress_bar=True)
        print(f'process few shot example embedding done! {len(self.few_shot_example_text)}')
        faiss.write_index(index, index_save_path)
        del fewshot_embeddings

        return index

    def get_batch_size(self) -> int:
        Determines the batch size that is used by the evaluator when calling the `batch_predict` function.

            int: The batch size, an integer between 1 and 16. This value indicates how many
                 queries should be processed together in a single batch. It can be dynamic
                 across different batch_predict calls, or stay a static value.
        self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE
        return self.batch_size

    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
der2933's avatar
der2933 committed
        Generates a batch of prediction based on associated prompts and task_type

        For multiple choice tasks, it randomly selects a choice.
        For other tasks, it returns a list of integers as a string,
        representing the model's prediction in a format compatible with task-specific parsers.

            - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
                - prompt (List[str]): a list of input prompts for the model.
            - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.

            str: A list of predictions for each of the prompts received in the batch.
                    Each prediction is
                           a string representing a single integer[0, 3] for multiple choice tasks,
                        or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
                        or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
                        or a string representing the (unconstrained) generated response for the generation tasks
                        Please refer to for more details on how these responses will be parsed by the evaluator.
der2933's avatar
der2933 committed
        prompts = batch["prompt"]
        # format prompts using the chat template
        formatted_prompts = self.format_prommpts(prompts, is_multiple_choice)
        # set max new tokens to be generated
der2933's avatar
der2933 committed
        max_new_tokens = 150 
der2933's avatar
der2933 committed
        if is_multiple_choice:
der2933's avatar
der2933 committed
            max_new_tokens = 2 # For MCQ tasks, we only need to generate 1 token
der2933's avatar
der2933 committed
        # Generate responses via vllm
        responses = self.llm.generate(
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
                n=1,  # Number of output sequences to return for each prompt.
                # top_p=0.9,  # Float that controls the cumulative probability of the top tokens to consider.
                # top_k=1,
der2933's avatar
der2933 committed
                # repetition_penalty=1.05,
der2933's avatar
der2933 committed
                temperature=0,  # randomness of the sampling
                seed=AICROWD_RUN_SEED, # Seed for reprodicibility
                skip_special_tokens=True,  # Whether to skip special tokens in the output.
                max_tokens=max_new_tokens,  # Maximum number of tokens to generate per output sequence.
der2933's avatar
der2933 committed
                stop_token_ids=[151329, 151336, 151338] # glm4 stop token
der2933's avatar
der2933 committed
            use_tqdm = False
der2933's avatar
der2933 committed

        # debug logging
        print("raw batch generation:", [response.outputs[0].text for response in responses])
der2933's avatar
der2933 committed
        # Aggregate answers into List[str]
        batch_response = []
        for response in responses:
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
        if is_multiple_choice:
der2933's avatar
der2933 committed
            batch_response = [k[0] for k in batch_response]
            print("formmated generation: MCQ: ", batch_response)
            print("formmated generation:", batch_response)
            # # tmp for ranking task
            # if '[' in batch_response[0][0] and ']' in batch_response[0][-1]:
            #     batch_response = [json.loads(t) for t in batch_response]
            #     batch_response = (np.argsort(batch_response, axis=1)[:,::-1]+1).tolist()
            #     batch_response = [str(k) for k in batch_response]

der2933's avatar
der2933 committed
        return batch_response

    def format_prommpts(self, prompts:List[str], is_multiple_choice:bool):
        Formats prompts using the chat_template of the model.
        - queries (list of str): A list of queries to be formatted into prompts.
        # 1. faiss index retrieve topK few shot example
        # 2. rerank few shot example
        # 3. select topK few shot example as prompt
        # 4.  [
        #       {"role":"system","content":self.system_prompt},
        #       {"role":"user","content": query + few shot exmaple}
        #       {"role":"assistant","content": model generate ... ...}
        #  ]

        # faiss vector Retrieve smiliar few shot example
der2933's avatar
der2933 committed
        # start_time = time.time()
der2933's avatar
der2933 committed
        formatted_prompts = []
der2933's avatar
der2933 committed
        query_embed_batch = self.embed_model.encode([self.get_detailed_instruct(self.retrieve_task_description, query_text) for query_text in prompts])
        # scores_indices = [[query_embed]).astype(np.float32), self.faiss_retrieve_topk) for query_embed in query_embed_batch]
        scores, indices =, self.faiss_retrieve_topk) 
        # print("retrieve total time: {:.2f} s".format(time.time() - start_time))

        for prompt_idx, prompt in enumerate(prompts):
der2933's avatar
der2933 committed
            # process results
            few_shot_exmaple = []
der2933's avatar
der2933 committed
            for score, retrieved_idx in zip(scores[prompt_idx],indices[prompt_idx]):
                if score>=self.faiss_score_filter and len(self.metadata[retrieved_idx]["fewshot_examaple"])<=5000 and "商品仕様】◉サイズ:46cm×27cm×15cm◉重さ:710g◉メイン素材:水、汚れに強い高品質ポリエステルキャンバス、インナー素材:ナイロン◉ブランド:honey&blue◉付属品:ベビーカー吊り下げ用フック 【たっぷりのメイン収納】大きく開く開口部はダ" not in self.metadata[retrieved_idx]["fewshot_examaple"]:
der2933's avatar
der2933 committed
                    fewshot_examaple = self.metadata[retrieved_idx]["fewshot_examaple"]
der2933's avatar
der2933 committed
            few_shot_exmaple = few_shot_exmaple[:4] if  is_multiple_choice else few_shot_exmaple

der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
            if len(few_shot_exmaple) > 0:
                prompt_example = '## Here are some similar questions and answers you can refer to:\n' 
                for i in few_shot_exmaple:
                    prompt_example += i+'\n'
                prompt_example += '## Now answer the Question:' + prompt
                prompt_example = '## Now answer the Question:' + prompt
der2933's avatar
der2933 committed

der2933's avatar
der2933 committed
            messages = [
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt_example}
            chat_prompt = self.tokenizer.apply_chat_template(
                # return_tensors="pt"
            # if "llama" in self.model_name.lower():
                # chat_prompt = chat_prompt[len(self.tokenizer.bos_token):]  # vllm tokenize will also add bos token
            # print(chat_prompt)
        ## debug logging
        print("batch formatted prompt:", formatted_prompts)
        return formatted_prompts
der2933's avatar
der2933 committed

der2933's avatar
der2933 committed
class llm_model_ensemble_vllm(ShopBenchBaseModel):
    A dummy model implementation for ShopBench, illustrating how to handle both
    multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
    This model uses a consistent random seed for reproducible results.
der2933's avatar
der2933 committed

der2933's avatar
der2933 committed
    def __init__(self):
        """Initializes the model and sets the random seed for consistency."""

    def initialize_models(self):
        # Initialize Meta Llama 3 - 8B Instruct Model
        self.model1_name = "./models/llama-mixed-626"
        self.model2_name = "/home/jnu/gxw/glm-4-9b-chat"

        if not os.path.exists(self.model1_name) or not os.path.exists(self.model2_name):
            raise Exception(
            The evaluators expect the model weights to be checked into the repository,
            but we could not find the model weights at {self.model1_name} or {self.model2_name}
der2933's avatar
der2933 committed
der2933's avatar
der2933 committed
            Please follow the instructions in the docs below to download and check in the model weights.

        # initialize the model with vllm
        self.system_prompt = "You are a helpful and multilingual online shopping assistant. You can understand and respond to user queries in English, German, Italian, French, Japanese, Spanish, Chinese. You are knowledgeable about various products. NOTE:ONLY OUTPUT THE ANSWER!!\n\n"
        # load model method 1 : model1 and model2 share gpu 0,1,2,3 compute and memory
        # load model method 2 : model1 load into gpu 0,1 and model2 load into gpu 2,3
        os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
        self.llm1 = vllm.LLM(
            dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
        self.tokenizer1 = self.llm1.get_tokenizer()

        os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
        self.llm2 = vllm.LLM(
            dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
        self.tokenizer2 = self.llm2.get_tokenizer()

        self.faiss = self.load_rag_module(faiss_index_path="./models/index_0626.ivf")
        self.faiss_retrieve_topk = 7
        self.faiss_score_filter = 0.882
        self.bge_rerank_topk = 6