diff --git a/local_evaluation.py b/local_evaluation.py index b66a079308bbb4f874d57409fbca6076a2e2890e..72491d3b24402b12cdae5dd4db230e4966cc64eb 100644 --- a/local_evaluation.py +++ b/local_evaluation.py @@ -240,6 +240,8 @@ def main(): data_df = load_development_data(DATA_FILENAME) + data_df = data_df[data_df["track"]=="amazon-kdd-cup-24-multi-lingual-abilities"].reset_index(drop=True) + # Load the model from the user's custom configuration # Note: The evaluator **Always** imports the UserModel, please reference your own class # by setting the `UserModel` variable in models.user_config diff --git a/models/vanilla_llama3_baseline-Copy2.py b/models/vanilla_llama3_baseline-Copy2.py new file mode 100644 index 0000000000000000000000000000000000000000..5f4d5eae0a31c88af0fac4b6b34d85d9be70a3bc --- /dev/null +++ b/models/vanilla_llama3_baseline-Copy2.py @@ -0,0 +1,173 @@ +import os +import random +from typing import Any, Dict, List +# sfjsidjjd fidufj +import vllm + +from .base_model import ShopBenchBaseModel + +#### CONFIG PARAMETERS --- + +# Set a consistent seed for reproducibility +AICROWD_RUN_SEED = int(os.getenv("AICROWD_RUN_SEED", 773815)) + +# Batch size you wish the evaluators will use to call the `batch_generate_answer` function +AICROWD_SUBMISSION_BATCH_SIZE = 16 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model. + +# VLLM Parameters +VLLM_TENSOR_PARALLEL_SIZE = 4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model. +VLLM_GPU_MEMORY_UTILIZATION = 1 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model. + + +class Llama3_8B_ZeroShotModel(ShopBenchBaseModel): + """ + A dummy model implementation for ShopBench, illustrating how to handle both + multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition. + This model uses a consistent random seed for reproducible results. + """ + + def __init__(self): + """Initializes the model and sets the random seed for consistency.""" + random.seed(AICROWD_RUN_SEED) + self.initialize_models() + + def initialize_models(self): + # Initialize Meta Llama 3 - 8B Instruct Model + self.model_name = "models/full_model_qwen_pad-awq" + + if not os.path.exists(self.model_name): + raise Exception( + f""" + The evaluators expect the model weights to be checked into the repository, + but we could not find the model weights at {self.model_name} + + Please follow the instructions in the docs below to download and check in the model weights. + https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/blob/master/docs/download-baseline-model-weights.md + + """ + ) + + # initialize the model with vllm + self.llm = vllm.LLM( + self.model_name, + worker_use_ray=True, + tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE, + gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION, + trust_remote_code=True, + dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs + enforce_eager=True, + max_model_len = 4096, + ) + self.tokenizer = self.llm.get_tokenizer() + + + + def get_batch_size(self) -> int: + """ + Determines the batch size that is used by the evaluator when calling the `batch_predict` function. + + Returns: + int: The batch size, an integer between 1 and 16. This value indicates how many + queries should be processed together in a single batch. It can be dynamic + across different batch_predict calls, or stay a static value. + """ + self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE + return self.batch_size + + def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]: + """ + Generates a batch of prediction based on associated prompts and task_type + + For multiple choice tasks, it randomly selects a choice. + For other tasks, it returns a list of integers as a string, + representing the model's prediction in a format compatible with task-specific parsers. + + Parameters: + - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys + - prompt (List[str]): a list of input prompts for the model. + + - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks. + + Returns: + str: A list of predictions for each of the prompts received in the batch. + Each prediction is + a string representing a single integer[0, 3] for multiple choice tasks, + or a string representing a comma separated list of integers for Ranking, Retrieval tasks, + or a string representing a comma separated list of named entities for Named Entity Recognition tasks. + or a string representing the (unconstrained) generated response for the generation tasks + Please refer to parsers.py for more details on how these responses will be parsed by the evaluator. + """ + prompts = batch["prompt"] + + # format prompts using the chat template + #formatted_prompts = self.format_prommpts(prompts) + # set max new tokens to be generated + max_new_tokens = 80 + + extra_prompt = "Do not give explanation only Answer. \nOutput:\n" + if is_multiple_choice: + max_new_tokens = 1 # For MCQ tasks, we only need to generate 1 token + extra_prompt = "" + + + system_prompt = "You are a helpful online shopping assistant. Please answer the following question about online shopping and follow the given instructions.\n\n" + formatted_prompts = [] + for prompt in prompts: + formatted_prompts.append(system_prompt + prompt +extra_prompt) + + + if is_multiple_choice: + # Generate responses via vllm + responses = self.llm.generate( + formatted_prompts, + vllm.SamplingParams( + n=1, # Number of output sequences to return for each prompt. + top_p=1, # Aumentado significativamente de 0.85 + temperature=0.05, # Reducaido significativamente de 0.35 + top_k=60, # Aumentado de 40 + seed=AICROWD_RUN_SEED, # Seed for reprodicibility + skip_special_tokens=True, # Whether to skip special tokens in the output. + max_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence. + ), + use_tqdm = False + ) + else: + # Generate responses via vll asdadm + responses = self.llm.generate( + formatted_prompts, + vllm.SamplingParams( + n=1, # Number of output sequences to return for each prompt. + top_p=0.9284708177506061, # Float that controls the cumulative probability of the top tokens to consider. + top_k = 42.53778219798265, + temperature = 0.29714063971560145, + seed=AICROWD_RUN_SEED, # Seed for reprodicibility + skip_special_tokens=True, # Whether to skip special tokens in the output. + max_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence. + ), + use_tqdm = False + ) + + # Aggregate answers into List[str] + batch_response = [] + for response in responses: + batch_response.append(response.outputs[0].text.replace("'", "")) + + if is_multiple_choice: + print("MCQ: ", batch_response) + + else: + print("NO MCQ: ", batch_response) + + return batch_response + + def format_prommpts(self, prompts): + """ + Formats prompts using the chat_template of the model. + + Parameters: + - queries (list of str): A list of queries to be formatted into prompts. + + """ + + + return formatted_prompts diff --git a/models/vanilla_llama3_baseline.py b/models/vanilla_llama3_baseline.py index 22554a44e8b3ef3ab96e3b7dc8554f2ed1f38c2a..dd88fa6aa562778e6bc2b66e1fbb386b53bbe2ab 100644 --- a/models/vanilla_llama3_baseline.py +++ b/models/vanilla_llama3_baseline.py @@ -102,9 +102,10 @@ class Llama3_8B_ZeroShotModel(ShopBenchBaseModel): # format prompts using the chat template #formatted_prompts = self.format_prommpts(prompts) # set max new tokens to be generated - max_new_tokens = 80 + max_new_tokens = 100 - extra_prompt = "Do not give explanation only Answer. \nOutput:\n" + #extra_prompt = "Do not give explanation only Answer. \nOutput:\n" + extra_prompt="" if is_multiple_choice: max_new_tokens = 1 # For MCQ tasks, we only need to generate 1 token extra_prompt = "" @@ -122,9 +123,9 @@ class Llama3_8B_ZeroShotModel(ShopBenchBaseModel): formatted_prompts, vllm.SamplingParams( n=1, # Number of output sequences to return for each prompt. - top_p=1, # Aumentado significativamente de 0.85 - temperature=0.05, # Reducaido significativamente de 0.35 - top_k=60, # Aumentado de 40 + top_p=0.95, # Aumentado significativamente de 0.85 + temperature=0.2, # Reducaido significativamente de 0.35 + top_k=50, # Aumentado de 40 seed=AICROWD_RUN_SEED, # Seed for reprodicibility skip_special_tokens=True, # Whether to skip special tokens in the output. max_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence. @@ -137,8 +138,9 @@ class Llama3_8B_ZeroShotModel(ShopBenchBaseModel): formatted_prompts, vllm.SamplingParams( n=1, # Number of output sequences to return for each prompt. - top_p=0.95, # Float that controls the cumulative probability of the top tokens to consider. - temperature = 0, + top_p=0.786481930959028, # Float that controls the cumulative probability of the top tokens to consider. + top_k=67.05724315032114, + temperature = 0.7170398410763993, seed=AICROWD_RUN_SEED, # Seed for reprodicibility skip_special_tokens=True, # Whether to skip special tokens in the output. max_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence. @@ -169,4 +171,4 @@ class Llama3_8B_ZeroShotModel(ShopBenchBaseModel): """ - return formatted_prompts + return formatted_prompts \ No newline at end of file diff --git a/models/vanilla_llama3_baseline_prueba.py b/models/vanilla_llama3_baseline_prueba.py new file mode 100644 index 0000000000000000000000000000000000000000..d4547224b88570d06b73f54623600d5db8476a49 --- /dev/null +++ b/models/vanilla_llama3_baseline_prueba.py @@ -0,0 +1,172 @@ +import os +import random +from typing import Any, Dict, List +# sfjsidjjd fidufj +import vllm + +from .base_model import ShopBenchBaseModel + +#### CONFIG PARAMETERS --- + +# Set a consistent seed for reproducibility +AICROWD_RUN_SEED = int(os.getenv("AICROWD_RUN_SEED", 773815)) + +# Batch size you wish the evaluators will use to call the `batch_generate_answer` function +AICROWD_SUBMISSION_BATCH_SIZE = 16 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model. + +# VLLM Parameters +VLLM_TENSOR_PARALLEL_SIZE = 4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model. +VLLM_GPU_MEMORY_UTILIZATION = 1 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model. + + +class Llama3_8B_ZeroShotModel(ShopBenchBaseModel): + """ + A dummy model implementation for ShopBench, illustrating how to handle both + multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition. + This model uses a consistent random seed for reproducible results. + """ + + def __init__(self): + """Initializes the model and sets the random seed for consistency.""" + random.seed(AICROWD_RUN_SEED) + self.initialize_models() + + def initialize_models(self): + # Initialize Meta Llama 3 - 8B Instruct Model + self.model_name = "models/full_model_qwen_pad-awq" + + if not os.path.exists(self.model_name): + raise Exception( + f""" + The evaluators expect the model weights to be checked into the repository, + but we could not find the model weights at {self.model_name} + + Please follow the instructions in the docs below to download and check in the model weights. + https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/blob/master/docs/download-baseline-model-weights.md + + """ + ) + + # initialize the model with vllm + self.llm = vllm.LLM( + self.model_name, + worker_use_ray=True, + tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE, + gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION, + trust_remote_code=True, + dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs + enforce_eager=True, + max_model_len = 4096, + ) + self.tokenizer = self.llm.get_tokenizer() + + + + def get_batch_size(self) -> int: + """ + Determines the batch size that is used by the evaluator when calling the `batch_predict` function. + + Returns: + int: The batch size, an integer between 1 and 16. This value indicates how many + queries should be processed together in a single batch. It can be dynamic + across different batch_predict calls, or stay a static value. + """ + self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE + return self.batch_size + + def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]: + """ + Generates a batch of prediction based on associated prompts and task_type + + For multiple choice tasks, it randomly selects a choice. + For other tasks, it returns a list of integers as a string, + representing the model's prediction in a format compatible with task-specific parsers. + + Parameters: + - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys + - prompt (List[str]): a list of input prompts for the model. + + - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks. + + Returns: + str: A list of predictions for each of the prompts received in the batch. + Each prediction is + a string representing a single integer[0, 3] for multiple choice tasks, + or a string representing a comma separated list of integers for Ranking, Retrieval tasks, + or a string representing a comma separated list of named entities for Named Entity Recognition tasks. + or a string representing the (unconstrained) generated response for the generation tasks + Please refer to parsers.py for more details on how these responses will be parsed by the evaluator. + """ + prompts = batch["prompt"] + + # format prompts using the chat template + #formatted_prompts = self.format_prommpts(prompts) + # set max new tokens to be generated + max_new_tokens = 80 + + extra_prompt = "Do not give explanation only Answer. \nOutput:\n" + if is_multiple_choice: + max_new_tokens = 1 # For MCQ tasks, we only need to generate 1 token + extra_prompt = "" + + + system_prompt = "You are a helpful online shopping assistant. Please answer the following question about online shopping and follow the given instructions.\n\n" + formatted_prompts = [] + for prompt in prompts: + formatted_prompts.append(system_prompt + prompt +extra_prompt) + + + if is_multiple_choice: + # Generate responses via vllm + responses = self.llm.generate( + formatted_prompts, + vllm.SamplingParams( + n=1, # Number of output sequences to return for each prompt. + top_p=0.95, # Aumentado significativamente de 0.85 + temperature=0.2, # Reducaido significativamente de 0.35 + top_k=50, # Aumentado de 40 + seed=AICROWD_RUN_SEED, # Seed for reprodicibility + skip_special_tokens=True, # Whether to skip special tokens in the output. + max_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence. + ), + use_tqdm = False + ) + else: + # Generate responses via vll asdadm + responses = self.llm.generate( + formatted_prompts, + vllm.SamplingParams( + n=1, # Number of output sequences to return for each prompt. + top_p=0.95, # Float that controls the cumulative probability of the top tokens to consider. + temperature = 0, + seed=AICROWD_RUN_SEED, # Seed for reprodicibility + skip_special_tokens=True, # Whether to skip special tokens in the output. + max_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence. + ), + use_tqdm = False + ) + + # Aggregate answers into List[str] + batch_response = [] + for response in responses: + batch_response.append(response.outputs[0].text.replace("'", "")) + + if is_multiple_choice: + print("MCQ: ", batch_response) + + else: + print("NO MCQ: ", batch_response) + + return batch_response + + def format_prommpts(self, prompts): + """ + Formats prompts using the chat_template of the model. + + Parameters: + - queries (list of str): A list of queries to be formatted into prompts. + + """ + + + return formatted_prompts \ No newline at end of file