diff --git a/.dockerignore b/.dockerignore index f3195f94703b5f9f1f264d4fb2d41c0460fe8257..caee08f0f2965fc28b5213503ff580372f5d5dae 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,3 @@ -models/** \ No newline at end of file +.git/ +models/** +data/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 8615e3cba23873dc72d75cbd01b35a3cdac2b390..a504b0abfa1dfe8fe8ddf04fa62574171ed754f8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 +FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 ENV DEBIAN_FRONTEND=noninteractive \ LANG=en_US.UTF-8 \ diff --git a/docs/download-baseline-model-weights.md b/docs/download-baseline-model-weights.md new file mode 100644 index 0000000000000000000000000000000000000000..f30882559542c9af1b3f7b13ccd89e62676b18a4 --- /dev/null +++ b/docs/download-baseline-model-weights.md @@ -0,0 +1,70 @@ +### Setting Up and Downloading Baseline Model weighta with Hugging Face + +This guide outlines the steps to download (and check in) the models weights required for the baseline models. +We will focus on the `Meta-Llama-3-8B-Instruct`. +But the steps should work equally well for any other models on hugging face. + +#### Preliminary Steps: + +1. **Install the Hugging Face Hub Package**: + + Begin by installing the `huggingface_hub` package, which includes the `hf_transfer` utility, by running the following command in your terminal: + + ```bash + pip install huggingface_hub[hf_transfer] + ``` + +2. **Accept the LLaMA Terms**: + + You must accept the LLaMA model's terms of use by visiting: [meta-llama/Meta-Llama-3-8B-Instruct Terms](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct). + +3. **Create a Hugging Face CLI Token**: + + Generate a CLI token by navigating to: [Hugging Face Token Settings](https://huggingface.co/settings/tokens). You will need this token for authentication. + +#### Hugging Face Authentication: + +1. **Login via CLI**: + + Authenticate yourself with the Hugging Face CLI using the token created in the previous step. Run: + + ```bash + huggingface-cli login + ``` + + When prompted, enter the token. + +#### Model Downloads: + +1. **Download LLaMA-2-7b Model**: + + Execute the following command to download the `Meta-Llama-3-8B-Instruct` model to a local subdirectory. This command excludes unnecessary files to save space: + + ```bash + HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download \ + meta-llama/Meta-Llama-3-8B-Instruct \ + --local-dir-use-symlinks False \ + --local-dir models/meta-llama/Meta-Llama-3-8B-Instruct \ + --exclude *.pth # These are alternates to the safetensors hence not needed + ``` + +#### Version Control with Git LFS: + +1. **Track Model Weights**: + + Use Git Large File Storage (LFS) to track the model directories. This ensures efficient handling of large files: + + ```bash + git lfs track "models/meta-llama/*" + ``` + +2. **Commit and Push**: + + Add the models to your Git repository, commit the changes, and push them to your remote repository: + + ```bash + git add models/ + git commit -am "add weights" + git push origin master + ``` +If you are struggling with GIT-LFS, you are very much encouraged to check out [this post](https://discourse.aicrowd.com/t/how-to-upload-large-files-size-to-your-submission/2304). diff --git a/local_evaluation.py b/local_evaluation.py index c4cf6ba6b1dd0e33f0f6076bb31b04a67ff47874..aa1c8e2ba84ac0dfe44fb56a12c1a5e3e3a603ed 100644 --- a/local_evaluation.py +++ b/local_evaluation.py @@ -52,20 +52,36 @@ def generate_model_outputs(data_df, model): - A list containing the model outputs for each entry in the data DataFrame. """ outputs = [] - for _, row in tqdm( - data_df.iterrows(), total=len(data_df), desc="Generating Responses" - ): - is_multiple_choice = row["task_type"] == "multiple-choice" - # the 'task_type' column won't be available during evaluation, so you should use something like - # ```is_multiple_choice = row['is_multiple_choice']`` - prompt = row["input_field"] - model_output = model.predict(prompt, is_multiple_choice) - outputs.append(model_output) - return outputs + task_grouped_df = data_df.groupby(by=["task_type"]) + + for task_type, task_group_data_df in task_grouped_df: + task_group_data_df = task_group_data_df.reset_index(drop=True) + + is_multiple_choice = task_type[0] == "multiple-choice" + batch_size = model.get_batch_size() + + batches = [task_group_data_df[i:i+batch_size] for i in range(0,len(task_group_data_df),batch_size)] + + for batch_df in batches: + batch = { + "prompt": batch_df["input_field"].tolist(), + } + model_output = model.batch_predict( + batch, + is_multiple_choice + ) + outputs.append( + pd.DataFrame({ + "input_field": batch["prompt"], + "model_output_str": model_output + })) + + df_outputs = pd.concat(outputs) + return df_outputs # Function to evaluate the generated model outputs -def evaluate_outputs(data_df, outputs, log_every_n_steps=1): +def evaluate_outputs(data_df, log_every_n_steps=1): """ Evaluate the model outputs against ground truth values using specified metrics. @@ -84,17 +100,18 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1): for row_idx, row in tqdm( data_df.iterrows(), total=len(data_df), desc="Evaluating" ): - task_name, task_type, metric, ground_truth = ( + task_name, task_type, metric, ground_truth, model_output_str = ( row["task_name"], row["task_type"], row["metric"], row["output_field"], + row["model_output_str"], ) if metric not in eval_methods: raise NotImplementedError(f"No metric for {metric=}") - model_output = task_parsers[task_type].parse(outputs[row_idx]) + model_output = task_parsers[task_type].parse(model_output_str) eval_fn = eval_methods[metric] metric_score = eval_fn(model_output, ground_truth) @@ -230,14 +247,15 @@ def main(): model = UserModel() # Generate model outputs - outputs = generate_model_outputs(data_df, model) - data_df["outputs"] = ( - outputs # Optional: Add outputs back to DataFrame for inspection - ) - print(data_df.head()) + df_outputs = generate_model_outputs(data_df, model) + + # add outputs to the data_df + merged_data_df = pd.merge(data_df, df_outputs, on="input_field") + + print(merged_data_df.head()) # Evaluate the generated outputs and calculate metrics - per_task_metrics = evaluate_outputs(data_df, outputs) + per_task_metrics = evaluate_outputs(merged_data_df) # Aggregate and display the evaluation scores overall_metrics = aggregate_scores(per_task_metrics) diff --git a/metrics.py b/metrics.py index 9d6d3d61fe1e7a10b311252ec48f00a774330ad5..f66703508f9016ebacaee07efb8d76e8a71cb7c3 100644 --- a/metrics.py +++ b/metrics.py @@ -1,11 +1,12 @@ -from rouge_score import rouge_scorer -from sentence_transformers import SentenceTransformer -import numpy as np -import evaluate +import os +from typing import List, Tuple, Union +import evaluate +import numpy as np import torch - -from typing import List, Union, Tuple +from loguru import logger +from rouge_score import rouge_scorer +from sentence_transformers import SentenceTransformer sacrebleu = None sentence_transformer_model_cache = {} diff --git a/models/README.md b/models/README.md index 486e493d5dfa18dc55289f99f438f53ce64c57f9..e9ff33e6724b18322bb98dc16ebf75cb6003d636 100644 --- a/models/README.md +++ b/models/README.md @@ -4,7 +4,7 @@ For a streamlined experience, we suggest placing the code for all your models within the `models` directory. This is a recommendation for organizational purposes, but it's not a strict requirement. ## Model Base Class -Your models should inherit from the `ShopBenchBaseModel` class found in [base_model.py](base_model.py). We provide an example model, `dummy_model.py`, to illustrate how you might structure your own model. Crucially, your model class must implement the `predict` method. +Your models should inherit from the `ShopBenchBaseModel` class found in [base_model.py](base_model.py). We provide an example model, `dummy_model.py`, to illustrate how you might structure your own model. Crucially, your model class must implement the `batch_predict` method. ## Configuring Your Model To ensure your model is recognized and utilized correctly, please specify your model class name in the [`user_config.py`](user_config.py) file, by following the instructions in the inline comments. @@ -12,12 +12,14 @@ To ensure your model is recognized and utilized correctly, please specify your m ## Model Inputs and Outputs ### Inputs -Your model will receive two pieces of information for every task: -- `prompt` (`str`): This is the specific task's input prompt. +- `batch` (`Dict[str, Any]`): A batch of inputs as a dictionary, where the dictionary has the following key: + - `prompt` (`List[str]`): `A list if prompts representing the tasks in a batch` - `is_multiple_choice` (`bool`): This indicates whether the task is a multiple choice question. ### Outputs -The output from your model's `predict` function should always be a string. Depending on the task, this could be: + +The output from your model's `batch_predict` function should be a list of string responses for all the prompts in the input batch. +Depending on the task, each response could be: - A single integer (in the range [0, 3]) for multiple choice tasks. - A comma-separated list of integers for ranking tasks. - A comma-separated list of named entities for Named Entity Recognition (NER) tasks. diff --git a/models/base_model.py b/models/base_model.py index dc41a235da95d5124def29bd923cccfc43d27730..b93cb226f690b9c38ce8b618c11806411f20f1bc 100644 --- a/models/base_model.py +++ b/models/base_model.py @@ -1,21 +1,39 @@ +from typing import Any, Dict, List + + class ShopBenchBaseModel: def __init__(self): pass - def predict(self, prompt: str, is_multiple_choice: bool) -> str: + def get_batch_size(self) -> int: + """ + Determines the batch size that is used by the evaluator when calling the `batch_predict` function. + + Returns: + int: The batch size, an integer between 1 and 16. This value indicates how many + queries should be processed together in a single batch. It can be dynamic + across different batch_predict calls, or stay a static value. + """ + raise NotImplementedError("get_batch_size method not implemented") + + def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]: """ - Generates a prediction based on the input prompt and task type. + Generates a batch of prediction based on associated prompts and task_type For multiple choice tasks, it randomly selects a choice. For other tasks, it returns a list of integers as a string, representing the model's prediction in a format compatible with task-specific parsers. - Args: - prompt (str): The input prompt for the model. - is_multiple_choice (bool): Indicates whether the task is a multiple choice question. + Parameters: + - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys + - prompt (List[str]): a list of input prompts for the model. + + - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks. Returns: - str: The prediction as a string representing a single integer[0, 3] for multiple choice tasks, + str: A list of predictions for each of the prompts received in the batch. + Each prediction is + a string representing a single integer[0, 3] for multiple choice tasks, or a string representing a comma separated list of integers for Ranking, Retrieval tasks, or a string representing a comma separated list of named entities for Named Entity Recognition tasks. or a string representing the (unconstrained) generated response for the generation tasks diff --git a/models/dummy_model.py b/models/dummy_model.py index 5126746b8968f7ba6a047c76766772c6de08b2ec..46dcb8373f924d0373b170900d5afaa0350c6590 100644 --- a/models/dummy_model.py +++ b/models/dummy_model.py @@ -1,6 +1,6 @@ -from typing import List, Union -import random import os +import random +from typing import Any, Dict, List from .base_model import ShopBenchBaseModel @@ -19,34 +19,55 @@ class DummyModel(ShopBenchBaseModel): """Initializes the model and sets the random seed for consistency.""" random.seed(AICROWD_RUN_SEED) - def predict(self, prompt: str, is_multiple_choice: bool) -> str: + def get_batch_size(self) -> int: + """ + Determines the batch size that is used by the evaluator when calling the `batch_predict` function. + + Returns: + int: The batch size, an integer between 1 and 16. This value indicates how many + queries should be processed together in a single batch. It can be dynamic + across different batch_predict calls, or stay a static value. + """ + self.batch_size = 4 + return self.batch_size + + def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]: """ - Generates a prediction based on the input prompt and task type. + Generates a batch of prediction based on associated prompts and task_type For multiple choice tasks, it randomly selects a choice. For other tasks, it returns a list of integers as a string, representing the model's prediction in a format compatible with task-specific parsers. - Args: - prompt (str): The input prompt for the model. - is_multiple_choice (bool): Indicates whether the task is a multiple choice question. + Parameters: + - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys + - prompt (List[str]): a list of input prompts for the model. + + - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks. Returns: - str: The prediction as a string representing a single integer[0, 3] for multiple choice tasks, + str: A list of predictions for each of the prompts received in the batch. + Each prediction is + a string representing a single integer[0, 3] for multiple choice tasks, or a string representing a comma separated list of integers for Ranking, Retrieval tasks, or a string representing a comma separated list of named entities for Named Entity Recognition tasks. or a string representing the (unconstrained) generated response for the generation tasks Please refer to parsers.py for more details on how these responses will be parsed by the evaluator. """ + prompts = batch["prompt"] + possible_responses = [1, 2, 3, 4] - if is_multiple_choice: - # Randomly select one of the possible responses for multiple choice tasks - return str(random.choice(possible_responses)) - else: - # For other tasks, shuffle the possible responses and return as a string - random.shuffle(possible_responses) - return str(possible_responses) - # Note: As this is dummy model, we are returning random responses for non-multiple choice tasks. - # For generation tasks, this should ideally return an unconstrained string. + batch_response = [] + for prompt in prompts: + if is_multiple_choice: + # Randomly select one of the possible responses for multiple choice tasks + batch_response.append(str(random.choice(possible_responses))) + else: + # For other tasks, shuffle the possible responses and return as a string + random.shuffle(possible_responses) + batch_response.append(str(possible_responses)) + # Note: As this is dummy model, we are returning random responses for non-multiple choice tasks. + # For generation tasks, this should ideally return an unconstrained string. + return batch_response diff --git a/models/user_config.py b/models/user_config.py index 3380d478901d3b217a9463587ebf22feee37d526..b1bca5a24e2736a7eb52ad7b449c86c68a8851b0 100644 --- a/models/user_config.py +++ b/models/user_config.py @@ -19,3 +19,10 @@ UserModel = DummyModel # # UserModel = YourModel + +# For example, to use the Llama3 8B Instruct baseline, you can comment the lines below: +# please remember to download the model weights and checking them into the repository +# before submitting + +# from models.vanilla_llama3_baseline import Llama3_8B_ZeroShotModel +# UserModel = Llama3_8B_ZeroShotModel diff --git a/models/vanilla_llama3_baseline.py b/models/vanilla_llama3_baseline.py new file mode 100644 index 0000000000000000000000000000000000000000..d8134b430fd7f534ff066e6158e838de42c266c2 --- /dev/null +++ b/models/vanilla_llama3_baseline.py @@ -0,0 +1,170 @@ +import os +import random +from typing import Any, Dict, List + +import vllm +from outlines.integrations.vllm import RegexLogitsProcessor + +from .base_model import ShopBenchBaseModel + +#### CONFIG PARAMETERS --- + +# Set a consistent seed for reproducibility +AICROWD_RUN_SEED = int(os.getenv("AICROWD_RUN_SEED", 773815)) + +# Batch size you wish the evaluators will use to call the `batch_generate_answer` function +AICROWD_SUBMISSION_BATCH_SIZE = 16 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model. + +# VLLM Parameters +VLLM_TENSOR_PARALLEL_SIZE = 4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model. +VLLM_GPU_MEMORY_UTILIZATION = 0.85 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model. + + +class Llama3_8B_ZeroShotModel(ShopBenchBaseModel): + """ + A dummy model implementation for ShopBench, illustrating how to handle both + multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition. + This model uses a consistent random seed for reproducible results. + """ + + def __init__(self): + """Initializes the model and sets the random seed for consistency.""" + random.seed(AICROWD_RUN_SEED) + self.initialize_models() + + def initialize_models(self): + # Initialize Meta Llama 3 - 8B Instruct Model + self.model_name = "models/meta-llama/Meta-Llama-3-8B-Instruct" + + if not os.path.exists(self.model_name): + raise Exception( + f""" + The evaluators expect the model weights to be checked into the repository, + but we could not find the model weights at {self.model_name} + + Please follow the instructions in the docs below to download and check in the model weights. + https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/blob/master/docs/download-baseline-model-weights.md + + """ + ) + + # initialize the model with vllm + self.llm = vllm.LLM( + self.model_name, + tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE, + gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION, + trust_remote_code=True, + dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs + enforce_eager=True + ) + self.tokenizer = self.llm.get_tokenizer() + + + + def get_batch_size(self) -> int: + """ + Determines the batch size that is used by the evaluator when calling the `batch_predict` function. + + Returns: + int: The batch size, an integer between 1 and 16. This value indicates how many + queries should be processed together in a single batch. It can be dynamic + across different batch_predict calls, or stay a static value. + """ + self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE + return self.batch_size + + def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]: + """ + Generates a batch of prediction based on associated prompts and task_type + + For multiple choice tasks, it randomly selects a choice. + For other tasks, it returns a list of integers as a string, + representing the model's prediction in a format compatible with task-specific parsers. + + Parameters: + - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys + - prompt (List[str]): a list of input prompts for the model. + + - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks. + + Returns: + str: A list of predictions for each of the prompts received in the batch. + Each prediction is + a string representing a single integer[0, 3] for multiple choice tasks, + or a string representing a comma separated list of integers for Ranking, Retrieval tasks, + or a string representing a comma separated list of named entities for Named Entity Recognition tasks. + or a string representing the (unconstrained) generated response for the generation tasks + Please refer to parsers.py for more details on how these responses will be parsed by the evaluator. + """ + prompts = batch["prompt"] + + # format prompts using the chat template + formatted_prompts = self.format_prommpts(prompts) + # set max new tokens to be generated + max_new_tokens = 100 + + # Setup logits processor + logits_processors = [] + if is_multiple_choice: + logits_processors = [ + RegexLogitsProcessor( + regex_string="\d+", # constrain generation to only integers + llm=self.llm) + ] + max_new_tokens = 1 # For MCQ tasks, we only need to generate 1 token + + + # Generate responses via vllm + responses = self.llm.generate( + formatted_prompts, + vllm.SamplingParams( + n=1, # Number of output sequences to return for each prompt. + top_p=0.9, # Float that controls the cumulative probability of the top tokens to consider. + temperature=0, # randomness of the sampling + seed=AICROWD_RUN_SEED, # Seed for reprodicibility + skip_special_tokens=True, # Whether to skip special tokens in the output. + max_tokens=max_new_tokens, # Maximum number of tokens to generate per output sequence. + # Note: We are using 50 max new tokens instead of 75, + # because the 75 max token limit is checked using the Llama2 tokenizer. + # The Llama3 model instead uses a differet tokenizer with a larger vocabulary + # This allows it to represent the same content more efficiently, using fewer tokens. + logits_processors=logits_processors, # Use logits processors to do constrained/guided generation in case of MCQ tasks + ), + use_tqdm = False + ) + # Aggregate answers into List[str] + batch_response = [] + for response in responses: + batch_response.append(response.outputs[0].text) + + if is_multiple_choice: + print("MCQ: ", batch_response) + + return batch_response + + def format_prommpts(self, prompts): + """ + Formats prompts using the chat_template of the model. + + Parameters: + - queries (list of str): A list of queries to be formatted into prompts. + + """ + system_prompt = "You are a helpful online shopping assistant. Please answer the following question about online shopping and follow the given instructions." + formatted_prompts = [] + + for _idx, prompt in enumerate(prompts): + user_message = "" + + formatted_prompts.append( + self.tokenizer.apply_chat_template( + [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ], + tokenize=False, + add_generation_prompt=True, + ) + ) + + return formatted_prompts diff --git a/requirements.txt b/requirements.txt index 12c6d5d5eac2aa9e97516ec03233adc7e98b9801..fc1f772a04265396b3337070ac4ebe7ea7b5cdca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,4 @@ torch +vllm>=0.4.2 +outlines>=0.0.41 +loguru