diff --git a/.dockerignore b/.dockerignore
index f3195f94703b5f9f1f264d4fb2d41c0460fe8257..caee08f0f2965fc28b5213503ff580372f5d5dae 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1 +1,3 @@
-models/**
\ No newline at end of file
+.git/
+models/**
+data/
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 8615e3cba23873dc72d75cbd01b35a3cdac2b390..a504b0abfa1dfe8fe8ddf04fa62574171ed754f8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
 
 ENV DEBIAN_FRONTEND=noninteractive \
     LANG=en_US.UTF-8 \
diff --git a/docs/download-baseline-model-weights.md b/docs/download-baseline-model-weights.md
new file mode 100644
index 0000000000000000000000000000000000000000..f30882559542c9af1b3f7b13ccd89e62676b18a4
--- /dev/null
+++ b/docs/download-baseline-model-weights.md
@@ -0,0 +1,70 @@
+### Setting Up and Downloading Baseline Model weighta with Hugging Face
+
+This guide outlines the steps to download (and check in) the models weights required for the baseline models.
+We will focus on the `Meta-Llama-3-8B-Instruct`.
+But the steps should work equally well for any other models on hugging face. 
+
+#### Preliminary Steps:
+
+1. **Install the Hugging Face Hub Package**:
+   
+   Begin by installing the `huggingface_hub` package, which includes the `hf_transfer` utility, by running the following command in your terminal:
+
+   ```bash
+   pip install huggingface_hub[hf_transfer]
+   ```
+
+2. **Accept the LLaMA Terms**:
+   
+   You must accept the LLaMA model's terms of use by visiting: [meta-llama/Meta-Llama-3-8B-Instruct Terms](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct).
+
+3. **Create a Hugging Face CLI Token**:
+   
+   Generate a CLI token by navigating to: [Hugging Face Token Settings](https://huggingface.co/settings/tokens). You will need this token for authentication.
+
+#### Hugging Face Authentication:
+
+1. **Login via CLI**:
+   
+   Authenticate yourself with the Hugging Face CLI using the token created in the previous step. Run:
+
+   ```bash
+   huggingface-cli login
+   ```
+
+   When prompted, enter the token.
+
+#### Model Downloads:
+
+1. **Download LLaMA-2-7b Model**:
+
+   Execute the following command to download the `Meta-Llama-3-8B-Instruct` model to a local subdirectory. This command excludes unnecessary files to save space:
+
+   ```bash
+   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download \
+       meta-llama/Meta-Llama-3-8B-Instruct \
+       --local-dir-use-symlinks False \
+       --local-dir models/meta-llama/Meta-Llama-3-8B-Instruct \
+       --exclude *.pth # These are alternates to the safetensors hence not needed
+   ```
+
+#### Version Control with Git LFS:
+
+1. **Track Model Weights**:
+   
+   Use Git Large File Storage (LFS) to track the model directories. This ensures efficient handling of large files:
+
+   ```bash
+   git lfs track "models/meta-llama/*"
+   ```
+
+2. **Commit and Push**:
+   
+   Add the models to your Git repository, commit the changes, and push them to your remote repository:
+
+   ```bash
+   git add models/
+   git commit -am "add weights"
+   git push origin master
+   ```
+If you are struggling with GIT-LFS, you are very much encouraged to check out [this post](https://discourse.aicrowd.com/t/how-to-upload-large-files-size-to-your-submission/2304).
diff --git a/local_evaluation.py b/local_evaluation.py
index c4cf6ba6b1dd0e33f0f6076bb31b04a67ff47874..aa1c8e2ba84ac0dfe44fb56a12c1a5e3e3a603ed 100644
--- a/local_evaluation.py
+++ b/local_evaluation.py
@@ -52,20 +52,36 @@ def generate_model_outputs(data_df, model):
     - A list containing the model outputs for each entry in the data DataFrame.
     """
     outputs = []
-    for _, row in tqdm(
-        data_df.iterrows(), total=len(data_df), desc="Generating Responses"
-    ):
-        is_multiple_choice = row["task_type"] == "multiple-choice"
-        # the 'task_type' column won't be available during evaluation, so you should use something like
-        # ```is_multiple_choice = row['is_multiple_choice']``
-        prompt = row["input_field"]
-        model_output = model.predict(prompt, is_multiple_choice)
-        outputs.append(model_output)
-    return outputs
+    task_grouped_df = data_df.groupby(by=["task_type"])
+    
+    for task_type, task_group_data_df in task_grouped_df:
+        task_group_data_df = task_group_data_df.reset_index(drop=True)
+        
+        is_multiple_choice = task_type[0] == "multiple-choice"
+        batch_size = model.get_batch_size()
+        
+        batches = [task_group_data_df[i:i+batch_size] for i in range(0,len(task_group_data_df),batch_size)]
+        
+        for batch_df in batches:
+            batch = {
+                "prompt": batch_df["input_field"].tolist(),
+            }
+            model_output = model.batch_predict(
+                    batch, 
+                    is_multiple_choice
+                )
+            outputs.append(
+                pd.DataFrame({
+                    "input_field": batch["prompt"],
+                    "model_output_str": model_output
+                }))
+    
+    df_outputs = pd.concat(outputs)
+    return df_outputs
 
 
 # Function to evaluate the generated model outputs
-def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
+def evaluate_outputs(data_df, log_every_n_steps=1):
     """
     Evaluate the model outputs against ground truth values using specified metrics.
 
@@ -84,17 +100,18 @@ def evaluate_outputs(data_df, outputs, log_every_n_steps=1):
     for row_idx, row in tqdm(
         data_df.iterrows(), total=len(data_df), desc="Evaluating"
     ):
-        task_name, task_type, metric, ground_truth = (
+        task_name, task_type, metric, ground_truth, model_output_str = (
             row["task_name"],
             row["task_type"],
             row["metric"],
             row["output_field"],
+            row["model_output_str"],
         )
 
         if metric not in eval_methods:
             raise NotImplementedError(f"No metric for {metric=}")
 
-        model_output = task_parsers[task_type].parse(outputs[row_idx])
+        model_output = task_parsers[task_type].parse(model_output_str)
         eval_fn = eval_methods[metric]
         metric_score = eval_fn(model_output, ground_truth)
 
@@ -230,14 +247,15 @@ def main():
     model = UserModel()
 
     # Generate model outputs
-    outputs = generate_model_outputs(data_df, model)
-    data_df["outputs"] = (
-        outputs  # Optional: Add outputs back to DataFrame for inspection
-    )
-    print(data_df.head())
+    df_outputs = generate_model_outputs(data_df, model)
+    
+    # add outputs to the data_df
+    merged_data_df = pd.merge(data_df, df_outputs, on="input_field")
+        
+    print(merged_data_df.head())
 
     # Evaluate the generated outputs and calculate metrics
-    per_task_metrics = evaluate_outputs(data_df, outputs)
+    per_task_metrics = evaluate_outputs(merged_data_df)
 
     # Aggregate and display the evaluation scores
     overall_metrics = aggregate_scores(per_task_metrics)
diff --git a/metrics.py b/metrics.py
index 9d6d3d61fe1e7a10b311252ec48f00a774330ad5..f66703508f9016ebacaee07efb8d76e8a71cb7c3 100644
--- a/metrics.py
+++ b/metrics.py
@@ -1,11 +1,12 @@
-from rouge_score import rouge_scorer
-from sentence_transformers import SentenceTransformer
-import numpy as np
-import evaluate
+import os
+from typing import List, Tuple, Union
 
+import evaluate
+import numpy as np
 import torch
-
-from typing import List, Union, Tuple
+from loguru import logger
+from rouge_score import rouge_scorer
+from sentence_transformers import SentenceTransformer
 
 sacrebleu = None
 sentence_transformer_model_cache = {}
diff --git a/models/README.md b/models/README.md
index 486e493d5dfa18dc55289f99f438f53ce64c57f9..e9ff33e6724b18322bb98dc16ebf75cb6003d636 100644
--- a/models/README.md
+++ b/models/README.md
@@ -4,7 +4,7 @@
 For a streamlined experience, we suggest placing the code for all your models within the `models` directory. This is a recommendation for organizational purposes, but it's not a strict requirement.
 
 ## Model Base Class
-Your models should inherit from the `ShopBenchBaseModel` class found in [base_model.py](base_model.py). We provide an example model, `dummy_model.py`, to illustrate how you might structure your own model. Crucially, your model class must implement the `predict` method.
+Your models should inherit from the `ShopBenchBaseModel` class found in [base_model.py](base_model.py). We provide an example model, `dummy_model.py`, to illustrate how you might structure your own model. Crucially, your model class must implement the `batch_predict` method.
 
 ## Configuring Your Model
 To ensure your model is recognized and utilized correctly, please specify your model class name in the [`user_config.py`](user_config.py) file, by following the instructions in the inline comments.
@@ -12,12 +12,14 @@ To ensure your model is recognized and utilized correctly, please specify your m
 ## Model Inputs and Outputs
 
 ### Inputs
-Your model will receive two pieces of information for every task:
-- `prompt` (`str`): This is the specific task's input prompt.
+- `batch` (`Dict[str, Any]`): A batch of inputs as a dictionary, where the dictionary has the following key:
+    - `prompt` (`List[str]`): `A list if prompts representing the tasks in a batch`
 - `is_multiple_choice` (`bool`): This indicates whether the task is a multiple choice question.
 
 ### Outputs
-The output from your model's `predict` function should always be a string. Depending on the task, this could be:
+
+The output from your model's `batch_predict` function should be a list of string responses for all the prompts in the input batch.
+Depending on the task, each response could be:
 - A single integer (in the range [0, 3]) for multiple choice tasks.
 - A comma-separated list of integers for ranking tasks.
 - A comma-separated list of named entities for Named Entity Recognition (NER) tasks.
diff --git a/models/base_model.py b/models/base_model.py
index dc41a235da95d5124def29bd923cccfc43d27730..b93cb226f690b9c38ce8b618c11806411f20f1bc 100644
--- a/models/base_model.py
+++ b/models/base_model.py
@@ -1,21 +1,39 @@
+from typing import Any, Dict, List
+
+
 class ShopBenchBaseModel:
     def __init__(self):
         pass
 
-    def predict(self, prompt: str, is_multiple_choice: bool) -> str:
+    def get_batch_size(self) -> int:
+        """
+        Determines the batch size that is used by the evaluator when calling the `batch_predict` function.
+
+        Returns:
+            int: The batch size, an integer between 1 and 16. This value indicates how many
+                 queries should be processed together in a single batch. It can be dynamic
+                 across different batch_predict calls, or stay a static value.
+        """
+        raise NotImplementedError("get_batch_size method not implemented")
+
+    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
         """
-        Generates a prediction based on the input prompt and task type.
+        Generates a batch of prediction based on associated prompts and task_type
 
         For multiple choice tasks, it randomly selects a choice.
         For other tasks, it returns a list of integers as a string,
         representing the model's prediction in a format compatible with task-specific parsers.
 
-        Args:
-            prompt (str): The input prompt for the model.
-            is_multiple_choice (bool): Indicates whether the task is a multiple choice question.
+        Parameters:
+            - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
+                - prompt (List[str]): a list of input prompts for the model.
+    
+            - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
 
         Returns:
-            str: The prediction as a string representing a single integer[0, 3] for multiple choice tasks,
+            str: A list of predictions for each of the prompts received in the batch.
+                    Each prediction is
+                           a string representing a single integer[0, 3] for multiple choice tasks,
                         or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
                         or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
                         or a string representing the (unconstrained) generated response for the generation tasks
diff --git a/models/dummy_model.py b/models/dummy_model.py
index 5126746b8968f7ba6a047c76766772c6de08b2ec..46dcb8373f924d0373b170900d5afaa0350c6590 100644
--- a/models/dummy_model.py
+++ b/models/dummy_model.py
@@ -1,6 +1,6 @@
-from typing import List, Union
-import random
 import os
+import random
+from typing import Any, Dict, List
 
 from .base_model import ShopBenchBaseModel
 
@@ -19,34 +19,55 @@ class DummyModel(ShopBenchBaseModel):
         """Initializes the model and sets the random seed for consistency."""
         random.seed(AICROWD_RUN_SEED)
 
-    def predict(self, prompt: str, is_multiple_choice: bool) -> str:
+    def get_batch_size(self) -> int:
+        """
+        Determines the batch size that is used by the evaluator when calling the `batch_predict` function.
+
+        Returns:
+            int: The batch size, an integer between 1 and 16. This value indicates how many
+                 queries should be processed together in a single batch. It can be dynamic
+                 across different batch_predict calls, or stay a static value.
+        """
+        self.batch_size = 4
+        return self.batch_size
+
+    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
         """
-        Generates a prediction based on the input prompt and task type.
+        Generates a batch of prediction based on associated prompts and task_type
 
         For multiple choice tasks, it randomly selects a choice.
         For other tasks, it returns a list of integers as a string,
         representing the model's prediction in a format compatible with task-specific parsers.
 
-        Args:
-            prompt (str): The input prompt for the model.
-            is_multiple_choice (bool): Indicates whether the task is a multiple choice question.
+        Parameters:
+            - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
+                - prompt (List[str]): a list of input prompts for the model.
+    
+            - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
 
         Returns:
-            str: The prediction as a string representing a single integer[0, 3] for multiple choice tasks,
+            str: A list of predictions for each of the prompts received in the batch.
+                    Each prediction is
+                           a string representing a single integer[0, 3] for multiple choice tasks,
                         or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
                         or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
                         or a string representing the (unconstrained) generated response for the generation tasks
                         Please refer to parsers.py for more details on how these responses will be parsed by the evaluator.
         """
+        prompts = batch["prompt"]
+
         possible_responses = [1, 2, 3, 4]
 
-        if is_multiple_choice:
-            # Randomly select one of the possible responses for multiple choice tasks
-            return str(random.choice(possible_responses))
-        else:
-            # For other tasks, shuffle the possible responses and return as a string
-            random.shuffle(possible_responses)
-            return str(possible_responses)
-            # Note: As this is dummy model, we are returning random responses for non-multiple choice tasks.
-            # For generation tasks, this should ideally return an unconstrained string.
+        batch_response = []
+        for prompt in prompts:
+            if is_multiple_choice:
+                # Randomly select one of the possible responses for multiple choice tasks
+                batch_response.append(str(random.choice(possible_responses)))
+            else:
+                # For other tasks, shuffle the possible responses and return as a string
+                random.shuffle(possible_responses)
+                batch_response.append(str(possible_responses))
+                # Note: As this is dummy model, we are returning random responses for non-multiple choice tasks.
+                # For generation tasks, this should ideally return an unconstrained string.
 
+        return batch_response
diff --git a/models/user_config.py b/models/user_config.py
index 3380d478901d3b217a9463587ebf22feee37d526..b1bca5a24e2736a7eb52ad7b449c86c68a8851b0 100644
--- a/models/user_config.py
+++ b/models/user_config.py
@@ -19,3 +19,10 @@ UserModel = DummyModel
 #
 # UserModel = YourModel
 
+
+# For example, to use the Llama3 8B Instruct baseline, you can comment the lines below:
+# please remember to download the model weights and checking them into the repository 
+# before submitting
+
+# from models.vanilla_llama3_baseline import Llama3_8B_ZeroShotModel
+# UserModel = Llama3_8B_ZeroShotModel
diff --git a/models/vanilla_llama3_baseline.py b/models/vanilla_llama3_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8134b430fd7f534ff066e6158e838de42c266c2
--- /dev/null
+++ b/models/vanilla_llama3_baseline.py
@@ -0,0 +1,170 @@
+import os
+import random
+from typing import Any, Dict, List
+
+import vllm
+from outlines.integrations.vllm import RegexLogitsProcessor
+
+from .base_model import ShopBenchBaseModel
+
+#### CONFIG PARAMETERS ---
+
+# Set a consistent seed for reproducibility
+AICROWD_RUN_SEED = int(os.getenv("AICROWD_RUN_SEED", 773815))
+
+# Batch size you wish the evaluators will use to call the `batch_generate_answer` function
+AICROWD_SUBMISSION_BATCH_SIZE = 16 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+
+# VLLM Parameters 
+VLLM_TENSOR_PARALLEL_SIZE = 4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+VLLM_GPU_MEMORY_UTILIZATION = 0.85 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+
+
+class Llama3_8B_ZeroShotModel(ShopBenchBaseModel):
+    """
+    A dummy model implementation for ShopBench, illustrating how to handle both
+    multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
+    This model uses a consistent random seed for reproducible results.
+    """
+
+    def __init__(self):
+        """Initializes the model and sets the random seed for consistency."""
+        random.seed(AICROWD_RUN_SEED)
+        self.initialize_models()
+
+    def initialize_models(self):
+        # Initialize Meta Llama 3 - 8B Instruct Model
+        self.model_name = "models/meta-llama/Meta-Llama-3-8B-Instruct"
+
+        if not os.path.exists(self.model_name):
+            raise Exception(
+                f"""
+            The evaluators expect the model weights to be checked into the repository,
+            but we could not find the model weights at {self.model_name}
+            
+            Please follow the instructions in the docs below to download and check in the model weights.
+                https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/blob/master/docs/download-baseline-model-weights.md
+            
+            """
+            )
+
+        # initialize the model with vllm
+        self.llm = vllm.LLM(
+            self.model_name,
+            tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE, 
+            gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION, 
+            trust_remote_code=True,
+            dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
+            enforce_eager=True
+        )
+        self.tokenizer = self.llm.get_tokenizer()
+
+
+
+    def get_batch_size(self) -> int:
+        """
+        Determines the batch size that is used by the evaluator when calling the `batch_predict` function.
+
+        Returns:
+            int: The batch size, an integer between 1 and 16. This value indicates how many
+                 queries should be processed together in a single batch. It can be dynamic
+                 across different batch_predict calls, or stay a static value.
+        """
+        self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE
+        return self.batch_size
+
+    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
+        """
+        Generates a batch of prediction based on associated prompts and task_type
+
+        For multiple choice tasks, it randomly selects a choice.
+        For other tasks, it returns a list of integers as a string,
+        representing the model's prediction in a format compatible with task-specific parsers.
+
+        Parameters:
+            - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
+                - prompt (List[str]): a list of input prompts for the model.
+    
+            - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
+
+        Returns:
+            str: A list of predictions for each of the prompts received in the batch.
+                    Each prediction is
+                           a string representing a single integer[0, 3] for multiple choice tasks,
+                        or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
+                        or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
+                        or a string representing the (unconstrained) generated response for the generation tasks
+                        Please refer to parsers.py for more details on how these responses will be parsed by the evaluator.
+        """
+        prompts = batch["prompt"]
+        
+        # format prompts using the chat template
+        formatted_prompts = self.format_prommpts(prompts)
+        # set max new tokens to be generated
+        max_new_tokens = 100 
+        
+        # Setup logits processor
+        logits_processors = []
+        if is_multiple_choice:
+            logits_processors = [
+                RegexLogitsProcessor(
+                    regex_string="\d+", # constrain generation to only integers
+                    llm=self.llm)
+            ]
+            max_new_tokens = 1 # For MCQ tasks, we only need to generate 1 token
+        
+        
+        # Generate responses via vllm
+        responses = self.llm.generate(
+            formatted_prompts,
+            vllm.SamplingParams(
+                n=1,  # Number of output sequences to return for each prompt.
+                top_p=0.9,  # Float that controls the cumulative probability of the top tokens to consider.
+                temperature=0,  # randomness of the sampling
+                seed=AICROWD_RUN_SEED, # Seed for reprodicibility
+                skip_special_tokens=True,  # Whether to skip special tokens in the output.
+                max_tokens=max_new_tokens,  # Maximum number of tokens to generate per output sequence.
+                # Note: We are using 50 max new tokens instead of 75,
+                # because the 75 max token limit is checked using the Llama2 tokenizer.
+                # The Llama3 model instead uses a differet tokenizer with a larger vocabulary
+                # This allows it to represent the same content more efficiently, using fewer tokens.
+                logits_processors=logits_processors, # Use logits processors to do constrained/guided generation in case of MCQ tasks
+            ),
+            use_tqdm = False
+        )
+        # Aggregate answers into List[str]
+        batch_response = []
+        for response in responses:
+            batch_response.append(response.outputs[0].text)        
+            
+        if is_multiple_choice:
+            print("MCQ: ", batch_response)
+
+        return batch_response
+
+    def format_prommpts(self, prompts):
+        """
+        Formats prompts using the chat_template of the model.
+            
+        Parameters:
+        - queries (list of str): A list of queries to be formatted into prompts.
+            
+        """
+        system_prompt = "You are a helpful online shopping assistant. Please answer the following question about online shopping and follow the given instructions."
+        formatted_prompts = []
+
+        for _idx, prompt in enumerate(prompts):
+            user_message = ""
+
+            formatted_prompts.append(
+                self.tokenizer.apply_chat_template(
+                    [
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": prompt},
+                    ],
+                    tokenize=False,
+                    add_generation_prompt=True,
+                )
+            )
+
+        return formatted_prompts
diff --git a/requirements.txt b/requirements.txt
index 12c6d5d5eac2aa9e97516ec03233adc7e98b9801..fc1f772a04265396b3337070ac4ebe7ea7b5cdca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,4 @@
 torch
+vllm>=0.4.2
+outlines>=0.0.41
+loguru