Compare revisions

Dipam Chakraborty · spmohanty · spmohanty · spmohanty · spmohanty · spmohanty
--- a/.dockerignore
+++ b/.dockerignore
+.git/
+models/**
+data/
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
-__pycache__
-scores.json
\ No newline at end of file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+scores.json
+data/
+*.ipynb
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    LANG=en_US.UTF-8 \
+    LANGUAGE=en_US:en \
+    LC_ALL=en_US.UTF-8 \
+    USER_NAME=aicrowd \
+    HOME_DIR=/home/aicrowd \
+    CONDA_DIR=/home/aicrowd/.conda \
+    PATH=/home/aicrowd/.conda/bin:${PATH} \
+    SHELL=/bin/bash
+
+# Install system dependencies and clean up in one layer
+COPY apt.txt /tmp/apt.txt
+RUN apt -qq update && apt -qq install -y --no-install-recommends `cat /tmp/apt.txt | tr -d '\r'` locales wget build-essential \
+    && locale-gen en_US.UTF-8 \
+    && rm -rf /var/cache/apt/* /var/lib/apt/lists/* \
+    && apt clean
+
+# Set up user
+RUN groupadd -g 1001 aicrowd && \
+    useradd -m -s /bin/bash -u 1001 -g aicrowd -G sudo aicrowd
+
+USER ${USER_NAME}
+WORKDIR ${HOME_DIR}
+
+# Install Miniconda and Python packages. You can change the python version by using another Miniconda. 
+RUN wget -nv -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_22.11.1-1-Linux-x86_64.sh \
+    && bash miniconda.sh -b -p ${CONDA_DIR} \
+    && . ${CONDA_DIR}/etc/profile.d/conda.sh \
+    && conda install cmake -y \
+    && conda clean -y -a \
+    && rm -rf miniconda.sh
+
+COPY --chown=1001:1001 requirements.txt ${HOME_DIR}/requirements.txt
+RUN pip install -r requirements.txt --no-cache-dir
+COPY --chown=1001:1001 requirements_eval.txt ${HOME_DIR}/requirements_eval.txt
+RUN pip install -r requirements_eval.txt --no-cache-dir
+
+## Add your custom commands below
--- a/README.md
+++ b/README.md
+![AMAZON KDD CUP 2024: MULTI-TASK ONLINE SHOPPING CHALLENGE FOR LLMS](https://aicrowd-production.s3.eu-central-1.amazonaws.com/challenge_images/amazon-kdd-cup-2024/amazon-kdd-cup-24-banner.jpg)
+[![Discord](https://img.shields.io/discord/565639094860775436.svg)](https://discord.gg/yWurtB2huX)
+
+# 🛒 [Amazon KDD CUP 2024: Multi-Task Online Shopping Challenge for LLMs](https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms) Starter Kit
+
+
+This repository is the Amazon KDD Cup 2024 **Submission template and Starter kit**! Clone the repository to compete now!
+
+**This repository contains**:
+*  **Documentation** on how to submit your models to the leaderboard
+*  **The procedure** for best practices and information on how we evaluate your model, etc.
+*  **Starter code** for you to get started!
+
+# Table of Contents
+
+1. [Competition Overview](#-competition-overview)
+2. [Dataset](#-dataset)
+3. [Tasks](#-tasks)
+4. [Evaluation Metrics](#-evaluation-metrics)
+5. [Getting Started](#-getting-started)
+   - [How to write your own model?](#️-how-to-write-your-own-model)
+   - [How to start participating?](#-how-to-start-participating)
+      - [Setup](#setup)
+      - [How to make a submission?](#-how-to-make-a-submission)
+      - [What hardware does my code run on?](#-what-hardware-does-my-code-run-on-)
+      - [How are my model responses parsed by the evaluators?](#-how-are-my-model-responses-parsed-by-the-evaluators-)
+6. [Frequently Asked Questions](#-frequently-asked-questions)
+6. [Important Links](#-important-links)
+
+
+# 📖 Competition Overview
+
+Online shopping is complex, involving various tasks from browsing to purchasing, all requiring insights into customer behavior and intentions. This necessitates multi-task learning models that can leverage shared knowledge across tasks. Yet, many current models are task-specific, increasing development costs and limiting effectiveness. Large language models (LLMs) have the potential to change this by handling multiple tasks through a single model with minor prompt adjustments. Furthermore, LLMs can also improve customer experiences by providing interactive and timely recommendations. However, online shopping, as a highly specified domain, features a wide range of domain-specific concepts (e.g. brands, product lines) and knowledge (e.g. which brand produces which products), making it challenging to adapt existing powerful LLMs from general domains to online shopping.
+
+Motivated by the potentials and challenges of LLMs, we present **ShopBench**, a massive challenge for online shopping, with `57 tasks` and `~20000 questions`, derived from real-world Amazon shopping data. All questions in this challenge are re-formulated to a unified text-to-text generation format to accommodate the exploration of LLM-based solutions. ShopBench focuses on four main key shopping skills (which will serve as **Tracks 1-4**): 
+- shopping concept understanding
+- shopping knowledge reasoning
+- user behavior alignment
+- multi-lingual abilities
+
+In addition, we set up **Track 5: All-around** to encourage even more versatile and all-around solutions. Track 5 requires participants to solve all questions in Tracks 1-4 with **a single solution**, which is expected to be more principled and unified than track-specific solutions to Tracks 1-4. We will correspondingly assign larger awards to Track 5. 
+
+# 📊 Dataset
+
+ShopBench used in this challenge is an anonymized, multi-task dataset sampled from real-world Amazon shopping data. Statistics of ShopBench is given in the following Table. 
+
+| 	# Tasks	  | # Questions	| # Products	| # Product Category	| # Attributes	| # Reviews	| # Queries|
+| ----------  | ----------- | --------    | -----------------   | ------------- | --------- | ---------|
+|	57          |	20598	      |   ~13300    |	400	                | 1032          |	~11200	  |~4500     |
+
+ShopBench is split into a few-shot development set and a test set to better mimic real-world applications --- where you never know the customer's questions beforehand. With this setting, we encourage participants to use any resource that is publicly available (e.g. pre-trained models, text datasets) to construct their solutions, instead of overfitting the given development data (e.g. generating pseudo data samples with GPT). 
+
+The development datasets will be given in json format with the following fields. 
+
+- `input_field`: This field contains the instructions and the question that should be answered by the model. 
+- `output_field`: This field contains the ground truth answer to the question. 
+- `task_type`: This field contains the type of the task (Details in the next Section, "Tasks")
+- `task_name`: This field contains the name of the task. However, the exact task names are redacted, and we only provide participants with hashed task names (e.g. `task1`, `task2`). 
+- `metric`: This field contains the metric used to evaluate the question (Details in Section "Evaluation Metrics"). 
+- `track`: This field specifies the track the question comes from. 
+
+However, the test dataset (which will be hidden from participants) will have a different format with only two fields: 
+- `input_field`, which is the same as above. 
+- `is_multiple_choice`: This field contains a `True` or `False` that indicates whether the question is a multiple choice or not. The detailed 'task_type' will not be given to participants. 
+
+# 👨‍💻👩‍💻 Tasks
+ShopBench is constructed to evaluate four important shopping skills, which correspond to Tracks 1-4 of the challenge. 
+
+- **Shopping Concept Understanding**: There are many domain-specific concepts in online shopping, such as brands, product lines, etc. Moreover, these concepts often exist in short texts, such as queries, making it even more challenging for models to understand them without adequate contexts. This skill emphasizes the ability of LLMs to understand and answer questions related to these concepts. 
+- **Shopping Knowledge Reasoning**: Complex reasoning with implicit knowledge is involved when people make shopping decisions, such as numeric reasoning (e.g. calculating the total amount of a product pack), multi-step reasoning (e.g. identifying whether two products are compatible with each other). This skill focuses on evaluating the model's reasoning ability on products or product attributes with domain-specific implicit knowledge. 
+- **User Behavior Alignment**:  User behavior modeling is of paramount importance in online shopping. However, user behaviors are highly diverse, including browsing, purchasing, query-then-clicking, etc. Moreover, most of them are implicit and not expressed in texts. Therefore, aligning with heterogeneous and implicit shopping behaviors is a unique challenge for language models in online shopping, which is the primary aim of this track.  
+- **Multi-lingual Abilities**: Multi-lingual models are especially desired in online shopping as they can be deployed in multiple marketplaces without re-training. Therefore, we include a separate multi-lingual track, including multi-lingual concept understanding and user behavior alignment, to evaluate how a single model performs in different shopping locales without re-training. 
+
+In addition, we setup Track 5: All-around, requiring participants to solve all questions in Tracks 1-4 with a unified solution to further emphasize the generalizability and the versatility of the solutions. 
+
+ShopBench involves a total of 5 types of tasks, all of which are re-formulated to text-to-text generation to accommodate LLM-based solutions. 
+
+- **Multiple Choice**: Each question is associated with several choices, and the model is required to output a single correct choice.
+- **Retrieval**: Each question is associated with a requirement and a list of candidate items, and the model is required to retrieve all items that satisfy the requirement. 
+- **Ranking**: Each question is associated with a requirement and a list of candidate items, and the model is required to re-rank all items according to how each item satisfies the requirement. 
+- **Named Entity Recognition**: Each question is associated with a piece of text and an entity type. The model is required to extract all phrases from the text that fall in the entity type. 
+- **Generation**: Each question is associated with an instruction and a question, and the model is required to generate text pieces following the instruction to answer the question. There are multiple types of generation questions, including extractive generation, translation, elaboration, etc.    
+
+To test the generalization ability of the solutions, the development set will only cover a part of all 57 tasks, resulting to tasks that are unseen throughout the challenge. However, all 5 task types will be covered in the development set to help participants understand the prompts and output formats.   
+
+
+## 📏 Evaluation Metrics
+ShopBench includes multiple types of tasks, each requiring specific metrics for evaluation. The metrics selected are as follows:
+- **Multiple Choice:** Accuracy is used to measure the performance for multiple choice questions.
+- **Ranking:** Normalized Discounted Cumulative Gain (NDCG) is used to evaluate ranking tasks.
+- **Named Entity Recognition (NER):** Micro-F1 score is used to assess NER tasks.
+- **Retrieval:** Hit@3 is used to assess retrieval tasks. The number of positive samples not exceeding 3 across ShopBench.
+- **Generation:** Metrics vary based on the task type:
+  - Extraction tasks (e.g., keyphrase extraction) uses ROUGE-L.
+  - Translation tasks uses BLEU score.
+  - For other generation tasks, we employ [Sentence Transformer](https://huggingface.co/sentence-transformers) to calculate sentence embeddings of the generated text $x_{gen}$ and the ground truth text $x_{gt}$. We then compute the cosine similarity between $x_{gen}$ and $x_{gt}$ (clipped to [0, 1]) as the metric. This approach focuses on evaluations on text semantics rather than just token-level accuracy.
+
+As all tasks are converted into text generation tasks, rule-based parsers will parse the answers from participants' solutions. Answers that parsers cannot process will be scored as 0. The parsers will be available to participants.
+
+Since all these metrics range from [0, 1], we calculate the average metric for all tasks within each track (macro-averaged) to determine the overall score for a track and identify track winners. The overall score of Track 5 will be calculated by averaging scores in Tracks 1-4. 
+
+Please refer to [local_evaluation.py](local_evaluation.py) for more details on how we will evaluate your submissions.
+
+# 🏁 Getting Started
+1. **Sign up** to join the competition [on the AIcrowd website](https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms).
+2. **Fork** this starter kit repository. You can use [this link](https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/forks/new) to create a fork.
+3. **Clone** your forked repo and start developing your model.
+4. **Develop** your model(s) following the template in [how to write your own model](#how-to-write-your-own-model) section.
+5. [**Submit**](#-how-to-make-a-submission) your trained models to [AIcrowd Gitlab](https://gitlab.aicrowd.com) for evaluation [(full instructions below)](#-how-to-make-a-submission). The automated evaluation setup will evaluate the submissions on the private datasets and report the metrics on the leaderboard of the competition.
+
+# ✍️ How to write your own model?
+
+Please follow the instructions in [models/README.md](models/README.md) for instructions and examples on how to write your own models for this competition.
+
+# 🚴 How to start participating?
+
+## Setup
+
+1. **Add your SSH key** to AIcrowd GitLab
+
+You can add your SSH Keys to your GitLab account by going to your profile settings [here](https://gitlab.aicrowd.com/-/profile/keys). If you do not have SSH Keys, you will first need to [generate one](https://docs.gitlab.com/ee/user/ssh.html).
+
+2. **Fork the repository**. You can use [this link](https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/forks/new) to create a fork.
+
+3.  **Clone the repository**
+
+    ```bash
+    git clone git@gitlab.aicrowd.com:<YOUR-AICROWD-USER-NAME>/amazon-kdd-cup-2024-starter-kit.git
+    cd amazon-kdd-cup-2024-starter-kit
+    ```
+
+4. **Install** competition specific dependencies!
+    ```bash
+    cd amazon-kdd-cup-2024-starter-kit
+    pip install -r requirements.txt
+    # an to run local_evaluation.py
+    pip install -r requirements_eval.txt
+    ```
+
+5. Write your own model as described in [How to write your own model](#how-to-write-your-own-model) section.
+
+6. Test your model locally using `python local_evaluation.py`.
+
+7. Accept the Challenge Rules on the main [challenge page](https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms) by clicking on the **Participate** button. Also accept the Challenge Rules on the Task specific page (link on the challenge page) that you want to submit to.
+
+8. Make a submission as described in [How to make a submission](#-how-to-make-a-submission) section.
+
+
+## 📮 How to make a submission?
+
+Please follow the instructions in [docs/submission.md](docs/submission.md) to make your first submission. 
+This also includes instructions on [specifying your software runtime](docs/submission.md#specifying-software-runtime-and-dependencies), [code structure](docs/submission.md#code-structure-guidelines), [submitting to different tracks](docs/submission.md#submitting-to-different-tracks).
+
+**Note**: **Remember to accept the Challenge Rules** on the challenge page, **and** the task page before making your first submission.
+
+## 💻 What hardware does my code run on ?
+You can find more details about the hardware and system configuration in [docs/hardware-and-system-config.md](docs/hardware-and-system-config.md).
+In summary, we provide you `4` x [[NVIDIA T4 GPUs](https://www.nvidia.com/en-us/data-center/tesla-t4/)] in Phase 2.
+
+Your solution will be given a certain amount of time for inference, after which it would be immediately killed and no results would be available. The time limit is set at 
+| Phase  | Track 1 | Track 2 | Track 3 | Track 4 | Track 5 |
+| ------ | ------- | ------- | ------- | ------- | ------- |
+| **Phase 2**| 70 minutes | 20 minutes | 30 minutes | 20 minutes | 140 minutes |
+
+For reference, the baseline solution with zero-shot LLaMA3-8B-instruct consumes the following amount of time. 
+
+| Phase  | Track 1 | Track 2 | Track 3 | Track 4 | 
+| ------ | ------- | ------- | ------- | ------- | 
+| **Phase 2**| 1490s | 397s | 576s | 359s | 
+
+We limit the prediction time of each sample to at most **10 seconds**. This limit applies at a batch level. For example, for a batch of 8 samples, you should return the prediction after at most 80 seconds. Otherwise, your submission will be killed. 
+
+Your maximum repo size is 200GB. 
+
+## 🧩 How are my model responses parsed by the evaluators ?
+Please refer to [parsers.py](parsers.py) for more details on how we parse your model responses.
+
+
+# ❓ Frequently Asked Questions 
+## Which track is this starter kit for ?
+This starter kit can be used to submit to any of the tracks. You can find more information in [docs/submission.md#submitting-to-different-tracks](docs/submission.md#submitting-to-different-tracks).
+
+**Best of Luck** :tada: :tada:
+
+# 📎 Important links
+
+- 💪 Challenge Page: https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms
+- 🗣 Discussion Forum: https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms/discussion
+- 🏆 Leaderboard: https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms/leaderboards
--- a/aicrowd.json
+++ b/aicrowd.json
+{
+  "challenge_id": "amazon-kdd-cup-24-understanding-shopping-concepts",
+  "authors": [
+    "your-aicrowd-username"
+  ],
+  "gpu": false,
+  "description": "(optional) description about your custom model"
+}
\ No newline at end of file
--- a/apt.txt
+++ b/apt.txt
+git
\ No newline at end of file
--- a/data/.gitkeep
+++ b/data/.gitkeep
--- a/data/development.json
+++ b/data/development.json
--- a/docker_run.sh
+++ b/docker_run.sh
+#!/bin/bash
+
+#!/bin/bash
+
+# This script builds a Docker image from the current directory
+# and runs a container from this image, executing local_evaluation.py
+# with the current directory mounted at /submission inside the container.
+
+# Step 1: Define the name of the Docker image.
+LAST_COMMIT_HASH=$(git rev-parse --short HEAD)
+IMAGE_NAME="aicrowd/amazon-kddcup24-submission:${LAST_COMMIT_HASH}"
+
+# Step 2: Build the Docker image.
+# The '.' at the end specifies that the Docker context is the current directory.
+# This means Docker will look for a Dockerfile in the current directory to build the image.
+START_TIME=$(date +%s)
+DOCKER_BUILDKIT=1 docker build -t $IMAGE_NAME .
+BUILD_STATUS=$?
+if [ $BUILD_STATUS -ne 0 ]; then
+    echo "Docker build failed. Exiting..."
+    exit $BUILD_STATUS
+fi
+END_TIME=$(date +%s)
+BUILD_TIME=$((END_TIME - START_TIME))
+echo "Total build time: $BUILD_TIME seconds"
+
+# Step 3: Run the Docker container.
+# -v "$(pwd)":/submission mounts the current directory ($(pwd) outputs the current directory path)
+# to /submission inside the container. This way, the container can access the contents
+# of the current directory as if they were located at /submission inside the container.
+# 'python /submission/local_evaluation.py' is the command executed inside the container.
+# the -w sets the workind directory to /submission.
+# It then local_evaluation.py using software runtime set up in the Dockerfile.
+docker run \
+    --gpus all \
+    -v "$(pwd)":/submission \
+    -w /submission \
+    --shm-size=10.24gb\
+    $IMAGE_NAME python local_evaluation.py
+
+# Note: We assume you have nvidia-container-toolkit installed and configured 
+# to use the --gpus all flag. If you are not using GPUs, you can remove this flag.
+
+
+# Note 1: Please refer to the Dockerfile to understand how the software runtime is set up.
+# The Dockerfile should include all necessary commands to install Python, the necessary
+# dependencies, and any other software required to run local_evaluation.py.
+
+# Note 2: Note the .dockerignore file in the root of this directory.
+# In the .dockerignore file, specify any files or directories that should not be included
+# in the Docker context. This typically includes large files, models, or datasets that
+# are not necessary for building the Docker image. Excluding these can significantly
+# speed up the build process by reducing the size of the build context sent to the Docker daemon.
+
+# Ensure your Dockerfile and .dockerignore are properly set up before running this script.
--- a/docs/download-baseline-model-weights.md
+++ b/docs/download-baseline-model-weights.md
+### Setting Up and Downloading Baseline Model weighta with Hugging Face
+
+This guide outlines the steps to download (and check in) the models weights required for the baseline models.
+We will focus on the `Meta-Llama-3-8B-Instruct`.
+But the steps should work equally well for any other models on hugging face. 
+
+#### Preliminary Steps:
+
+1. **Install the Hugging Face Hub Package**:
+   
+   Begin by installing the `huggingface_hub` package, which includes the `hf_transfer` utility, by running the following command in your terminal:
+
+   ```bash
+   pip install huggingface_hub[hf_transfer]
+   ```
+
+2. **Accept the LLaMA Terms**:
+   
+   You must accept the LLaMA model's terms of use by visiting: [meta-llama/Meta-Llama-3-8B-Instruct Terms](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct).
+
+3. **Create a Hugging Face CLI Token**:
+   
+   Generate a CLI token by navigating to: [Hugging Face Token Settings](https://huggingface.co/settings/tokens). You will need this token for authentication.
+
+#### Hugging Face Authentication:
+
+1. **Login via CLI**:
+   
+   Authenticate yourself with the Hugging Face CLI using the token created in the previous step. Run:
+
+   ```bash
+   huggingface-cli login
+   ```
+
+   When prompted, enter the token.
+
+#### Model Downloads:
+
+1. **Download LLaMA-2-7b Model**:
+
+   Execute the following command to download the `Meta-Llama-3-8B-Instruct` model to a local subdirectory. This command excludes unnecessary files to save space:
+
+   ```bash
+   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download \
+       meta-llama/Meta-Llama-3-8B-Instruct \
+       --local-dir-use-symlinks False \
+       --local-dir models/meta-llama/Meta-Llama-3-8B-Instruct \
+       --exclude *.pth # These are alternates to the safetensors hence not needed
+   ```
+
+#### Version Control with Git LFS:
+
+1. **Track Model Weights**:
+   
+   Use Git Large File Storage (LFS) to track the model directories. This ensures efficient handling of large files:
+
+   ```bash
+   git lfs track "models/meta-llama/*"
+   ```
+
+2. **Commit and Push**:
+   
+   Add the models to your Git repository, commit the changes, and push them to your remote repository:
+
+   ```bash
+   git add models/
+   git commit -am "add weights"
+   git push origin master
+   ```
+If you are struggling with GIT-LFS, you are very much encouraged to check out [this post](https://discourse.aicrowd.com/t/how-to-upload-large-files-size-to-your-submission/2304).
--- a/docs/hardware-and-system-config.md
+++ b/docs/hardware-and-system-config.md
+## Hardware and System Configuration
+We apply a limit on the hardware available to each participant to run their solutions. Specifically, 
+
+- All solutions will be run on [AWS g4dn.12xlarge](https://aws.amazon.com/ec2/instance-types/g4/) instances equipped with [NVIDIA T4 GPUs](https://www.nvidia.com/en-us/data-center/tesla-t4/). 
+- Solutions for Phase 1 will have access to :
+    - `2` x [NVIDIA T4 GPU](https://www.nvidia.com/en-us/data-center/tesla-t4/s). 
+    - `20` x vCPU (`10` physical CPU cores)
+    - `90GB` RAM 
+- Solutions for Phase 2 will have access to: 
+    - `4` x [NVIDIA T4 GPU](https://www.nvidia.com/en-us/data-center/tesla-t4/s). 
+    - `40` x vCPU (`20` physical CPU cores)
+    - `180GB` RAM 
+
+**Note**: When running in `gpu:false` mode, you will have access to `4` x vCPUs (`2` physical cores) and `8GB` RAM. 
+
+Please note that NVIDIA T4 uses a somewhat outdated architectures and is thus not compatible with certain acceleration toolkits (e.g. Flash Attention), so please be careful about compatibility.
+
+Besides, the following restrictions will also be imposed: 
+
+- Network connection will be disabled. 
+- Each submission will be assigned a certain amount of time to run. Submissions that exceed the time limits will be killed and will not be evaluated. The tentative time limit is set as follows. 
+
+| Phase  | Track 1 | Track 2 | Track 3 | Track 4 | Track 5 |
+| ------ | ------- | ------- | ------- | ------- | ------- |
+| **Phase 1**| 140 minutes | 40 minutes | 60 minutes | 60 minutes | 5 hours |
+
+- Each team will be able to make up to **2 submissions per week** per track for Tracks 1-4, and **1 submission per week** for track 5 all-around. 
+
+Based on the hardware and system configuration, we recommend participants to begin with 7B models. According to our experiments, 7B models like Vicuna-7B and Mistral can perform inference smoothly on 2 NVIDIA T4 GPUs, while 13B models will result in OOM. 
--- a/docs/runtime.md
+++ b/docs/runtime.md
+## Adding your runtime
+
+This repository is a valid submission (and submission structure). 
+You can simply add your dependencies on top of this repository.
+
+Few of the most common ways are as follows:
+
+* `requirements.txt` -- The `pip3` packages used by your inference code. As you add new pip3 packages to your inference procedure either manually add them to `requirements.txt` or if your software runtime is simple, perform:
+    ```
+    # Put ALL of the current pip3 packages on your system in the submission
+    >> pip3 freeze >> requirements.txt
+    >> cat requirements.txt
+    aicrowd_api
+    coloredlogs
+    matplotlib
+    pandas
+    [...]
+    ```
+
+We would suggest participants to keep the `requirements.txt` to the minimum, with only necessary packages in it. Chances are that, the more (unnecessary) packages you put in it, the more likely you may encounter an error on some (maybe totally unnecessary) packages. 
+
+* `apt.txt` -- The Debian packages (via aptitude) used by your inference code!
+
+These files are used to construct your **AIcrowd submission docker containers** in which your code will run.
+
+* `Dockerfile` -- `Dockerfile` gives you more flexibility on defining the software runtime used during evaluations. The `Dockerfile` under the root path of the starter kit will be used to build your solution. Feel free to modify anything in it, and test it locally. 
+
+----
+
+To test your image builds locally, you can use [repo2docker](https://github.com/jupyterhub/repo2docker)
--- a/docs/submission.md
+++ b/docs/submission.md
+# Guide to Making Your First Submission
+
+This document is designed to assist you in making your initial submission smoothly. Below, you'll find step-by-step instructions on specifying your software runtime and dependencies, structuring your code, and finally, submitting your project. Follow these guidelines to ensure a smooth submission process.
+
+# Table of Contents
+
+1. [Specifying Software Runtime and Dependencies](#specifying-software-runtime-and-dependencies)
+2. [Code Structure Guidelines](#code-structure-guidelines)
+3. [Submitting to Different Tracks](#submitting-to-different-tracks)
+4. [Submission Entry Point](#submission-entry-point)
+5. [Setting Up SSH Keys](#setting-up-ssh-keys)
+6. [Managing Large Model Files with Git LFS](#managing-large-model-files-with-git-lfs)
+    - [Why Use Git LFS?](#why-use-git-lfs)
+    - [Steps to Use Git LFS](#steps-to-use-git-lfs)
+    - [Handling Previously Committed Large Files](#handling-previously-committed-large-files)
+7. [How to Submit Your Code](#how-to-submit-your-code)
+
+
+## Specifying Software Runtime and Dependencies
+
+Our platform supports custom runtime environments. This means you have the flexibility to choose any libraries or frameworks necessary for your project. Here’s how you can specify your runtime and dependencies:
+
+- **`requirements.txt`**: List any PyPI packages your project needs. **Do specify versions, as we observe significant difference in inference time between different `transformer` versions.**
+- **`apt.txt`**: Include any apt packages required.
+- **`Dockerfile`**: The one located at the root will be used by default to build your submission. **You can specify the python version here if you need specific ones**. 
+
+For detailed setup instructions regarding runtime dependencies, refer to the documentation in the `docs/runtime.md` file.
+
+## Code Structure Guidelines
+
+Your project should follow the structure outlined in the starter kit. Here’s a brief overview of what each component represents:
+
+```
+.
+├── .dockerignore                   # Please specify the paths to your model checkpoints so that the large files won't be built into the docker image. 
+├── README.md                       # Project documentation and setup instructions
+├── aicrowd.json                    # Submission meta information - like your username, track name
+├── data
+│   └── development.json            # Development dataset local testing
+├── docs
+│   └── runtime.md                  # Documentation on the runtime environment setup, dependency configs
+├── Dockerfile                      # The Dockerfile that will be used to build your submission and all dependencies. The default one will work fine, but you can write your own. 
+├── docker_run.sh                   # This script builds your submission locally and calls `local_evaluation.py`. It can be used to debug (if your submission fails to build). 
+├── local_evaluation.py             # Use this to check your model evaluation flow locally
+├── metrics.py                      # Scripts to calculate evaluation metrics for your model's performance
+├── models
+│   ├── README.md                   # Documentation specific to the implementation of model interfaces
+│   ├── base_model.py               # Base model class 
+│   ├── dummy_model.py              # A simple or placeholder model for demonstration or testing. We also implement a simple Vicuna-7B baseline here. 
+│   └── user_config.py              # IMPORTANT: Configuration file to specify your model 
+├── parsers.py                      # Model output parser
+├── requirements.txt                # Python packages to be installed for model development
+├── requirements_eval.txt           # Additional Python packages to be installed for local evaluation
+└── utilities
+    └── _Dockerfile                 # Example Dockerfile for specifying runtime via Docker
+```
+
+Remember, **your submission metadata JSON (`aicrowd.json`)** is crucial for mapping your submission to the challenge. Ensure it contains the correct `challenge_id`, `authors`, and other necessary information. **To utilize GPUs, set the `"gpu": true` flag in your `aicrowd.json`.**
+
+## Submitting to Different Tracks
+
+Specify the track by setting the appropriate `challenge_id` in your [aicrowd.json](aicrowd.json). Here are the challenge IDs for various tracks:
+
+| Track Name                        | Challenge ID                                        |
+|-----------------------------------|-----------------------------------------------------|
+| Understanding Shopping Concepts   | `amazon-kdd-cup-24-understanding-shopping-concepts` |
+| Shopping Knowledge Reasoning      | `amazon-kdd-cup-24-shopping-knowledge-reasoning`    |
+| User Behavior Alignment           | `amazon-kdd-cup-24-user-behavior-alignment`         |
+| Multi-Lingual Abilities           | `amazon-kdd-cup-24-multi-lingual-abilities`         |
+| All-Around                        | `amazon-kdd-cup-24-all-around`                      |
+
+## Submission Entry Point
+
+The evaluation process will instantiate a model from `models/user_config.py` for evaluation. Ensure this configuration is set correctly.
+
+## Setting Up SSH Keys
+
+You will have to add your SSH Keys to your GitLab account by going to your profile settings [here](https://gitlab.aicrowd.com/profile/keys). If you do not have SSH Keys, you will first need to [generate one](https://docs.gitlab.com/ee/ssh/README.html#generating-a-new-ssh-key-pair).
+
+
+## Managing Large Model Files with Git LFS
+
+When preparing your submission, it's crucial to ensure all necessary models and files required by your inference code are properly saved and included. Due to the potentially large size of model weight files, we highly recommend using Git Large File Storage (Git LFS) to manage these files efficiently.
+
+### Why Use Git LFS?
+
+Git LFS is designed to handle large files more effectively than Git's default handling of large files. This ensures smoother operations and avoids common errors associated with large files, such as:
+
+- `fatal: the remote end hung up unexpectedly`
+- `remote: fatal: pack exceeds maximum allowed size`
+
+These errors typically occur when large files are directly checked into the Git repository without Git LFS, leading to challenges in handling and transferring those files.
+
+### Steps to Use Git LFS
+
+1. **Install Git LFS**: If you haven't already, install Git LFS on your machine. Detailed instructions can be found [here](https://git-lfs.github.com/).
+
+2. **Track Large Files**: Use Git LFS to track the large files within your project. You can do this by running `git lfs track "*.model"` (replace `*.model` with your file type).
+
+3. **Add and Commit**: After tracking the large files with Git LFS, add and commit them as you would with any other file. Git LFS will automatically handle these files differently to optimize their storage and transfer.
+
+4. **Push to Repository**: When you push your changes to the repository, Git LFS will manage the large files, ensuring a smooth push process.
+
+### Handling Previously Committed Large Files
+
+If you have already committed large files directly to your Git repository without using Git LFS, you may encounter issues. These files, even if not present in the current working directory, could still be in the Git history, leading to errors.
+
+To resolve this, ensure that the large files are removed from the Git history and then re-add and commit them using Git LFS. This process cleans up the repository's history and avoids the aforementioned errors.
+
+For more information on how to upload large files to your submission and detailed guidance on using Git LFS, please refer to [this detailed guide](https://discourse.aicrowd.com/t/how-to-upload-large-files-size-to-your-submission/2304).
+
+**Note**: Properly managing large files not only facilitates smoother operations for you but also ensures that the evaluation process can proceed without hindrances.
+
+## How to Submit Your Code
+
+To submit your code, push a tag beginning with "submission-" to your repository on [GitLab](https://gitlab.aicrowd.com/). Follow these steps to make a submission:
+
+Assuming, you have cloned the repo already by following the instructions [here](../README.md#setup) and made your changes.
+
+1. Commit your changes with `git commit -am "Your commit message"`.
+2. Tag your submission (e.g., `git tag -am "submission-v0.1" submission-v0.1`).
+3. Push your changes and tags to the AIcrowd repository (e.g. `git push origin submission-v0.1`)
+
+After pushing your tag, you can view your submission details at `https://gitlab.aicrowd.com/<YOUR-AICROWD-USER-NAME>/amazon-kdd-cup-2024-starter-kit/issues`. It may take about **30 minutes** for each submission to build and begin evaluation, so please be patient. 
+
+Ensure your `aicrowd.json` is correctly filled with the necessary metadata, and you've replaced `<YOUR-AICROWD-USER-NAME>` with your GitLab username in the provided URL.
--- a/local_evaluation.py
+++ b/local_evaluation.py
-import torch
+import os
+
+import metrics
 import numpy as np
 import pandas as pd
-from tqdm.auto import tqdm
-from sentence_transformers import SentenceTransformer
+import parsers
+import torch
+from tqdm import tqdm
+
+VERSION = "0.1.0"

-import metrics
-from models.user_config import UserModel

-def print_sample(i, generation, truth, metric, score):
-    print(f"Sample {i}, generation: {generation}")
-    print(f"Sample {i}, truth: {truth}")
+def print_sample(idx, generation, truth, metric, score):
+    """
+    Print a sample's generated output, the truth, and its evaluation score.
+    """
+    print(f"Sample {idx}, generation: {generation}")
+    print(f"Sample {idx}, truth: {truth}")
    if isinstance(score, tuple) and len(score) == 3:
-        print(f"Metric ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}")
+        print(
+            f"Per Sample Metric Score ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}"
+        )
    else:
-        print(f"Metric ({metric}): {score}")
+        print(f"Per Sample Metric Score ({metric}): {score}")
    print()

-def run_and_evaluate(data_df, max_eval_rows, print_interval=200):
-    model = UserModel()

-    if max_eval_rows < len(data_df):
-        data_df_eval = data_df.sample(max_eval_rows)
-    else:
-        data_df_eval = data_df
+# Function to load development data from a JSON file
+def load_development_data(filename):
+    """
+    Load development data from a specified JSON file.

-    # Run model
-    outputs = []
-    task_methods = {
-        'multiple-choice': model.task_multichoice,
-        'generation': model.task_generation,
-        'retrieval': model.task_retrieval,
-        'ranking': model.task_ranking,
-        'named_entity_recognition': model.task_named_entity_recognition,
-    }
+    Parameters:
+    - filename: Path to the JSON file containing the development data.
+
+    Returns:
+    - A pandas DataFrame containing the loaded data.
+    """
+    return pd.read_json(filename, lines=True)
+
+
+# Function to generate model outputs based on the input data
+def generate_model_outputs(data_df, model):
+    """
+    Generate predictions for each entry in the data DataFrame using a given model.
+
+    Parameters:
+    - data_df: A pandas DataFrame containing the input data for predictions.
+    - model: The model instance used for generating predictions.

-    for _, row in tqdm(data_df_eval.iterrows(), total=len(data_df_eval), desc='Processing'):
-        task_type = row['task_type']
-        if task_type not in task_methods:
-            raise NotImplementedError(f"No task method for {task_type=}")
+    Returns:
+    - A list containing the model outputs for each entry in the data DataFrame.
+    """
+    outputs = []
+    task_grouped_df = data_df.groupby(by=["task_type"])
+    
+    for task_type, task_group_data_df in task_grouped_df:
+        task_group_data_df = task_group_data_df.reset_index(drop=True)
        
-        task_prompt = row['input_field']
-        task_fn = task_methods[task_type]
-        task_output = task_fn(task_prompt)
-        outputs.append(task_output)
-
-    # Evaluate
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    sentence_all_lm = SentenceTransformer('all-MiniLM-L6-v2').to(device)
-    sentece_multilingual = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2').to(device)
-
-    eval_methods = {
-        'accuracy': metrics.accuracy,
-        'hit rate@3': metrics.hit_rate_3,
-        'rougel': metrics.rougel,
-        'sent-transformer': lambda g,t: metrics.sent_transformer(g, t, sentence_all_lm),
-        'multilingual-sent-transformer': lambda g,t: metrics.sent_transformer(g, t, sentece_multilingual),
-        'micro f1': metrics.tp_fp_fn,
-        'ndcg': metrics.ndcg_eval,
-        'bleu': metrics.bleu,
-        'jp-bleu': lambda g,t: metrics.bleu(g,t, jp=True)
-    }
+        is_multiple_choice = task_type[0] == "multiple-choice"
+        batch_size = model.get_batch_size()
+        
+        batches = [task_group_data_df[i:i+batch_size] for i in range(0,len(task_group_data_df),batch_size)]
+        
+        for batch_df in batches:
+            batch = {
+                "prompt": batch_df["input_field"].tolist(),
+            }
+            model_output = model.batch_predict(
+                    batch, 
+                    is_multiple_choice
+                )
+            outputs.append(
+                pd.DataFrame({
+                    "input_field": batch["prompt"],
+                    "model_output_str": model_output
+                }))
+    
+    df_outputs = pd.concat(outputs)
+    return df_outputs
+
+
+# Function to evaluate the generated model outputs
+def evaluate_outputs(data_df, log_every_n_steps=1):
+    """
+    Evaluate the model outputs against ground truth values using specified metrics.
+
+    Parameters:
+    - data_df: DataFrame containing the development data, including ground truth.
+    - outputs: The generated outputs from the model to be evaluated.
+    - log_every_n_steps: Logs samples every N steps

+    Returns:
+    - A dictionary containing evaluation metrics and scores for each task.
+    """
+    eval_methods = get_evaluation_methods()
+    task_parsers = get_task_parsers()
    per_task_metrics = {}

-    for ri, row in tqdm(data_df_eval.iterrows(), total=len(data_df_eval), desc='Evaluating'):
-        metric = row['metric']
+    for row_idx, row in tqdm(
+        data_df.iterrows(), total=len(data_df), desc="Evaluating"
+    ):
+        task_name, task_type, metric, ground_truth, model_output_str = (
+            row["task_name"],
+            row["task_type"],
+            row["metric"],
+            row["output_field"],
+            row["model_output_str"],
+        )
+
        if metric not in eval_methods:
            raise NotImplementedError(f"No metric for {metric=}")

-        task_name = row['task_name']
-        per_task_metrics.setdefault(task_name, {
-            'metric': metric,
-            'sample_score': []
-        })
-        
-        gt = row['output_field']
-        model_output = outputs[ri]
-
+        model_output = task_parsers[task_type].parse(model_output_str)
        eval_fn = eval_methods[metric]
-        metric_score = eval_fn(model_output, gt)
-        per_task_metrics[task_name]['sample_score'].append(metric_score)
-        per_task_metrics[task_name]['sample_score'].append(metric_score)
-        
-        if ri % print_interval == 0:
-            print_sample(ri, model_output, gt, metric, metric_score)
+        metric_score = eval_fn(model_output, ground_truth)
+
+        if task_name not in per_task_metrics:
+            per_task_metrics[task_name] = {
+                "task_type": task_type,
+                "metric": metric,
+                "sample_score": [],
+            }
+
+        per_task_metrics[task_name]["sample_score"].append(metric_score)
+
+        if (row_idx + 1) % log_every_n_steps == 0:
+            print_sample(
+                row_idx + 1, model_output, ground_truth, metric, metric_score
+            )

-    # Aggregate scores
-    for k in per_task_metrics:
-        if per_task_metrics[k]['metric'] != 'micro f1':
-            print(k, len(per_task_metrics[k]['sample_score']))
-            per_task_metrics[k]['overall_metric'] = np.mean(per_task_metrics[k]['sample_score'])
-        else:
-            per_task_metrics[k]['overall_metric'] = metrics.compute_f1_score(per_task_metrics[k]['sample_score'])
+    return per_task_metrics

+
+# Function to aggregate scores from evaluations
+def aggregate_scores(per_task_metrics):
+    """
+    Aggregate evaluation scores across different tasks and metrics.
+
+    Parameters:
+    - per_task_metrics: A dictionary containing raw evaluation scores for each task.
+
+    Returns:
+    - A pandas DataFrame summarizing the overall metrics and scores.
+    """
    overall_metrics = {
-        'task_name': [],
-        'metric': [],
-        'overall_score': []
+        "task_name": [],
+        "task_type": [],
+        "metric": [],
+        "num_samples": [],
+        "overall_score": [],
+    }
+    for task_name, values in per_task_metrics.items():
+        task_type, metric, sample_scores = (
+            values["task_type"],
+            values["metric"],
+            values["sample_score"],
+        )
+        overall_score = (
+            np.mean(sample_scores)
+            if metric != "micro f1"
+            else metrics.calculate_f1_score(sample_scores)
+        )
+
+        overall_metrics["task_name"].append(task_name)
+        overall_metrics["task_type"].append(task_type)
+        overall_metrics["metric"].append(metric)
+        overall_metrics["num_samples"].append(len(sample_scores))
+        overall_metrics["overall_score"].append(overall_score)
+
+    return pd.DataFrame(overall_metrics)
+
+
+# Define and return evaluation methods
+def get_evaluation_methods():
+    """
+    Get evaluation methods including accuracy, sentence transformers, and other metrics.
+
+    Returns:
+    - A dictionary mapping metric names to their respective evaluation functions.
+    """
+    return {
+        "accuracy": metrics.calculate_per_sample_accuracy,
+        "hit rate@3": metrics.calculate_hit_rate_3,
+        "rougel": metrics.calculate_rougel,
+        "sent-transformer": lambda generated_text, reference_texts: metrics.calculate_cosine_similarity(
+            generated_text=generated_text,
+            reference_texts=reference_texts,
+            model_name="all-MiniLM-L6-v2",
+        ),
+        "multilingual-sent-transformer": lambda generated_text, reference_texts: metrics.calculate_cosine_similarity(
+            generated_text=generated_text,
+            reference_texts=reference_texts,
+            model_name="paraphrase-multilingual-MiniLM-L12-v2",
+        ),
+        "micro f1": metrics.calculate_true_positive_false_positives_false_negatives,
+        "ndcg": metrics.calculate_ndcg,
+        "bleu": metrics.calculate_bleu_score,
+        "jp-bleu": lambda generated_text, reference_text: metrics.calculate_bleu_score(
+            generated_text=generated_text,
+            reference_text=reference_text,
+            is_japanese=True,
+        ),
    }
-    for k in per_task_metrics:
-        overall_metrics['task_name'].append(k)
-        overall_metrics['metric'].append(per_task_metrics[k]['metric'])
-        overall_metrics['overall_score'].append(per_task_metrics[k]['overall_metric'])
-    track_wise_score = np.mean(overall_metrics['overall_score'])
-    overall_metrics['task_name'].append('track_wise')
-    overall_metrics['metric'].append('track_wise')
-    overall_metrics['overall_score'].append(track_wise_score)
-    overall_metrics_df = pd.DataFrame(overall_metrics)
-    overall_metrics_df.to_json("scores.json", orient='records', lines=True)
-    print(f"Overall score {track_wise_score}")
+
+
+# Define and return task parsers
+def get_task_parsers():
+    """
+    Define parsers for different task types to format model outputs accordingly.
+
+    Returns:
+    - A dictionary mapping task types to their respective parsers.
+    """
+    return {
+        "multiple-choice": parsers.ShoppingBenchTaskParsers("multichoice"),
+        "generation": parsers.ShoppingBenchTaskParsers("generation"),
+        "retrieval": parsers.ShoppingBenchTaskParsers("retrieval"),
+        "ranking": parsers.ShoppingBenchTaskParsers("ranking"),
+        "named_entity_recognition": parsers.ShoppingBenchTaskParsers(
+            "named_entity_recognition"
+        ),
+    }
+
+
+# Main execution function to load data, generate model outputs, evaluate, and aggregate scores
+def main():
+    # Load development data
+    # Please download the development data from : https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms/dataset_files
+    # and place it at: ./data/development.json
+    DATA_FILENAME = "./data/development.json"
+
+    if not os.path.exists(DATA_FILENAME):
+        raise FileNotFoundError(
+            f"Development data file not found at {DATA_FILENAME}."
+            "Please download the development data from : https://www.aicrowd.com/challenges/amazon-kdd-cup-2024-multi-task-online-shopping-challenge-for-llms/dataset_files"
+            "and place it at: ./data/development.json"
+        )
+
+    data_df = load_development_data(DATA_FILENAME)
+
+    # Load the model from the user's custom configuration
+    # Note: The evaluator **Always** imports the UserModel, please reference your own class
+    # by setting the `UserModel` variable in models.user_config
+    from models.user_config import UserModel
+
+    model = UserModel()
+
+    # Generate model outputs
+    df_outputs = generate_model_outputs(data_df, model)
+    
+    # add outputs to the data_df
+    merged_data_df = pd.merge(data_df, df_outputs, on="input_field")
+        
+    print(merged_data_df.head())
+
+    # Evaluate the generated outputs and calculate metrics
+    per_task_metrics = evaluate_outputs(merged_data_df)
+
+    # Aggregate and display the evaluation scores
+    overall_metrics = aggregate_scores(per_task_metrics)
+    print("=" * 100)
+    print("Task specific metrics: ")
+    print(overall_metrics)
+
+    print()
+    # Calculate and print the overall score across all tasks and metrics
+    overall_score = overall_metrics["overall_score"].mean()
+    print(f"Overall Score: {overall_score}")
+

 if __name__ == "__main__":
-    DATA_FILENAME = '../data/tracks/track3_rephrase.json'
-    data_df = pd.read_json(DATA_FILENAME, lines=True)
-    MAX_EVAL_ROWS = 100000
-    run_and_evaluate(data_df, MAX_EVAL_ROWS)
\ No newline at end of file
+    main()
--- a/metrics.py
+++ b/metrics.py
+import os
+from typing import List, Tuple, Union
+
+import evaluate
+import numpy as np
+import torch
+from loguru import logger
 from rouge_score import rouge_scorer
 from sentence_transformers import SentenceTransformer
-import numpy as np
-import evaluate

-from typing import List
+sacrebleu = None
+sentence_transformer_model_cache = {}
+

-print("\nsacrebleu loading...")
-sacrebleu = evaluate.load('sacrebleu')
+def calculate_per_sample_accuracy(prediction: int, truth: int) -> bool:
+    """
+    Computes the accuracy of a single prediction.

-def accuracy(prediction: int, truth: int):
+    This function checks if a given prediction matches the ground truth.
+    
+    Parameters:
+    - prediction (int): The predicted value.
+    - truth (int): The actual ground truth value.
+    
+    Returns:
+    - bool: True if the prediction matches the truth, False otherwise.
+    """
    return prediction == truth

-def hit_rate_3(retrieved_int: List[int], truth: List[int]):
+
+def calculate_hit_rate_3(retrieved_int: List[int], truth: List[int]) -> float:
+    """
+    Calculates the hit rate within the top 3 retrieved integers.
+
+    This function assesses how many of the truth integers are present 
+    within the first three elements of the retrieved list of integers.
+    
+    Parameters:
+    - retrieved_int (List[int]): The list of retrieved integers, ordered by relevance.
+    - truth (List[int]): The list of ground truth integers.
+    
+    Returns:
+    - float: The hit rate, calculated as the proportion of truth integers found 
+      in the top 3 retrieved integers, relative to the total number of truth integers.
+    """
+    # Calculate the number of hits within the top 3 retrieved integers
    hit = len(set(truth).intersection(set(retrieved_int[:3])))
-    hit /= len(truth)
-    return hit
+    # Normalize the hit count by the total number of truth integers to get the hit rate
+    hit_rate = hit / len(truth)
+    return hit_rate

-def rougel(generation: str, truth: str):
-    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+
+def calculate_rougel(generation: str, truth: str) -> float:
+    """
+    Calculates the ROUGE-L F-measure score between a generated string and the truth string.
+
+    ROUGE-L measures the longest common subsequence between the generated text and the truth text,
+    considering both the precision and recall of the sequences. It is widely used in evaluating
+    the quality of text generation systems.
+    
+    Parameters:
+    - generation (str): The generated text to evaluate.
+    - truth (str): The ground truth text to compare against.
+    
+    Returns:
+    - float: The ROUGE-L F-measure score, indicating the quality of the generated text.
+    """
+    # Initialize the ROUGE scorer with the ROUGE-L metric
+    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
+    # Calculate the ROUGE scores between the generated text and the truth text
    scores = scorer.score(generation, truth)
-    return scores['rougeL'].fmeasure
+    # Extract and return the ROUGE-L F-measure score
+    return scores["rougeL"].fmeasure

-def sent_transformer(generation: str, truth: str, sent_transformer_model):
-    generation_embedding = sent_transformer_model.encode([generation])[0]

-    if isinstance(truth, str):
-        truth_embedding = sent_transformer_model.encode([truth])[0]
-        score = ((generation_embedding * truth_embedding).sum()) 
-        score /= (np.linalg.norm(generation_embedding, ord=2) * np.linalg.norm(truth_embedding, ord=2))
-        if score > 0:
-            return score
-        else:
-            return 0
+def load_sentence_transformer_model(model_name: str) -> SentenceTransformer:
+    """
+    Loads a Sentence Transformer model by its name and moves it to the appropriate device.
+
+    Parameters:
+    - model_name (str): The name of the model to load.
+
+    Returns:
+    - SentenceTransformer: The loaded SentenceTransformer model.
+    """
+    
+    global sentence_transformer_model_cache
+    
+    # a model cache ensure we do not load the model on every call
+    if model_name not in sentence_transformer_model_cache:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = SentenceTransformer(model_name).to(device)
+        sentence_transformer_model_cache[model_name] = model
+        
+    return sentence_transformer_model_cache[model_name]
+
+def calculate_cosine_similarity(generated_text: str, reference_texts: Union[str, List[str]], model_name) -> float:
+    """
+    Computes the cosine similarity score(s) between a generated text and reference text(s) using a sentence embedding model.
+    
+    This function calculates the cosine similarity between the embedding of the generated text and the embedding(s) 
+    of reference text(s). The embeddings are generated using a specified sentence embedding model. The cosine similarity 
+    score is a measure of similarity between two vectors, ranging from -1 (completely different) to 1 (exactly the same).
+    
+    Parameters:
+    - generated_text (str): The text generated by the model.
+    - reference_texts (Union[str, List[str]]): The reference text(s) for comparison. Can be a single string or a list of strings.
+    - model_name: The sentence embedding model used to generate text embeddings.
+    
+    Returns:
+    - float: The average cosine similarity score between the generated text and the reference text(s). If reference_texts is a single 
+      string, a single score is returned. If reference_texts is a list of strings, the average score across all references is returned.
+      The score is bounded between 0 (no similarity) and 1 (identical), with negative scores adjusted to 0.
+    """
+    # Load/Reference model
+    model = load_sentence_transformer_model(model_name)
+    
+    # Embedding for the generated text
+    generated_embedding = model.encode([generated_text])[0]
+
+    # Handling a single reference text
+    if isinstance(reference_texts, str):
+        # Embedding for the single reference text
+        reference_embedding = model.encode([reference_texts])[0]
+        # Compute cosine similarity
+        similarity_score = np.dot(generated_embedding, reference_embedding) / (np.linalg.norm(generated_embedding) * np.linalg.norm(reference_embedding))
+        # Ensure non-negative score
+        return max(similarity_score, 0)
+    
+    # Handling multiple reference texts
    else:
-        scores = []
-        for label_item in truth:
-            truth_embedding = sent_transformer_model.encode([label_item])[0]
-            score_ = (generation_embedding * truth_embedding).sum()
-            score_ /= (np.linalg.norm(generation_embedding, ord=2) * np.linalg.norm(truth_embedding, ord=2))
-            scores.append(score_)
-        if np.mean(scores) > 0:
-            return np.mean(scores)
-        else:
-            return 0
-
-def tp_fp_fn(entity_list, truth):
-    answer_lower = []
-    for a in entity_list:
-        answer_lower.append(a.lower().lstrip(' ').rstrip(' '))
-    truth_lower = []
-    for l in truth:
-        truth_lower.append(l.lower())
-    true_positive = len(set(answer_lower).intersection(set(truth_lower)))
-    false_positive = len(answer_lower) - true_positive
-    false_negative = len(truth_lower) - true_positive
-    return true_positive, false_positive, false_negative
-
-def compute_f1_score(tp_fp_fn_list):
-    total_tp = 0
-    total_fp = 0
-    total_fn = 0
-    for tp, fp, fn in tp_fp_fn_list:
+        similarity_scores = []
+        for reference_text in reference_texts:
+            # Embedding for each reference text
+            reference_embedding = model.encode([reference_text])[0]
+            # Compute cosine similarity for each reference
+            individual_score = np.dot(generated_embedding, reference_embedding) / (np.linalg.norm(generated_embedding) * np.linalg.norm(reference_embedding))
+            similarity_scores.append(individual_score)
+        # Calculate and ensure non-negative average score
+        return max(np.mean(similarity_scores), 0)
+    
+def calculate_true_positive_false_positives_false_negatives(extracted_entities: List[str], ground_truth_entities: List[str]) -> Tuple[int, int, int]:
+    """
+    Calculates true positives, false positives, and false negatives for entity extraction.
+
+    This function compares a list of extracted entities against a list of ground truth entities
+    to determine the count of true positives (correctly extracted entities), false positives
+    (incorrectly extracted entities), and false negatives (missed entities).
+
+    Both lists are case-insensitive, and leading/trailing spaces in extracted entities are ignored.
+
+    Parameters:
+    - extracted_entities (List[str]): The list of entities extracted by the model.
+    - ground_truth_entities (List[str]): The list of actual entities (ground truth).
+
+    Returns:
+    - Tuple[int, int, int]: A tuple containing the counts of true positives, false positives, and false negatives.
+    """
+    # Normalize the extracted entities by making them lowercase and stripping leading/trailing spaces
+    normalized_extracted_entities = [entity.lower().strip() for entity in extracted_entities]
+    
+    # Normalize the ground truth entities by making them lowercase
+    normalized_ground_truth_entities = [entity.lower() for entity in ground_truth_entities]
+
+    # Calculate true positives by finding the intersection between extracted and ground truth entities
+    true_positives = len(set(normalized_extracted_entities).intersection(set(normalized_ground_truth_entities)))
+
+    # Calculate false positives as extracted entities not in ground truth
+    false_positives = len(normalized_extracted_entities) - true_positives
+
+    # Calculate false negatives as ground truth entities not extracted
+    false_negatives = len(normalized_ground_truth_entities) - true_positives
+
+    return true_positives, false_positives, false_negatives
+
+def calculate_f1_score(metrics_list: List[Tuple[int, int, int]]) -> float:
+    """
+    Calculates the F1 score from a list of tuples containing true positives, false positives, and false negatives.
+
+    Parameters:
+    - metrics_list (List[Tuple[int, int, int]]): A list of tuples, where each tuple contains counts of true positives,
+      false positives, and false negatives in that order for various classifications or entity extractions.
+
+    Returns:
+    - float: The computed F1 score, ranging from 0 to 1.
+    """
+    total_tp, total_fp, total_fn = 0, 0, 0
+
+    # Aggregate total true positives, false positives, and false negatives
+    for tp, fp, fn in metrics_list:
        total_tp += tp
        total_fp += fp
        total_fn += fn
-    precision = total_tp / (total_tp + total_fp)
-    recall = total_tp / (total_tp + total_fn)
+
+    # Calculate precision and recall
+    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
+    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
+
+    # Calculate F1 score, handling the case where precision + recall equals 0
    if precision + recall == 0:
        return 0
    else:
        return 2 * precision * recall / (precision + recall)
-        
-def ndcg(ranked_list, weight):
-    idcg = 0
-    dcg = 0
-    for i in range(len(ranked_list)):
-        position = i+1
-        if ranked_list[i]-1 < len(weight):
-            relevance = weight[ranked_list[i]-1]
+
+def calculate_ndcg(predicted_relevance_scores: List[int], true_relevance_weights: List[float]) -> float:
+    """
+    Calculates and evaluates the Normalized Discounted Cumulative Gain (NDCG) score directly from predicted relevance scores
+    against true relevance weights. It normalizes the scores to ensure a fair comparison, trimming the predicted scores
+    if necessary to match the length of the true relevance weights.
+
+    Parameters:
+    - predicted_relevance_scores (List[int]): Indices of items ranked by the algorithm, expected to be integers starting from 1.
+    - true_relevance_weights (List[float]): Actual relevance weights for the items, with higher values indicating greater relevance.
+
+    Returns:
+    - float: The NDCG score, normalized against the ideal ranking, ranging from 0 to 1.
+    """
+    # Trim the predicted scores to match the true scores length if necessary
+    if len(predicted_relevance_scores) > len(true_relevance_weights):
+        predicted_relevance_scores = predicted_relevance_scores[:len(true_relevance_weights)]
+
+    dcg, idcg = 0.0, 0.0
+
+    # Calculate DCG for the predicted ranking
+    for i, score_index in enumerate(predicted_relevance_scores, start=1):
+        if score_index - 1 < len(true_relevance_weights):
+            relevance = true_relevance_weights[score_index - 1]
        else:
            relevance = 0
-        dcg += (np.power(2, relevance) - 1)/np.log2(position+1)
-    weight.sort(reverse=True)
-    for i in range(len(weight)):
-        position = i+1
-        relevance = weight[i]
-        idcg += (np.power(2, relevance) - 1)/ np.log2(position+1)
-    return dcg/idcg 
-
-def ndcg_eval(relevance_scores: List[float], truth: List[float]):
-    if len(relevance_scores) > len(truth):
-        relevance_scores = relevance_scores[:len(truth)]
-    return ndcg(relevance_scores, truth)
-   
-
-def bleu(generation, truth, jp = False):
-    generation = generation.lstrip('\n').rstrip('\n').split('\n')[0]
-    candidate = [generation]
-    reference = [[truth]]
-    if not jp:
-        score = sacrebleu.compute(predictions=candidate, references=reference,
-                                  lowercase=True)['score']/100
-    else:
-        score = sacrebleu.compute(predictions=candidate, references=reference,
-                                  lowercase=True,
-                                  tokenize='ja-mecab')['score']/100
-    return score
-    
-    
+        dcg += (np.power(2, relevance) - 1) / np.log2(i + 1)
    
+    # Calculate IDCG using sorted true relevance weights
+    for i, weight in enumerate(sorted(true_relevance_weights, reverse=True), start=1):
+        idcg += (np.power(2, weight) - 1) / np.log2(i + 1)
    
+    # Avoid division by zero
+    return 0 if idcg == 0 else dcg / idcg

+
+def calculate_bleu_score(generated_text: str, reference_text: str, is_japanese: bool = False) -> float:
+    """
+    Calculates the BLEU score for a generated text compared to a reference truth text. This function supports
+    both general text and Japanese-specific evaluation by using the sacrebleu library.
+
+    Parameters:
+    - generated_text (str): The generated text to be evaluated.
+    - reference_text (str): The reference truth text.
+    - is_japanese (bool, optional): Flag to indicate whether the text is in Japanese, requiring special tokenization.
+
+    Returns:
+    - float: The BLEU score as a percentage (0 to 1 scale) for the generated text against the reference truth.
+    """
+    global sacrebleu
+    if sacrebleu is None:
+        sacrebleu = evaluate.load("sacrebleu")
+
+    # Preprocess input texts
+    generated_text = generated_text.lstrip("\n").rstrip("\n").split("\n")[0]
+    candidate = [generated_text]
+    reference = [[reference_text]]
+
+    # Compute BLEU score with or without Japanese-specific tokenization
+    bleu_args = {"predictions": candidate, "references": reference, "lowercase": True}
+    if is_japanese:
+        bleu_args["tokenize"] = "ja-mecab"
+    score = sacrebleu.compute(**bleu_args)["score"] / 100
+
+    return score
--- a/models/README.md
+++ b/models/README.md
+# Guide to Writing Your Own Models
+
+## Model Code Organization
+For a streamlined experience, we suggest placing the code for all your models within the `models` directory. This is a recommendation for organizational purposes, but it's not a strict requirement.
+
+## Model Base Class
+Your models should inherit from the `ShopBenchBaseModel` class found in [base_model.py](base_model.py). We provide an example model, `dummy_model.py`, to illustrate how you might structure your own model. Crucially, your model class must implement the `batch_predict` method.
+
+## Configuring Your Model
+To ensure your model is recognized and utilized correctly, please specify your model class name in the [`user_config.py`](user_config.py) file, by following the instructions in the inline comments.
+
+## Model Inputs and Outputs
+
+### Inputs
+- `batch` (`Dict[str, Any]`): A batch of inputs as a dictionary, where the dictionary has the following key:
+    - `prompt` (`List[str]`): `A list if prompts representing the tasks in a batch`
+- `is_multiple_choice` (`bool`): This indicates whether the task is a multiple choice question.
+
+### Outputs
+
+The output from your model's `batch_predict` function should be a list of string responses for all the prompts in the input batch.
+Depending on the task, each response could be:
+- A single integer (in the range [0, 3]) for multiple choice tasks.
+- A comma-separated list of integers for ranking tasks.
+- A comma-separated list of named entities for Named Entity Recognition (NER) tasks.
+- (unconstrained) generated response for the generation tasks
+
+For more information on how these responses are processed, please see [parsers.py](../parsers.py).
+
+
+**Note** that the `task_type` will not be explicitly provided to your model. However, the information about the `task_type` is implicitly available in the prompt provided.
+
+## Internet Access
+Your model will not have access to the internet during evaluation. As such, you'll need to include any necessary model weights directly in your repository before submission. Ensure that your Model class is self-contained and fully operational without internet access.
+
--- a/models/base_model.py
+++ b/models/base_model.py
+from typing import Any, Dict, List
+
+
+class ShopBenchBaseModel:
+    def __init__(self):
+        pass
+
+    def get_batch_size(self) -> int:
+        """
+        Determines the batch size that is used by the evaluator when calling the `batch_predict` function.
+
+        Returns:
+            int: The batch size, an integer between 1 and 16. This value indicates how many
+                 queries should be processed together in a single batch. It can be dynamic
+                 across different batch_predict calls, or stay a static value.
+        """
+        raise NotImplementedError("get_batch_size method not implemented")
+
+    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
+        """
+        Generates a batch of prediction based on associated prompts and task_type
+
+        For multiple choice tasks, it randomly selects a choice.
+        For other tasks, it returns a list of integers as a string,
+        representing the model's prediction in a format compatible with task-specific parsers.
+
+        Parameters:
+            - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
+                - prompt (List[str]): a list of input prompts for the model.
+    
+            - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
+
+        Returns:
+            str: A list of predictions for each of the prompts received in the batch.
+                    Each prediction is
+                           a string representing a single integer[0, 3] for multiple choice tasks,
+                        or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
+                        or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
+                        or a string representing the (unconstrained) generated response for the generation tasks
+                        Please refer to parsers.py for more details on how these responses will be parsed by the evaluator.
+        """
+        raise NotImplementedError("predict method not implemented")
--- a/models/dummy_model.py
+++ b/models/dummy_model.py
-from typing import List
+import os
+import random
+from typing import Any, Dict, List

+from .base_model import ShopBenchBaseModel

-class DummyModel:
+# Set a consistent seed for reproducibility
+AICROWD_RUN_SEED = int(os.getenv("AICROWD_RUN_SEED", 3142))
+
+
+class DummyModel(ShopBenchBaseModel):
    """
-    Note to participants:
-        Example class to show the different functions to be implemented for each type of task
-        Make sure to follow the data types as mentioned in the function definitions
+    A dummy model implementation for ShopBench, illustrating how to handle both
+    multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
+    This model uses a consistent random seed for reproducible results.
    """
+
    def __init__(self):
-        """ Initialize your models here """
-        pass
-    
-    def task_multichoice(self, task_prompt: str) -> int:
-        """
-        Task method for Multiple choice questions
-            Input - Task Prompt (includes choices)
-            Output - Single integer index among ones given in the input
-        """
-        return 0
+        """Initializes the model and sets the random seed for consistency."""
+        random.seed(AICROWD_RUN_SEED)

-    def task_ranking(self, task_prompt: str) -> List[float]:
+    def get_batch_size(self) -> int:
        """
-        Task method for Ranking
-            Input - Task Prompt (includes items to rank)
-            Output - Ordered List of ranks for each item
-        """
-        return [1, 0, 2, 3]
+        Determines the batch size that is used by the evaluator when calling the `batch_predict` function.

-    def task_generation(self, task_prompt: str) -> str:
-        """
-        Task method for Generation
-            Input - Task Prompt describing the required generation
-            Output - Generated text as per task prompt
+        Returns:
+            int: The batch size, an integer between 1 and 16. This value indicates how many
+                 queries should be processed together in a single batch. It can be dynamic
+                 across different batch_predict calls, or stay a static value.
        """
-        return "This is a test"
+        self.batch_size = 4
+        return self.batch_size

-    def task_retrieval(self, task_prompt: str) -> List[int]:
+    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
        """
-       Task method for Generation
-            Input - Task Prompt describing the items which need to be selected from (includes indexes of items)
-            Output - Unordered list of indexes selected (must be a python list even if single item)
-        """
-        return [0, 1, 2]
+        Generates a batch of prediction based on associated prompts and task_type

-    def task_named_entity_recognition(self, task_prompt: str) -> List[str]:
-        """
-        Task method for Named Entity Recognition
-            Input - Task Prompt describing the named entity recognition task
-            Output - Unordered list of one or more entity names (must be a python list even if single item)
+        For multiple choice tasks, it randomly selects a choice.
+        For other tasks, it returns a list of integers as a string,
+        representing the model's prediction in a format compatible with task-specific parsers.
+
+        Parameters:
+            - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
+                - prompt (List[str]): a list of input prompts for the model.
+    
+            - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
+
+        Returns:
+            str: A list of predictions for each of the prompts received in the batch.
+                    Each prediction is
+                           a string representing a single integer[0, 3] for multiple choice tasks,
+                        or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
+                        or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
+                        or a string representing the (unconstrained) generated response for the generation tasks
+                        Please refer to parsers.py for more details on how these responses will be parsed by the evaluator.
        """
-        return ["food", "gpu"]
\ No newline at end of file
+        prompts = batch["prompt"]
+
+        possible_responses = [1, 2, 3, 4]
+
+        batch_response = []
+        for prompt in prompts:
+            if is_multiple_choice:
+                # Randomly select one of the possible responses for multiple choice tasks
+                batch_response.append(str(random.choice(possible_responses)))
+            else:
+                # For other tasks, shuffle the possible responses and return as a string
+                random.shuffle(possible_responses)
+                batch_response.append(str(possible_responses))
+                # Note: As this is dummy model, we are returning random responses for non-multiple choice tasks.
+                # For generation tasks, this should ideally return an unconstrained string.
+
+        return batch_response
--- a/models/user_config.py
+++ b/models/user_config.py
+# Importing DummyModel from the models package.
+# The DummyModel class is located in the dummy_model.py file inside the 'models' directory.
 from models.dummy_model import DummyModel

-UserModel = DummyModel
\ No newline at end of file
+# This line establishes an alias for the DummyModel class to be used within this script.
+# Instead of directly using DummyModel everywhere in the code, we're assigning it to 'UserModel'.
+# This approach allows for easier reference to your model class when evaluating your models,
+UserModel = DummyModel
+
+
+# When implementing your own model please follow this pattern:
+#
+# from models.your_model import YourModel
+#
+# Replace 'your_model' with the name of your Python file containing the model class
+# and 'YourModel' with the class name of your model.
+#
+# Finally, assign YourModel to UserModel as shown below to use it throughout your script.
+#
+# UserModel = YourModel
+
+
+# For example, to use the Llama3 8B Instruct baseline, you can comment the lines below:
+# please remember to download the model weights and checking them into the repository 
+# before submitting
+
+# from models.vanilla_llama3_baseline import Llama3_8B_ZeroShotModel
+# UserModel = Llama3_8B_ZeroShotModel
--- a/models/vanilla_llama3_baseline.py
+++ b/models/vanilla_llama3_baseline.py
+import os
+import random
+from typing import Any, Dict, List
+
+import vllm
+
+from .base_model import ShopBenchBaseModel
+
+#### CONFIG PARAMETERS ---
+
+# Set a consistent seed for reproducibility
+AICROWD_RUN_SEED = int(os.getenv("AICROWD_RUN_SEED", 773815))
+
+# Batch size you wish the evaluators will use to call the `batch_generate_answer` function
+AICROWD_SUBMISSION_BATCH_SIZE = 16 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+
+# VLLM Parameters 
+VLLM_TENSOR_PARALLEL_SIZE = 4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+VLLM_GPU_MEMORY_UTILIZATION = 0.85 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+
+
+class Llama3_8B_ZeroShotModel(ShopBenchBaseModel):
+    """
+    A dummy model implementation for ShopBench, illustrating how to handle both
+    multiple choice and other types of tasks like Ranking, Retrieval, and Named Entity Recognition.
+    This model uses a consistent random seed for reproducible results.
+    """
+
+    def __init__(self):
+        """Initializes the model and sets the random seed for consistency."""
+        random.seed(AICROWD_RUN_SEED)
+        self.initialize_models()
+
+    def initialize_models(self):
+        # Initialize Meta Llama 3 - 8B Instruct Model
+        self.model_name = "models/meta-llama/Meta-Llama-3-8B-Instruct"
+
+        if not os.path.exists(self.model_name):
+            raise Exception(
+                f"""
+            The evaluators expect the model weights to be checked into the repository,
+            but we could not find the model weights at {self.model_name}
+            
+            Please follow the instructions in the docs below to download and check in the model weights.
+                https://gitlab.aicrowd.com/aicrowd/challenges/amazon-kdd-cup-2024/amazon-kdd-cup-2024-starter-kit/-/blob/master/docs/download-baseline-model-weights.md
+            
+            """
+            )
+
+        # initialize the model with vllm
+        self.llm = vllm.LLM(
+            self.model_name,
+            worker_use_ray=True,
+            tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE, 
+            gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION, 
+            trust_remote_code=True,
+            dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
+            enforce_eager=True
+        )
+        self.tokenizer = self.llm.get_tokenizer()
+
+
+
+    def get_batch_size(self) -> int:
+        """
+        Determines the batch size that is used by the evaluator when calling the `batch_predict` function.
+
+        Returns:
+            int: The batch size, an integer between 1 and 16. This value indicates how many
+                 queries should be processed together in a single batch. It can be dynamic
+                 across different batch_predict calls, or stay a static value.
+        """
+        self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE
+        return self.batch_size
+
+    def batch_predict(self, batch: Dict[str, Any], is_multiple_choice:bool) -> List[str]:
+        """
+        Generates a batch of prediction based on associated prompts and task_type
+
+        For multiple choice tasks, it randomly selects a choice.
+        For other tasks, it returns a list of integers as a string,
+        representing the model's prediction in a format compatible with task-specific parsers.
+
+        Parameters:
+            - batch (Dict[str, Any]): A dictionary containing a batch of input prompts with the following keys
+                - prompt (List[str]): a list of input prompts for the model.
+    
+            - is_multiple_choice bool: A boolean flag indicating if all the items in this batch belong to multiple choice tasks.
+
+        Returns:
+            str: A list of predictions for each of the prompts received in the batch.
+                    Each prediction is
+                           a string representing a single integer[0, 3] for multiple choice tasks,
+                        or a string representing a comma separated list of integers for Ranking, Retrieval tasks,
+                        or a string representing a comma separated list of named entities for Named Entity Recognition tasks.
+                        or a string representing the (unconstrained) generated response for the generation tasks
+                        Please refer to parsers.py for more details on how these responses will be parsed by the evaluator.
+        """
+        prompts = batch["prompt"]
+        
+        # format prompts using the chat template
+        formatted_prompts = self.format_prommpts(prompts)
+        # set max new tokens to be generated
+        max_new_tokens = 100 
+        
+        if is_multiple_choice:
+            max_new_tokens = 1 # For MCQ tasks, we only need to generate 1 token
+        
+        
+        # Generate responses via vllm
+        responses = self.llm.generate(
+            formatted_prompts,
+            vllm.SamplingParams(
+                n=1,  # Number of output sequences to return for each prompt.
+                top_p=0.9,  # Float that controls the cumulative probability of the top tokens to consider.
+                temperature=0,  # randomness of the sampling
+                seed=AICROWD_RUN_SEED, # Seed for reprodicibility
+                skip_special_tokens=True,  # Whether to skip special tokens in the output.
+                max_tokens=max_new_tokens,  # Maximum number of tokens to generate per output sequence.
+            ),
+            use_tqdm = False
+        )
+        # Aggregate answers into List[str]
+        batch_response = []
+        for response in responses:
+            batch_response.append(response.outputs[0].text)        
+            
+        if is_multiple_choice:
+            print("MCQ: ", batch_response)
+
+        return batch_response
+
+    def format_prommpts(self, prompts):
+        """
+        Formats prompts using the chat_template of the model.
+            
+        Parameters:
+        - queries (list of str): A list of queries to be formatted into prompts.
+            
+        """
+        system_prompt = "You are a helpful online shopping assistant. Please answer the following question about online shopping and follow the given instructions.\n\n"
+        formatted_prompts = []
+        for prompt in prompts:
+            formatted_prompts.append(system_prompt + prompt)
+
+        return formatted_prompts
No results found