initial commit

a443997f · Dipam Chakraborty · a443997f · a443997f · a443997f · a443997f
Commit a443997f authored 1 year ago by Dipam Chakraborty
--- a/.gitattributes
+++ b/.gitattributes
+example_data/qa.json filter=lfs diff=lfs merge=lfs -text
+example_data/web.json filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
+.vscode
+__pycache__
+api_responses/*.json
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
+## This is an example Dokerfile you can change to make submissions on aicrowd
+## To use it, place it in the base of the repo, and remove the underscore (_) from the filename
+
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+COPY apt.txt /tmp/apt.txt
+RUN apt -qq update && apt -qq install -y --no-install-recommends `cat /tmp/apt.txt` \
+ && rm -rf /var/cache/*
+RUN apt install -y locales wget
+
+# Unicode support:
+RUN locale-gen en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US:en
+ENV LC_ALL en_US.UTF-8
+
+# Create user home directory - This is needed for aicrowd submissions
+ENV USER_NAME aicrowd 
+ENV HOME_DIR /home/$USER_NAME
+
+# Replace HOST_UID/HOST_GUID with your user / group id
+ENV HOST_UID 1001
+ENV HOST_GID 1001
+
+# Use bash as default shell, rather than sh
+ENV SHELL /bin/bash
+
+# Set up user
+RUN adduser --disabled-password \
+    --gecos "Default user" \
+    --uid ${HOST_UID} \
+    ${USER_NAME}
+
+USER ${USER_NAME}
+WORKDIR ${HOME_DIR}
+
+ENV CONDA_DIR ${HOME_DIR}/.conda
+
+
+RUN wget -nv -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_22.11.1-1-Linux-x86_64.sh \
+ && bash miniconda.sh -b -p ${CONDA_DIR} \
+ && . ${CONDA_DIR}/etc/profile.d/conda.sh \
+ && conda clean -y -a \
+ && rm -rf miniconda.sh
+
+ENV PATH ${CONDA_DIR}/bin:${PATH}
+
+RUN conda install cmake -y && conda clean -y -a
+COPY --chown=1001:1001 requirements.txt ${HOME_DIR}/requirements.txt
+RUN pip install -r requirements.txt --no-cache-dir
+
+COPY --chown=1001:1001 . ${HOME_DIR}
+
+## Add your custom commands below
--- a/README.md
+++ b/README.md
--- a/api_responses/.gitkeep
+++ b/api_responses/.gitkeep
--- a/apt.txt
+++ b/apt.txt
+git
\ No newline at end of file
--- a/example_data/qa.json
+++ b/example_data/qa.json
--- a/example_data/web.json
+++ b/example_data/web.json
--- a/local_evaluation.py
+++ b/local_evaluation.py
+import json
+import os
+from tqdm.auto import tqdm
+from openai import OpenAI, APIConnectionError, RateLimitError
+from datetime import datetime
+
+from models.user_config import UserModel
+
+def get_system_message():
+    INSTRUCTIONS = """
+    You are given a question and the ground truth prediction is correct by comparing to the list of ground truth answers. You should evaluate for Accuracy and Missing. 
+    - For Missing, check whether the prediction returns any concrete answer. If the prediction is "I don't know", "I don't have enough information to answer", or similar responses, Missing should be True, otherwise Missing should be False.  
+    - For Accuracy, check whether a prediction is "correct" according to the ground truth answers. If the prediction is correct, Accuracy should be "True"; if the prediction is wrong, Accuracy should be "False". If the ground truth answer contains a number, the prediction needs to predict a number that matches the ground truth answer for the accuracy to be True.\n 
+    """
+
+    IN_CONTEXT_EXAMPLES = """
+    You need to check whether the prediction of a question-answering system to a question is Accurate or Missing. You should make the judgment based on a list of ground truth answers provided to you. Your response should be "correct" if the prediction is correct or "incorrect" if the prediction is wrong.
+    Examples:
+    Question: Who authored The Taming of the Shrew (published in 2002)?
+    Ground truth: ["William Shakespeare", "Roma Gill"]
+    Prediction: W Shakespeare
+    Accuracy: True
+    Missing: False
+
+    Question: how many seconds is 3 minutes 15 seconds?
+    Ground truth: ["195 seconds"]
+    Prediction: 3 minutes 15 seconds is 195 seconds.
+    Accuracy: True
+    Missing: False
+
+    Question: Who authored The Taming of the Shrew (published in 2002)?
+    Ground truth: ["William Shakespeare", "Roma Gill"]
+    Prediction: The author to The Taming of the Shrew is Roma Shakespeare.
+    Accuracy: False
+    Missing: False
+    
+    Question: Who played Sheldon in Big Bang Theory?
+    Ground truth: ["Jim Parsons", "Iain Armitage"]
+    Prediction: I am sorry I don't know.
+    Accuracy: False
+    Missing: True
+    
+    Question: What country is Maharashtra Metro Rail Corporation Limited
+    located in?
+    Ground truth: ["India"]
+    Prediction: Maharashtra
+    Accuracy: False
+    Missing: False
+    
+    Question: What’s the job of Song Kang-ho in Parasite (2019)?
+    Ground truth: ["actor"]
+    Prediction: He plays the role of Kim Ki-taek, the patriarch of the Kim
+    family. He also plays in the TV Show Friends.
+    Accuracy: True
+    Missing: False
+    
+    Question: Which era did Michael Oakeshott belong to?
+    Ground truth: ["20th-century philosophy"]
+    Prediction: 20th century.
+    Accuracy: True
+    Missing: False
+    
+    Question: What the Meta stock price now?
+    Ground truth: ["382.63 dollars"]
+    Prediction: I apologize, but I couldn't find any information on this.
+    Accuracy: False
+    Missing: True
+    
+    Question: How long is the great wall of China?
+    Ground truth: ["13171 miles"]
+    Prediction: The great wall of China is 13271 miles.
+    Accuracy: False
+    Missing: False
+    
+    Question: When did Brian Fleming release his last song?
+    Ground truth: ["2023"]
+    Prediction: Brian Fleming has not released any songs as a musician. He is a playwright and actor, not a musician. My apologies for the confusion.
+    Accuracy: False
+    Missing: False
+    
+    Question: How old is Maria Carrie?
+    Ground truth: ["53 year old"]
+    Prediction: Could you please provide more context or information about who or what this person or thing is?
+    Accuracy: False
+    Missing: True
+    
+    Question: What is the best role of Mayim Bialik?
+    Ground truth: ["Amy Farrah Fowler in Big Bang Theory"]
+    Prediction: Mayim Bialik is not a known artist. It's possible that you intended to ask about Meryl Streep.
+    Accuracy: False
+    Missing: True
+    """
+
+    return INSTRUCTIONS + IN_CONTEXT_EXAMPLES
+
+def gpt_eval(client, model_name, query, gt, pred):
+    messages = []
+    messages.append({
+        "role": "system",
+        "content": get_system_message()
+    })
+    messages.append({
+        "role": "user",
+        "content": f"Question: {query}\n Ground truth: {gt}\n Prediction: {pred}\n",
+    })
+    MAX_RETRIES = 10
+    for _ in range(MAX_RETRIES):
+        try:
+            response = client.chat.completions.create(model=model_name, messages=messages).choices[0].message.content
+            break
+        except APIConnectionError:
+            import traceback; print(traceback.format_exc())
+            continue
+        except RateLimitError:
+            import traceback; print(traceback.format_exc())
+            continue
+        except:
+            import traceback; print(traceback.format_exc())
+        
+    fname = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
+    with open(f"api_responses/{fname}.json", 'w') as f:
+        json.dump({
+            "messages": messages,
+            "response": response,
+        }, f)
+
+    miss = "Missing: True" in response
+    correct = "Accuracy: True" in response
+    return miss, correct
+
+def evaluate(dataset_path, model_name):
+    # Load dataset
+    with open(f'{dataset_path}/qa.json') as f:
+        qa = json.load(f)
+    with open(f'{dataset_path}/web.json') as f:
+        web_results = json.load(f)
+
+    # Setup
+    openai_client = OpenAI()
+    participant_model = UserModel()
+    char_lim = 50  # TODO: Set actual character limit based on query
+
+    n_miss, n_correct, n_exact = 0, 0, 0
+    
+    # Eval loop
+    for i, qdict in tqdm(enumerate(qa), total=len(qa)):
+        query = qdict['q']
+        gt = qdict['fact_ans']
+        query_web_res = web_results[i]
+        prediction = participant_model.generate_answer(query, query_web_res, character_limit=char_lim)
+        prediction_trimmed = prediction[:char_lim]
+        miss, correct = gpt_eval(openai_client, model_name, query, gt, prediction_trimmed)
+        n_exact = (prediction.strip() == gt.strip())
+        n_miss += miss
+        n_correct += correct
+    
+    # Scores
+    miss = n_miss / len(qa)
+    acc = n_correct / len(qa)
+    acc_exact = n_exact / len(qa)
+
+    results = {
+        "Exact Accuracy": acc_exact, 
+        "Accuracy": acc,
+        "Missing": miss,
+        "Total": len(qa)
+    }
+    print(results)   
+
+
+if __name__ == '__main__':
+    DATASET_PATH = "example_data/"
+    MODEL_NAME = "gpt-4"
+    evaluate(DATASET_PATH, MODEL_NAME)
\ No newline at end of file
--- a/models/README.md
+++ b/models/README.md
--- a/models/dummy_model.py
+++ b/models/dummy_model.py
+from typing import List
+
+class DummyModel:
+    def __init__(self):
+        """ Initialize your models here """
+        pass
+
+    def generate_answer(self, query: str, search_results: List[str], character_limit: int) -> str:
+        """
+        You will be provided with a query and the corresponding pre-cached search results for the query
+        
+        Inputs - 
+            query - String representing the input query
+            search_results - List of strings, each comes from scraped HTML text of the search query
+            character_limit - A maximum character limit for the answer (can vary per query)
+        Returns - 
+            string response - Your answer in plain text, should be limited to the character limit, 
+                              Any longer responses will be trimmed to meet the character limit
+        """
+        answer = "I'm sorry, I can't help with that."
+        return answer
\ No newline at end of file
--- a/models/user_config.py
+++ b/models/user_config.py
+from models.dummy_model import DummyModel
+
+UserModel = DummyModel
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+torch
+transformers
\ No newline at end of file