add tokenizer + response trimming

1b510bb5 · spmohanty · 9a7c5b04 · 1b510bb5 · 1b510bb5 · 1b510bb5
Commit 1b510bb5 authored 11 months ago by spmohanty
--- a/local_evaluation.py
+++ b/local_evaluation.py
@@ -6,6 +6,9 @@ from loguru import logger
 from openai import APIConnectionError, OpenAI, RateLimitError
 from prompts.templates import IN_CONTEXT_EXAMPLES, INSTRUCTIONS
 from tqdm.auto import tqdm
+from transformers import LlamaTokenizerFast
+
+tokenizer = LlamaTokenizerFast.from_pretrained("tokenizer")


 def load_json_file(file_path):
@@ -41,10 +44,12 @@ def attempt_api_call(client, model_name, messages, max_retries=10):
    return None


-def log_response(messages, response):
+def log_response(messages, response, output_directory="api_responses"):
    """Save the response from the API to a file."""
+    os.makedirs(output_directory, exist_ok=True)
    file_name = datetime.now().strftime("%d-%m-%Y-%H-%M-%S.json")
-    with open(f"api_responses/{file_name}", "w") as f:
+    file_path = os.path.join(output_directory, file_name)
+    with open(file_path, "w") as f:
        json.dump({"messages": messages, "response": response}, f)


@@ -71,6 +76,13 @@ def parse_response(resp: str):
    except:
        return -1

+def trim_predictions_to_max_token_length(prediction):
+    """Trims prediction output to 75 tokens"""
+    max_token_length = 75
+    tokenized_prediction = tokenizer.encode(prediction)
+    trimmed_tokenized_prediction = tokenized_prediction[1: max_token_length+1]
+    trimmed_prediction = tokenizer.decode(trimmed_tokenized_prediction)
+    return trimmed_prediction

 def generate_predictions(dataset_path, participant_model):
    qa = load_json_file(os.path.join(dataset_path, "qa.json"))
@@ -82,6 +94,8 @@ def generate_predictions(dataset_path, participant_model):
        prediction = participant_model.generate_answer(
            query, query_web_search_results
        )
+        # trim prediction to 75 tokens
+        prediction = trim_predictions_to_max_token_length(prediction)
        predictions.append(
            {
                "query": query,

--- a/models/dummy_model.py
+++ b/models/dummy_model.py
 import os
 from typing import List

+from models.utils import trim_predictions_to_max_token_length
+
 # Load the environment variable that specifies the URL of the MockAPI. This URL is essential
 # for accessing the correct API endpoint in Task 2 and Task 3. The value of this environment variable
 # may vary across different evaluation settings, emphasizing the importance of dynamically obtaining
@@ -44,4 +46,7 @@ class DummyModel:
        # Default response when unsure about the answer
        answer = "i don't know"
        
-        return answer
+        # Trim prediction to a max of 75 tokens
+        trimmed_answer = trim_predictions_to_max_token_length(answer)
+        
+        return trimmed_answer
--- a/models/utils.py
+++ b/models/utils.py
+#!/usr/bin/env python
+
+import os
+
+from transformers import LlamaTokenizerFast
+
+tokenizer_path = os.path.join(os.path.dirname(__file__), "..", "tokenizer")
+tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer_path)
+
+def trim_predictions_to_max_token_length(prediction):
+    """Trims prediction output to 75 tokens"""
+    max_token_length = 75
+    tokenized_prediction = tokenizer.encode(prediction)
+    trimmed_tokenized_prediction = tokenized_prediction[1: max_token_length+1]
+    trimmed_prediction = tokenizer.decode(trimmed_tokenized_prediction)
+    return trimmed_prediction
\ No newline at end of file
--- a/tokenizer/README.md
+++ b/tokenizer/README.md
+# hf-internal-testing/llama-tokenizer
+
+This tokenizer has been obtained from: https://huggingface.co/hf-internal-testing/llama-tokenizer
\ No newline at end of file
--- a/tokenizer/special_tokens_map.json
+++ b/tokenizer/special_tokens_map.json
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/tokenizer/tokenizer.json
+++ b/tokenizer/tokenizer.json
--- a/tokenizer/tokenizer.model
+++ b/tokenizer/tokenizer.model
--- a/tokenizer/tokenizer_config.json
+++ b/tokenizer/tokenizer_config.json
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 2048,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}