From 048f5f45c7d5756576150b7a8a6a694904ead99f Mon Sep 17 00:00:00 2001
From: "S.P. Mohanty" <spmohanty91@gmail.com>
Date: Sun, 21 Apr 2024 23:59:35 +0000
Subject: [PATCH] use lxml parser in baseline

---
 models/rag_llama_baseline.py | 6 ++----
 requirements.txt             | 1 +
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/models/rag_llama_baseline.py b/models/rag_llama_baseline.py
index 1e1f11d..0f16d3a 100644
--- a/models/rag_llama_baseline.py
+++ b/models/rag_llama_baseline.py
@@ -126,9 +126,7 @@ class RAGModel:
         # Process each HTML text from the search results to extract text content.
         for html_text in search_results:
             # Parse the HTML content to extract text.
-            soup = BeautifulSoup(
-                html_text["page_result"], features="html.parser"
-            )
+            soup = BeautifulSoup(html_text["page_result"], features="lxml")
             text = soup.get_text().replace("\n", "")
             if len(text) > 0:
                 # Convert the text into sentences and extract their offsets.
@@ -179,7 +177,7 @@ class RAGModel:
             # If the model fails to generate an answer, return a default response.
             answer = "I don't know"
 
-        # Trim the prediction to a maximum of 75 tokens (this function needs to be defined).
+        # Trim the prediction to a maximum of 75 tokens to meet the submission requirements.
         trimmed_answer = trim_predictions_to_max_token_length(answer)
 
         return trimmed_answer
diff --git a/requirements.txt b/requirements.txt
index 951fd3e..5fc9efd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,7 @@ blingfire
 hf-transfer
 huggingface-hub
 loguru
+lxml
 openai==1.13.3
 sentence_transformers
 torch
-- 
GitLab