Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Meta Comphrehensive RAG Benchmark starter kit
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
AIcrowd
Challenges
Meta Comprehensive RAG Benchmark - KDD Cup 2024
Meta Comphrehensive RAG Benchmark starter kit
Commits
5e714bdf
Commit
5e714bdf
authored
10 months ago
by
spmohanty
Browse files
Options
Downloads
Patches
Plain Diff
aggregate config params in a single place
parent
82917a01
No related branches found
No related tags found
1 merge request
!4
Batch predict interface v0
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
models/rag_llama_baseline.py
+19
-5
19 additions, 5 deletions
models/rag_llama_baseline.py
models/vanilla_llama_baseline.py
+14
-3
14 additions, 3 deletions
models/vanilla_llama_baseline.py
with
33 additions
and
8 deletions
models/rag_llama_baseline.py
+
19
−
5
View file @
5e714bdf
...
...
@@ -38,6 +38,9 @@ from sentence_transformers import SentenceTransformer
# **Note**: This environment variable will not be available for Task 1 evaluations.
CRAG_MOCK_API_URL
=
os
.
getenv
(
"
CRAG_MOCK_API_URL
"
,
"
http://localhost:8000
"
)
#### CONFIG PARAMETERS ---
# Define the number of context sentences to consider for generating an answer.
NUM_CONTEXT_SENTENCES
=
20
# Set the maximum length for each context sentence (in characters).
...
...
@@ -45,6 +48,18 @@ MAX_CONTEXT_SENTENCE_LENGTH = 1000
# Set the maximum context references length (in characters).
MAX_CONTEXT_REFERENCES_LENGTH
=
4000
# Batch size you wish the evaluators will use to call the `batch_generate_answer` function
AICROWD_SUBMISSION_BATCH_SIZE
=
8
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
# VLLM Parameters
VLLM_TENSOR_PARALLEL_SIZE
=
4
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
VLLM_GPU_MEMORY_UTILIZATION
=
0.85
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
# Sentence Transformer Parameters
SENTENTENCE_TRANSFORMER_BATCH_SIZE
=
128
# TUNE THIS VARIABLE depending on the size of your embedding model and GPU mem available
#### CONFIG PARAMETERS END---
class
ChunkExtractor
:
@ray.remote
...
...
@@ -173,8 +188,8 @@ class RAGModel:
# Initialize the model with vllm
self
.
llm
=
vllm
.
LLM
(
self
.
model_name
,
tensor_parallel_size
=
4
,
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
gpu_memory_utilization
=
0.85
,
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
tensor_parallel_size
=
VLLM_TENSOR_PARALLEL_SIZE
,
gpu_memory_utilization
=
VLLM_GPU_MEMORY_UTILIZATION
,
trust_remote_code
=
True
,
dtype
=
"
half
"
,
# note: bfloat16 is not supported on nvidia-T4 GPUs
enforce_eager
=
True
...
...
@@ -188,7 +203,6 @@ class RAGModel:
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
),
)
self
.
sentence_model_inference_batch_size
=
128
# TUNE THIS VARIABLE depending on the size of your embedding model and GPU mem available
def
calculate_embeddings
(
self
,
sentences
):
"""
...
...
@@ -207,7 +221,7 @@ class RAGModel:
embeddings
=
self
.
sentence_model
.
encode
(
sentences
=
sentences
,
normalize_embeddings
=
True
,
batch_size
=
self
.
sentence_model_inference_batch_size
,
batch_size
=
SENTENTENCE_TRANSFORMER_BATCH_SIZE
,
)
# Note: There is an opportunity to parallelize the embedding generation across 4 GPUs
# but sentence_model.encode_multi_process seems to interefere with Ray
...
...
@@ -228,7 +242,7 @@ class RAGModel:
int: The batch size, an integer between 1 and 16. It can be dynamic
across different batch_generate_answer calls, or stay a static value.
"""
self
.
batch_size
=
8
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
self
.
batch_size
=
AICROWD_SUBMISSION_BATCH_SIZE
return
self
.
batch_size
def
batch_generate_answer
(
self
,
batch
:
Dict
[
str
,
Any
])
->
List
[
str
]:
...
...
This diff is collapsed.
Click to expand it.
models/vanilla_llama_baseline.py
+
14
−
3
View file @
5e714bdf
...
...
@@ -35,6 +35,17 @@ from models.utils import trim_predictions_to_max_token_length
CRAG_MOCK_API_URL
=
os
.
getenv
(
"
CRAG_MOCK_API_URL
"
,
"
http://localhost:8000
"
)
#### CONFIG PARAMETERS ---
# Batch size you wish the evaluators will use to call the `batch_generate_answer` function
AICROWD_SUBMISSION_BATCH_SIZE
=
8
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
# VLLM Parameters
VLLM_TENSOR_PARALLEL_SIZE
=
4
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
VLLM_GPU_MEMORY_UTILIZATION
=
0.85
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
#### CONFIG PARAMETERS END---
class
InstructModel
:
def
__init__
(
self
):
"""
...
...
@@ -63,8 +74,8 @@ class InstructModel:
# initialize the model with vllm
self
.
llm
=
vllm
.
LLM
(
self
.
model_name
,
tensor_parallel_size
=
4
,
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
gpu_memory_utilization
=
0.85
,
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
tensor_parallel_size
=
VLLM_TENSOR_PARALLEL_SIZE
,
gpu_memory_utilization
=
VLLM_GPU_MEMORY_UTILIZATION
,
trust_remote_code
=
True
,
dtype
=
"
half
"
,
# note: bfloat16 is not supported on nvidia-T4 GPUs
enforce_eager
=
True
...
...
@@ -80,7 +91,7 @@ class InstructModel:
queries should be processed together in a single batch. It can be dynamic
across different batch_generate_answer calls, or stay a static value.
"""
self
.
batch_size
=
8
# TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
self
.
batch_size
=
AICROWD_SUBMISSION_BATCH_SIZE
return
self
.
batch_size
def
batch_generate_answer
(
self
,
batch
:
Dict
[
str
,
Any
])
->
List
[
str
]:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment