diff --git a/models/rag_llama_baseline.py b/models/rag_llama_baseline.py
index bd48c92cee3ba04ff0a13ab7d30514d1f2cea6b5..80df19a7c12268819602c4b35aa182d45303262d 100644
--- a/models/rag_llama_baseline.py
+++ b/models/rag_llama_baseline.py
@@ -38,6 +38,9 @@ from sentence_transformers import SentenceTransformer
 # **Note**: This environment variable will not be available for Task 1 evaluations.
 CRAG_MOCK_API_URL = os.getenv("CRAG_MOCK_API_URL", "http://localhost:8000")
 
+
+#### CONFIG PARAMETERS ---
+
 # Define the number of context sentences to consider for generating an answer.
 NUM_CONTEXT_SENTENCES = 20
 # Set the maximum length for each context sentence (in characters).
@@ -45,6 +48,18 @@ MAX_CONTEXT_SENTENCE_LENGTH = 1000
 # Set the maximum context references length (in characters).
 MAX_CONTEXT_REFERENCES_LENGTH = 4000
 
+# Batch size you wish the evaluators will use to call the `batch_generate_answer` function
+AICROWD_SUBMISSION_BATCH_SIZE = 8 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+
+# VLLM Parameters 
+VLLM_TENSOR_PARALLEL_SIZE = 4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+VLLM_GPU_MEMORY_UTILIZATION = 0.85 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+
+# Sentence Transformer Parameters
+SENTENTENCE_TRANSFORMER_BATCH_SIZE = 128 # TUNE THIS VARIABLE depending on the size of your embedding model and GPU mem available
+
+#### CONFIG PARAMETERS END---
+
 class ChunkExtractor:
 
     @ray.remote
@@ -173,8 +188,8 @@ class RAGModel:
         # Initialize the model with vllm
         self.llm = vllm.LLM(
             self.model_name,
-            tensor_parallel_size=4,  # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
-            gpu_memory_utilization=0.85,  # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+            tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE, 
+            gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION, 
             trust_remote_code=True,
             dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
             enforce_eager=True
@@ -188,7 +203,6 @@ class RAGModel:
                 "cuda" if torch.cuda.is_available() else "cpu"
             ),
         )
-        self.sentence_model_inference_batch_size = 128  # TUNE THIS VARIABLE depending on the size of your embedding model and GPU mem available
 
     def calculate_embeddings(self, sentences):
         """
@@ -207,7 +221,7 @@ class RAGModel:
         embeddings = self.sentence_model.encode(
             sentences=sentences,
             normalize_embeddings=True,
-            batch_size=self.sentence_model_inference_batch_size,
+            batch_size=SENTENTENCE_TRANSFORMER_BATCH_SIZE,
         )
         # Note: There is an opportunity to parallelize the embedding generation across 4 GPUs
         #       but sentence_model.encode_multi_process seems to interefere with Ray
@@ -228,7 +242,7 @@ class RAGModel:
             int: The batch size, an integer between 1 and 16. It can be dynamic
                  across different batch_generate_answer calls, or stay a static value.
         """
-        self.batch_size = 8  # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+        self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE  
         return self.batch_size
 
     def batch_generate_answer(self, batch: Dict[str, Any]) -> List[str]:
diff --git a/models/vanilla_llama_baseline.py b/models/vanilla_llama_baseline.py
index 767397a8c98cbf703b015ae40829d3edfc8641f5..24a9424ca8d5a895005f0e0ad83d8b6feaf7fd9f 100644
--- a/models/vanilla_llama_baseline.py
+++ b/models/vanilla_llama_baseline.py
@@ -35,6 +35,17 @@ from models.utils import trim_predictions_to_max_token_length
 CRAG_MOCK_API_URL = os.getenv("CRAG_MOCK_API_URL", "http://localhost:8000")
 
 
+#### CONFIG PARAMETERS ---
+
+# Batch size you wish the evaluators will use to call the `batch_generate_answer` function
+AICROWD_SUBMISSION_BATCH_SIZE = 8 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+
+# VLLM Parameters 
+VLLM_TENSOR_PARALLEL_SIZE = 4 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+VLLM_GPU_MEMORY_UTILIZATION = 0.85 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+
+#### CONFIG PARAMETERS END---
+
 class InstructModel:
     def __init__(self):
         """
@@ -63,8 +74,8 @@ class InstructModel:
         # initialize the model with vllm
         self.llm = vllm.LLM(
             self.model_name,
-            tensor_parallel_size=4,  # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
-            gpu_memory_utilization=0.85,  # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+            tensor_parallel_size=VLLM_TENSOR_PARALLEL_SIZE, 
+            gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION, 
             trust_remote_code=True,
             dtype="half", # note: bfloat16 is not supported on nvidia-T4 GPUs
             enforce_eager=True
@@ -80,7 +91,7 @@ class InstructModel:
                  queries should be processed together in a single batch. It can be dynamic
                  across different batch_generate_answer calls, or stay a static value.
         """
-        self.batch_size = 8 # TUNE THIS VARIABLE depending on the number of GPUs you are requesting and the size of your model.
+        self.batch_size = AICROWD_SUBMISSION_BATCH_SIZE  
         return self.batch_size
 
     def batch_generate_answer(self, batch: Dict[str, Any]) -> List[str]: