diff --git a/local_evaluation.py b/local_evaluation.py
index c40f7215b6d3cf68b270b5fd0b91ad567fd1f73b..3484c15a892e2815426a64f7c8b044146ee841d5 100644
--- a/local_evaluation.py
+++ b/local_evaluation.py
@@ -5,19 +5,21 @@ from tqdm.auto import tqdm
 from sentence_transformers import SentenceTransformer
 
 import metrics
-from models.user_config import UserModel
+
 
 def print_sample(i, generation, truth, metric, score):
     print(f"Sample {i}, generation: {generation}")
     print(f"Sample {i}, truth: {truth}")
     if isinstance(score, tuple) and len(score) == 3:
-        print(f"Metric ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}")
+        print(
+            f"Metric ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}"
+        )
     else:
         print(f"Metric ({metric}): {score}")
     print()
 
+
 def run_and_evaluate(data_df, max_eval_rows, print_interval=200):
-    model = UserModel()
 
     if max_eval_rows < len(data_df):
         data_df_eval = data_df.sample(max_eval_rows)
@@ -27,91 +29,126 @@ def run_and_evaluate(data_df, max_eval_rows, print_interval=200):
     # Run model
     outputs = []
     task_methods = {
-        'multiple-choice': model.task_multichoice,
-        'generation': model.task_generation,
-        'retrieval': model.task_retrieval,
-        'ranking': model.task_ranking,
-        'named_entity_recognition': model.task_named_entity_recognition,
+        "multiple-choice": model.task_multichoice,
+        "generation": model.task_generation,
+        "retrieval": model.task_retrieval,
+        "ranking": model.task_ranking,
+        "named_entity_recognition": model.task_named_entity_recognition,
     }
 
-    for _, row in tqdm(data_df_eval.iterrows(), total=len(data_df_eval), desc='Processing'):
-        task_type = row['task_type']
+    for _, row in tqdm(
+        data_df_eval.iterrows(), total=len(data_df_eval), desc="Processing"
+    ):
+        task_type = row["task_type"]
         if task_type not in task_methods:
             raise NotImplementedError(f"No task method for {task_type=}")
-        
-        task_prompt = row['input_field']
+
+        task_prompt = row["input_field"]
         task_fn = task_methods[task_type]
         task_output = task_fn(task_prompt)
         outputs.append(task_output)
 
     # Evaluate
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    sentence_all_lm = SentenceTransformer('all-MiniLM-L6-v2').to(device)
-    sentece_multilingual = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2').to(device)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    sentence_all_lm = SentenceTransformer("all-MiniLM-L6-v2").to(device)
+    sentece_multilingual = SentenceTransformer(
+        "paraphrase-multilingual-MiniLM-L12-v2"
+    ).to(device)
 
     eval_methods = {
-        'accuracy': metrics.accuracy,
-        'hit rate@3': metrics.hit_rate_3,
-        'rougel': metrics.rougel,
-        'sent-transformer': lambda g,t: metrics.sent_transformer(g, t, sentence_all_lm),
-        'multilingual-sent-transformer': lambda g,t: metrics.sent_transformer(g, t, sentece_multilingual),
-        'micro f1': metrics.tp_fp_fn,
-        'ndcg': metrics.ndcg_eval,
-        'bleu': metrics.bleu,
-        'jp-bleu': lambda g,t: metrics.bleu(g,t, jp=True)
+        "accuracy": metrics.accuracy,
+        "hit rate@3": metrics.hit_rate_3,
+        "rougel": metrics.rougel,
+        "sent-transformer": lambda g, t: metrics.sent_transformer(
+            g, t, sentence_all_lm
+        ),
+        "multilingual-sent-transformer": lambda g, t: metrics.sent_transformer(
+            g, t, sentece_multilingual
+        ),
+        "micro f1": metrics.tp_fp_fn,
+        "ndcg": metrics.ndcg_eval,
+        "bleu": metrics.bleu,
+        "jp-bleu": lambda g, t: metrics.bleu(g, t, jp=True),
     }
 
     per_task_metrics = {}
 
-    for ri, row in tqdm(data_df_eval.iterrows(), total=len(data_df_eval), desc='Evaluating'):
-        metric = row['metric']
+    for ri, row in tqdm(
+        data_df_eval.iterrows(), total=len(data_df_eval), desc="Evaluating"
+    ):
+        metric = row["metric"]
         if metric not in eval_methods:
             raise NotImplementedError(f"No metric for {metric=}")
 
-        task_name = row['task_name']
-        per_task_metrics.setdefault(task_name, {
-            'metric': metric,
-            'sample_score': []
-        })
-        
-        gt = row['output_field']
+        task_name = row["task_name"]
+        per_task_metrics.setdefault(
+            task_name, {"metric": metric, "sample_score": []}
+        )
+
+        gt = row["output_field"]
         model_output = outputs[ri]
 
         eval_fn = eval_methods[metric]
         metric_score = eval_fn(model_output, gt)
-        per_task_metrics[task_name]['sample_score'].append(metric_score)
-        per_task_metrics[task_name]['sample_score'].append(metric_score)
-        
+        per_task_metrics[task_name]["sample_score"].append(metric_score)
+        per_task_metrics[task_name]["sample_score"].append(metric_score)
+
         if ri % print_interval == 0:
             print_sample(ri, model_output, gt, metric, metric_score)
 
     # Aggregate scores
     for k in per_task_metrics:
-        if per_task_metrics[k]['metric'] != 'micro f1':
-            print(k, len(per_task_metrics[k]['sample_score']))
-            per_task_metrics[k]['overall_metric'] = np.mean(per_task_metrics[k]['sample_score'])
+        if per_task_metrics[k]["metric"] != "micro f1":
+            print(k, len(per_task_metrics[k]["sample_score"]))
+            per_task_metrics[k]["overall_metric"] = np.mean(
+                per_task_metrics[k]["sample_score"]
+            )
         else:
-            per_task_metrics[k]['overall_metric'] = metrics.compute_f1_score(per_task_metrics[k]['sample_score'])
+            per_task_metrics[k]["overall_metric"] = metrics.compute_f1_score(
+                per_task_metrics[k]["sample_score"]
+            )
 
-    overall_metrics = {
-        'task_name': [],
-        'metric': [],
-        'overall_score': []
-    }
+    overall_metrics = {"task_name": [], "metric": [], "overall_score": []}
     for k in per_task_metrics:
-        overall_metrics['task_name'].append(k)
-        overall_metrics['metric'].append(per_task_metrics[k]['metric'])
-        overall_metrics['overall_score'].append(per_task_metrics[k]['overall_metric'])
-    track_wise_score = np.mean(overall_metrics['overall_score'])
-    overall_metrics['task_name'].append('track_wise')
-    overall_metrics['metric'].append('track_wise')
-    overall_metrics['overall_score'].append(track_wise_score)
+        overall_metrics["task_name"].append(k)
+        overall_metrics["metric"].append(per_task_metrics[k]["metric"])
+        overall_metrics["overall_score"].append(
+            per_task_metrics[k]["overall_metric"]
+        )
+    track_wise_score = np.mean(overall_metrics["overall_score"])
+    overall_metrics["task_name"].append("track_wise")
+    overall_metrics["metric"].append("track_wise")
+    overall_metrics["overall_score"].append(track_wise_score)
     overall_metrics_df = pd.DataFrame(overall_metrics)
-    overall_metrics_df.to_json("scores.json", orient='records', lines=True)
+    overall_metrics_df.to_json("scores.json", orient="records", lines=True)
     print(f"Overall score {track_wise_score}")
 
+
 if __name__ == "__main__":
-    DATA_FILENAME = './data/phase1_track3.json'
+
+    # Load Development Data
+    DATA_FILENAME = "./data/development.json"
     data_df = pd.read_json(DATA_FILENAME, lines=True)
-    MAX_EVAL_ROWS = 100000
-    run_and_evaluate(data_df, MAX_EVAL_ROWS)
\ No newline at end of file
+
+    # Load UserModel
+    from models.user_config import UserModel
+
+    model = UserModel()
+
+    # Generate Responses
+
+    outputs = []
+    for _rowd_idx, row in tqdm(
+        data_df.iterrows(),
+        total=len(data_df),
+        desc="Generating Responses",
+    ):
+        print("=" * 100)
+        is_multiple_choice = row["task_type"] == "multiple-choice"
+        prompt = row["input_field"]
+        model_output = model.predict(prompt, is_multiple_choice)
+        outputs.append(model_output)
+
+        print(prompt, model_output)
+
+    # run_and_evaluate(data_df, MAX_EVAL_ROWS)
diff --git a/models/dummy_model.py b/models/dummy_model.py
index 2eb84d3e52d0deb7d8f633090b5a703e79636446..0e0bf1c807befbd3c518fe74bec2b12da3f39b5d 100644
--- a/models/dummy_model.py
+++ b/models/dummy_model.py
@@ -1,52 +1,43 @@
 from typing import List
+import random
+import os
+
+# please use this seed consistently across your code
+AICROWD_RUN_SEED = int(os.getenv("AICROWD_RUN_SEED", 3142))
 
 
 class DummyModel:
     """
-    Note to participants:
-        Example class to show the different functions to be implemented for each type of task
-        Make sure to follow the data types as mentioned in the function definitions
+    TODO
     """
+
     def __init__(self):
-        """ Initialize your models here """
-        pass
-    
-    def task_multichoice(self, task_prompt: str) -> int:
-        """
-        Task method for Multiple choice questions
-            Input - Task Prompt (includes choices)
-            Output - Single integer index among ones given in the input
-        """
-        return 0
+        """Initialize your models here"""
+        random.seed(AICROWD_RUN_SEED)
 
-    def task_ranking(self, task_prompt: str) -> List[int]:
-        """
-        Task method for Ranking
-            Input - Task Prompt (includes items to rank)
-            Output - Ordered List of ranks for each item
+    def predict(self, prompt: str, is_multiple_choice: bool) -> str:
         """
-        return [1, 0, 2, 3]
+        Standard inferface for all tasks and tracks.
 
-    def task_generation(self, task_prompt: str) -> str:
-        """
-        Task method for Generation
-            Input - Task Prompt describing the required generation
-            Output - Generated text as per task prompt
-        """
-        return "This is a test"
+        The goal is for your model to be able to infer the task type,
+        and respond with a string that is compatible with the task specific parser.
 
-    def task_retrieval(self, task_prompt: str) -> List[int]:
-        """
-       Task method for Generation
-            Input - Task Prompt describing the items which need to be selected from (includes indexes of items)
-            Output - Unordered list of indexes selected (must be a python list even if single item)
-        """
-        return [0, 1, 2]
 
-    def task_named_entity_recognition(self, task_prompt: str) -> List[str]:
+        Note: Even if the development dataset has the task_type information,
+        During the actual evaluations, your code will only have access to the prompt,
+        and the boolean variable indicating if its a multiple choice question.
         """
-        Task method for Named Entity Recognition
-            Input - Task Prompt describing the named entity recognition task
-            Output - Unordered list of one or more entity names (must be a python list even if single item)
-        """
-        return ["food", "gpu"]
\ No newline at end of file
+
+        potential_response = [1, 2, 3, 4]
+        if is_multiple_choice:
+            return str(random.choice(potential_response))
+        else:
+            # For Ranking, Retrieval, and Named Entity Recognition tasks
+            # the expected response is a string that can be parsed with
+            # `ast.literal_eval` (see parsers.py for more details)
+            random.shuffle(potential_response)
+            return str(potential_response)
+
+            # Note: For the generation task, the expected response is a string
+            # And, as this is a dummy response, we are just returning the
+            # shuffled version of list, but in your case, it can be any string
diff --git a/models/dummy_model_old.py b/models/dummy_model_old.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb84d3e52d0deb7d8f633090b5a703e79636446
--- /dev/null
+++ b/models/dummy_model_old.py
@@ -0,0 +1,52 @@
+from typing import List
+
+
+class DummyModel:
+    """
+    Note to participants:
+        Example class to show the different functions to be implemented for each type of task
+        Make sure to follow the data types as mentioned in the function definitions
+    """
+    def __init__(self):
+        """ Initialize your models here """
+        pass
+    
+    def task_multichoice(self, task_prompt: str) -> int:
+        """
+        Task method for Multiple choice questions
+            Input - Task Prompt (includes choices)
+            Output - Single integer index among ones given in the input
+        """
+        return 0
+
+    def task_ranking(self, task_prompt: str) -> List[int]:
+        """
+        Task method for Ranking
+            Input - Task Prompt (includes items to rank)
+            Output - Ordered List of ranks for each item
+        """
+        return [1, 0, 2, 3]
+
+    def task_generation(self, task_prompt: str) -> str:
+        """
+        Task method for Generation
+            Input - Task Prompt describing the required generation
+            Output - Generated text as per task prompt
+        """
+        return "This is a test"
+
+    def task_retrieval(self, task_prompt: str) -> List[int]:
+        """
+       Task method for Generation
+            Input - Task Prompt describing the items which need to be selected from (includes indexes of items)
+            Output - Unordered list of indexes selected (must be a python list even if single item)
+        """
+        return [0, 1, 2]
+
+    def task_named_entity_recognition(self, task_prompt: str) -> List[str]:
+        """
+        Task method for Named Entity Recognition
+            Input - Task Prompt describing the named entity recognition task
+            Output - Unordered list of one or more entity names (must be a python list even if single item)
+        """
+        return ["food", "gpu"]
\ No newline at end of file
diff --git a/parsers.py b/parsers.py
index 193c6f06c455994df0a7cd1bd5d2ac1c7f640fa4..ce5636f6afb799bf69c5445081c285ccdfecfd01 100644
--- a/parsers.py
+++ b/parsers.py
@@ -1,261 +1,246 @@
-#!/usr/bin/env python3
 import ast
 
 
 class ShoppingBenchTaskParsers:
     """
-    A class for parsing responses from different types of tasks in a shopping bench scenario.
+    A class designed to parse responses from different task types in
+    the ShopBench - MultiTask Online Shopping Challenge for LLMs.
+    It supports a variety of task types such as multiple choice, ranking, generation, retrieval,
+    and named entity recognition, each with its own specific parsing logic to format the raw
+    response strings into structured data.
 
     Attributes:
-        task_type (str): The type of task for which the parser is instantiated.
+        task_type (str): The type of task the parser is set up to handle. Valid task types
+                         include 'multichoice', 'ranking', 'generation', 'retrieval',
+                         and 'named_entity_recognition'.
     """
 
     def __init__(self, task_type: str) -> None:
         """
-        Initializes the parser with a specific task type.
+        Initializes the parser for a specific task type.
 
         Parameters:
-            task_type (str): The type of task, e.g., 'multichoice', 'ranking', etc.
+            task_type (str): Specifies the task type this parser instance will handle.
         """
         self.task_type = task_type
 
     def parse(self, response: str) -> any:
         """
-        Parses the response based on the task type.
+        Parses a given response string according to the task type of the parser, and returns
+        a structured representation of that response.
 
         Parameters:
-            response (str): The raw response string from the model.
+            response (str): The raw response string obtained from performing the task.
 
         Returns:
-            The parsed response, formatted according to the task type's requirements.
+            A parsed and appropriately formatted response suitable for the parser's task type.
+            The format of the return value varies with the task type.
         """
-        # Mapping task types to their respective parsing methods.
+        # Map of task types to their corresponding parsing methods.
         task_parser_methods = {
-            "multichoice": self._task_multichoice_parser,
-            "ranking": self._task_ranking_parser,
-            "generation": self._task_generation_parser,
-            "retrieval": self._task_retrieval_parser,
-            "named_entity_recognition": self._task_named_entity_recognition_parser,
+            "multichoice": self._parse_multichoice,
+            "ranking": self._parse_ranking,
+            "generation": self._parse_generation,
+            "retrieval": self._parse_retrieval,
+            "named_entity_recognition": self._parse_named_entity_recognition,
         }
 
-        # Retrieve the parser method based on the task type.
+        # Attempt to retrieve the appropriate parser method for the task type.
         parser_method = task_parser_methods.get(self.task_type)
 
-        if parser_method is not None:
+        # Execute the parser method if found, otherwise raise an error.
+        if parser_method:
             return parser_method(response)
         else:
             raise NotImplementedError(
-                f"Task type {self.task_type} not implemented"
+                f"Task type '{self.task_type}' is not supported."
             )
 
-    def _task_multichoice_parser(self, response: str) -> int:
+    def _parse_multichoice(self, response: str) -> int:
         """
-        Parses a multichoice task response.
+        Parses a response from a multiple-choice task.
+
+        Assumes the first character of the response string indicates the chosen option.
 
         Parameters:
-            response (str): A string representing the selected option's index.
+            response (str): The raw response string.
 
         Returns:
-            int: The index of the selected option, or -1 if the input is invalid.
+            An integer representing the selected option. Returns -1 if the parsing fails due to
+            an invalid response format.
         """
         try:
-            return int(response.strip())
+            return int(response.strip()[0])
         except ValueError:
             return -1
 
-    def _task_ranking_parser(self, response: str) -> list:
+    def _parse_ranking(self, response: str) -> list:
         """
-        Parses a ranking task response.
+        Parses a ranking task response into a list of ranked items.
+
+        Expects a string with numeric values separated by commas, indicating the ranking order.
 
         Parameters:
-            response (str): A string representing the ordered list of ranks.
+            response (str): The raw response string.
 
         Returns:
-            list: A list of ranks if the input is valid, otherwise ignore non numeric list elements.
+            A list of integers representing the items in ranked order. Limits to the first 5 unique
+            elements. Returns an empty list if duplicates are found or parsing fails.
         """
-        return self._parse_list(response, expected_type=float)
+        # Keep only numeric characters and specific punctuation.
+        cleaned_response = "".join(
+            c for c in response if c.isnumeric() or c in ["[", "]", ",", " "]
+        )
+
+        # Convert to list of integers
+        ranked_items = []
+        for item in cleaned_response.split(","):
+            try:
+                # Attempt to convert each item to an integer and add it to the list.
+                ranked_items.append(int(item))
+            except ValueError:
+                pass  # Skip non-numeric items.
+
+        # Consider only the first 5 unique elements.
+        ranked_items = ranked_items[:5]
 
-    def _task_generation_parser(self, response: str) -> str:
+        # If there are duplicates, empty the list
+        if len(ranked_items) != len(set(ranked_items)):
+            ranked_items = []
+        return ranked_items
+
+    def _parse_generation(self, response: str) -> str:
         """
-        Parses a generation task response.
+        Parses a response from a generation task by trimming whitespace.
+
+        This method primarily cleans up the response string for presentation or further processing.
 
         Parameters:
-            response (str): The generated text response.
+            response (str): The raw response string.
 
         Returns:
-            str: The stripped response text.
+            A trimmed version of the response string.
         """
         return response.strip()
 
-    def _task_retrieval_parser(self, response: str) -> list:
+    def _parse_retrieval(self, response: str) -> list:
         """
-        Parses a retrieval task response.
+        Parses a retrieval task response, extracting the identifiers of retrieved items.
+
+        The response is expected to contain numeric values separated by commas.
 
         Parameters:
-            response (str): A string representing the indexes of selected items.
+            response (str): The raw response string.
 
         Returns:
-            list: A list of selected item indexes if the input is valid, otherwise ignore non numeric list elements.
+            A list of integers representing the first 3 unique retrieved item indices.
         """
-        return self._parse_list(response, expected_type=int)
+        # Similar to ranking parser, but only returns the first 3 elements.
+        cleaned_response = "".join(
+            c for c in response if c.isnumeric() or c in ["[", "]", ",", " "]
+        )
 
-    def _task_named_entity_recognition_parser(self, response: str) -> list:
-        """
-        Parses a named entity recognition task response.
+        # Convert to list of integers
+        response = []
+        for item in cleaned_response.split(","):
+            try:
+                # Attempt to convert each item to an integer and add it to the list.
+                response.append(int(item))
+            except ValueError:
+                pass  # Skip non-numeric items.
 
-        Parameters:
-            response (str): A string representing the list of identified entities.
+        # consider only the first 3 elements
+        retrieved_items = response[:3]
 
-        Returns:
-            list: A list of entity names if the input is valid.
-        """
-        return self._parse_list(response, expected_type=str)
+        return retrieved_items
 
-    def _parse_list(self, response: str, expected_type: type) -> list:
+    def _parse_named_entity_recognition(self, response: str) -> list:
         """
-        A helper method to parse a string into a list with elements of an expected type.
+        Parses a response from a named entity recognition (NER) task.
+
+        Can handle both list-like string inputs or comma-separated entities in a plain string.
 
         Parameters:
-            response (str): The string to parse.
-            expected_type (type): The expected type of elements in the list.
+            response (str): The raw response string.
 
         Returns:
-            list: A list of elements of the expected type, or ignore items if parsing fails.
+            A list of named entities extracted from the response. Attempts to parse the response as a
+            literal list; falls back to splitting by commas if that fails.
         """
         try:
-            parsed_response = ast.literal_eval(response)
-            if not isinstance(parsed_response, list):
-                return []
-
-            sanitized_response = []
-            for item in parsed_response:
-                try:
-                    sanitized_response.append(expected_type(item))
-                except (ValueError, TypeError) as e:
-                    pass
-            return sanitized_response
-        except SyntaxError:
-            return []
+            # Attempt to interpret the response as a literal list.
+            entities = ast.literal_eval(response)
+            if isinstance(entities, list) and all(
+                isinstance(item, str) for item in entities
+            ):
+                return entities
+        except (SyntaxError, ValueError):
+            # Fallback: split the string by commas and strip whitespace.
+            return [entity.strip() for entity in response.split(",")]
 
 
 if __name__ == "__main__":
-    # This section demonstrates the use of the ShoppingBenchTaskParsers class
-    # for different types of tasks. For each task, we initialize a parser,
-    # provide it with a response string, and then output the parsed result.
-
-    # MULTICHOICE TASK EXAMPLE
-    # Initialize the parser for a multichoice task
-    multichoice_parser = ShoppingBenchTaskParsers("multichoice")
-    # Example response string for a multichoice task (correct option is 2)
-    multichoice_response = "2"
-    # Parse the response and print the result
+    # Example usage of the ShoppingBenchTaskParsers class for various task types.
+
+    # MULTICHOICE EXAMPLE
+    multic_choice_parser = ShoppingBenchTaskParsers("multichoice")
+    print("Multichoice Example:")
+    print(multic_choice_parser.parse("2"))  # Expected output: 2
     print(
-        "Multichoice Task Parsing Result:",
-        multichoice_parser.parse(multichoice_response),
-    )
-    # Expected output: 2
+        multic_choice_parser.parse("a")
+    )  # Expected output (failure case): -1
+    print()
 
-    # RANKING TASK EXAMPLE
-    # Initialize the parser for a ranking task
+    # RANKING EXAMPLE
     ranking_parser = ShoppingBenchTaskParsers("ranking")
-    # Example response string for a ranking task (items ranked as 3rd, 1st, 2nd)
-    ranking_response = "[3, 1, 2]"
-    # Parse the response and print the result
+    print("Ranking Example:")
+    print(
+        ranking_parser.parse("1, 2, 3, 4, 5")
+    )  # Expected output: [1, 2, 3, 4, 5]
+    print(
+        ranking_parser.parse("[1, 2, 2, 3]")
+    )  # Expected output (failure case): [] # because of repeating numbers
     print(
-        "Ranking Task Parsing Result:", ranking_parser.parse(ranking_response)
-    )
-    # Expected output: [3.0, 1.0, 2.0]
+        ranking_parser.parse("1, 4, 5, aicrowd, 6")
+    )  # Expected output: [1, 4, 5, 6] # remove alphanumeric chars
 
-    # GENERATION TASK EXAMPLE
-    # Initialize the parser for a text generation task
+    print()
+
+    # GENERATION EXAMPLE
     generation_parser = ShoppingBenchTaskParsers("generation")
-    # Example response string for a generation task
-    generation_response = (
-        "This is a generated response based on the input prompt."
-    )
-    # Parse the response and print the result
+    print("Generation Example:")
     print(
-        "Generation Task Parsing Result:",
-        generation_parser.parse(generation_response),
-    )
-    # Expected output: This is a generated response based on the input prompt.
+        generation_parser.parse("This is a generated response")
+    )  # Expected output: 'This is a generated response.'
+    print()
 
-    # RETRIEVAL TASK EXAMPLE
-    # Initialize the parser for a retrieval task
+    # RETRIEVAL EXAMPLE
     retrieval_parser = ShoppingBenchTaskParsers("retrieval")
-    # Example response string for a retrieval task (items at indexes 0 and 2 are relevant)
-    retrieval_response = "[0, 2]"
-    # Parse the response and print the result
+    print("Retrieval Example:")
     print(
-        "Retrieval Task Parsing Result:",
-        retrieval_parser.parse(retrieval_response),
-    )
-    # Expected output: [0, 2]
-
-    # NAMED ENTITY RECOGNITION (NER) TASK EXAMPLE
-    # Initialize the parser for a named entity recognition task
-    ner_parser = ShoppingBenchTaskParsers("named_entity_recognition")
-    # Example response string for an NER task
-    ner_response = '["New York", "ShopBench"]'
-    # Parse the response and print the result
-    print("NER Task Parsing Result:", ner_parser.parse(ner_response))
-    # Expected output: ['New York', 'ShopBench']
-
-    # This demonstrates the flexible and effective parsing capabilities of the
-    # ShoppingBenchTaskParsers class across a variety of task types.
-
-    # Failure Case Examples for ShoppingBenchTaskParsers
-    # These examples illustrate how the parser handles incorrect or unexpected inputs.
-
-    print("=== FAILURE CASES ===\n")
-
-    # MULTICHOICE TASK FAILURE EXAMPLE
-    # Non-integer response for a multichoice task
-    multichoice_parser = ShoppingBenchTaskParsers("multichoice")
-    multichoice_bad_response = "abc"  # Invalid response (not an integer)
+        retrieval_parser.parse("100, 200, 300")
+    )  # Expected output: [100, 200, 300]
     print(
-        "Multichoice Task Failure Case:",
-        multichoice_parser.parse(multichoice_bad_response),
-    )
-    # Expected output: -1 (indicating an invalid response)
-
-    # RANKING TASK FAILURE EXAMPLE
-    # Non-list response for a ranking task
-    ranking_parser = ShoppingBenchTaskParsers("ranking")
-    ranking_bad_response = "not a valid list"  # Invalid list format
+        retrieval_parser.parse("100, 200")
+    )  # Expected output (shorter than 3): [100, 200]
     print(
-        "Ranking Task Failure Case:",
-        ranking_parser.parse(ranking_bad_response),
-    )
-    # Expected output: [] (indicating an inability to parse the response)
-
-    # GENERATION TASK FAILURE EXAMPLE
-    # Empty or whitespace-only response for a generation task
-    generation_parser = ShoppingBenchTaskParsers("generation")
-    generation_bad_response = "    "  # Only spaces
+        retrieval_parser.parse("100, 200, jjhg")
+    )  # Expected output (removed alphhanumeric chars): [100, 200]
     print(
-        "Generation Task Failure Case:",
-        f"'{generation_parser.parse(generation_bad_response)}'",
-    )
-    # Expected output: '' (an empty string indicating an invalid or empty response)
+        retrieval_parser.parse("100, 200, 300, 400")
+    )  # Expected output (only consider first 3 elems): [100, 200, 300]
 
-    # RETRIEVAL TASK FAILURE EXAMPLE
-    # Incorrect element format for a retrieval task
-    retrieval_parser = ShoppingBenchTaskParsers("retrieval")
-    retrieval_bad_response = "[1, 'a']"  # Contains a non-integer
-    print(
-        "Retrieval Task Failure Case:",
-        retrieval_parser.parse(retrieval_bad_response),
-    )
-    # Expected output: [1] (ignores invalid non-integer values)
+    print()
 
-    # NAMED ENTITY RECOGNITION (NER) TASK FAILURE EXAMPLE
-    # Non-list or incorrect entity format for an NER task
+    # NAMED ENTITY RECOGNITION EXAMPLE
     ner_parser = ShoppingBenchTaskParsers("named_entity_recognition")
-    ner_bad_response = '{"entity": "New York"}'  # Not a list, incorrect format
-    print("NER Task Failure Case:", ner_parser.parse(ner_bad_response))
-    # Expected output: [] (indicating the response could not be parsed as a list of entities)
-
+    print("Named Entity Recognition Example:")
+    print(
+        ner_parser.parse("['New York', 'ShopBench', 'Amazon']")
+    )  # Expected output: ['New York', 'ShopBench', 'Amazon']
+    print(
+        ner_parser.parse("New York, ShopBench, Amazon")
+    )  # Expected output: ['New York', 'ShopBench', 'Amazon']
     print(
-        "\nThese examples demonstrate how the parser handles various incorrect inputs."
-    )
+        ner_parser.parse("[New York, ShopBench, Amazon]")
+    )  # Expected output (failure case - extra '[' characters added to boundary elems]): ['[New York', 'ShopBench', 'Amazon]']