diff --git a/local_evaluation.py b/local_evaluation.py index c40f7215b6d3cf68b270b5fd0b91ad567fd1f73b..3484c15a892e2815426a64f7c8b044146ee841d5 100644 --- a/local_evaluation.py +++ b/local_evaluation.py @@ -5,19 +5,21 @@ from tqdm.auto import tqdm from sentence_transformers import SentenceTransformer import metrics -from models.user_config import UserModel + def print_sample(i, generation, truth, metric, score): print(f"Sample {i}, generation: {generation}") print(f"Sample {i}, truth: {truth}") if isinstance(score, tuple) and len(score) == 3: - print(f"Metric ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}") + print( + f"Metric ({metric}): tp {score[0]}, fp {score[1]}, fn {score[2]}" + ) else: print(f"Metric ({metric}): {score}") print() + def run_and_evaluate(data_df, max_eval_rows, print_interval=200): - model = UserModel() if max_eval_rows < len(data_df): data_df_eval = data_df.sample(max_eval_rows) @@ -27,91 +29,126 @@ def run_and_evaluate(data_df, max_eval_rows, print_interval=200): # Run model outputs = [] task_methods = { - 'multiple-choice': model.task_multichoice, - 'generation': model.task_generation, - 'retrieval': model.task_retrieval, - 'ranking': model.task_ranking, - 'named_entity_recognition': model.task_named_entity_recognition, + "multiple-choice": model.task_multichoice, + "generation": model.task_generation, + "retrieval": model.task_retrieval, + "ranking": model.task_ranking, + "named_entity_recognition": model.task_named_entity_recognition, } - for _, row in tqdm(data_df_eval.iterrows(), total=len(data_df_eval), desc='Processing'): - task_type = row['task_type'] + for _, row in tqdm( + data_df_eval.iterrows(), total=len(data_df_eval), desc="Processing" + ): + task_type = row["task_type"] if task_type not in task_methods: raise NotImplementedError(f"No task method for {task_type=}") - - task_prompt = row['input_field'] + + task_prompt = row["input_field"] task_fn = task_methods[task_type] task_output = task_fn(task_prompt) outputs.append(task_output) # Evaluate - device = 'cuda' if torch.cuda.is_available() else 'cpu' - sentence_all_lm = SentenceTransformer('all-MiniLM-L6-v2').to(device) - sentece_multilingual = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2').to(device) + device = "cuda" if torch.cuda.is_available() else "cpu" + sentence_all_lm = SentenceTransformer("all-MiniLM-L6-v2").to(device) + sentece_multilingual = SentenceTransformer( + "paraphrase-multilingual-MiniLM-L12-v2" + ).to(device) eval_methods = { - 'accuracy': metrics.accuracy, - 'hit rate@3': metrics.hit_rate_3, - 'rougel': metrics.rougel, - 'sent-transformer': lambda g,t: metrics.sent_transformer(g, t, sentence_all_lm), - 'multilingual-sent-transformer': lambda g,t: metrics.sent_transformer(g, t, sentece_multilingual), - 'micro f1': metrics.tp_fp_fn, - 'ndcg': metrics.ndcg_eval, - 'bleu': metrics.bleu, - 'jp-bleu': lambda g,t: metrics.bleu(g,t, jp=True) + "accuracy": metrics.accuracy, + "hit rate@3": metrics.hit_rate_3, + "rougel": metrics.rougel, + "sent-transformer": lambda g, t: metrics.sent_transformer( + g, t, sentence_all_lm + ), + "multilingual-sent-transformer": lambda g, t: metrics.sent_transformer( + g, t, sentece_multilingual + ), + "micro f1": metrics.tp_fp_fn, + "ndcg": metrics.ndcg_eval, + "bleu": metrics.bleu, + "jp-bleu": lambda g, t: metrics.bleu(g, t, jp=True), } per_task_metrics = {} - for ri, row in tqdm(data_df_eval.iterrows(), total=len(data_df_eval), desc='Evaluating'): - metric = row['metric'] + for ri, row in tqdm( + data_df_eval.iterrows(), total=len(data_df_eval), desc="Evaluating" + ): + metric = row["metric"] if metric not in eval_methods: raise NotImplementedError(f"No metric for {metric=}") - task_name = row['task_name'] - per_task_metrics.setdefault(task_name, { - 'metric': metric, - 'sample_score': [] - }) - - gt = row['output_field'] + task_name = row["task_name"] + per_task_metrics.setdefault( + task_name, {"metric": metric, "sample_score": []} + ) + + gt = row["output_field"] model_output = outputs[ri] eval_fn = eval_methods[metric] metric_score = eval_fn(model_output, gt) - per_task_metrics[task_name]['sample_score'].append(metric_score) - per_task_metrics[task_name]['sample_score'].append(metric_score) - + per_task_metrics[task_name]["sample_score"].append(metric_score) + per_task_metrics[task_name]["sample_score"].append(metric_score) + if ri % print_interval == 0: print_sample(ri, model_output, gt, metric, metric_score) # Aggregate scores for k in per_task_metrics: - if per_task_metrics[k]['metric'] != 'micro f1': - print(k, len(per_task_metrics[k]['sample_score'])) - per_task_metrics[k]['overall_metric'] = np.mean(per_task_metrics[k]['sample_score']) + if per_task_metrics[k]["metric"] != "micro f1": + print(k, len(per_task_metrics[k]["sample_score"])) + per_task_metrics[k]["overall_metric"] = np.mean( + per_task_metrics[k]["sample_score"] + ) else: - per_task_metrics[k]['overall_metric'] = metrics.compute_f1_score(per_task_metrics[k]['sample_score']) + per_task_metrics[k]["overall_metric"] = metrics.compute_f1_score( + per_task_metrics[k]["sample_score"] + ) - overall_metrics = { - 'task_name': [], - 'metric': [], - 'overall_score': [] - } + overall_metrics = {"task_name": [], "metric": [], "overall_score": []} for k in per_task_metrics: - overall_metrics['task_name'].append(k) - overall_metrics['metric'].append(per_task_metrics[k]['metric']) - overall_metrics['overall_score'].append(per_task_metrics[k]['overall_metric']) - track_wise_score = np.mean(overall_metrics['overall_score']) - overall_metrics['task_name'].append('track_wise') - overall_metrics['metric'].append('track_wise') - overall_metrics['overall_score'].append(track_wise_score) + overall_metrics["task_name"].append(k) + overall_metrics["metric"].append(per_task_metrics[k]["metric"]) + overall_metrics["overall_score"].append( + per_task_metrics[k]["overall_metric"] + ) + track_wise_score = np.mean(overall_metrics["overall_score"]) + overall_metrics["task_name"].append("track_wise") + overall_metrics["metric"].append("track_wise") + overall_metrics["overall_score"].append(track_wise_score) overall_metrics_df = pd.DataFrame(overall_metrics) - overall_metrics_df.to_json("scores.json", orient='records', lines=True) + overall_metrics_df.to_json("scores.json", orient="records", lines=True) print(f"Overall score {track_wise_score}") + if __name__ == "__main__": - DATA_FILENAME = './data/phase1_track3.json' + + # Load Development Data + DATA_FILENAME = "./data/development.json" data_df = pd.read_json(DATA_FILENAME, lines=True) - MAX_EVAL_ROWS = 100000 - run_and_evaluate(data_df, MAX_EVAL_ROWS) \ No newline at end of file + + # Load UserModel + from models.user_config import UserModel + + model = UserModel() + + # Generate Responses + + outputs = [] + for _rowd_idx, row in tqdm( + data_df.iterrows(), + total=len(data_df), + desc="Generating Responses", + ): + print("=" * 100) + is_multiple_choice = row["task_type"] == "multiple-choice" + prompt = row["input_field"] + model_output = model.predict(prompt, is_multiple_choice) + outputs.append(model_output) + + print(prompt, model_output) + + # run_and_evaluate(data_df, MAX_EVAL_ROWS) diff --git a/models/dummy_model.py b/models/dummy_model.py index 2eb84d3e52d0deb7d8f633090b5a703e79636446..0e0bf1c807befbd3c518fe74bec2b12da3f39b5d 100644 --- a/models/dummy_model.py +++ b/models/dummy_model.py @@ -1,52 +1,43 @@ from typing import List +import random +import os + +# please use this seed consistently across your code +AICROWD_RUN_SEED = int(os.getenv("AICROWD_RUN_SEED", 3142)) class DummyModel: """ - Note to participants: - Example class to show the different functions to be implemented for each type of task - Make sure to follow the data types as mentioned in the function definitions + TODO """ + def __init__(self): - """ Initialize your models here """ - pass - - def task_multichoice(self, task_prompt: str) -> int: - """ - Task method for Multiple choice questions - Input - Task Prompt (includes choices) - Output - Single integer index among ones given in the input - """ - return 0 + """Initialize your models here""" + random.seed(AICROWD_RUN_SEED) - def task_ranking(self, task_prompt: str) -> List[int]: - """ - Task method for Ranking - Input - Task Prompt (includes items to rank) - Output - Ordered List of ranks for each item + def predict(self, prompt: str, is_multiple_choice: bool) -> str: """ - return [1, 0, 2, 3] + Standard inferface for all tasks and tracks. - def task_generation(self, task_prompt: str) -> str: - """ - Task method for Generation - Input - Task Prompt describing the required generation - Output - Generated text as per task prompt - """ - return "This is a test" + The goal is for your model to be able to infer the task type, + and respond with a string that is compatible with the task specific parser. - def task_retrieval(self, task_prompt: str) -> List[int]: - """ - Task method for Generation - Input - Task Prompt describing the items which need to be selected from (includes indexes of items) - Output - Unordered list of indexes selected (must be a python list even if single item) - """ - return [0, 1, 2] - def task_named_entity_recognition(self, task_prompt: str) -> List[str]: + Note: Even if the development dataset has the task_type information, + During the actual evaluations, your code will only have access to the prompt, + and the boolean variable indicating if its a multiple choice question. """ - Task method for Named Entity Recognition - Input - Task Prompt describing the named entity recognition task - Output - Unordered list of one or more entity names (must be a python list even if single item) - """ - return ["food", "gpu"] \ No newline at end of file + + potential_response = [1, 2, 3, 4] + if is_multiple_choice: + return str(random.choice(potential_response)) + else: + # For Ranking, Retrieval, and Named Entity Recognition tasks + # the expected response is a string that can be parsed with + # `ast.literal_eval` (see parsers.py for more details) + random.shuffle(potential_response) + return str(potential_response) + + # Note: For the generation task, the expected response is a string + # And, as this is a dummy response, we are just returning the + # shuffled version of list, but in your case, it can be any string diff --git a/models/dummy_model_old.py b/models/dummy_model_old.py new file mode 100644 index 0000000000000000000000000000000000000000..2eb84d3e52d0deb7d8f633090b5a703e79636446 --- /dev/null +++ b/models/dummy_model_old.py @@ -0,0 +1,52 @@ +from typing import List + + +class DummyModel: + """ + Note to participants: + Example class to show the different functions to be implemented for each type of task + Make sure to follow the data types as mentioned in the function definitions + """ + def __init__(self): + """ Initialize your models here """ + pass + + def task_multichoice(self, task_prompt: str) -> int: + """ + Task method for Multiple choice questions + Input - Task Prompt (includes choices) + Output - Single integer index among ones given in the input + """ + return 0 + + def task_ranking(self, task_prompt: str) -> List[int]: + """ + Task method for Ranking + Input - Task Prompt (includes items to rank) + Output - Ordered List of ranks for each item + """ + return [1, 0, 2, 3] + + def task_generation(self, task_prompt: str) -> str: + """ + Task method for Generation + Input - Task Prompt describing the required generation + Output - Generated text as per task prompt + """ + return "This is a test" + + def task_retrieval(self, task_prompt: str) -> List[int]: + """ + Task method for Generation + Input - Task Prompt describing the items which need to be selected from (includes indexes of items) + Output - Unordered list of indexes selected (must be a python list even if single item) + """ + return [0, 1, 2] + + def task_named_entity_recognition(self, task_prompt: str) -> List[str]: + """ + Task method for Named Entity Recognition + Input - Task Prompt describing the named entity recognition task + Output - Unordered list of one or more entity names (must be a python list even if single item) + """ + return ["food", "gpu"] \ No newline at end of file diff --git a/parsers.py b/parsers.py index 193c6f06c455994df0a7cd1bd5d2ac1c7f640fa4..ce5636f6afb799bf69c5445081c285ccdfecfd01 100644 --- a/parsers.py +++ b/parsers.py @@ -1,261 +1,246 @@ -#!/usr/bin/env python3 import ast class ShoppingBenchTaskParsers: """ - A class for parsing responses from different types of tasks in a shopping bench scenario. + A class designed to parse responses from different task types in + the ShopBench - MultiTask Online Shopping Challenge for LLMs. + It supports a variety of task types such as multiple choice, ranking, generation, retrieval, + and named entity recognition, each with its own specific parsing logic to format the raw + response strings into structured data. Attributes: - task_type (str): The type of task for which the parser is instantiated. + task_type (str): The type of task the parser is set up to handle. Valid task types + include 'multichoice', 'ranking', 'generation', 'retrieval', + and 'named_entity_recognition'. """ def __init__(self, task_type: str) -> None: """ - Initializes the parser with a specific task type. + Initializes the parser for a specific task type. Parameters: - task_type (str): The type of task, e.g., 'multichoice', 'ranking', etc. + task_type (str): Specifies the task type this parser instance will handle. """ self.task_type = task_type def parse(self, response: str) -> any: """ - Parses the response based on the task type. + Parses a given response string according to the task type of the parser, and returns + a structured representation of that response. Parameters: - response (str): The raw response string from the model. + response (str): The raw response string obtained from performing the task. Returns: - The parsed response, formatted according to the task type's requirements. + A parsed and appropriately formatted response suitable for the parser's task type. + The format of the return value varies with the task type. """ - # Mapping task types to their respective parsing methods. + # Map of task types to their corresponding parsing methods. task_parser_methods = { - "multichoice": self._task_multichoice_parser, - "ranking": self._task_ranking_parser, - "generation": self._task_generation_parser, - "retrieval": self._task_retrieval_parser, - "named_entity_recognition": self._task_named_entity_recognition_parser, + "multichoice": self._parse_multichoice, + "ranking": self._parse_ranking, + "generation": self._parse_generation, + "retrieval": self._parse_retrieval, + "named_entity_recognition": self._parse_named_entity_recognition, } - # Retrieve the parser method based on the task type. + # Attempt to retrieve the appropriate parser method for the task type. parser_method = task_parser_methods.get(self.task_type) - if parser_method is not None: + # Execute the parser method if found, otherwise raise an error. + if parser_method: return parser_method(response) else: raise NotImplementedError( - f"Task type {self.task_type} not implemented" + f"Task type '{self.task_type}' is not supported." ) - def _task_multichoice_parser(self, response: str) -> int: + def _parse_multichoice(self, response: str) -> int: """ - Parses a multichoice task response. + Parses a response from a multiple-choice task. + + Assumes the first character of the response string indicates the chosen option. Parameters: - response (str): A string representing the selected option's index. + response (str): The raw response string. Returns: - int: The index of the selected option, or -1 if the input is invalid. + An integer representing the selected option. Returns -1 if the parsing fails due to + an invalid response format. """ try: - return int(response.strip()) + return int(response.strip()[0]) except ValueError: return -1 - def _task_ranking_parser(self, response: str) -> list: + def _parse_ranking(self, response: str) -> list: """ - Parses a ranking task response. + Parses a ranking task response into a list of ranked items. + + Expects a string with numeric values separated by commas, indicating the ranking order. Parameters: - response (str): A string representing the ordered list of ranks. + response (str): The raw response string. Returns: - list: A list of ranks if the input is valid, otherwise ignore non numeric list elements. + A list of integers representing the items in ranked order. Limits to the first 5 unique + elements. Returns an empty list if duplicates are found or parsing fails. """ - return self._parse_list(response, expected_type=float) + # Keep only numeric characters and specific punctuation. + cleaned_response = "".join( + c for c in response if c.isnumeric() or c in ["[", "]", ",", " "] + ) + + # Convert to list of integers + ranked_items = [] + for item in cleaned_response.split(","): + try: + # Attempt to convert each item to an integer and add it to the list. + ranked_items.append(int(item)) + except ValueError: + pass # Skip non-numeric items. + + # Consider only the first 5 unique elements. + ranked_items = ranked_items[:5] - def _task_generation_parser(self, response: str) -> str: + # If there are duplicates, empty the list + if len(ranked_items) != len(set(ranked_items)): + ranked_items = [] + return ranked_items + + def _parse_generation(self, response: str) -> str: """ - Parses a generation task response. + Parses a response from a generation task by trimming whitespace. + + This method primarily cleans up the response string for presentation or further processing. Parameters: - response (str): The generated text response. + response (str): The raw response string. Returns: - str: The stripped response text. + A trimmed version of the response string. """ return response.strip() - def _task_retrieval_parser(self, response: str) -> list: + def _parse_retrieval(self, response: str) -> list: """ - Parses a retrieval task response. + Parses a retrieval task response, extracting the identifiers of retrieved items. + + The response is expected to contain numeric values separated by commas. Parameters: - response (str): A string representing the indexes of selected items. + response (str): The raw response string. Returns: - list: A list of selected item indexes if the input is valid, otherwise ignore non numeric list elements. + A list of integers representing the first 3 unique retrieved item indices. """ - return self._parse_list(response, expected_type=int) + # Similar to ranking parser, but only returns the first 3 elements. + cleaned_response = "".join( + c for c in response if c.isnumeric() or c in ["[", "]", ",", " "] + ) - def _task_named_entity_recognition_parser(self, response: str) -> list: - """ - Parses a named entity recognition task response. + # Convert to list of integers + response = [] + for item in cleaned_response.split(","): + try: + # Attempt to convert each item to an integer and add it to the list. + response.append(int(item)) + except ValueError: + pass # Skip non-numeric items. - Parameters: - response (str): A string representing the list of identified entities. + # consider only the first 3 elements + retrieved_items = response[:3] - Returns: - list: A list of entity names if the input is valid. - """ - return self._parse_list(response, expected_type=str) + return retrieved_items - def _parse_list(self, response: str, expected_type: type) -> list: + def _parse_named_entity_recognition(self, response: str) -> list: """ - A helper method to parse a string into a list with elements of an expected type. + Parses a response from a named entity recognition (NER) task. + + Can handle both list-like string inputs or comma-separated entities in a plain string. Parameters: - response (str): The string to parse. - expected_type (type): The expected type of elements in the list. + response (str): The raw response string. Returns: - list: A list of elements of the expected type, or ignore items if parsing fails. + A list of named entities extracted from the response. Attempts to parse the response as a + literal list; falls back to splitting by commas if that fails. """ try: - parsed_response = ast.literal_eval(response) - if not isinstance(parsed_response, list): - return [] - - sanitized_response = [] - for item in parsed_response: - try: - sanitized_response.append(expected_type(item)) - except (ValueError, TypeError) as e: - pass - return sanitized_response - except SyntaxError: - return [] + # Attempt to interpret the response as a literal list. + entities = ast.literal_eval(response) + if isinstance(entities, list) and all( + isinstance(item, str) for item in entities + ): + return entities + except (SyntaxError, ValueError): + # Fallback: split the string by commas and strip whitespace. + return [entity.strip() for entity in response.split(",")] if __name__ == "__main__": - # This section demonstrates the use of the ShoppingBenchTaskParsers class - # for different types of tasks. For each task, we initialize a parser, - # provide it with a response string, and then output the parsed result. - - # MULTICHOICE TASK EXAMPLE - # Initialize the parser for a multichoice task - multichoice_parser = ShoppingBenchTaskParsers("multichoice") - # Example response string for a multichoice task (correct option is 2) - multichoice_response = "2" - # Parse the response and print the result + # Example usage of the ShoppingBenchTaskParsers class for various task types. + + # MULTICHOICE EXAMPLE + multic_choice_parser = ShoppingBenchTaskParsers("multichoice") + print("Multichoice Example:") + print(multic_choice_parser.parse("2")) # Expected output: 2 print( - "Multichoice Task Parsing Result:", - multichoice_parser.parse(multichoice_response), - ) - # Expected output: 2 + multic_choice_parser.parse("a") + ) # Expected output (failure case): -1 + print() - # RANKING TASK EXAMPLE - # Initialize the parser for a ranking task + # RANKING EXAMPLE ranking_parser = ShoppingBenchTaskParsers("ranking") - # Example response string for a ranking task (items ranked as 3rd, 1st, 2nd) - ranking_response = "[3, 1, 2]" - # Parse the response and print the result + print("Ranking Example:") + print( + ranking_parser.parse("1, 2, 3, 4, 5") + ) # Expected output: [1, 2, 3, 4, 5] + print( + ranking_parser.parse("[1, 2, 2, 3]") + ) # Expected output (failure case): [] # because of repeating numbers print( - "Ranking Task Parsing Result:", ranking_parser.parse(ranking_response) - ) - # Expected output: [3.0, 1.0, 2.0] + ranking_parser.parse("1, 4, 5, aicrowd, 6") + ) # Expected output: [1, 4, 5, 6] # remove alphanumeric chars - # GENERATION TASK EXAMPLE - # Initialize the parser for a text generation task + print() + + # GENERATION EXAMPLE generation_parser = ShoppingBenchTaskParsers("generation") - # Example response string for a generation task - generation_response = ( - "This is a generated response based on the input prompt." - ) - # Parse the response and print the result + print("Generation Example:") print( - "Generation Task Parsing Result:", - generation_parser.parse(generation_response), - ) - # Expected output: This is a generated response based on the input prompt. + generation_parser.parse("This is a generated response") + ) # Expected output: 'This is a generated response.' + print() - # RETRIEVAL TASK EXAMPLE - # Initialize the parser for a retrieval task + # RETRIEVAL EXAMPLE retrieval_parser = ShoppingBenchTaskParsers("retrieval") - # Example response string for a retrieval task (items at indexes 0 and 2 are relevant) - retrieval_response = "[0, 2]" - # Parse the response and print the result + print("Retrieval Example:") print( - "Retrieval Task Parsing Result:", - retrieval_parser.parse(retrieval_response), - ) - # Expected output: [0, 2] - - # NAMED ENTITY RECOGNITION (NER) TASK EXAMPLE - # Initialize the parser for a named entity recognition task - ner_parser = ShoppingBenchTaskParsers("named_entity_recognition") - # Example response string for an NER task - ner_response = '["New York", "ShopBench"]' - # Parse the response and print the result - print("NER Task Parsing Result:", ner_parser.parse(ner_response)) - # Expected output: ['New York', 'ShopBench'] - - # This demonstrates the flexible and effective parsing capabilities of the - # ShoppingBenchTaskParsers class across a variety of task types. - - # Failure Case Examples for ShoppingBenchTaskParsers - # These examples illustrate how the parser handles incorrect or unexpected inputs. - - print("=== FAILURE CASES ===\n") - - # MULTICHOICE TASK FAILURE EXAMPLE - # Non-integer response for a multichoice task - multichoice_parser = ShoppingBenchTaskParsers("multichoice") - multichoice_bad_response = "abc" # Invalid response (not an integer) + retrieval_parser.parse("100, 200, 300") + ) # Expected output: [100, 200, 300] print( - "Multichoice Task Failure Case:", - multichoice_parser.parse(multichoice_bad_response), - ) - # Expected output: -1 (indicating an invalid response) - - # RANKING TASK FAILURE EXAMPLE - # Non-list response for a ranking task - ranking_parser = ShoppingBenchTaskParsers("ranking") - ranking_bad_response = "not a valid list" # Invalid list format + retrieval_parser.parse("100, 200") + ) # Expected output (shorter than 3): [100, 200] print( - "Ranking Task Failure Case:", - ranking_parser.parse(ranking_bad_response), - ) - # Expected output: [] (indicating an inability to parse the response) - - # GENERATION TASK FAILURE EXAMPLE - # Empty or whitespace-only response for a generation task - generation_parser = ShoppingBenchTaskParsers("generation") - generation_bad_response = " " # Only spaces + retrieval_parser.parse("100, 200, jjhg") + ) # Expected output (removed alphhanumeric chars): [100, 200] print( - "Generation Task Failure Case:", - f"'{generation_parser.parse(generation_bad_response)}'", - ) - # Expected output: '' (an empty string indicating an invalid or empty response) + retrieval_parser.parse("100, 200, 300, 400") + ) # Expected output (only consider first 3 elems): [100, 200, 300] - # RETRIEVAL TASK FAILURE EXAMPLE - # Incorrect element format for a retrieval task - retrieval_parser = ShoppingBenchTaskParsers("retrieval") - retrieval_bad_response = "[1, 'a']" # Contains a non-integer - print( - "Retrieval Task Failure Case:", - retrieval_parser.parse(retrieval_bad_response), - ) - # Expected output: [1] (ignores invalid non-integer values) + print() - # NAMED ENTITY RECOGNITION (NER) TASK FAILURE EXAMPLE - # Non-list or incorrect entity format for an NER task + # NAMED ENTITY RECOGNITION EXAMPLE ner_parser = ShoppingBenchTaskParsers("named_entity_recognition") - ner_bad_response = '{"entity": "New York"}' # Not a list, incorrect format - print("NER Task Failure Case:", ner_parser.parse(ner_bad_response)) - # Expected output: [] (indicating the response could not be parsed as a list of entities) - + print("Named Entity Recognition Example:") + print( + ner_parser.parse("['New York', 'ShopBench', 'Amazon']") + ) # Expected output: ['New York', 'ShopBench', 'Amazon'] + print( + ner_parser.parse("New York, ShopBench, Amazon") + ) # Expected output: ['New York', 'ShopBench', 'Amazon'] print( - "\nThese examples demonstrate how the parser handles various incorrect inputs." - ) + ner_parser.parse("[New York, ShopBench, Amazon]") + ) # Expected output (failure case - extra '[' characters added to boundary elems]): ['[New York', 'ShopBench', 'Amazon]']