diff --git a/.gitignore b/.gitignore index 8a34ad7a947a59d495cdb07f78a2afd1a1b6848e..c11a7ef4547dc4eb936293134bec6411ca06217c 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ cython_debug/ scores.json data/ +*.ipynb \ No newline at end of file diff --git a/local_evaluation.py b/local_evaluation.py index 19cc37b84fec00fe4fd64c16bcb693785d82abe7..c40f7215b6d3cf68b270b5fd0b91ad567fd1f73b 100644 --- a/local_evaluation.py +++ b/local_evaluation.py @@ -111,7 +111,7 @@ def run_and_evaluate(data_df, max_eval_rows, print_interval=200): print(f"Overall score {track_wise_score}") if __name__ == "__main__": - DATA_FILENAME = './data/tracks/track3_rephrase.json' + DATA_FILENAME = './data/phase1_track3.json' data_df = pd.read_json(DATA_FILENAME, lines=True) MAX_EVAL_ROWS = 100000 run_and_evaluate(data_df, MAX_EVAL_ROWS) \ No newline at end of file diff --git a/parsers.py b/parsers.py new file mode 100644 index 0000000000000000000000000000000000000000..46b0532f50df82675af79ca81c4ccbd5c26dabed --- /dev/null +++ b/parsers.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +import ast + + +class ShoppingBenchTaskParsers: + """ + A class for parsing responses from different types of tasks in a shopping bench scenario. + + Attributes: + task_type (str): The type of task for which the parser is instantiated. + """ + + def __init__(self, task_type: str) -> None: + """ + Initializes the parser with a specific task type. + + Parameters: + task_type (str): The type of task, e.g., 'multichoice', 'ranking', etc. + """ + self.task_type = task_type + + def parse(self, response: str) -> any: + """ + Parses the response based on the task type. + + Parameters: + response (str): The raw response string from the model. + + Returns: + The parsed response, formatted according to the task type's requirements. + """ + # Mapping task types to their respective parsing methods. + task_parser_methods = { + "multichoice": self._task_multichoice_parser, + "ranking": self._task_ranking_parser, + "generation": self._task_generation_parser, + "retrieval": self._task_retrieval_parser, + "named_entity_recognition": self._task_named_entity_recognition_parser, + } + + # Retrieve the parser method based on the task type. + parser_method = task_parser_methods.get(self.task_type) + + if parser_method is not None: + return parser_method(response) + else: + raise NotImplementedError( + f"Task type {self.task_type} not implemented" + ) + + def _task_multichoice_parser(self, response: str) -> int: + """ + Parses a multichoice task response. + + Parameters: + response (str): A string representing the selected option's index. + + Returns: + int: The index of the selected option, or -1 if the input is invalid. + """ + try: + return int(response.strip()) + except ValueError: + return -1 + + def _task_ranking_parser(self, response: str) -> list: + """ + Parses a ranking task response. + + Parameters: + response (str): A JSON string representing the ordered list of ranks. + + Returns: + list: A list of ranks if the input is valid, otherwise ignore non numeric list elements. + """ + return self._parse_json_list(response, expected_type=float) + + def _task_generation_parser(self, response: str) -> str: + """ + Parses a generation task response. + + Parameters: + response (str): The generated text response. + + Returns: + str: The stripped response text. + """ + return response.strip() + + def _task_retrieval_parser(self, response: str) -> list: + """ + Parses a retrieval task response. + + Parameters: + response (str): A JSON string representing the indexes of selected items. + + Returns: + list: A list of selected item indexes if the input is valid, otherwise ignore non numeric list elements. + """ + return self._parse_json_list(response, expected_type=int) + + def _task_named_entity_recognition_parser(self, response: str) -> list: + """ + Parses a named entity recognition task response. + + Parameters: + response (str): A JSON string representing the list of identified entities. + + Returns: + list: A list of entity names if the input is valid. + """ + return self._parse_json_list(response, expected_type=str) + + def _parse_json_list(self, response: str, expected_type: type) -> list: + """ + A helper method to parse a JSON string into a list with elements of an expected type. + + Parameters: + response (str): The JSON string to parse. + expected_type (type): The expected type of elements in the list. + + Returns: + list: A list of elements of the expected type, or ignore items if parsing fails. + """ + try: + parsed_response = ast.literal_eval(response) + if not isinstance(parsed_response, list): + return [] + + sanitized_response = [] + for item in parsed_response: + try: + sanitized_response.append(expected_type(item)) + except (ValueError, TypeError) as e: + pass + return sanitized_response + except SyntaxError: + return [] + + +if __name__ == "__main__": + # This section demonstrates the use of the ShoppingBenchTaskParsers class + # for different types of tasks. For each task, we initialize a parser, + # provide it with a response string, and then output the parsed result. + + # MULTICHOICE TASK EXAMPLE + # Initialize the parser for a multichoice task + multichoice_parser = ShoppingBenchTaskParsers("multichoice") + # Example response string for a multichoice task (correct option is 2) + multichoice_response = "2" + # Parse the response and print the result + print( + "Multichoice Task Parsing Result:", + multichoice_parser.parse(multichoice_response), + ) + # Expected output: 2 + + # RANKING TASK EXAMPLE + # Initialize the parser for a ranking task + ranking_parser = ShoppingBenchTaskParsers("ranking") + # Example response string for a ranking task (items ranked as 3rd, 1st, 2nd) + ranking_response = "[3, 1, 2]" + # Parse the response and print the result + print( + "Ranking Task Parsing Result:", ranking_parser.parse(ranking_response) + ) + # Expected output: [3.0, 1.0, 2.0] + + # GENERATION TASK EXAMPLE + # Initialize the parser for a text generation task + generation_parser = ShoppingBenchTaskParsers("generation") + # Example response string for a generation task + generation_response = ( + "This is a generated response based on the input prompt." + ) + # Parse the response and print the result + print( + "Generation Task Parsing Result:", + generation_parser.parse(generation_response), + ) + # Expected output: This is a generated response based on the input prompt. + + # RETRIEVAL TASK EXAMPLE + # Initialize the parser for a retrieval task + retrieval_parser = ShoppingBenchTaskParsers("retrieval") + # Example response string for a retrieval task (items at indexes 0 and 2 are relevant) + retrieval_response = "[0, 2]" + # Parse the response and print the result + print( + "Retrieval Task Parsing Result:", + retrieval_parser.parse(retrieval_response), + ) + # Expected output: [0, 2] + + # NAMED ENTITY RECOGNITION (NER) TASK EXAMPLE + # Initialize the parser for a named entity recognition task + ner_parser = ShoppingBenchTaskParsers("named_entity_recognition") + # Example response string for an NER task + ner_response = '["New York", "ShopBench"]' + # Parse the response and print the result + print("NER Task Parsing Result:", ner_parser.parse(ner_response)) + # Expected output: ['New York', 'ShopBench'] + + # This demonstrates the flexible and effective parsing capabilities of the + # ShoppingBenchTaskParsers class across a variety of task types. + + # Failure Case Examples for ShoppingBenchTaskParsers + # These examples illustrate how the parser handles incorrect or unexpected inputs. + + print("=== FAILURE CASES ===\n") + + # MULTICHOICE TASK FAILURE EXAMPLE + # Non-integer response for a multichoice task + multichoice_parser = ShoppingBenchTaskParsers("multichoice") + multichoice_bad_response = "abc" # Invalid response (not an integer) + print( + "Multichoice Task Failure Case:", + multichoice_parser.parse(multichoice_bad_response), + ) + # Expected output: -1 (indicating an invalid response) + + # RANKING TASK FAILURE EXAMPLE + # Non-JSON response for a ranking task + ranking_parser = ShoppingBenchTaskParsers("ranking") + ranking_bad_response = "not a json string" # Invalid JSON format + print( + "Ranking Task Failure Case:", + ranking_parser.parse(ranking_bad_response), + ) + # Expected output: [] (indicating an inability to parse the response) + + # GENERATION TASK FAILURE EXAMPLE + # Empty or whitespace-only response for a generation task + generation_parser = ShoppingBenchTaskParsers("generation") + generation_bad_response = " " # Only spaces + print( + "Generation Task Failure Case:", + f"'{generation_parser.parse(generation_bad_response)}'", + ) + # Expected output: '' (an empty string indicating an invalid or empty response) + + # RETRIEVAL TASK FAILURE EXAMPLE + # Incorrect JSON format for a retrieval task + retrieval_parser = ShoppingBenchTaskParsers("retrieval") + retrieval_bad_response = "[1, 'a']" # Contains a non-integer + print( + "Retrieval Task Failure Case:", + retrieval_parser.parse(retrieval_bad_response), + ) + # Expected output: [1] (ignores invalid non-integer values) + + # NAMED ENTITY RECOGNITION (NER) TASK FAILURE EXAMPLE + # Non-list JSON or incorrect entity format for an NER task + ner_parser = ShoppingBenchTaskParsers("named_entity_recognition") + ner_bad_response = '{"entity": "New York"}' # Not a list, incorrect format + print("NER Task Failure Case:", ner_parser.parse(ner_bad_response)) + # Expected output: [] (indicating the response could not be parsed as a list of entities) + + print( + "\nThese examples demonstrate how the parser handles various incorrect inputs." + )