Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Amazon KDD Cup 2024 Starter Kit
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
der2933
Amazon KDD Cup 2024 Starter Kit
Commits
78136551
Commit
78136551
authored
1 year ago
by
spmohanty
Browse files
Options
Downloads
Patches
Plain Diff
Update Parser
parent
f3994b1a
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
local_evaluation.py
+93
-56
93 additions, 56 deletions
local_evaluation.py
models/dummy_model.py
+30
-39
30 additions, 39 deletions
models/dummy_model.py
models/dummy_model_old.py
+52
-0
52 additions, 0 deletions
models/dummy_model_old.py
parsers.py
+151
-166
151 additions, 166 deletions
parsers.py
with
326 additions
and
261 deletions
local_evaluation.py
+
93
−
56
View file @
78136551
...
...
@@ -5,19 +5,21 @@ from tqdm.auto import tqdm
from
sentence_transformers
import
SentenceTransformer
import
metrics
from
models.user_config
import
UserModel
def
print_sample
(
i
,
generation
,
truth
,
metric
,
score
):
print
(
f
"
Sample
{
i
}
, generation:
{
generation
}
"
)
print
(
f
"
Sample
{
i
}
, truth:
{
truth
}
"
)
if
isinstance
(
score
,
tuple
)
and
len
(
score
)
==
3
:
print
(
f
"
Metric (
{
metric
}
): tp
{
score
[
0
]
}
, fp
{
score
[
1
]
}
, fn
{
score
[
2
]
}
"
)
print
(
f
"
Metric (
{
metric
}
): tp
{
score
[
0
]
}
, fp
{
score
[
1
]
}
, fn
{
score
[
2
]
}
"
)
else
:
print
(
f
"
Metric (
{
metric
}
):
{
score
}
"
)
print
()
def
run_and_evaluate
(
data_df
,
max_eval_rows
,
print_interval
=
200
):
model
=
UserModel
()
if
max_eval_rows
<
len
(
data_df
):
data_df_eval
=
data_df
.
sample
(
max_eval_rows
)
...
...
@@ -27,91 +29,126 @@ def run_and_evaluate(data_df, max_eval_rows, print_interval=200):
# Run model
outputs
=
[]
task_methods
=
{
'
multiple-choice
'
:
model
.
task_multichoice
,
'
generation
'
:
model
.
task_generation
,
'
retrieval
'
:
model
.
task_retrieval
,
'
ranking
'
:
model
.
task_ranking
,
'
named_entity_recognition
'
:
model
.
task_named_entity_recognition
,
"
multiple-choice
"
:
model
.
task_multichoice
,
"
generation
"
:
model
.
task_generation
,
"
retrieval
"
:
model
.
task_retrieval
,
"
ranking
"
:
model
.
task_ranking
,
"
named_entity_recognition
"
:
model
.
task_named_entity_recognition
,
}
for
_
,
row
in
tqdm
(
data_df_eval
.
iterrows
(),
total
=
len
(
data_df_eval
),
desc
=
'
Processing
'
):
task_type
=
row
[
'
task_type
'
]
for
_
,
row
in
tqdm
(
data_df_eval
.
iterrows
(),
total
=
len
(
data_df_eval
),
desc
=
"
Processing
"
):
task_type
=
row
[
"
task_type
"
]
if
task_type
not
in
task_methods
:
raise
NotImplementedError
(
f
"
No task method for
{
task_type
=
}
"
)
task_prompt
=
row
[
'
input_field
'
]
task_prompt
=
row
[
"
input_field
"
]
task_fn
=
task_methods
[
task_type
]
task_output
=
task_fn
(
task_prompt
)
outputs
.
append
(
task_output
)
# Evaluate
device
=
'
cuda
'
if
torch
.
cuda
.
is_available
()
else
'
cpu
'
sentence_all_lm
=
SentenceTransformer
(
'
all-MiniLM-L6-v2
'
).
to
(
device
)
sentece_multilingual
=
SentenceTransformer
(
'
paraphrase-multilingual-MiniLM-L12-v2
'
).
to
(
device
)
device
=
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
sentence_all_lm
=
SentenceTransformer
(
"
all-MiniLM-L6-v2
"
).
to
(
device
)
sentece_multilingual
=
SentenceTransformer
(
"
paraphrase-multilingual-MiniLM-L12-v2
"
).
to
(
device
)
eval_methods
=
{
'
accuracy
'
:
metrics
.
accuracy
,
'
hit rate@3
'
:
metrics
.
hit_rate_3
,
'
rougel
'
:
metrics
.
rougel
,
'
sent-transformer
'
:
lambda
g
,
t
:
metrics
.
sent_transformer
(
g
,
t
,
sentence_all_lm
),
'
multilingual-sent-transformer
'
:
lambda
g
,
t
:
metrics
.
sent_transformer
(
g
,
t
,
sentece_multilingual
),
'
micro f1
'
:
metrics
.
tp_fp_fn
,
'
ndcg
'
:
metrics
.
ndcg_eval
,
'
bleu
'
:
metrics
.
bleu
,
'
jp-bleu
'
:
lambda
g
,
t
:
metrics
.
bleu
(
g
,
t
,
jp
=
True
)
"
accuracy
"
:
metrics
.
accuracy
,
"
hit rate@3
"
:
metrics
.
hit_rate_3
,
"
rougel
"
:
metrics
.
rougel
,
"
sent-transformer
"
:
lambda
g
,
t
:
metrics
.
sent_transformer
(
g
,
t
,
sentence_all_lm
),
"
multilingual-sent-transformer
"
:
lambda
g
,
t
:
metrics
.
sent_transformer
(
g
,
t
,
sentece_multilingual
),
"
micro f1
"
:
metrics
.
tp_fp_fn
,
"
ndcg
"
:
metrics
.
ndcg_eval
,
"
bleu
"
:
metrics
.
bleu
,
"
jp-bleu
"
:
lambda
g
,
t
:
metrics
.
bleu
(
g
,
t
,
jp
=
True
),
}
per_task_metrics
=
{}
for
ri
,
row
in
tqdm
(
data_df_eval
.
iterrows
(),
total
=
len
(
data_df_eval
),
desc
=
'
Evaluating
'
):
metric
=
row
[
'
metric
'
]
for
ri
,
row
in
tqdm
(
data_df_eval
.
iterrows
(),
total
=
len
(
data_df_eval
),
desc
=
"
Evaluating
"
):
metric
=
row
[
"
metric
"
]
if
metric
not
in
eval_methods
:
raise
NotImplementedError
(
f
"
No metric for
{
metric
=
}
"
)
task_name
=
row
[
'
task_name
'
]
per_task_metrics
.
setdefault
(
task_name
,
{
'
metric
'
:
metric
,
'
sample_score
'
:
[]
})
gt
=
row
[
'
output_field
'
]
task_name
=
row
[
"
task_name
"
]
per_task_metrics
.
setdefault
(
task_name
,
{
"
metric
"
:
metric
,
"
sample_score
"
:
[]}
)
gt
=
row
[
"
output_field
"
]
model_output
=
outputs
[
ri
]
eval_fn
=
eval_methods
[
metric
]
metric_score
=
eval_fn
(
model_output
,
gt
)
per_task_metrics
[
task_name
][
'
sample_score
'
].
append
(
metric_score
)
per_task_metrics
[
task_name
][
'
sample_score
'
].
append
(
metric_score
)
per_task_metrics
[
task_name
][
"
sample_score
"
].
append
(
metric_score
)
per_task_metrics
[
task_name
][
"
sample_score
"
].
append
(
metric_score
)
if
ri
%
print_interval
==
0
:
print_sample
(
ri
,
model_output
,
gt
,
metric
,
metric_score
)
# Aggregate scores
for
k
in
per_task_metrics
:
if
per_task_metrics
[
k
][
'
metric
'
]
!=
'
micro f1
'
:
print
(
k
,
len
(
per_task_metrics
[
k
][
'
sample_score
'
]))
per_task_metrics
[
k
][
'
overall_metric
'
]
=
np
.
mean
(
per_task_metrics
[
k
][
'
sample_score
'
])
if
per_task_metrics
[
k
][
"
metric
"
]
!=
"
micro f1
"
:
print
(
k
,
len
(
per_task_metrics
[
k
][
"
sample_score
"
]))
per_task_metrics
[
k
][
"
overall_metric
"
]
=
np
.
mean
(
per_task_metrics
[
k
][
"
sample_score
"
]
)
else
:
per_task_metrics
[
k
][
'
overall_metric
'
]
=
metrics
.
compute_f1_score
(
per_task_metrics
[
k
][
'
sample_score
'
])
per_task_metrics
[
k
][
"
overall_metric
"
]
=
metrics
.
compute_f1_score
(
per_task_metrics
[
k
][
"
sample_score
"
]
)
overall_metrics
=
{
'
task_name
'
:
[],
'
metric
'
:
[],
'
overall_score
'
:
[]
}
overall_metrics
=
{
"
task_name
"
:
[],
"
metric
"
:
[],
"
overall_score
"
:
[]}
for
k
in
per_task_metrics
:
overall_metrics
[
'
task_name
'
].
append
(
k
)
overall_metrics
[
'
metric
'
].
append
(
per_task_metrics
[
k
][
'
metric
'
])
overall_metrics
[
'
overall_score
'
].
append
(
per_task_metrics
[
k
][
'
overall_metric
'
])
track_wise_score
=
np
.
mean
(
overall_metrics
[
'
overall_score
'
])
overall_metrics
[
'
task_name
'
].
append
(
'
track_wise
'
)
overall_metrics
[
'
metric
'
].
append
(
'
track_wise
'
)
overall_metrics
[
'
overall_score
'
].
append
(
track_wise_score
)
overall_metrics
[
"
task_name
"
].
append
(
k
)
overall_metrics
[
"
metric
"
].
append
(
per_task_metrics
[
k
][
"
metric
"
])
overall_metrics
[
"
overall_score
"
].
append
(
per_task_metrics
[
k
][
"
overall_metric
"
]
)
track_wise_score
=
np
.
mean
(
overall_metrics
[
"
overall_score
"
])
overall_metrics
[
"
task_name
"
].
append
(
"
track_wise
"
)
overall_metrics
[
"
metric
"
].
append
(
"
track_wise
"
)
overall_metrics
[
"
overall_score
"
].
append
(
track_wise_score
)
overall_metrics_df
=
pd
.
DataFrame
(
overall_metrics
)
overall_metrics_df
.
to_json
(
"
scores.json
"
,
orient
=
'
records
'
,
lines
=
True
)
overall_metrics_df
.
to_json
(
"
scores.json
"
,
orient
=
"
records
"
,
lines
=
True
)
print
(
f
"
Overall score
{
track_wise_score
}
"
)
if
__name__
==
"
__main__
"
:
DATA_FILENAME
=
'
./data/phase1_track3.json
'
# Load Development Data
DATA_FILENAME
=
"
./data/development.json
"
data_df
=
pd
.
read_json
(
DATA_FILENAME
,
lines
=
True
)
MAX_EVAL_ROWS
=
100000
run_and_evaluate
(
data_df
,
MAX_EVAL_ROWS
)
\ No newline at end of file
# Load UserModel
from
models.user_config
import
UserModel
model
=
UserModel
()
# Generate Responses
outputs
=
[]
for
_rowd_idx
,
row
in
tqdm
(
data_df
.
iterrows
(),
total
=
len
(
data_df
),
desc
=
"
Generating Responses
"
,
):
print
(
"
=
"
*
100
)
is_multiple_choice
=
row
[
"
task_type
"
]
==
"
multiple-choice
"
prompt
=
row
[
"
input_field
"
]
model_output
=
model
.
predict
(
prompt
,
is_multiple_choice
)
outputs
.
append
(
model_output
)
print
(
prompt
,
model_output
)
# run_and_evaluate(data_df, MAX_EVAL_ROWS)
This diff is collapsed.
Click to expand it.
models/dummy_model.py
+
30
−
39
View file @
78136551
from
typing
import
List
import
random
import
os
# please use this seed consistently across your code
AICROWD_RUN_SEED
=
int
(
os
.
getenv
(
"
AICROWD_RUN_SEED
"
,
3142
))
class
DummyModel
:
"""
Note to participants:
Example class to show the different functions to be implemented for each type of task
Make sure to follow the data types as mentioned in the function definitions
TODO
"""
def
__init__
(
self
):
"""
Initialize your models here
"""
pass
def
task_multichoice
(
self
,
task_prompt
:
str
)
->
int
:
"""
Task method for Multiple choice questions
Input - Task Prompt (includes choices)
Output - Single integer index among ones given in the input
"""
return
0
"""
Initialize your models here
"""
random
.
seed
(
AICROWD_RUN_SEED
)
def
task_ranking
(
self
,
task_prompt
:
str
)
->
List
[
int
]:
"""
Task method for Ranking
Input - Task Prompt (includes items to rank)
Output - Ordered List of ranks for each item
def
predict
(
self
,
prompt
:
str
,
is_multiple_choice
:
bool
)
->
str
:
"""
return
[
1
,
0
,
2
,
3
]
Standard inferface for all tasks and tracks.
def
task_generation
(
self
,
task_prompt
:
str
)
->
str
:
"""
Task method for Generation
Input - Task Prompt describing the required generation
Output - Generated text as per task prompt
"""
return
"
This is a test
"
The goal is for your model to be able to infer the task type,
and respond with a string that is compatible with the task specific parser.
def
task_retrieval
(
self
,
task_prompt
:
str
)
->
List
[
int
]:
"""
Task method for Generation
Input - Task Prompt describing the items which need to be selected from (includes indexes of items)
Output - Unordered list of indexes selected (must be a python list even if single item)
"""
return
[
0
,
1
,
2
]
def
task_named_entity_recognition
(
self
,
task_prompt
:
str
)
->
List
[
str
]:
Note: Even if the development dataset has the task_type information,
During the actual evaluations, your code will only have access to the prompt,
and the boolean variable indicating if its a multiple choice question.
"""
Task method for Named Entity Recognition
Input - Task Prompt describing the named entity recognition task
Output - Unordered list of one or more entity names (must be a python list even if single item)
"""
return
[
"
food
"
,
"
gpu
"
]
\ No newline at end of file
potential_response
=
[
1
,
2
,
3
,
4
]
if
is_multiple_choice
:
return
str
(
random
.
choice
(
potential_response
))
else
:
# For Ranking, Retrieval, and Named Entity Recognition tasks
# the expected response is a string that can be parsed with
# `ast.literal_eval` (see parsers.py for more details)
random
.
shuffle
(
potential_response
)
return
str
(
potential_response
)
# Note: For the generation task, the expected response is a string
# And, as this is a dummy response, we are just returning the
# shuffled version of list, but in your case, it can be any string
This diff is collapsed.
Click to expand it.
models/dummy_model_old.py
0 → 100644
+
52
−
0
View file @
78136551
from
typing
import
List
class
DummyModel
:
"""
Note to participants:
Example class to show the different functions to be implemented for each type of task
Make sure to follow the data types as mentioned in the function definitions
"""
def
__init__
(
self
):
"""
Initialize your models here
"""
pass
def
task_multichoice
(
self
,
task_prompt
:
str
)
->
int
:
"""
Task method for Multiple choice questions
Input - Task Prompt (includes choices)
Output - Single integer index among ones given in the input
"""
return
0
def
task_ranking
(
self
,
task_prompt
:
str
)
->
List
[
int
]:
"""
Task method for Ranking
Input - Task Prompt (includes items to rank)
Output - Ordered List of ranks for each item
"""
return
[
1
,
0
,
2
,
3
]
def
task_generation
(
self
,
task_prompt
:
str
)
->
str
:
"""
Task method for Generation
Input - Task Prompt describing the required generation
Output - Generated text as per task prompt
"""
return
"
This is a test
"
def
task_retrieval
(
self
,
task_prompt
:
str
)
->
List
[
int
]:
"""
Task method for Generation
Input - Task Prompt describing the items which need to be selected from (includes indexes of items)
Output - Unordered list of indexes selected (must be a python list even if single item)
"""
return
[
0
,
1
,
2
]
def
task_named_entity_recognition
(
self
,
task_prompt
:
str
)
->
List
[
str
]:
"""
Task method for Named Entity Recognition
Input - Task Prompt describing the named entity recognition task
Output - Unordered list of one or more entity names (must be a python list even if single item)
"""
return
[
"
food
"
,
"
gpu
"
]
\ No newline at end of file
This diff is collapsed.
Click to expand it.
parsers.py
+
151
−
166
View file @
78136551
#!/usr/bin/env python3
import
ast
class
ShoppingBenchTaskParsers
:
"""
A class for parsing responses from different types of tasks in a shopping bench scenario.
A class designed to parse responses from different task types in
the ShopBench - MultiTask Online Shopping Challenge for LLMs.
It supports a variety of task types such as multiple choice, ranking, generation, retrieval,
and named entity recognition, each with its own specific parsing logic to format the raw
response strings into structured data.
Attributes:
task_type (str): The type of task for which the parser is instantiated.
task_type (str): The type of task the parser is set up to handle. Valid task types
include
'
multichoice
'
,
'
ranking
'
,
'
generation
'
,
'
retrieval
'
,
and
'
named_entity_recognition
'
.
"""
def
__init__
(
self
,
task_type
:
str
)
->
None
:
"""
Initializes the parser
with
a specific task type.
Initializes the parser
for
a specific task type.
Parameters:
task_type (str):
The type of task, e.g.,
'
multichoice
'
,
'
ranking
'
, etc
.
task_type (str):
Specifies the task type this parser instance will handle
.
"""
self
.
task_type
=
task_type
def
parse
(
self
,
response
:
str
)
->
any
:
"""
Parses the response based on the task type.
Parses a given response string according to the task type of the parser, and returns
a structured representation of that response.
Parameters:
response (str): The raw response string
from the model
.
response (str): The raw response string
obtained from performing the task
.
Returns:
The parsed response, formatted according to the task type
'
s requirements.
A parsed and appropriately formatted response suitable for the parser
'
s task type.
The format of the return value varies with the task type.
"""
# Map
ping
task types to their resp
ective
parsing methods.
# Map
of
task types to their
cor
resp
onding
parsing methods.
task_parser_methods
=
{
"
multichoice
"
:
self
.
_
task
_multichoice
_parser
,
"
ranking
"
:
self
.
_
task
_ranking
_parser
,
"
generation
"
:
self
.
_
task
_generation
_parser
,
"
retrieval
"
:
self
.
_
task
_retrieval
_parser
,
"
named_entity_recognition
"
:
self
.
_
task
_named_entity_recognition
_parser
,
"
multichoice
"
:
self
.
_
parse
_multichoice
,
"
ranking
"
:
self
.
_
parse
_ranking
,
"
generation
"
:
self
.
_
parse
_generation
,
"
retrieval
"
:
self
.
_
parse
_retrieval
,
"
named_entity_recognition
"
:
self
.
_
parse
_named_entity_recognition
,
}
#
Retrieve th
e parser method
based on
the task type.
#
Attempt to retrieve the appropriat
e parser method
for
the task type.
parser_method
=
task_parser_methods
.
get
(
self
.
task_type
)
if
parser_method
is
not
None
:
# Execute the parser method if found, otherwise raise an error.
if
parser_method
:
return
parser_method
(
response
)
else
:
raise
NotImplementedError
(
f
"
Task type
{
self
.
task_type
}
not
implemen
ted
"
f
"
Task type
'
{
self
.
task_type
}
'
is
not
suppor
ted
.
"
)
def
_
task
_multichoice
_parser
(
self
,
response
:
str
)
->
int
:
def
_
parse
_multichoice
(
self
,
response
:
str
)
->
int
:
"""
Parses a multichoice task response.
Parses a response from a multiple-choice task.
Assumes the first character of the response string indicates the chosen option.
Parameters:
response (str):
A string representing the selected option
'
s index
.
response (str):
The raw response string
.
Returns:
int: The index of the selected option, or -1 if the input is invalid.
An integer representing the selected option. Returns -1 if the parsing fails due to
an invalid response format.
"""
try
:
return
int
(
response
.
strip
())
return
int
(
response
.
strip
()
[
0
]
)
except
ValueError
:
return
-
1
def
_
task
_ranking
_parser
(
self
,
response
:
str
)
->
list
:
def
_
parse
_ranking
(
self
,
response
:
str
)
->
list
:
"""
Parses a ranking task response.
Parses a ranking task response into a list of ranked items.
Expects a string with numeric values separated by commas, indicating the ranking order.
Parameters:
response (str):
A string representing the ordered list of ranks
.
response (str):
The raw response string
.
Returns:
list: A list of ranks if the input is valid, otherwise ignore non numeric list elements.
A list of integers representing the items in ranked order. Limits to the first 5 unique
elements. Returns an empty list if duplicates are found or parsing fails.
"""
return
self
.
_parse_list
(
response
,
expected_type
=
float
)
# Keep only numeric characters and specific punctuation.
cleaned_response
=
""
.
join
(
c
for
c
in
response
if
c
.
isnumeric
()
or
c
in
[
"
[
"
,
"
]
"
,
"
,
"
,
"
"
]
)
# Convert to list of integers
ranked_items
=
[]
for
item
in
cleaned_response
.
split
(
"
,
"
):
try
:
# Attempt to convert each item to an integer and add it to the list.
ranked_items
.
append
(
int
(
item
))
except
ValueError
:
pass
# Skip non-numeric items.
# Consider only the first 5 unique elements.
ranked_items
=
ranked_items
[:
5
]
def
_task_generation_parser
(
self
,
response
:
str
)
->
str
:
# If there are duplicates, empty the list
if
len
(
ranked_items
)
!=
len
(
set
(
ranked_items
)):
ranked_items
=
[]
return
ranked_items
def
_parse_generation
(
self
,
response
:
str
)
->
str
:
"""
Parses a generation task response.
Parses a response from a generation task by trimming whitespace.
This method primarily cleans up the response string for presentation or further processing.
Parameters:
response (str): The
generated text
response.
response (str): The
raw
response
string
.
Returns:
str: The stripped
response
text
.
A trimmed version of the
response
string
.
"""
return
response
.
strip
()
def
_
task
_retrieval
_parser
(
self
,
response
:
str
)
->
list
:
def
_
parse
_retrieval
(
self
,
response
:
str
)
->
list
:
"""
Parses a retrieval task response.
Parses a retrieval task response, extracting the identifiers of retrieved items.
The response is expected to contain numeric values separated by commas.
Parameters:
response (str):
A string representing the indexes of selected items
.
response (str):
The raw response string
.
Returns:
list:
A list of
selected item indexes if the input is valid, otherwise ignore non numeric list element
s.
A list of
integers representing the first 3 unique retrieved item indice
s.
"""
return
self
.
_parse_list
(
response
,
expected_type
=
int
)
# Similar to ranking parser, but only returns the first 3 elements.
cleaned_response
=
""
.
join
(
c
for
c
in
response
if
c
.
isnumeric
()
or
c
in
[
"
[
"
,
"
]
"
,
"
,
"
,
"
"
]
)
def
_task_named_entity_recognition_parser
(
self
,
response
:
str
)
->
list
:
"""
Parses a named entity recognition task response.
# Convert to list of integers
response
=
[]
for
item
in
cleaned_response
.
split
(
"
,
"
):
try
:
# Attempt to convert each item to an integer and add it to the list.
response
.
append
(
int
(
item
))
except
ValueError
:
pass
# Skip non-numeric items.
Parameters:
response (str): A string representing the list of identified entities.
# consider only the first 3 elements
retrieved_items
=
response
[:
3
]
Returns:
list: A list of entity names if the input is valid.
"""
return
self
.
_parse_list
(
response
,
expected_type
=
str
)
return
retrieved_items
def
_parse_
list
(
self
,
response
:
str
,
expected_type
:
type
)
->
list
:
def
_parse_
named_entity_recognition
(
self
,
response
:
str
)
->
list
:
"""
A helper method to parse a string into a list with elements of an expected type.
Parses a response from a named entity recognition (NER) task.
Can handle both list-like string inputs or comma-separated entities in a plain string.
Parameters:
response (str): The string to parse.
expected_type (type): The expected type of elements in the list.
response (str): The raw response string.
Returns:
list: A list of elements of the expected type, or ignore items if parsing fails.
A list of named entities extracted from the response. Attempts to parse the response as a
literal list; falls back to splitting by commas if that fails.
"""
try
:
parsed_response
=
ast
.
literal_eval
(
response
)
if
not
isinstance
(
parsed_response
,
list
):
return
[]
sanitized_response
=
[]
for
item
in
parsed_response
:
try
:
sanitized_response
.
append
(
expected_type
(
item
))
except
(
ValueError
,
TypeError
)
as
e
:
pass
return
sanitized_response
except
SyntaxError
:
return
[]
# Attempt to interpret the response as a literal list.
entities
=
ast
.
literal_eval
(
response
)
if
isinstance
(
entities
,
list
)
and
all
(
isinstance
(
item
,
str
)
for
item
in
entities
):
return
entities
except
(
SyntaxError
,
ValueError
):
# Fallback: split the string by commas and strip whitespace.
return
[
entity
.
strip
()
for
entity
in
response
.
split
(
"
,
"
)]
if
__name__
==
"
__main__
"
:
# This section demonstrates the use of the ShoppingBenchTaskParsers class
# for different types of tasks. For each task, we initialize a parser,
# provide it with a response string, and then output the parsed result.
# MULTICHOICE TASK EXAMPLE
# Initialize the parser for a multichoice task
multichoice_parser
=
ShoppingBenchTaskParsers
(
"
multichoice
"
)
# Example response string for a multichoice task (correct option is 2)
multichoice_response
=
"
2
"
# Parse the response and print the result
# Example usage of the ShoppingBenchTaskParsers class for various task types.
# MULTICHOICE EXAMPLE
multic_choice_parser
=
ShoppingBenchTaskParsers
(
"
multichoice
"
)
print
(
"
Multichoice Example:
"
)
print
(
multic_choice_parser
.
parse
(
"
2
"
))
# Expected output: 2
print
(
"
Multichoice Task Parsing Result:
"
,
multichoice_parser
.
parse
(
multichoice_response
),
)
# Expected output: 2
multic_choice_parser
.
parse
(
"
a
"
)
)
# Expected output (failure case): -1
print
()
# RANKING TASK EXAMPLE
# Initialize the parser for a ranking task
# RANKING EXAMPLE
ranking_parser
=
ShoppingBenchTaskParsers
(
"
ranking
"
)
# Example response string for a ranking task (items ranked as 3rd, 1st, 2nd)
ranking_response
=
"
[3, 1, 2]
"
# Parse the response and print the result
print
(
"
Ranking Example:
"
)
print
(
ranking_parser
.
parse
(
"
1, 2, 3, 4, 5
"
)
)
# Expected output: [1, 2, 3, 4, 5]
print
(
ranking_parser
.
parse
(
"
[1, 2, 2, 3]
"
)
)
# Expected output (failure case): [] # because of repeating numbers
print
(
"
Ranking Task Parsing Result:
"
,
ranking_parser
.
parse
(
ranking_response
)
)
# Expected output: [3.0, 1.0, 2.0]
ranking_parser
.
parse
(
"
1, 4, 5, aicrowd, 6
"
)
)
# Expected output: [1, 4, 5, 6] # remove alphanumeric chars
# GENERATION TASK EXAMPLE
# Initialize the parser for a text generation task
print
()
# GENERATION EXAMPLE
generation_parser
=
ShoppingBenchTaskParsers
(
"
generation
"
)
# Example response string for a generation task
generation_response
=
(
"
This is a generated response based on the input prompt.
"
)
# Parse the response and print the result
print
(
"
Generation Example:
"
)
print
(
"
Generation Task Parsing Result:
"
,
generation_parser
.
parse
(
generation_response
),
)
# Expected output: This is a generated response based on the input prompt.
generation_parser
.
parse
(
"
This is a generated response
"
)
)
# Expected output: 'This is a generated response.'
print
()
# RETRIEVAL TASK EXAMPLE
# Initialize the parser for a retrieval task
# RETRIEVAL EXAMPLE
retrieval_parser
=
ShoppingBenchTaskParsers
(
"
retrieval
"
)
# Example response string for a retrieval task (items at indexes 0 and 2 are relevant)
retrieval_response
=
"
[0, 2]
"
# Parse the response and print the result
print
(
"
Retrieval Example:
"
)
print
(
"
Retrieval Task Parsing Result:
"
,
retrieval_parser
.
parse
(
retrieval_response
),
)
# Expected output: [0, 2]
# NAMED ENTITY RECOGNITION (NER) TASK EXAMPLE
# Initialize the parser for a named entity recognition task
ner_parser
=
ShoppingBenchTaskParsers
(
"
named_entity_recognition
"
)
# Example response string for an NER task
ner_response
=
'
[
"
New York
"
,
"
ShopBench
"
]
'
# Parse the response and print the result
print
(
"
NER Task Parsing Result:
"
,
ner_parser
.
parse
(
ner_response
))
# Expected output: ['New York', 'ShopBench']
# This demonstrates the flexible and effective parsing capabilities of the
# ShoppingBenchTaskParsers class across a variety of task types.
# Failure Case Examples for ShoppingBenchTaskParsers
# These examples illustrate how the parser handles incorrect or unexpected inputs.
print
(
"
=== FAILURE CASES ===
\n
"
)
# MULTICHOICE TASK FAILURE EXAMPLE
# Non-integer response for a multichoice task
multichoice_parser
=
ShoppingBenchTaskParsers
(
"
multichoice
"
)
multichoice_bad_response
=
"
abc
"
# Invalid response (not an integer)
retrieval_parser
.
parse
(
"
100, 200, 300
"
)
)
# Expected output: [100, 200, 300]
print
(
"
Multichoice Task Failure Case:
"
,
multichoice_parser
.
parse
(
multichoice_bad_response
),
)
# Expected output: -1 (indicating an invalid response)
# RANKING TASK FAILURE EXAMPLE
# Non-list response for a ranking task
ranking_parser
=
ShoppingBenchTaskParsers
(
"
ranking
"
)
ranking_bad_response
=
"
not a valid list
"
# Invalid list format
retrieval_parser
.
parse
(
"
100, 200
"
)
)
# Expected output (shorter than 3): [100, 200]
print
(
"
Ranking Task Failure Case:
"
,
ranking_parser
.
parse
(
ranking_bad_response
),
)
# Expected output: [] (indicating an inability to parse the response)
# GENERATION TASK FAILURE EXAMPLE
# Empty or whitespace-only response for a generation task
generation_parser
=
ShoppingBenchTaskParsers
(
"
generation
"
)
generation_bad_response
=
"
"
# Only spaces
retrieval_parser
.
parse
(
"
100, 200, jjhg
"
)
)
# Expected output (removed alphhanumeric chars): [100, 200]
print
(
"
Generation Task Failure Case:
"
,
f
"'
{
generation_parser
.
parse
(
generation_bad_response
)
}
'"
,
)
# Expected output: '' (an empty string indicating an invalid or empty response)
retrieval_parser
.
parse
(
"
100, 200, 300, 400
"
)
)
# Expected output (only consider first 3 elems): [100, 200, 300]
# RETRIEVAL TASK FAILURE EXAMPLE
# Incorrect element format for a retrieval task
retrieval_parser
=
ShoppingBenchTaskParsers
(
"
retrieval
"
)
retrieval_bad_response
=
"
[1,
'
a
'
]
"
# Contains a non-integer
print
(
"
Retrieval Task Failure Case:
"
,
retrieval_parser
.
parse
(
retrieval_bad_response
),
)
# Expected output: [1] (ignores invalid non-integer values)
print
()
# NAMED ENTITY RECOGNITION (NER) TASK FAILURE EXAMPLE
# Non-list or incorrect entity format for an NER task
# NAMED ENTITY RECOGNITION EXAMPLE
ner_parser
=
ShoppingBenchTaskParsers
(
"
named_entity_recognition
"
)
ner_bad_response
=
'
{
"
entity
"
:
"
New York
"
}
'
# Not a list, incorrect format
print
(
"
NER Task Failure Case:
"
,
ner_parser
.
parse
(
ner_bad_response
))
# Expected output: [] (indicating the response could not be parsed as a list of entities)
print
(
"
Named Entity Recognition Example:
"
)
print
(
ner_parser
.
parse
(
"
[
'
New York
'
,
'
ShopBench
'
,
'
Amazon
'
]
"
)
)
# Expected output: ['New York', 'ShopBench', 'Amazon']
print
(
ner_parser
.
parse
(
"
New York, ShopBench, Amazon
"
)
)
# Expected output: ['New York', 'ShopBench', 'Amazon']
print
(
"
\n
These examples demonstrate how the parser handles various incorrect inputs.
"
)
ner_parser
.
parse
(
"
[New York, ShopBench, Amazon]
"
)
)
# Expected output (failure case - extra '[' characters added to boundary elems]): ['[New York', 'ShopBench', 'Amazon]']
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment