Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Amazon KDD Cup 2024 Starter Kit
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
der2933
Amazon KDD Cup 2024 Starter Kit
Commits
929296c8
Commit
929296c8
authored
1 year ago
by
spmohanty
Browse files
Options
Downloads
Patches
Plain Diff
Refactor local eval script
parent
3ef95c87
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
local_evaluation.py
+176
-76
176 additions, 76 deletions
local_evaluation.py
with
176 additions
and
76 deletions
local_evaluation.py
+
176
−
76
View file @
929296c8
import
pandas
as
pd
from
tqdm
import
tqdm
import
torch
import
numpy
as
np
import
pandas
as
pd
from
tqdm.auto
import
tqdm
from
sentence_transformers
import
SentenceTransformer
from
sentence_transformers
import
SentenceTransformer
import
metrics
import
parsers
def
print_sample
(
i
,
generation
,
truth
,
metric
,
score
):
print
(
f
"
Sample
{
i
}
, generation:
{
generation
}
"
)
print
(
f
"
Sample
{
i
}
, truth:
{
truth
}
"
)
def
print_sample
(
idx
,
generation
,
truth
,
metric
,
score
):
"""
Print a sample
'
s generated output, the truth, and its evaluation score.
"""
print
(
f
"
Sample
{
idx
}
, generation:
{
generation
}
"
)
print
(
f
"
Sample
{
idx
}
, truth:
{
truth
}
"
)
if
isinstance
(
score
,
tuple
)
and
len
(
score
)
==
3
:
print
(
f
"
Metric
(
{
metric
}
): tp
{
score
[
0
]
}
, fp
{
score
[
1
]
}
, fn
{
score
[
2
]
}
"
f
"
Per Sample Metric Score
(
{
metric
}
): tp
{
score
[
0
]
}
, fp
{
score
[
1
]
}
, fn
{
score
[
2
]
}
"
)
else
:
print
(
f
"
Metric
(
{
metric
}
):
{
score
}
"
)
print
(
f
"
Per Sample Metric Score
(
{
metric
}
):
{
score
}
"
)
print
()
if
__name__
==
"
__main__
"
:
# Function to load development data from a JSON file
def
load_development_data
(
filename
):
"""
Load development data from a specified JSON file.
# Load Development Data
DATA_FILENAME
=
"
./data/development.json
"
data_df
=
pd
.
read_json
(
DATA_FILENAME
,
lines
=
True
)
Parameters:
- filename: Path to the JSON file containing the development data.
# Load UserModel
from
models.user_config
import
UserModel
Returns:
- A pandas DataFrame containing the loaded data.
"""
return
pd
.
read_json
(
filename
,
lines
=
True
)
model
=
UserModel
()
# Generate Responses
# Function to generate model outputs based on the input data
def
generate_model_outputs
(
data_df
,
model
):
"""
Generate predictions for each entry in the data DataFrame using a given model.
Parameters:
- data_df: A pandas DataFrame containing the input data for predictions.
- model: The model instance used for generating predictions.
Returns:
- A list containing the model outputs for each entry in the data DataFrame.
"""
outputs
=
[]
for
_rowd_idx
,
row
in
tqdm
(
data_df
.
iterrows
(),
total
=
len
(
data_df
),
desc
=
"
Generating Responses
"
,
for
_
,
row
in
tqdm
(
data_df
.
iterrows
(),
total
=
len
(
data_df
),
desc
=
"
Generating Responses
"
):
print
(
"
=
"
*
100
)
is_multiple_choice
=
row
[
"
task_type
"
]
==
"
multiple-choice
"
prompt
=
row
[
"
input_field
"
]
model_output
=
model
.
predict
(
prompt
,
is_multiple_choice
)
outputs
.
append
(
model_output
)
return
outputs
print
(
prompt
,
model_output
)
# Merge outputs into DF
data_df
[
"
outputs
"
]
=
outputs
print
(
data_df
)
# Function to evaluate the generated model outputs
def
evaluate_outputs
(
data_df
,
outputs
,
log_every_n_steps
=
1
):
"""
Evaluate the model outputs against ground truth values using specified metrics.
# Evaluate
print_interval
=
1
Parameters:
- data_df: DataFrame containing the development data, including ground truth.
- outputs: The generated outputs from the model to be evaluated.
- log_every_n_steps: Logs samples every N steps
Returns:
- A dictionary containing evaluation metrics and scores for each task.
"""
eval_methods
=
get_evaluation_methods
()
task_parsers
=
get_task_parsers
()
per_task_metrics
=
{}
for
row_idx
,
row
in
tqdm
(
data_df
.
iterrows
(),
total
=
len
(
data_df
),
desc
=
"
Evaluating
"
):
task_type
,
metric
,
ground_truth
=
(
row
[
"
task_type
"
],
row
[
"
metric
"
],
row
[
"
output_field
"
],
)
if
metric
not
in
eval_methods
:
raise
NotImplementedError
(
f
"
No metric for
{
metric
=
}
"
)
task_name
=
f
"
{
task_type
}
---
{
metric
}
"
# Note: In practice, here we are using the task_type-metric pair as a unique identifier, calling it as the task_name.
# During the actual evaluations, the task names are more semantically defined, meaning, there could be multiple tasks
# with the same task_type and metric.
model_output
=
task_parsers
[
task_type
].
parse
(
outputs
[
row_idx
])
eval_fn
=
eval_methods
[
metric
]
metric_score
=
eval_fn
(
model_output
,
ground_truth
)
if
task_name
not
in
per_task_metrics
:
per_task_metrics
[
task_name
]
=
{
"
task_type
"
:
task_type
,
"
metric
"
:
metric
,
"
sample_score
"
:
[],
}
per_task_metrics
[
task_name
][
"
sample_score
"
].
append
(
metric_score
)
if
row_idx
%
log_every_n_steps
==
0
:
print_sample
(
row_idx
,
model_output
,
ground_truth
,
metric
,
metric_score
)
return
per_task_metrics
# Function to aggregate scores from evaluations
def
aggregate_scores
(
per_task_metrics
):
"""
Aggregate evaluation scores across different tasks and metrics.
Parameters:
- per_task_metrics: A dictionary containing raw evaluation scores for each task.
Returns:
- A pandas DataFrame summarizing the overall metrics and scores.
"""
overall_metrics
=
{
"
task_name
"
:
[],
"
task_type
"
:
[],
"
metric
"
:
[],
"
overall_score
"
:
[],
}
for
task_name
,
values
in
per_task_metrics
.
items
():
task_type
,
metric
,
sample_scores
=
(
values
[
"
task_type
"
],
values
[
"
metric
"
],
values
[
"
sample_score
"
],
)
overall_score
=
(
np
.
mean
(
sample_scores
)
if
metric
!=
"
micro f1
"
else
metrics
.
compute_f1_score
(
sample_scores
)
)
overall_metrics
[
"
task_name
"
].
append
(
task_name
)
overall_metrics
[
"
task_type
"
].
append
(
task_type
)
overall_metrics
[
"
metric
"
].
append
(
metric
)
overall_metrics
[
"
overall_score
"
].
append
(
overall_score
)
return
pd
.
DataFrame
(
overall_metrics
)
# Define and return evaluation methods
def
get_evaluation_methods
():
"""
Get evaluation methods including accuracy, sentence transformers, and other metrics.
Returns:
- A dictionary mapping metric names to their respective evaluation functions.
"""
device
=
"
cuda
"
if
torch
.
cuda
.
is_available
()
else
"
cpu
"
sentence_all_lm
=
SentenceTransformer
(
"
all-MiniLM-L6-v2
"
).
to
(
device
)
sentece_multilingual
=
SentenceTransformer
(
sente
n
ce_multilingual
=
SentenceTransformer
(
"
paraphrase-multilingual-MiniLM-L12-v2
"
).
to
(
device
)
eval_methods
=
{
return
{
"
accuracy
"
:
metrics
.
accuracy
,
"
hit rate@3
"
:
metrics
.
hit_rate_3
,
"
rougel
"
:
metrics
.
rougel
,
...
...
@@ -67,7 +174,7 @@ if __name__ == "__main__":
g
,
t
,
sentence_all_lm
),
"
multilingual-sent-transformer
"
:
lambda
g
,
t
:
metrics
.
sent_transformer
(
g
,
t
,
sentece_multilingual
g
,
t
,
sente
n
ce_multilingual
),
"
micro f1
"
:
metrics
.
tp_fp_fn
,
"
ndcg
"
:
metrics
.
ndcg_eval
,
...
...
@@ -75,7 +182,16 @@ if __name__ == "__main__":
"
jp-bleu
"
:
lambda
g
,
t
:
metrics
.
bleu
(
g
,
t
,
jp
=
True
),
}
task_parsers
=
{
# Define and return task parsers
def
get_task_parsers
():
"""
Define parsers for different task types to format model outputs accordingly.
Returns:
- A dictionary mapping task types to their respective parsers.
"""
return
{
"
multiple-choice
"
:
parsers
.
ShoppingBenchTaskParsers
(
"
multichoice
"
),
"
generation
"
:
parsers
.
ShoppingBenchTaskParsers
(
"
generation
"
),
"
retrieval
"
:
parsers
.
ShoppingBenchTaskParsers
(
"
retrieval
"
),
...
...
@@ -85,57 +201,41 @@ if __name__ == "__main__":
),
}
per_task_metrics
=
{}
for
row_idx
,
row
in
tqdm
(
data_df
.
iterrows
(),
total
=
len
(
data_df
),
desc
=
"
Evaluating
"
):
metric
=
row
[
"
metric
"
]
if
metric
not
in
eval_methods
:
raise
NotImplementedError
(
f
"
No metric for
{
metric
=
}
"
)
task_type
=
row
[
"
task_type
"
]
task_name
=
f
"
{
task_type
}
---
{
metric
}
"
per_task_metrics
.
setdefault
(
task_name
,
{
"
metric
"
:
metric
,
"
sample_score
"
:
[]}
)
gt
=
row
[
"
output_field
"
]
model_output
=
task_parsers
[
task_type
].
parse
(
outputs
[
row_idx
])
eval_fn
=
eval_methods
[
metric
]
metric_score
=
eval_fn
(
model_output
,
gt
)
per_task_metrics
[
task_name
][
"
sample_score
"
].
append
(
metric_score
)
per_task_metrics
[
task_name
][
"
sample_score
"
].
append
(
metric_score
)
# Main execution function to load data, generate model outputs, evaluate, and aggregate scores
def
main
():
# Load development data
DATA_FILENAME
=
"
./data/development.json
"
data_df
=
load_development_data
(
DATA_FILENAME
)
if
row_idx
%
print_interval
==
0
:
print_sample
(
row_idx
,
outputs
[
row_idx
],
gt
,
metric
,
metric_score
)
# Load the model from the user's custom configuration
# Note: The evaluator **Always** imports the UserModel, please reference your own class
# by setting the `UserModel` variable in models.user_config
from
models.user_config
import
UserModel
# Aggregate scores
for
task_name
in
per_task_metrics
:
if
per_task_metrics
[
task_name
][
"
metric
"
]
!=
"
micro f1
"
:
per_task_metrics
[
task_name
][
"
overall_metric
"
]
=
np
.
mean
(
per_task_metrics
[
task_name
][
"
sample_score
"
]
)
else
:
per_task_metrics
[
task_name
][
"
overall_metric
"
]
=
(
metrics
.
compute_f1_score
(
per_task_metrics
[
task_name
][
"
sample_score
"
]
)
)
model
=
UserModel
()
print
(
per_task_metrics
)
# Generate model outputs
outputs
=
generate_model_outputs
(
data_df
,
model
)
data_df
[
"
outputs
"
]
=
(
outputs
# Optional: Add outputs back to DataFrame for inspection
)
print
(
data_df
.
head
())
overall_metrics
=
{
"
task_name
"
:
[],
"
metric
"
:
[],
"
overall_score
"
:
[]}
for
task_name
in
per_task_metrics
:
overall_metrics
[
"
task_name
"
].
append
(
task_name
)
overall_metrics
[
"
metric
"
].
append
(
per_task_metrics
[
task_name
][
"
metric
"
])
overall_metrics
[
"
overall_score
"
].
append
(
per_task_metrics
[
task_name
][
"
overall_metric
"
]
)
# Evaluate the generated outputs and calculate metrics
per_task_metrics
=
evaluate_outputs
(
data_df
,
outputs
)
overall_metrics
=
pd
.
DataFrame
(
overall_metrics
)
# Aggregate and display the evaluation scores
overall_metrics
=
aggregate_scores
(
per_task_metrics
)
print
(
"
=
"
*
100
)
print
(
"
Task specific metrics:
"
)
print
(
overall_metrics
)
print
()
# Calculate and print the overall score across all tasks and metrics
overall_score
=
overall_metrics
[
"
overall_score
"
].
mean
()
print
(
f
"
Overall Score:
{
overall_score
}
"
)
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment