diff --git a/local_evaluation.py b/local_evaluation.py index 7dd899c9488c7d2245e6bfc6b9181fa9e24b6771..8ed8dd1740c7a0a64578c047fffe847fa98d301e 100644 --- a/local_evaluation.py +++ b/local_evaluation.py @@ -117,14 +117,14 @@ def evaluate(LocalEvalConfig): index=instructions_df.InputInstructionWithGameID).to_dict() cpreds, cgt = [], [] for instructionWithGameID, instruction_is_clear in classifier_gt.items(): - cgt.append(int(instruction_is_clear.lower() == 'yes')) + cgt.append(int(instruction_is_clear.lower() == 'no')) pred = classifer_preds.get(instructionWithGameID, None) if pred is not None: cpreds.append(pred) else: warnings.warn(f"No prediction for instruction + game id {instructionWithGameID}") # if any instruction is not predicted, default value will be taken as 1 - cpred.append(1) + cpred.append(0) clariq_f1_score = f1_score(y_true=cgt, y_pred=cpreds, average='macro') @@ -158,4 +158,4 @@ if __name__ == "__main__": RANKER_RESULTS_FILE = './local-eval-ranker-results.json' DATA_FOLDER = './public_data' - evaluate(LocalEvalConfig) \ No newline at end of file + evaluate(LocalEvalConfig)