diff --git a/aicrowd.json b/aicrowd.json index 19a8ab99f6b74d00c04ccb48774e8a1db35de13e..3fe7d2e3e00e8123d64fd5bf61ef47bd8bf28ece 100755 --- a/aicrowd.json +++ b/aicrowd.json @@ -1,8 +1,8 @@ { "challenge_id": "esci-challenge-for-improving-product-search", - "task": "task_3_product_substitute_identification", + "task": "task_1_query-product_ranking", "gpu": true, "authors": ["aicrowd-username"], - "description": "task_1_query-product_ranking, task_2_multiclass_product_classification", + "description": "task_3_product_substitute_identification, task_2_multiclass_product_classification", "license": "MIT" } \ No newline at end of file diff --git a/utils/dataset.py b/utils/dataset.py index bc40ed3de10368f39c4fa618ae4cf9e4c1db2b19..e7f68aae6fd2b880e4fb5ee47713074b5f7954bc 100755 --- a/utils/dataset.py +++ b/utils/dataset.py @@ -340,6 +340,9 @@ class Task1DatasetWithDescription(Task1Dataset): "attention_mask": attention_mask, "speical_token_pos": input_ids_pos, } + + if self.model_type == 'bigbird': + feature['token_type_ids'] = torch.zeros_like(input_ids) meta = { "product_id": row['product_id'], diff --git a/utils/lgb_predict_task1.py b/utils/lgb_predict_task1.py index bd0a340579ae52499834862381b1b68160799326..9c3f5560843e5a46c8f4320dd2734a0fca548929 100755 --- a/utils/lgb_predict_task1.py +++ b/utils/lgb_predict_task1.py @@ -6,21 +6,21 @@ __MAP__ = ["irrelevant", "complement", "substitute", "exact"] LGB_CONFIG = { "us": { - "product_feat": "/models/us-product-feat.csv", + "product_feat": "/models/us-product-feat-task-1-only.csv", "model_file": "/models/lgb-us-task-1.txt", "features": ['pred_0','pred_1','pred_2','pred_3', 'label_0','label_1','label_2','label_3', 'query_count','is_isbn','has_isbn','fold'] }, "jp": { - "product_feat": "/models/jp-product-feat.csv", + "product_feat": "/models/es-jp-product-feat-task-1-only.csv", "model_file": "/models/lgb-es-jp-task-1.txt", "features": ['pred_0','pred_1','pred_2','pred_3', 'label_0','label_1','label_2','label_3', 'query_count','is_isbn','has_isbn','fold','locale'] }, "es": { - "product_feat": "/models/es-product-feat.csv", + "product_feat": "/models/es-jp-product-feat-task-1-only.csv", "model_file": "/models/lgb-es-jp-task-1.txt", "features": ['pred_0','pred_1','pred_2','pred_3', 'label_0','label_1','label_2','label_3', @@ -30,7 +30,7 @@ LGB_CONFIG = { LOCALE_MAP = {'jp':0, 'es':1, 'us':2} COL_NAME = "product_id" -WEIGHT_MAP = {0:0.5, 1:0.5, 2:0.5, 3:0.5, 4:0.25, 5:0.25} +WEIGHT_MAP = {0:0.5, 1:0.5, 2:0.5, 3:0.5, 4:0.25, 5:0.25, 6:0.5, 7:0.5} # Need modification def lgb_predict_task1(df, locale): df = df.reset_index(drop=True) diff --git a/utils/onnx_predict.py b/utils/onnx_predict.py index 40ccda576922adb3c838ba2123319864e6ebd551..af4b8b31d90c20ad719be15999f48d233d9d953c 100755 --- a/utils/onnx_predict.py +++ b/utils/onnx_predict.py @@ -43,11 +43,6 @@ def onnx_predict(sub_test, config): all_example = [] for data in tqdm(loader): - # inputs = { - # "input_ids": data["features"]["input_ids"], - # "attention_mask": data["features"]["attention_mask"], - # "speical_token_pos": data["features"]["speical_token_pos"], - # } for i, s in enumerate(session): all_output[i] += list( s.run(output_names=["output"], input_feed=dict(data['features']))[0] # type: ignore @@ -97,14 +92,9 @@ def onnx_predict_task1(sub_test, config): all_product = [] for data in tqdm(loader): - inputs = { - "input_ids": data["features"]["input_ids"], - "attention_mask": data["features"]["attention_mask"], - "speical_token_pos": data["features"]["speical_token_pos"], - } for i, s in enumerate(session): all_output[i] += list( - s.run(output_names=["output"], input_feed=dict(inputs))[0] # type: ignore + s.run(output_names=["output"], input_feed=dict(data['features']))[0] # type: ignore ) all_query += data["meta"]["query_id"] all_product += data['meta']['product_id'] diff --git a/utils/run_task1.py b/utils/run_task1.py index 578c37ef4ae6b700d19947c2aebc137ea6a9262b..09f9bf8725922e8353facf02e11b30254214d54b 100755 --- a/utils/run_task1.py +++ b/utils/run_task1.py @@ -24,7 +24,7 @@ CONFIG = { "product": "/models/product.h5", "key": "us", "type": "deberta", - "fold_offset": 0, + "fold_offset": 0, # fold 0, 1 "description": False, }, { @@ -43,7 +43,7 @@ CONFIG = { "product": "/models/cocolm.h5", "key": "us", "type": "cocolm", - "fold_offset": 2, + "fold_offset": 2, # fold 2, 3 "description": False, }, { @@ -62,9 +62,29 @@ CONFIG = { "product": "/models/distilbart.h5", "key": "us", "type": "distilbart", - "fold_offset": 4, + "fold_offset": 4, # fold 4, 5 "description": False, }, + { + "clean": DeBertaCleanV2, + "encode": { + "query": 12506, + "product_title": 3771, + "product_id": 4787, + "index": 6477, + "product_description": 6865, + "product_bullet_point": 10739, + "product_brand": 4609, + "product_color_name": 3225, + }, + "model": ["/models/us-bigbird-des-0-fp16.onnx", "/models/us-bigbird-des-1-fp16.onnx"], + "product": "/models/bigbird.h5", + "key": "us", + "type": "bigbird", + "tokenizer": "/models/bigbird.model", + "fold_offset": 6, # 6, 7 + "description":True, + }, ], "jp": [ { @@ -85,6 +105,7 @@ CONFIG = { "key": "jp", "type": "mdeberta", "fold_offset": 0, + "description":False, }, ], "es": [ @@ -106,6 +127,7 @@ CONFIG = { "key": "es", "type": "mdeberta", "fold_offset": 0, + "description":False, }, ], }