diff --git a/README.md b/README.md index 10eb647f3047238250aa9bac3992aa1fe23ea78a..77534c7eb559836325cbbf04c831abf93ddf4059 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ The `Private Leaderboard` (computed at the end of Round-2), will use a different This repository contains the Round 2 baseline which contains fast heuristic implementations of some simple ideas -- **Purchase images with more labels** - For multilabel datasets, often having images with more than one label gives a boost for deep learning models. +- **Purchase images with more faults** - Each image can have multiple faults, making the problem a multilabel classification problem. For multilabel datasets, often having images with more than one label gives a boost for deep learning models. Hence we purchase images with higher number of predicted faults. - **Purchase uncertain images** - Purchase images which have the most uncertainty in their predictions. While many methods exists to measure uncertainty, a simple output probability based heuristic method is used here. - **Purchase images to balance labels** - Well balanced datasets can improve model performance in deep learning. We set a uniform target distribution and try to purchase labels to get closer to that distribution. The provided code can try to purchase labels to any target distribution. @@ -97,7 +97,7 @@ The baseline follows the following steps, please check the code in `run.py`: 1. **Pre-train** - We train a simple Resnet model on the pre-training dataset during the pre-train phase. Model can be found at `baseline_utils\model.py` and training loop under `baseline_utils\training.py` 2. **Random purchase** - 20% of the purchase budget is used to make random purchases. While it may seem wasteful, it will help in case the training and unlabelled datasets distributions vary too much. Possibly conditional ideas can be applied here depending on how much the distributions vary based on these random purchases. 3. **Train and Predict** - With the extra labels purchased, further train and predict the labels for the rest of the unlabelled images. -4. **Purchase with more labels** - Use the predicted labels to purchase 30% of the budget for the images that have the most labels predicted. Here a straightforward threshold and count is used, but other ideas like adding the softmax probabilities can be used as well. Code can be found at `purchase_strategies\morelabels_purchase.py` +4. **Purchase with more labels** - Use the predicted labels to purchase 30% of the budget for the images that have the most labels predicted. Here a straightforward threshold and count is used, but other ideas like adding the softmax probabilities can be used as well. Code can be found at `purchase_strategies\morefaults_purchase.py` 5. **Purchase uncertain** - 30% of the budget is used to purchase uncertain labels. Here a simple and fast heuristic is used to find uncertainty. This is based on the assumption that the probabilities close to 1 or 0 are "certain". Please note that this is not always the case with deep learning models, so feel free to try out other uncertainty methods. Code can be found at `purchase_strategies\purchase_uncertain.py` 6. **Balance Labels** - The rest of the budget is used to balance the labels. Label balancing can be tricky in multilabel settings because one needs to compute which set of label matches the closest to get the target distribution needed. Here we have setup a heuristic to find the a simple difference in current and target distributions and normalized it to find the closest prediction that matches this distribution. Code can be found at `purchase_strategies\balance_labels.py` diff --git a/purchase_strategies/morelabels_purchase.py b/purchase_strategies/morefaults_purchase.py similarity index 87% rename from purchase_strategies/morelabels_purchase.py rename to purchase_strategies/morefaults_purchase.py index 46149cf20eb1494fb9c668ac33f0370f2da1bd50..0eefa015fe6e30c0088e2fded8a8803a94c4c034 100644 --- a/purchase_strategies/morelabels_purchase.py +++ b/purchase_strategies/morefaults_purchase.py @@ -1,8 +1,8 @@ import numpy as np from tqdm.auto import tqdm -def purchase_data_with_more_labels(unlabelled_dataset, predicted_labels, size): - """ Simple strategy to buy the images which have higher number of predicted labels """ +def purchase_data_with_more_faults(unlabelled_dataset, predicted_labels, size): + """ Simple strategy to buy the images which have higher number of predicted faults """ num_predicted_labels = {k: np.sum(v > 0.5) for k, v in predicted_labels.items()} num_predicted_labels_array = np.array([v for k, v in num_predicted_labels.items()]) @@ -13,7 +13,7 @@ def purchase_data_with_more_labels(unlabelled_dataset, predicted_labels, size): dataset_keys = list(num_predicted_labels.keys()) - for array_idx in tqdm(predict_num_sorted_idx[:size], desc="Purchase data with more Labels"): + for array_idx in tqdm(predict_num_sorted_idx[:size], desc="Purchase data with more Faults"): dataset_idx = dataset_keys[array_idx] sample_image = unlabelled_dataset[dataset_idx]["image"] sample_label = np.array(unlabelled_dataset.purchase_label(dataset_idx)) diff --git a/run.py b/run.py index 5139cf6a915e5595c07d1ccd35bcac8c4fad431d..92c05460c39d22504035f8ce0511993d851fc61d 100644 --- a/run.py +++ b/run.py @@ -14,7 +14,7 @@ from baseline_utils.training import train_on_dataset from baseline_utils.dataset import SimpleDataset from purchase_strategies.random_purchase import random_purchase -from purchase_strategies.morelabels_purchase import purchase_data_with_more_labels +from purchase_strategies.morefaults_purchase import purchase_data_with_more_faults from purchase_strategies.purchase_uncertain import purchase_uncertain_images from purchase_strategies.balance_labels import match_labels_to_target_dist @@ -183,11 +183,11 @@ class ZEWDPCBaseRun: precicted_labels.pop(label_idx) ####################################################################################################################### - ##### Purchase images with more labels ##### - morelabels_budget = purchase_budget*3//10 # 30% - images, labels = purchase_data_with_more_labels(unlabelled_dataset, + ##### Purchase images with more faults ##### + morefaults_budget = purchase_budget*3//10 # 30% + images, labels = purchase_data_with_more_faults(unlabelled_dataset, precicted_labels, - morelabels_budget) + morefaults_budget) total_unlabelled_images.update(images) total_unlabelled_labels.update(labels)