#!/usr/bin/env python import os import numpy as np from tqdm.auto import tqdm import torch from torchvision import transforms as T from evaluator.dataset import ZEWDPCBaseDataset, ZEWDPCProtectedDataset from baseline_utils.model import ResnetPredictor from baseline_utils.predict import predict_on_dataset from baseline_utils.training import train_on_dataset from baseline_utils.dataset import SimpleDataset from purchase_strategies.random_purchase import random_purchase from purchase_strategies.morefaults_purchase import purchase_data_with_more_faults from purchase_strategies.purchase_uncertain import purchase_uncertain_images from purchase_strategies.balance_labels import match_labels_to_target_dist os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html class Hparams: NUM_CLASSES = 6 USE_PRETRAINED = True FEATURE_EXTRACTING = False NUM_EPOCHS = 4 NUM_EPOCHS_PRETRAIN = 6 BATCH_SIZE = 64 VALIDATION_PERCENTAGE = 0.1 VALIDATION_INTERVAL = 2 DEVICE = 'cuda' class ZEWDPCBaseRun: """ Template Submission Class for the ZEW Data Purchasing Challenge 2022. The submission template follows the following hooks : - pre_training_phase - purchase_phase - prediction_phase - save_checkpoint - load_checkpoint Please refer to the inline documentation for further details. You are allowed to add any other member functions, however you are not allowed to change the names of these hooks, otherwise your submissions will not be evaluated by the automated evaluators. """ def __init__(self): # self._seed(42) self.evaluation_state = {} self.model = ResnetPredictor(use_pretrained=Hparams.USE_PRETRAINED, feature_extracting=Hparams.FEATURE_EXTRACTING, num_classes=Hparams.NUM_CLASSES) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=5e-4) def _seed(self, seed): self.seed = seed torch.manual_seed(seed) torch.use_deterministic_algorithms(True) def pre_training_phase( self, training_dataset: ZEWDPCBaseDataset, compute_budget=10**10, register_progress=lambda x: False, ): """ # Pre-training Phase ------------------------- Pre-train your model on the available training dataset here. Hook for the Pre-Training Phase of the Competition, where you have access to a training_dataset, which is an instance of the `ZEWDPCBaseDataset` class (see `evaluator/dataset.py` for more details). You are allowed to pre-train on this data, while you prepare for the Purchase_Phase of the competition. If you train some models, you can instantiate them as `self.model`, as long as you implement self-contained checkpointing in the `self.save_checkpoint` and `self.load_checkpoint` hooks, as the hooks for the different phases of the competition, can be called in different executions of the BaseRun. The `compute_budget` argument holds a floating point number representing the time available (in seconds) for **BOTH** the pre_training_phase and the `purchase_phase`. Exceeding the time will lead to a TimeOut error. You have access to a `register_progress` function, to which you can pass a value between [0,1] to relay onto the leaderboard your self reported progress on the training phase to be displayed on the submission dashboard during the evaluation. If a value out of these bounds is provided, it will be clipped to this range. """ print("\n================> Pre-Training Phase\n") criterion = torch.nn.BCEWithLogitsLoss() self.model = train_on_dataset( self.model, training_dataset, Hparams.NUM_EPOCHS_PRETRAIN, Hparams.BATCH_SIZE, Hparams.VALIDATION_PERCENTAGE, Hparams.VALIDATION_INTERVAL, Hparams.DEVICE, criterion=criterion, optimizer=self.optimizer, register_progress_fn=register_progress, # [Optional, but recommended] Mark Progress ) print("Execution Complete of Training Phase.") def purchase_phase( self, unlabelled_dataset: ZEWDPCProtectedDataset, training_dataset: ZEWDPCBaseDataset, purchase_budget=1500, compute_budget=10**10, register_progress=lambda x: False, ): """ # Purchase Phase ------------------------- In this phase of the competition, you have access to the unlabelled_dataset (an instance of `ZEWDPCProtectedDataset`) and the training_dataset (an instance of `ZEWDPCBaseDataset`) {see datasets.py for more details}, a purchase budget, and a compute budget. You can iterate over both the datasets and access the images without restrictions. However, you can probe the labels of the unlabelled_dataset only until you run out of the label purchasing budget. The `compute_budget` argument holds a floating point number representing the time available (in seconds) for **BOTH** the pre_training_phase and the `purchase_phase`. Exceeding the time will lead to a TimeOut error. """ print("\n================> Purchase Phase | Budget = {}\n".format(purchase_budget)) register_progress(0.0) # Register Progress ##### Sample a small amount and train further ##### random_sample_budget = purchase_budget*2//10 # 20% images, labels = random_purchase(unlabelled_dataset, random_sample_budget) total_unlabelled_images = images total_unlabelled_labels = labels ####################################################################################################################### ## Further train on the combined data total_unlabelled_dataset = SimpleDataset(total_unlabelled_images, total_unlabelled_labels) combined_dataset = torch.utils.data.ConcatDataset([training_dataset, total_unlabelled_dataset]) criterion = torch.nn.BCEWithLogitsLoss() self.model = train_on_dataset( self.model, combined_dataset, Hparams.NUM_EPOCHS, Hparams.BATCH_SIZE, Hparams.VALIDATION_PERCENTAGE, Hparams.VALIDATION_INTERVAL, Hparams.DEVICE, criterion=criterion, optimizer=self.optimizer, ) register_progress(len(unlabelled_dataset.purchases)/purchase_budget) # Predict on all images precicted_labels = predict_on_dataset(self.model, unlabelled_dataset, Hparams.BATCH_SIZE, Hparams.DEVICE) # Remove already purchased images from prediction list for label_idx in total_unlabelled_labels: precicted_labels.pop(label_idx) ####################################################################################################################### ##### Purchase images with more faults ##### morefaults_budget = purchase_budget*3//10 # 30% images, labels = purchase_data_with_more_faults(unlabelled_dataset, precicted_labels, morefaults_budget) total_unlabelled_images.update(images) total_unlabelled_labels.update(labels) # Remove already purchased images from prediction list for label_idx in labels: precicted_labels.pop(label_idx) register_progress(len(unlabelled_dataset.purchases)/purchase_budget) ###################################################################################################################### #### Purchase uncertain images ##### uncertain_budget = purchase_budget*3//10 # 30% images, labels = purchase_uncertain_images(unlabelled_dataset, precicted_labels, uncertain_budget) total_unlabelled_images.update(images) total_unlabelled_labels.update(labels) # Remove already purchased images from prediction list for label_idx in labels: precicted_labels.pop(label_idx) register_progress(len(unlabelled_dataset.purchases)/purchase_budget) ###################################################################################################################### # Further train on the combined data total_unlabelled_dataset = SimpleDataset(total_unlabelled_images, total_unlabelled_labels) combined_dataset = torch.utils.data.ConcatDataset([training_dataset, total_unlabelled_dataset]) criterion = torch.nn.BCEWithLogitsLoss() self.model = train_on_dataset( self.model, combined_dataset, Hparams.NUM_EPOCHS, Hparams.BATCH_SIZE, Hparams.VALIDATION_PERCENTAGE, Hparams.VALIDATION_INTERVAL, Hparams.DEVICE, criterion=criterion, optimizer=self.optimizer, ) # Predict on all images precicted_labels = predict_on_dataset(self.model, unlabelled_dataset, Hparams.BATCH_SIZE, Hparams.DEVICE) # Remove already purchased images from prediction list for label_idx in total_unlabelled_labels: precicted_labels.pop(label_idx) ####################################################################################################################### ##### Balance the dataset labels with the rest of the purchase balance ###### rebalance_budget = purchase_budget - len(unlabelled_dataset.purchases) target_distribution = [0.166, 0.166, 0.166, 0.166, 0.166, 0.17] assert sum(target_distribution) == 1 images, labels = match_labels_to_target_dist(unlabelled_dataset, total_unlabelled_labels, precicted_labels, target_distribution, rebalance_budget) total_unlabelled_images.update(images) total_unlabelled_labels.update(labels) register_progress(len(unlabelled_dataset.purchases)/purchase_budget) print("Execution Complete of Purchase Phase.") # Participants DO NOT need to return anything in the purchase phase # Their indexes used on unlabelled_dataset.purchase_label(idx) will be registered by the evaluator # These indexes will be used for the respective purchased labels return total_unlabelled_labels def prediction_phase( self, test_dataset: ZEWDPCBaseDataset, register_progress=lambda x: False, ): """ # Prediction Phase ------------------------- In this phase of the competition, you have access to the test dataset, and you are supposed to make predictions using your trained models. Returns: np.ndarray of shape (n, 6) where n is the number of samples in the test set and 6 refers to the 6 labels to be predicted for each sample for the multi-label classification problem. PARTICIPANT_TODO: Add your code here """ print( "\n================> Prediction Phase : - on {} images\n".format( len(test_dataset) ) ) predictions = [] batch_size = 16 self.model.eval() transform = T.Compose( [ T.ToTensor(), *self.model.required_transforms, ] ) test_dataset.set_transform(transform) dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=batch_size, shuffle=False ) outputs = [] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(device) for data in tqdm(dataloader, total=len(dataloader)): with torch.no_grad(): image = data["image"].to(device) output = self.model(image) output_with_activation = torch.argmax(output, dim=1).cpu().numpy() outputs.append(output_with_activation) outputs = np.concatenate(outputs, axis=0) outputs = outputs > 0.5 register_progress(1.0) predictions = np.array(predictions) # random predictions print("Execution Complete of Purchase Phase.") return outputs def save_checkpoint(self, checkpoint_folder): """ Self-contained checkpoint code to be included here, which can capture the state of your run (including any trained models, etc) at the provided folder path. This is critical to implement, as the execution of the different phases can happen using different instances of the BaseRun. See below for examples. PARTICIPANT_TODO: Add your code here """ checkpoint_path = os.path.join(checkpoint_folder, "model.pth") torch.save({ "model": self.model.state_dict(), "optimizer": self.optimizer.state_dict() }, checkpoint_path) def load_checkpoint(self, checkpoint_folder): """ Self-contained checkpoint code to be included here, which can load the state of your run (including any trained models, etc) from a provided checkpoint_folder path (previously saved using `self.save_checkpoint`) This is critical to implement, as the execution of the different phases can happen using different instances of the BaseRun. See below for examples. PARTICIPANT_TODO: Add your code here """ checkpoint_path = os.path.join(checkpoint_folder, "model.pth") self.model = ResnetPredictor(use_pretrained=False, feature_extracting=Hparams.FEATURE_EXTRACTING) load_dict = torch.load(checkpoint_path) self.model.load_state_dict(load_dict["model"]) self.optimizer.load_state_dict(load_dict["optimizer"]) if __name__ == "__main__": #################################################################################### ## You need to implement `ZEWDPCBaseRun` class in this file for this challenge. ## Code for running all the phases locally is written in `main.py` for illustration ## purposes. ## ## Checkout the inline documentation of `ZEWDPCBaseRun` for more details. #################################################################################### import local_evaluation