diff --git a/siam-mot/siammot/data/adapters/augmentation/__pycache__/build_augmentation.cpython-37.pyc b/siam-mot/siammot/data/adapters/augmentation/__pycache__/build_augmentation.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb1cf3b9ca47c6a77d5d63802e13b9cbd6604614 Binary files /dev/null and b/siam-mot/siammot/data/adapters/augmentation/__pycache__/build_augmentation.cpython-37.pyc differ diff --git a/siam-mot/siammot/data/adapters/augmentation/__pycache__/image_augmentation.cpython-37.pyc b/siam-mot/siammot/data/adapters/augmentation/__pycache__/image_augmentation.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c18d2abe5bea310ca77bdeb1bc67d7fc0d3a1a7 Binary files /dev/null and b/siam-mot/siammot/data/adapters/augmentation/__pycache__/image_augmentation.cpython-37.pyc differ diff --git a/siam-mot/siammot/data/adapters/augmentation/__pycache__/video_augmentation.cpython-37.pyc b/siam-mot/siammot/data/adapters/augmentation/__pycache__/video_augmentation.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35bc4d792ce302c33d116e8363ab049962ede5b3 Binary files /dev/null and b/siam-mot/siammot/data/adapters/augmentation/__pycache__/video_augmentation.cpython-37.pyc differ diff --git a/siam-mot/siammot/data/adapters/augmentation/build_augmentation.py b/siam-mot/siammot/data/adapters/augmentation/build_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..147c04c78d54f8af0dcf2426cf8308383f904b0d --- /dev/null +++ b/siam-mot/siammot/data/adapters/augmentation/build_augmentation.py @@ -0,0 +1,85 @@ +from .video_augmentation import SiamVideoResize, \ + SiamVideoColorJitter, SiamVideoCompressionAugment, SiamVideoMotionAugment, \ + SiamVideoMotionBlurAugment, SiamVideoRandomHorizontalFlip, VideoTransformer +from .image_augmentation import ToTensor, ToBGR255 + +import maskrcnn_benchmark.data.transforms as T + + +def build_siam_augmentation(cfg, is_train=True, modality='video'): + + motion_limit = 0.0 + motion_blur_prob = 0.0 + compression_limit = 0.0 + if is_train: + min_size = cfg.INPUT.MIN_SIZE_TRAIN + max_size = cfg.INPUT.MAX_SIZE_TRAIN + flip_horizontal_prob = 0.5 # cfg.INPUT.FLIP_PROB_TRAIN + brightness = cfg.INPUT.BRIGHTNESS + contrast = cfg.INPUT.CONTRAST + saturation = cfg.INPUT.SATURATION + hue = cfg.INPUT.HUE + + if modality == 'image': + motion_limit = cfg.INPUT.MOTION_LIMIT + motion_blur_prob = cfg.INPUT.MOTION_BLUR_PROB + compression_limit = cfg.INPUT.COMPRESSION_LIMIT + + else: + min_size = cfg.INPUT.MIN_SIZE_TEST + max_size = cfg.INPUT.MAX_SIZE_TEST + flip_horizontal_prob = 0.0 + brightness = 0.0 + contrast = 0.0 + saturation = 0.0 + hue = 0.0 + + amodal = cfg.INPUT.AMODAL + SIZE_DIVISIBILITY = cfg.DATALOADER.SIZE_DIVISIBILITY + to_bgr255 = cfg.INPUT.TO_BGR255 + + video_color_jitter = SiamVideoColorJitter( + brightness=brightness, + contrast=contrast, + saturation=saturation, + hue=hue, + ) + + normalize_transform = T.Normalize( + mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255 + ) + + transform = Compose( + [ + video_color_jitter, + SiamVideoMotionBlurAugment(motion_blur_prob), + SiamVideoCompressionAugment(compression_limit), + SiamVideoMotionAugment(motion_limit, amodal), + SiamVideoResize(min_size, max_size, SIZE_DIVISIBILITY), + SiamVideoRandomHorizontalFlip(prob=flip_horizontal_prob), + # PIL image + VideoTransformer(ToTensor()), + # Torch tensor, CHW (RGB format), and range from [0, 1] + # VideoTransformer(ToBGR255(to_bgr255=to_bgr255)) + VideoTransformer(normalize_transform), + ] + ) + return transform + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target=None): + for t in self.transforms: + image, target = t(image, target) + return image, target + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string \ No newline at end of file diff --git a/siam-mot/siammot/data/adapters/augmentation/image_augmentation.py b/siam-mot/siammot/data/adapters/augmentation/image_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..adbc582025bb64d628378ba246c70485d8e9f8e9 --- /dev/null +++ b/siam-mot/siammot/data/adapters/augmentation/image_augmentation.py @@ -0,0 +1,187 @@ +import torch +import random +import numpy as np +from PIL import Image +from torchvision.transforms import functional as F + +import imgaug.augmenters as iaa + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class ImageResize(object): + def __init__(self, min_size, max_size, size_divisibility): + if not isinstance(min_size, (list, tuple)): + min_size = (min_size,) + self.min_size = min_size + self.max_size = max_size + self.size_divisibility = size_divisibility + + # modified from torchvision to add support for max size + def get_size(self, image_size): + w, h = image_size + size = random.choice(self.min_size) + max_size = self.max_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + if self.size_divisibility > 0: + oh = (int(oh / self.size_divisibility) * self.size_divisibility) + ow = (int(ow / self.size_divisibility) * self.size_divisibility) + + return (oh, ow) + + def __call__(self, image, target=None): + size = self.get_size(image.size) + image = F.resize(image, size) + if target is None: + return image, target + target = target.resize(image.size) + return image, target + + +class ImageCropResize(object): + """ + Crop a patch from the image and resize to its original size + """ + def __init__(self, crop_limit=None, amodal=False): + self.crop_limit = crop_limit + self.amodal = amodal + + def remove_invisible_box(self, box: BoxList): + """ + Remove boxes that are not visible (out of image boundary) after motion augmentation + """ + bbox = box.bbox.clone() + xmin_clip = bbox[:, 0].clamp(min=0, max=box.size[0] - 1) + ymin_clip = bbox[:, 1].clamp(min=0, max=box.size[1] - 1) + xmax_clip = bbox[:, 2].clamp(min=0, max=box.size[0] - 1) + ymax_clip = bbox[:, 3].clamp(min=0, max=box.size[1] - 1) + keep = (xmax_clip > xmin_clip) & (ymax_clip > ymin_clip) + + return box[keep] + + def boxlist_crop(self, box: BoxList, x1, y1, x2, y2): + """ + Adjust the coordinate of the bounding box within + image crop specified by (x1, y1, x2, y2) + """ + + w, h = (x2 - x1), (y2 - y1) + xmin, ymin, xmax, ymax = box._split_into_xyxy() + cropped_xmin = (xmin - x1) + cropped_ymin = (ymin - y1) + cropped_xmax = (xmax - x1) + cropped_ymax = (ymax - y1) + cropped_bbox = torch.cat( + (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1 + ) + cropped_box = BoxList(cropped_bbox, (w, h), mode="xyxy") + for k, v in box.extra_fields.items(): + cropped_box.add_field(k, v) + + if self.amodal: + # amodal allows the corners of bbox go beyond image boundary + cropped_box = self.remove_invisible_box(cropped_box) + else: + # the corners of bbox need to be within image boundary for non-amodal training + cropped_box = cropped_box.clip_to_image(remove_empty=True) + return cropped_box.convert(box.mode) + + def __call__(self, image, target): + w, h = image.size + + tl_x = int(w * (random.random() * self.crop_limit)) + tl_y = int(h * (random.random() * self.crop_limit)) + br_x = int(w - w * (random.random() * self.crop_limit)) + # keep aspect ratio + br_y = int((h / w) * (br_x - tl_x) + tl_y) + + if len(target) > 0: + box = target.bbox + box_w = box[:, 2] - box[:, 0] + box_h = box[:, 3] - box[:, 1] + box_area = box_h * box_w + max_area_idx = torch.argmax(box_area, dim=0) + max_motion_limit_w = int(box_w[max_area_idx] * 0.25) + max_motion_limit_h = int(box_h[max_area_idx] * 0.25) + + # make sure at least one bounding box is preserved + # after motion augmentation + tl_x = min(tl_x, max_motion_limit_w) + tl_y = min(tl_y, max_motion_limit_h) + br_x = max(br_x, w-max_motion_limit_w) + br_y = max(br_y, h-max_motion_limit_h) + + assert (tl_x < br_x) and (tl_y < br_y) + + crop = F.crop(image, tl_y, tl_x, (br_y-tl_y), (br_x-tl_x)) + crop = F.resize(crop, (h, w)) + if len(target) > 0: + target = self.boxlist_crop(target, tl_x, tl_y, br_x, br_y) + target = target.resize(image.size) + + return crop, target + + +class ImageMotionBlur(object): + """ + Perform motion augmentation to an image + """ + def __init__(self): + motion_blur = iaa.MotionBlur(k=10, angle=[-30, 30]) + gaussian_blur = iaa.GaussianBlur(sigma=(0.0, 2.0)) + + self.blur_func_pool = [motion_blur, gaussian_blur] + + pass + + def __call__(self, image): + blur_id = random.choice(list(range(0, len(self.blur_func_pool)))) + blur_func = self.blur_func_pool[blur_id] + np_image = np.asarray(image) + blurred_image = blur_func.augment_image(np_image) + pil_image = Image.fromarray(np.uint8(blurred_image)) + return pil_image + + +class ImageCompression(object): + """ + Perform JPEG compression augmentation to an image + """ + def __init__(self, max_compression): + self.max_compression = max_compression + + def __call__(self, image): + ratio = random.uniform(0, 1) + compression = min(100, int(ratio * self.max_compression)) + np_image = np.asarray(image) + compressed_image = iaa.arithmetic.compress_jpeg(np_image, compression) + pil_image = Image.fromarray(np.uint8(compressed_image)) + return pil_image + + +class ToTensor(object): + def __call__(self, image, target=None): + return F.to_tensor(image), target + + +class ToBGR255(object): + def __init__(self, to_bgr255=True): + self.to_bgr255 = to_bgr255 + + def __call__(self, image, target=None): + if self.to_bgr255: + image = image[[2, 1, 0]] * 255 + return image, target + diff --git a/siam-mot/siammot/data/adapters/augmentation/video_augmentation.py b/siam-mot/siammot/data/adapters/augmentation/video_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..0f267bf0db912b3e240d82dbc0e1c0cfe9790b37 --- /dev/null +++ b/siam-mot/siammot/data/adapters/augmentation/video_augmentation.py @@ -0,0 +1,187 @@ +import torch +import random +from torchvision.transforms import functional as F +from torchvision.transforms import ColorJitter as ImageColorJitter + +from .image_augmentation import ImageResize, ImageCropResize, \ + ImageMotionBlur, ImageCompression + + +class VideoTransformer(object): + def __init__(self, transform_fn=None): + if transform_fn is None: + raise KeyError('Transform function should not be None.') + self.transform_fn = transform_fn + + def __call__(self, video, target=None): + """ + A data transformation wrapper for video + :param video: a list of images + :param target: a list of BoxList (per image) + """ + if not isinstance(video, (list, tuple)): + return self.transform_fn(video, target) + + new_video = [] + new_target = [] + for (image, image_target) in zip(video, target): + (image, image_target) = self.transform_fn(image, image_target) + new_video.append(image) + new_target.append(image_target) + + return new_video, new_target + + +class SiamVideoResize(ImageResize): + def __init__(self, min_size, max_size, size_divisibility): + super(SiamVideoResize, self).__init__(min_size, max_size, size_divisibility) + + def __call__(self, video, target=None): + + if not isinstance(video, (list, tuple)): + return super(SiamVideoResize, self).__call__(video, target) + + assert len(video) >= 1 + new_size = self.get_size(video[0].size) + + new_video = [] + new_target = [] + for (image, image_target) in zip(video, target): + (image, image_target) = self._resize(image, new_size, image_target) + new_video.append(image) + new_target.append(image_target) + + return new_video, new_target + + def _resize(self, image, size, target=None): + image = F.resize(image, size) + target = target.resize(image.size) + return image, target + + +class SiamVideoRandomHorizontalFlip(object): + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, video, target=None): + + if not isinstance(video, (list, tuple)): + return video, target + + new_video = [] + new_target = [] + # All frames should have the same flipping operation + if random.random() < self.prob: + for (image, image_target) in zip(video, target): + new_video.append(F.hflip(image)) + new_target.append(image_target.transpose(0)) + else: + new_video = video + new_target = target + return new_video, new_target + + +class SiamVideoColorJitter(ImageColorJitter): + def __init__(self, + brightness=None, + contrast=None, + saturation=None, + hue=None): + super(SiamVideoColorJitter, self).__init__(brightness, contrast, saturation, hue) + + def __call__(self, video, target=None): + # Color jitter only applies for Siamese Training + if not isinstance(video, (list, tuple)): + return video, target + + idx = random.choice((0, 1)) + # all frames in the video should go through the same transformation + transform = self.get_params(self.brightness, self.contrast, + self.saturation, self.hue) + new_video = [] + new_target = [] + for i, (image, image_target) in enumerate(zip(video, target)): + if i == idx: + image = transform(image) + new_video.append(image) + new_target.append(image_target) + + return new_video, new_target + + +class SiamVideoMotionAugment(object): + def __init__(self, motion_limit=None, amodal=False): + # maximum motion augmentation + self.motion_limit = min(0.1, motion_limit) + if motion_limit is None: + self.motion_limit = 0 + self.motion_augment = ImageCropResize(self.motion_limit, amodal) + + def __call__(self, video, target=None): + + # Motion augmentation only applies for Siamese Training + if not isinstance(video, (list, tuple)) or self.motion_limit == 0: + return video, target + + new_video = [] + new_target = [] + # Only 1 frame go through the motion augmentation, + # the other unchanged + idx = random.choice((0, 1)) + for i, (image, image_target) in enumerate(zip(video, target)): + if i == idx: + (image, image_target) = self.motion_augment(image, image_target) + new_video.append(image) + new_target.append(image_target) + + return new_video, new_target + + +class SiamVideoMotionBlurAugment(object): + def __init__(self, motion_blur_prob=None): + self.motion_blur_prob = motion_blur_prob + if motion_blur_prob is None: + self.motion_blur_prob = 0.0 + self.motion_blur_func = ImageMotionBlur() + + def __call__(self, video, target): + # Blur augmentation only applies for Siamese Training + if not isinstance(video, (list, tuple)) or self.motion_blur_prob == 0.0: + return video, target + + new_video = [] + new_target = [] + idx = random.choice((0, 1)) + for i, (image, image_target) in enumerate(zip(video, target)): + if i == idx: + random_prob = random.uniform(0, 1) + if random_prob < self.motion_blur_prob: + image = self.motion_blur_func(image) + new_video.append(image) + new_target.append(image_target) + + return new_video, new_target + + +class SiamVideoCompressionAugment(object): + def __init__(self, max_compression=None): + self.max_compression = max_compression + if max_compression is None: + self.max_compression = 0.0 + self.compression_func = ImageCompression(self.max_compression) + + def __call__(self, video, target): + # Compression augmentation only applies for Siamese Training + if not isinstance(video, (list, tuple)) or self.max_compression == 0.0: + return video, target + + idx = random.choice((0, 1)) + new_video = [] + new_target = [] + for i, (image, image_target) in enumerate(zip(video, target)): + if i == idx: + image = self.compression_func(image) + new_video.append(image) + new_target.append(image_target) + + return new_video, new_target \ No newline at end of file diff --git a/siam-mot/siammot/data/adapters/handler/data_filtering.py b/siam-mot/siammot/data/adapters/handler/data_filtering.py new file mode 100644 index 0000000000000000000000000000000000000000..9c51b8db6bdd0a41010bfe9454fcc06bacc5347e --- /dev/null +++ b/siam-mot/siammot/data/adapters/handler/data_filtering.py @@ -0,0 +1,140 @@ +import numpy as np + +from gluoncv.torch.data.gluoncv_motion_dataset.dataset import AnnoEntity + +from siammot.utils.entity_utils import bbs_iou + + +def build_data_filter_fn(dataset_key: str, *args, **kwargs): + """ + Get dataset specific filter function list, if there is any + """ + filter_fn = None + if dataset_key == 'CRP': + filter_fn = CRPFilter(*args, **kwargs) + elif dataset_key.startswith('MOT'): + filter_fn = MOTFilter(*args, **kwargs) + elif dataset_key == 'AOT': + filter_fn = AOTFilter(*args, **kwargs) + return filter_fn + + +class BaseFilter: + def __init__(self): + pass + + # the default filter does not filter any entity, which is technically doing nothing + def _filter(self, entity: AnnoEntity, ignored_gt_entities=None): + raise False + + def filter(self, entity:AnnoEntity, ignored_gt_entities=None): + return self._filter(entity, ignored_gt_entities) + + def __call__(self, entities: [AnnoEntity], ignored_entities=None, meta_data=None): + """ + Check each entity whether it is valid or should be filtered (ignored). + :param entities: A list of entities (for a single frame) to be evaluated + :param ignored_entities: A list of ignored entities or a binary mask indicating ignored regions + :param meta_data: The meta data for the frame (or video) + :return: A list of valid entities and a list of filtered (ignored) entities + """ + valid_entities = [] + filtered_entities = [] + + for entity in entities: + if self._filter(entity, ignored_entities): + filtered_entities.append(entity) + else: + valid_entities.append(entity) + + return valid_entities, filtered_entities + + +class CRPFilter(BaseFilter): + """ + A class for filtering JTA dataset entities during evaluation + A gt entity will be filtered (ignored) if its id is -1 (negative) + A predicted entity will be filtered (ignored) if it is matched to a ignored ground truth entity + """ + def __init__(self, iou_thresh=0.2, is_train=False): + """ + :param iou_thresh: a predicted entity which overlaps with any ignored gt entity with at least + iou_thresh would be filtered + """ + self.iou_thresh = iou_thresh + + def _filter(self, entity: AnnoEntity, ignored_gt_entities=None): + if ignored_gt_entities is None: + if entity.id < 0: + return True + else: + for entity_ in ignored_gt_entities: + if bbs_iou(entity, entity_) >= self.iou_thresh: + return True + return False + + +class MOTFilter(BaseFilter): + """ + A class for filtering MOT dataset entities + A gt entity will be filtered (ignored) if its visibility ratio is very low + A predicted entity will be filtered (ignored) if it is matched to a ignored ground truth entity + """ + def __init__(self, visibility_thresh=0.1, iou_thresh=0.5, is_train=False): + self.visibility_thresh = visibility_thresh + self.iou_thresh = iou_thresh + self.is_train = is_train + + def _filter(self, entity: AnnoEntity, ignored_gt_entities=None): + if ignored_gt_entities is None: + if self.is_train: + # any entity whose visibility is below the pre-defined + # threshold should be filtered out + # meanwhile, any entity whose class does not have label + # needs to be filtered + if entity.blob['visibility'] < self.visibility_thresh or \ + not any(k in ('person', '2', '7') for k in entity.labels): + return True + else: + if 'person' not in entity.labels or int(entity.id) < 0: + return True + else: + for entity_ in ignored_gt_entities: + if bbs_iou(entity, entity_) >= self.iou_thresh: + return True + return False + + +class AOTFilter(BaseFilter): + """ + A class for filtering AOT entities + A gt entity will be filtered if it falls into one the following criterion + 1. tracking id is not Helicopter1 or Airplane1 + 2. range distance is larger than 1200 + """ + + def __init__(self, range_distance_thresh=1200, iou_thresh=0.2, is_train=False): + self.range_distance_thresh = range_distance_thresh + self.iou_thresh = iou_thresh + self.is_train = is_train + + def _filter(self, entity: AnnoEntity, ignored_gt_entities=None): + if ignored_gt_entities is None: + range_distance_m = np.inf + if 'range_distance_m' in entity.blob: + range_distance_m = entity.blob['range_distance_m'] + + labels = [] + if entity.labels is not None: + labels = entity.labels + + if ('intruder' not in labels) or \ + (range_distance_m >= self.range_distance_thresh): + return True + else: + for entity_ in ignored_gt_entities: + if entity_.bbox is not None: + if bbs_iou(entity, entity_) >= self.iou_thresh: + return True + return False + diff --git a/siam-mot/siammot/data/adapters/utils/data_utils.py b/siam-mot/siammot/data/adapters/utils/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2d2ce35763b08667824918fb8001b7f43e58bb98 --- /dev/null +++ b/siam-mot/siammot/data/adapters/utils/data_utils.py @@ -0,0 +1,62 @@ +import os + +from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset +from pycocotools.coco import COCO + +from .dataset_info import dataset_maps + + +def load_motion_anno(dataset_folder, + anno_file, + split_file, + set=None, + ): + """ + Load GluonCVMotionDataset format annotations for downstream training / testing + """ + + dataset = GluonCVMotionDataset(anno_file, + root_path=dataset_folder, + split_file=split_file + ) + + if set == 'train': + dataset = list(dataset.train_samples) + elif set == 'val': + dataset = list(dataset.val_samples) + elif set == 'test': + dataset = list(dataset.test_samples) + + return dataset + + +def load_coco_anno(dataset_folder, + anno_file): + + dataset_anno_path = os.path.join(dataset_folder, anno_file) + dataset = COCO(dataset_anno_path) + return dataset + + +def load_dataset_anno(cfg, dataset_key, set=None): + dataset_folder, anno_file, split_file, modality = dataset_maps[dataset_key] + + dataset_info = dict() + dataset_info['modality'] = modality + + dataset_folder = os.path.join(cfg.DATASETS.ROOT_DIR, dataset_folder) + if modality == 'video': + dataset = load_motion_anno(dataset_folder, + anno_file, + split_file, + set) + elif modality == 'image': + dataset = load_coco_anno(dataset_folder, + anno_file) + image_folder = os.path.join(dataset_folder, split_file) + dataset_info['image_folder'] = image_folder + else: + raise ValueError("dataset has to be video or image.") + + return dataset, dataset_info + diff --git a/siam-mot/siammot/data/adapters/utils/dataset_info.py b/siam-mot/siammot/data/adapters/utils/dataset_info.py new file mode 100644 index 0000000000000000000000000000000000000000..36527d83f8ffad656aa510e0d83cf83fad7a408b --- /dev/null +++ b/siam-mot/siammot/data/adapters/utils/dataset_info.py @@ -0,0 +1,49 @@ +dataset_maps = dict() +""" +each item in the dataset maps are a list of the following info +( +dataset_folder, +annotation file name (video dataset) / path of annotation file (image dataset), +split file name (video dataset) / path of image folder (image dataset) , +modality +) +""" +dataset_maps['TAO'] = ['TAO', + 'anno_person.json', + 'splits_person.json', + 'video'] + +dataset_maps['CRP'] = ['caltech_roadside_pedestrians', + 'anno.json', + 'splits.json', + 'video'] + +dataset_maps['MOT17_DPM'] = ['MOT17', + 'anno.json', + 'splits_DPM.json', + 'video'] + +dataset_maps['MOT17'] = ['MOT17', + 'anno.json', + 'splits.json', + 'video'] + +dataset_maps['AOT'] = ['airbone_object_tracking', + 'anno.json', + 'splits.json', + 'video'] + +dataset_maps['COCO17_train'] = ['mscoco', + 'annotations/MSCOCO2017_train_person.json', + 'images/train2017', # all raw images would be in dataset_root/mscoco/images/train2017 + 'image'] + +dataset_maps['crowdhuman_train_fbox'] = ['CrowdHuman', + 'annotations/annotation_train_fbox.json', + 'Images', # all raw images would be in dataset_root/CrowdHuman/Images + 'image'] + +dataset_maps['crowdhuman_train_vbox'] = ['CrowdHuman', + 'annotations/annotation_train_vbox.json', + 'Images', + 'image'] \ No newline at end of file diff --git a/siam-mot/siammot/data/build_inference_data_loader.py b/siam-mot/siammot/data/build_inference_data_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..970fca45cec1e52ee0d35328f08bf9626947ffdd --- /dev/null +++ b/siam-mot/siammot/data/build_inference_data_loader.py @@ -0,0 +1,56 @@ +import torch +import torch.utils.data as data + +from gluoncv.torch.data.gluoncv_motion_dataset.dataset import DataSample +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class InferenceVideoData(data.Dataset): + """ + Split the video into small chunks (in an non-overlapping fashion) for inference + """ + + def __init__(self, video: DataSample, clip_len=1, transforms=None): + """ + Construct a data loader for inference + :param video: a video stream in DataSample format + :param clip_len: the length of video clips + :param transforms: transform function for video pre-processing + """ + self.video = video + self.video_reader = video.get_data_reader() + self.clip_len = clip_len + self.transforms = transforms + self.clip_idxs = list(range(0, len(self.video), self.clip_len)) + + def __getitem__(self, id): + video_clip = [] + # this is needed for transformation + dummy_boxes = [] + timestamps = [] + start_idx = self.clip_idxs[id] + end_idx = min(len(self.video), start_idx + self.clip_len) + for frame_idx in range(start_idx, end_idx): + (im, timestamp, _) = self.video_reader[frame_idx] + dummy_bbox = torch.tensor([[0, 0, 1, 1]]) + dummy_boxlist = BoxList(dummy_bbox, im.size, mode='xywh') + + video_clip.append(im) + timestamps.append(torch.tensor(timestamp)) + dummy_boxes.append(dummy_boxlist) + + if self.transforms is not None: + video_clip, _ = self.transforms(video_clip, dummy_boxes) + + return torch.stack(video_clip), start_idx, torch.stack(timestamps) + + def __len__(self): + return len(self.clip_idxs) + + +def build_video_loader(cfg, video: DataSample, transforms): + clip_len = cfg.INFERENCE.CLIP_LEN + videodata = InferenceVideoData(video, clip_len=clip_len, transforms=transforms) + videoloader = data.DataLoader(videodata, num_workers=4, batch_size=1, shuffle=False) + + return videoloader diff --git a/siam-mot/siammot/data/build_train_data_loader.py b/siam-mot/siammot/data/build_train_data_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..20174132ae8ced050df863e3827a60ff35d9f6f6 --- /dev/null +++ b/siam-mot/siammot/data/build_train_data_loader.py @@ -0,0 +1,77 @@ +import torch.utils.data + +from maskrcnn_benchmark.utils.comm import get_world_size +from maskrcnn_benchmark.data.build import make_data_sampler, make_batch_data_sampler +from maskrcnn_benchmark.data.datasets.concat_dataset import ConcatDataset + +from .video_dataset import VideoDataset, VideoDatasetBatchCollator +from .image_dataset import ImageDataset +from .adapters.utils.data_utils import load_dataset_anno +from .adapters.augmentation.build_augmentation import build_siam_augmentation +from .adapters.handler.data_filtering import build_data_filter_fn + + +def build_dataset(cfg): + """ + + """ + + dataset_list = cfg.DATASETS.TRAIN + if not isinstance(dataset_list, (list, tuple)): + raise RuntimeError( + "dataset_list should be a list of strings, got {}".format(dataset_list) + ) + + datasets = [] + for dataset_key in dataset_list: + dataset_anno, dataset_info = load_dataset_anno(cfg, dataset_key) + modality = dataset_info['modality'] + transforms = build_siam_augmentation(cfg, is_train=True, modality=modality) + data_filter_fn = build_data_filter_fn(dataset_key, is_train=True) + + if modality == 'image': + assert 'image_folder' in dataset_info + _dataset = ImageDataset(dataset_anno, + dataset_info['image_folder'], + transforms=transforms, + frames_per_image=cfg.VIDEO.RANDOM_FRAMES_PER_CLIP, + amodal=cfg.INPUT.AMODAL) + else: + _dataset = VideoDataset(dataset_anno, + sampling_interval=cfg.VIDEO.TEMPORAL_SAMPLING, + clip_len=cfg.VIDEO.TEMPORAL_WINDOW, + transforms=transforms, + filter_fn=data_filter_fn, + frames_in_clip=cfg.VIDEO.RANDOM_FRAMES_PER_CLIP, + amodal=cfg.INPUT.AMODAL) + datasets.append(_dataset) + + dataset = ConcatDataset(datasets) + + return dataset + + +def build_train_data_loader(cfg, is_distributed=False, start_iter=0): + + num_gpus = get_world_size() + + video_clips_per_batch = cfg.SOLVER.VIDEO_CLIPS_PER_BATCH + assert ( + video_clips_per_batch % num_gpus == 0 + ), "SOLVER.VIDEO_CLIPS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format( + video_clips_per_batch, num_gpus) + + video_clips_per_gpu = video_clips_per_batch // num_gpus + + dataset = build_dataset(cfg) + num_iters = cfg.SOLVER.MAX_ITER + sampler = make_data_sampler(dataset, True, is_distributed) + batch_sampler = make_batch_data_sampler( + dataset, sampler, [], video_clips_per_gpu, num_iters, start_iter + ) + + num_workers = cfg.DATALOADER.NUM_WORKERS + collator = VideoDatasetBatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) + data_loader = torch.utils.data.DataLoader(dataset, num_workers=num_workers, + batch_sampler=batch_sampler, collate_fn=collator) + return data_loader diff --git a/siam-mot/siammot/data/image_dataset.py b/siam-mot/siammot/data/image_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..806e8e6842941517bce6a4d7025695f1c932326d --- /dev/null +++ b/siam-mot/siammot/data/image_dataset.py @@ -0,0 +1,232 @@ +import torch +import os +from tqdm import tqdm +from PIL import Image + +import torch.utils.data as data +from pycocotools.coco import COCO +from gluoncv.utils.bbox import bbox_xywh_to_xyxy, bbox_clip_xyxy + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class ImageDataset(data.Dataset): + def __init__(self, + dataset: COCO, + image_dir, + transforms=None, + frames_per_image=1, + amodal=False, + skip_empty=True, + min_object_area=0, + use_crowd=False, + include_bg=False, + ): + """ + :param dataset: the ingested dataset with COCO-format + :param transforms: image transformation + :param frames_per_image: how many image copies are generated from a single image + :param amodal: whether to use amodal ground truth (no image boundary clipping) + :param include_bg: whether to include the full background images during training + """ + + self.dataset = dataset + self.image_dir = image_dir + self.transforms = transforms + self.frames_per_image = frames_per_image + + self._skip_empty = skip_empty + self._min_object_area = min_object_area + self._use_crowd = use_crowd + self._amodal = amodal + self._include_bg = include_bg + self._det_classes = [c['name'] for c in self.dataset.loadCats(self.dataset.getCatIds())] + + # These are tha mapping table of COCO labels + self.json_category_id_to_contiguous_id = { + v: i+1 for i, v in enumerate(self.dataset.getCatIds()) + } + + self._labels, self._im_aspect_ratios, self._items, self._ids \ + = self._dataset_preprocess() + + self.id_to_img_map = {k: v for k, v in enumerate(self._ids)} + + def __getitem__(self, index): + img_name = self._items[index] + img_path = os.path.join(self.image_dir, img_name) + + img = Image.open(img_path).convert('RGB') + target = self._get_target(img, index) + + # for tracking purposes, two frames are needed + # the pairs would go into random augmentation to generate fake motion + video_clip = [img for _ in range(self.frames_per_image)] + video_target = [target for _ in range(self.frames_per_image)] + + if self.transforms is not None: + video_clip, video_target = self.transforms(video_clip, video_target) + + return video_clip, video_target, img_name + + def _get_target(self, img, index): + + # a list of label (x1, y1, x2, y2, class_id, instance_id) + labels = self._labels[index] + if len(labels) == 0: + assert self._include_bg is True, "The image does not has ground truth" + bbox = torch.as_tensor(labels).reshape(-1, 4) + class_ids = torch.as_tensor(labels) + instance_ids = torch.as_tensor(labels) + empty_boxlist = BoxList(bbox, img.size, mode="xyxy") + empty_boxlist.add_field("labels", class_ids) + empty_boxlist.add_field("ids", instance_ids) + return empty_boxlist + + labels = torch.as_tensor(labels).reshape(-1, 6) + boxes = labels[:, :4] + target = BoxList(boxes, img.size, mode="xyxy") + + class_ids = labels[:, 4].clone().to(torch.int64) + target.add_field("labels", class_ids) + + instance_ids = labels[:, -1].clone().to(torch.int64) + target.add_field("ids", instance_ids) + + if not self._amodal: + target = target.clip_to_image(remove_empty=True) + + return target + + def _dataset_preprocess(self): + items = [] + labels = [] + ids = [] + im_aspect_ratios = [] + image_ids = sorted(self.dataset.getImgIds()) + instance_id = 0 + rm_redundant = 0 + all_amodal = 0 + + for entry in tqdm(self.dataset.loadImgs(image_ids)): + label, num_instances, num_redundant, num_amodal\ + = self._check_load_bbox(entry, instance_id) + if not label and not self._include_bg: + continue + instance_id += num_instances + rm_redundant += num_redundant + all_amodal += num_amodal + labels.append(label) + ids.append(entry['id']) + items.append(entry['file_name']) + im_aspect_ratios.append(float(entry['width']) / entry['height']) + + print('{} / {} valid images...'.format(len(labels), len(image_ids))) + print('{} instances...'.format(instance_id)) + print('{} redundant instances are removed...'.format(rm_redundant)) + print('{} amodal instances...'.format(all_amodal)) + return labels, im_aspect_ratios, items, ids + + def _check_load_bbox(self, entry, instance_id): + """ + Check and load ground-truth labels + """ + entry_id = entry['id'] + entry_id = [entry_id] if not isinstance(entry_id, (list, tuple)) else entry_id + ann_ids = self.dataset.getAnnIds(imgIds=entry_id, iscrowd=None) + objs = self.dataset.loadAnns(ann_ids) + + # check valid bboxes + valid_objs = [] + width = entry['width'] + height = entry['height'] + _instance_count = 0 + _redudant_count = 0 + _amodal_count = 0 + unique_bbs = set() + for obj in objs: + if obj.get('ignore', 0) == 1: + continue + if not self._use_crowd and obj.get('iscrowd', 0): + continue + if self._amodal: + xmin, ymin, xmax, ymax = bbox_xywh_to_xyxy(obj['bbox']) + if xmin < 0 or ymin < 0 or xmax > width or ymax > height: + _amodal_count += 1 + else: + xmin, ymin, xmax, ymax = bbox_clip_xyxy(bbox_xywh_to_xyxy(obj['bbox']), width, height) + + if (xmin, ymin, xmax, ymax) in unique_bbs: + _redudant_count += 1 + continue + + box_w = (xmax - xmin) + box_h = (ymax - ymin) + area = box_w * box_h + if area <= self._min_object_area: + continue + + # require non-zero box area + if xmax > xmin and ymax > ymin: + unique_bbs.add((xmin, ymin, xmax, ymax)) + contiguous_cid = self.json_category_id_to_contiguous_id[obj['category_id']] + valid_objs.append([xmin, ymin, xmax, ymax, contiguous_cid, + instance_id+_instance_count]) + _instance_count += 1 + if not valid_objs: + if not self._skip_empty: + # dummy invalid labels if no valid objects are found + valid_objs.append([-1, -1, -1, -1, -1, -1]) + return valid_objs, _instance_count, _redudant_count, _amodal_count + + def __len__(self): + return len(self._items) + + def get_img_info(self, index): + img_id = self.id_to_img_map[index] + img_data = self.dataset.imgs[img_id] + return img_data + + @property + def classes(self): + return self._det_classes + + def get_im_aspect_ratio(self): + return self._im_aspect_ratios + + +if __name__ == "__main__": + + from siammot.configs.defaults import cfg + from siammot.data.video_dataset import VideoDatasetBatchCollator + from siammot.data.adapters.utils.data_utils import load_dataset_anno + from siammot.data.adapters.augmentation.build_augmentation import build_siam_augmentation + + torch.manual_seed(0) + + dataset_anno, dataset_info = load_dataset_anno('COCO17_train') + collator = VideoDatasetBatchCollator() + transforms = build_siam_augmentation(cfg, modality=dataset_info['modality']) + + dataset = ImageDataset(dataset_anno, + dataset_info['image_folder'], + frames_per_image=2, + transforms=transforms, + amodal=True) + + batch_size = 16 + sampler = torch.utils.data.sampler.RandomSampler(dataset) + batch_sampler = torch.utils.data.sampler.BatchSampler( + sampler, batch_size, drop_last=False) + dataloader = data.DataLoader(dataset, + num_workers=4, + batch_sampler=batch_sampler, + collate_fn=collator + ) + import time + tic = time.time() + for iteration, (image, target, image_ids) in enumerate(dataloader): + data_time = time.time() - tic + print("Data loading time: {}".format(data_time)) + tic = time.time() + print(image_ids) \ No newline at end of file diff --git a/siam-mot/siammot/data/ingestion/ingest_mot.py b/siam-mot/siammot/data/ingestion/ingest_mot.py new file mode 100644 index 0000000000000000000000000000000000000000..cd101288e563e8260ff678158cac02ed008b4d4f --- /dev/null +++ b/siam-mot/siammot/data/ingestion/ingest_mot.py @@ -0,0 +1,197 @@ +import argparse +import csv +import configparser +import datetime +import glob +import os + +from PIL import Image +from pathlib import Path + +from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset, DataSample, AnnoEntity, FieldNames, SplitNames +from gluoncv.torch.data.gluoncv_motion_dataset.utils.ingestion_utils import process_dataset_splits + +# From paper, see table 5 and 6: https://arxiv.org/pdf/1603.00831.pdf +MOT_LABEL_MAP = { + 1: "Pedestrian", + 2: "Person on vehicle", + 3: "Car", + 4: "Bicycle", + 5: "Motorbike", + 6: "Non motorized vehicle", + 7: "Static person", + 8: "Distractor", + 9: "Occluder", + 10: "Occluder on the ground", + 11: "Occluder full", + 12: "Reflection", +} + +DET_OPTIONS = {"SDP", "FRCNN", "DPM"} + + +def sample_from_mot_csv(csv_path, fps, sample=None, mot17=True, has_gt=False): + if sample is None: + id_ = Path(csv_path).stem + sample = DataSample(id_) + else: + sample = sample.get_copy_without_entities() + with open(csv_path, newline='') as f: + reader = csv.reader(f, delimiter=',') + + def coord(x): + return round(float(x)) + + for row in reader: + frame_num = int(row[0]) + obj_id = row[1] + x = coord(row[2]) + y = coord(row[3]) + w = coord(row[4]) + h = coord(row[5]) + conf = float(row[6]) + # If not mot17 the last 3 are 3D coords which are usually -1 + # (see pg. 9 https://arxiv.org/pdf/1504.01942.pdf) + if has_gt and mot17: + label = int(row[7]) + visibility = float(row[8]) + else: + label = 1 + visibility = 1 + + label_text = MOT_LABEL_MAP[label] + + # NOTE: Actually all classes that aren't Pedestrian have confidence 0 and so should be ingested + # but are ignored at evaluation time + # i.e. (label != 1 and conf) is never true + assert not (label != 1 and conf) + has_person_label = label_text in ("Pedestrian") + + time_ms = int((frame_num - 1) / fps * 1000) + entity = AnnoEntity(time=time_ms, id=obj_id) + entity.bbox = [x, y, w, h] + blob = { + "frame_csv": frame_num, + "frame_idx": frame_num - 1, + "visibility": visibility + } + entity.labels = {} + # entity.labels["person"] = 1 + if has_person_label: + entity.labels["person"] = 1 + else: + entity.labels[str(label)] = 1 + entity.labels["vis"] = visibility + + entity.confidence = conf + entity.blob = blob + + sample.add_entity(entity) + return sample + + +def main(args, description="Initial ingestion", det_options=None, mot17=True): + if mot17: + if det_options is not None and not all(x in DET_OPTIONS for x in det_options): + raise ValueError("Det options were {} but must be only: {}".format(det_options, DET_OPTIONS)) + if det_options is None: + det_options = DET_OPTIONS + else: + print("Ingesting MOT15, ignoring det options {}".format(det_options)) + det_options = [""] + + dataset_path = args.dataset_path + out_filename = args.anno_name + + out_dataset = GluonCVMotionDataset(out_filename, dataset_path, load_anno=False) + metadata = { + FieldNames.DESCRIPTION: description, + FieldNames.DATE_MODIFIED: str(datetime.datetime.now()), + } + out_dataset.metadata = metadata + + splits = { + "train": os.path.join(out_dataset.data_root_path, "train"), + "test": os.path.join(out_dataset.data_root_path, "test"), # No gt for MOT test + } + + for det_option in det_options: + for split_name, split_path in splits.items(): + subdirs = glob.glob(os.path.join(split_path, "*" + det_option)) + for i, subdir in enumerate(subdirs): + vid_id = os.path.basename(subdir) + vid_path = os.path.join(split_path, subdir) + + sample = DataSample(vid_id) + + if mot17: + info_path = os.path.join(vid_path, "seqinfo.ini") + config = configparser.ConfigParser() + config.read(info_path) + seq_conf = config["Sequence"] + fps = float(seq_conf['frameRate']) + num_frames = int(seq_conf['seqLength']) + width = int(seq_conf['imWidth']) + height = int(seq_conf['imHeight']) + else: + # Assume 30 fps + fps = 30 + im_paths = glob.glob(os.path.join(vid_path, "img1", "*.jpg")) + num_frames = len(im_paths) + im_example = Image.open(im_paths[0]) + width = im_example.width + height = im_example.height + + rel_base_dir = vid_path.replace(out_dataset.data_root_path, "").lstrip(os.path.sep) + rel_base_dir = os.path.join(rel_base_dir, "img1") + metadata = { + FieldNames.DATA_PATH: rel_base_dir, + FieldNames.FPS: fps, + FieldNames.NUM_FRAMES: num_frames, + FieldNames.RESOLUTION: {"width": width, "height": height}, + } + sample.metadata = metadata + + gt_path = os.path.join(vid_path, "gt/gt.txt") + det_path = os.path.join(vid_path, "det/det.txt") + has_gt = os.path.exists(gt_path) + anno_path = gt_path if has_gt else det_path + + sample = sample_from_mot_csv(anno_path, fps, sample, mot17, has_gt) + + out_dataset.add_sample(sample) + + print("Done {} sample {}/{}, {}".format(split_name, i+1, len(subdirs), vid_id)) + + out_dataset.dump() + + return out_dataset + + +def write_data_split(args, dataset): + if dataset is None: + dataset = GluonCVMotionDataset(args.anno_name, args.dataset_path) + + def split_func(sample): + data_path = sample.data_relative_path + if data_path.startswith("train"): + return SplitNames.TRAIN + elif data_path.startswith("test"): + return SplitNames.TEST + + raise Exception("Shouldn't happen") + + process_dataset_splits(dataset, split_func, save=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Ingest mot dataset') + parser.add_argument('--dataset_path', default="", + help="The path of dataset folder") + parser.add_argument('--anno_name', default="anno.json", + help="The file name (with json) of ingested annotation file") + args = parser.parse_args() + + mot17 = "MOT17" in args.dataset_path + dataset = main(args, mot17=mot17) + write_data_split(args, dataset) diff --git a/siam-mot/siammot/data/ingestion/ingest_prim_air.py b/siam-mot/siammot/data/ingestion/ingest_prim_air.py new file mode 100644 index 0000000000000000000000000000000000000000..b973d1d5f87a419c035c0b12c88c7dbbbe608682 --- /dev/null +++ b/siam-mot/siammot/data/ingestion/ingest_prim_air.py @@ -0,0 +1,127 @@ +import argparse +import copy +import datetime +import fire +import string +import tqdm +import os +from pathlib import Path + +from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset, FieldNames, SplitNames +from gluoncv.torch.data.gluoncv_motion_dataset.utils.ingestion_utils import process_dataset_splits +from gluoncv.torch.data.gluoncv_motion_dataset.utils.serialization_utils import save_json + + +def ingest_dataset(args, renumber_ids=True): + """ + + :param args: Input arguments + :param renumber_ids: rename track identities to integers + """ + dataset = GluonCVMotionDataset(args.anno_name, args.dataset_path, load_anno=False) + dataset.metadata = { + FieldNames.DESCRIPTION: "Initial ingestion", + FieldNames.DATE_MODIFIED: str(datetime.datetime.now()), + } + #raw_anno_paths = sorted(Path(dataset.data_root_path).glob("groundtruth.json")) + raw_anno_paths = sorted(Path('/home/ubuntu/airborne-detection-starter-kit/data/').glob("groundtruth.json")) + + for raw_anno_path in tqdm.tqdm(raw_anno_paths): + # Setting the dataset and samples to None here looks pointless but it allows the memory to be freed, otherwise + # on subsequent iterations it can actually run out of memory as it loads a new dataset while keeping the + # previous one still in memory (happened on c5.xlarge 8GB RAM) + raw_dataset = None + samples = None + # raw_sample and sample have references back to the dataset so have to unset these too + raw_sample = sample = None + raw_dataset = GluonCVMotionDataset(raw_anno_path) + raw_dataset.__version__ = 1 + set_dir = raw_anno_path.parent.parent + images_root_path = Path(dataset.data_root_path) # set_dir / "Images" + + samples = sorted(raw_dataset.samples) + with open ('/home/ubuntu/siam-mot/data/all_flights_val.txt', 'r') as f: + all_flights = f.readlines() + all_flights = [flight.rstrip() for flight in all_flights] + + for raw_id, raw_sample in tqdm.tqdm(samples): + if raw_id not in all_flights[200:]: + continue + data_path = images_root_path /raw_id + data_rel_path = str(data_path.relative_to(dataset.data_root_path)) + new_id = data_rel_path + first_img = sorted(data_path.glob("*.png"))[0] + first_timestamp = int(first_img.name.split(raw_id)[0]) + sample = raw_sample.get_copy_without_entities(new_id=new_id) + sample.metadata["orig_path"] = raw_sample.data_relative_path + sample.data_relative_path = data_rel_path + unique_ids = {} + + first_frame = None + for raw_entity in raw_sample.entities: + entity = copy.deepcopy(raw_entity) + orig_frame = entity.blob.pop("frame") + orig_time = entity.time + if first_frame is None: + assert raw_entity.time == first_timestamp + first_frame = orig_frame + rel_frame = orig_frame - first_frame + # rel_ts = raw_entity.time - first_timestamp + # assert rel_ts >= 0 + # rel_ts_msec = rel_ts / 1e6 + # ts_msec_round = int(round(rel_ts_msec / sample.period) * sample.period) + # print(f"frame: {raw_entity.blob.get('frame')} ts_msec: {rel_ts_msec} ts_round {ts_msec_round}") + # print() + # assert abs(rel_ts_msec - ts_msec_round) < sample.period / 10 + # entity.time = ts_msec_round + + entity.time = round(rel_frame / sample.fps * 1000) + if entity.id: + obj_type = entity.id.rstrip(string.digits).lower() + entity.labels[obj_type] = 1 + if entity.id.lower() in ("airplane1", "helicopter1"): + entity.labels["intruder"] = 1 + entity.blob["orig_id"] = entity.id + if renumber_ids: + entity.id = unique_ids.setdefault(entity.id, len(unique_ids)) + entity.blob[FieldNames.FRAME_IDX] = rel_frame + entity.blob["orig_frame"] = orig_frame + entity.blob["orig_time"] = orig_time + if entity.labels and "miss_distance_class" in entity.labels: + entity.blob["miss_distance_class"] = entity.labels.pop("miss_distance_class") + if "range_distance_m" in entity.blob: + entity.blob["range_distance_m"] = round(entity.blob["range_distance_m"], 1) + sample.add_entity(entity) + + # break + dataset.add_sample(sample, dump_directly=True) + + dataset.dump() + + return dataset + + +def write_split(dataset): + def split_func(sample): + # data_path = sample.data_relative_path + orig_path = sample.metadata['orig_path'] + if orig_path.startswith("train"): + return SplitNames.TRAIN + elif orig_path.startswith("val"): + return SplitNames.VAL + + raise Exception("Shouldn't happen") + + process_dataset_splits(dataset, split_func, save=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Ingest Prime Air dataset') + parser.add_argument('--dataset_path', default="/home/ubuntu/airborne-detection-starter-kit/data/val/") + #description="The path of dataset folder") + parser.add_argument('--anno_name', default="anno.json") + #description="The file name (with json) of ingested annotation file") + args = parser.parse_args() + + dataset = ingest_dataset(args, renumber_ids=True) + write_split(dataset) diff --git a/siam-mot/siammot/data/video_dataset.py b/siam-mot/siammot/data/video_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a9c3f6acca456d0207f14ec2ca46232a3649bba6 --- /dev/null +++ b/siam-mot/siammot/data/video_dataset.py @@ -0,0 +1,195 @@ +import random +import torch +import itertools +import torch.utils.data as data +from tqdm import tqdm +from collections import defaultdict +from PIL.Image import Image + +from maskrcnn_benchmark.structures.image_list import to_image_list +from maskrcnn_benchmark.structures.bounding_box import BoxList + +from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset, AnnoEntity + + +class VideoDataset(data.Dataset): + + def __init__(self, dataset: GluonCVMotionDataset, sampling_interval=250, clip_len=1000, + is_train=True, frames_in_clip=2, transforms=None, filter_fn=None, + amodal=False): + """ + :param dataset: the ingested dataset with GluonCVMotionDataset + :param sampling_interval: the temporal stride (in ms) of sliding window + :param clip_len: the temporal length (in ms) of video clips + :param is_train: a boolean flag indicating whether it is training + :param frames_in_clip: the number of frames sampled in a video clip (for a training example) + :param transforms: frame-level transformation before they are fed into neural networks + :param filter_fn: a callable function to filter entities + :param amodal: whether to clip the bounding box beyond image boundary + """ + + if dataset is None: + raise Exception('dataset should not be None. Call GluonCVMotionDataset to construct dataset first.') + + assert is_train is True, "The dataset class only supports training" + assert (2 >= frames_in_clip > 0), "frames_in_clip has to be 1 or 2" + + self.data = dict(dataset.train_samples) + + self.clip_len = clip_len + self.transforms = transforms + self.filter_fn = filter_fn + self.frames_in_clip = min(clip_len, frames_in_clip) + + # Process dataset to get all valid video clips + self.clips = self.get_video_clips(sampling_interval_ms=sampling_interval) + self.amodal = amodal + + def __getitem__(self, item_id): + + video = [] + target = [] + + (sample_id, clip_frame_ids) = self.clips[item_id] + video_info = self.data[sample_id] + video_reader = video_info.get_data_reader() + + # Randomly sampling self.frames_in_clip frames + # And keep their relative temporal order + rand_idxs = sorted(random.sample(clip_frame_ids, self.frames_in_clip)) + for frame_idx in rand_idxs: + im = video_reader[frame_idx][0] + entities = video_info.get_entities_for_frame_num(frame_idx) + if self.filter_fn is not None: + entities, _ = self.filter_fn(entities, meta_data=video_info.metadata) + boxes = self.entity2target(im, entities) + + video.append(im) + target.append(boxes) + + # Video clip-level augmentation + if self.transforms is not None: + video, target = self.transforms(video, target) + + return video, target, sample_id + + def __len__(self): + return len(self.clips) + + def get_video_clips(self, sampling_interval_ms=250): + """ + Process the long videos to a small video chunk (with self.clip_len seconds) + Video clips are generated in a temporal sliding window fashion + """ + video_clips = [] + for (sample_id, sample) in tqdm(self.data.items()): + frame_idxs_with_anno = sample.get_non_empty_frames(self.filter_fn) + if len(frame_idxs_with_anno) == 0: + continue + # The video clip may not be temporally continuous + start_frame = min(frame_idxs_with_anno) + end_frame = max(frame_idxs_with_anno) + # make sure that the video clip has at least two frames + clip_len_in_frames = max(self.frames_in_clip, int(self.clip_len / 1000. * sample.fps)) + sampling_interval = int(sampling_interval_ms / 1000. * sample.fps) + for idx in range(start_frame, end_frame, sampling_interval): + clip_frame_ids = [] + # only include frames with annotation within the video clip + for frame_idx in range(idx, idx + clip_len_in_frames): + if frame_idx in frame_idxs_with_anno: + clip_frame_ids.append(frame_idx) + # Only include video clips that have at least self.frames_in_clip annotating frames + if len(clip_frame_ids) >= self.frames_in_clip: + video_clips.append((sample_id, clip_frame_ids)) + + return video_clips + + def entity2target(self, im: Image, entities: [AnnoEntity]): + """ + Wrap up the entity to maskrcnn-benchmark compatible format - BoxList + """ + boxes = [entity.bbox for entity in entities] + ids = [int(entity.id) for entity in entities] + # we only consider person tracking for now, + # thus all the labels are 1, + # reserve category 0 for background during training + int_labels = [1 for _ in entities] + + boxes = torch.as_tensor(boxes).reshape(-1, 4) + boxes = BoxList(boxes, im.size, mode='xywh').convert('xyxy') + if not self.amodal: + boxes = boxes.clip_to_image(remove_empty=False) + boxes.add_field('labels', torch.as_tensor(int_labels, dtype=torch.int64)) + boxes.add_field('ids', torch.as_tensor(ids, dtype=torch.int64)) + + return boxes + + +class VideoDatasetBatchCollator(object): + """ + From a list of samples from the dataset, + returns the batched images and targets. + This should be passed to the DataLoader + """ + + def __init__(self, size_divisible=0): + self.size_divisible = size_divisible + + def __call__(self, batch): + transposed_batch = list(zip(*batch)) + image_batch = list(itertools.chain(*transposed_batch[0])) + image_batch = to_image_list(image_batch, self.size_divisible) + + # to make sure that the id of each instance + # are unique across the whole batch + targets = transposed_batch[1] + video_ids = transposed_batch[2] + uid = 0 + video_id_map = defaultdict(dict) + for targets_per_video, video_id in zip(targets, video_ids): + for targets_per_video_frame in targets_per_video: + if targets_per_video_frame.has_field('ids'): + _ids = targets_per_video_frame.get_field('ids') + _uids = _ids.clone() + for i in range(len(_ids)): + _id = _ids[i].item() + if _id not in video_id_map[video_id]: + video_id_map[video_id][_id] = uid + uid += 1 + _uids[i] = video_id_map[video_id][_id] + targets_per_video_frame.extra_fields['ids'] = _uids + + targets = list(itertools.chain(*targets)) + + return image_batch, targets, video_ids + + +if __name__ == "__main__": + + from siammot.data.adapters.utils.data_utils import load_dataset_anno + + torch.manual_seed(0) + + dataset_anno, dataset_info = load_dataset_anno('MOT17') + collator = VideoDatasetBatchCollator() + + dataset = VideoDataset(dataset_anno, + frames_in_clip=2, + amodal=True) + + batch_size = 16 + sampler = torch.utils.data.sampler.RandomSampler(dataset) + batch_sampler = torch.utils.data.sampler.BatchSampler( + sampler, batch_size, drop_last=False) + dataloader = data.DataLoader(dataset, + num_workers=4, + batch_sampler=batch_sampler, + collate_fn=collator + ) + import time + tic = time.time() + for iteration, (image, target, image_ids) in enumerate(dataloader): + data_time = time.time() - tic + print("Data loading time: {}".format(data_time)) + tic = time.time() + print(image_ids)