diff --git a/siam-mot/siammot/data/adapters/augmentation/__pycache__/build_augmentation.cpython-37.pyc b/siam-mot/siammot/data/adapters/augmentation/__pycache__/build_augmentation.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb1cf3b9ca47c6a77d5d63802e13b9cbd6604614
Binary files /dev/null and b/siam-mot/siammot/data/adapters/augmentation/__pycache__/build_augmentation.cpython-37.pyc differ
diff --git a/siam-mot/siammot/data/adapters/augmentation/__pycache__/image_augmentation.cpython-37.pyc b/siam-mot/siammot/data/adapters/augmentation/__pycache__/image_augmentation.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c18d2abe5bea310ca77bdeb1bc67d7fc0d3a1a7
Binary files /dev/null and b/siam-mot/siammot/data/adapters/augmentation/__pycache__/image_augmentation.cpython-37.pyc differ
diff --git a/siam-mot/siammot/data/adapters/augmentation/__pycache__/video_augmentation.cpython-37.pyc b/siam-mot/siammot/data/adapters/augmentation/__pycache__/video_augmentation.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35bc4d792ce302c33d116e8363ab049962ede5b3
Binary files /dev/null and b/siam-mot/siammot/data/adapters/augmentation/__pycache__/video_augmentation.cpython-37.pyc differ
diff --git a/siam-mot/siammot/data/adapters/augmentation/build_augmentation.py b/siam-mot/siammot/data/adapters/augmentation/build_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..147c04c78d54f8af0dcf2426cf8308383f904b0d
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/augmentation/build_augmentation.py
@@ -0,0 +1,85 @@
+from .video_augmentation import SiamVideoResize, \
+    SiamVideoColorJitter, SiamVideoCompressionAugment, SiamVideoMotionAugment, \
+    SiamVideoMotionBlurAugment, SiamVideoRandomHorizontalFlip, VideoTransformer
+from .image_augmentation import ToTensor, ToBGR255
+
+import maskrcnn_benchmark.data.transforms as T
+
+
+def build_siam_augmentation(cfg, is_train=True, modality='video'):
+
+    motion_limit = 0.0
+    motion_blur_prob = 0.0
+    compression_limit = 0.0
+    if is_train:
+        min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        flip_horizontal_prob = 0.5  # cfg.INPUT.FLIP_PROB_TRAIN
+        brightness = cfg.INPUT.BRIGHTNESS
+        contrast = cfg.INPUT.CONTRAST
+        saturation = cfg.INPUT.SATURATION
+        hue = cfg.INPUT.HUE
+
+        if modality == 'image':
+            motion_limit = cfg.INPUT.MOTION_LIMIT
+            motion_blur_prob = cfg.INPUT.MOTION_BLUR_PROB
+            compression_limit = cfg.INPUT.COMPRESSION_LIMIT
+
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        flip_horizontal_prob = 0.0
+        brightness = 0.0
+        contrast = 0.0
+        saturation = 0.0
+        hue = 0.0
+
+    amodal = cfg.INPUT.AMODAL
+    SIZE_DIVISIBILITY = cfg.DATALOADER.SIZE_DIVISIBILITY
+    to_bgr255 = cfg.INPUT.TO_BGR255
+
+    video_color_jitter = SiamVideoColorJitter(
+        brightness=brightness,
+        contrast=contrast,
+        saturation=saturation,
+        hue=hue,
+    )
+
+    normalize_transform = T.Normalize(
+        mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255
+    )
+
+    transform = Compose(
+        [
+            video_color_jitter,
+            SiamVideoMotionBlurAugment(motion_blur_prob),
+            SiamVideoCompressionAugment(compression_limit),
+            SiamVideoMotionAugment(motion_limit, amodal),
+            SiamVideoResize(min_size, max_size, SIZE_DIVISIBILITY),
+            SiamVideoRandomHorizontalFlip(prob=flip_horizontal_prob),
+            # PIL image
+            VideoTransformer(ToTensor()),
+            # Torch tensor, CHW (RGB format), and range from [0, 1]
+            # VideoTransformer(ToBGR255(to_bgr255=to_bgr255))
+            VideoTransformer(normalize_transform),
+        ]
+    )
+    return transform
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target=None):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
\ No newline at end of file
diff --git a/siam-mot/siammot/data/adapters/augmentation/image_augmentation.py b/siam-mot/siammot/data/adapters/augmentation/image_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..adbc582025bb64d628378ba246c70485d8e9f8e9
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/augmentation/image_augmentation.py
@@ -0,0 +1,187 @@
+import torch
+import random
+import numpy as np
+from PIL import Image
+from torchvision.transforms import functional as F
+
+import imgaug.augmenters as iaa
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+class ImageResize(object):
+    def __init__(self, min_size, max_size, size_divisibility):
+        if not isinstance(min_size, (list, tuple)):
+            min_size = (min_size,)
+        self.min_size = min_size
+        self.max_size = max_size
+        self.size_divisibility = size_divisibility
+
+    # modified from torchvision to add support for max size
+    def get_size(self, image_size):
+        w, h = image_size
+        size = random.choice(self.min_size)
+        max_size = self.max_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        if self.size_divisibility > 0:
+            oh = (int(oh / self.size_divisibility) * self.size_divisibility)
+            ow = (int(ow / self.size_divisibility) * self.size_divisibility)
+
+        return (oh, ow)
+
+    def __call__(self, image, target=None):
+        size = self.get_size(image.size)
+        image = F.resize(image, size)
+        if target is None:
+            return image, target
+        target = target.resize(image.size)
+        return image, target
+
+
+class ImageCropResize(object):
+    """
+    Crop a patch from the image and resize to its original size
+    """
+    def __init__(self, crop_limit=None, amodal=False):
+        self.crop_limit = crop_limit
+        self.amodal = amodal
+
+    def remove_invisible_box(self, box: BoxList):
+        """
+        Remove boxes that are not visible (out of image boundary) after motion augmentation
+        """
+        bbox = box.bbox.clone()
+        xmin_clip = bbox[:, 0].clamp(min=0, max=box.size[0] - 1)
+        ymin_clip = bbox[:, 1].clamp(min=0, max=box.size[1] - 1)
+        xmax_clip = bbox[:, 2].clamp(min=0, max=box.size[0] - 1)
+        ymax_clip = bbox[:, 3].clamp(min=0, max=box.size[1] - 1)
+        keep = (xmax_clip > xmin_clip) & (ymax_clip > ymin_clip)
+
+        return box[keep]
+
+    def boxlist_crop(self, box: BoxList, x1, y1, x2, y2):
+        """
+         Adjust the coordinate of the bounding box within
+         image crop specified by (x1, y1, x2, y2)
+        """
+
+        w, h = (x2 - x1), (y2 - y1)
+        xmin, ymin, xmax, ymax = box._split_into_xyxy()
+        cropped_xmin = (xmin - x1)
+        cropped_ymin = (ymin - y1)
+        cropped_xmax = (xmax - x1)
+        cropped_ymax = (ymax - y1)
+        cropped_bbox = torch.cat(
+            (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1
+        )
+        cropped_box = BoxList(cropped_bbox, (w, h), mode="xyxy")
+        for k, v in box.extra_fields.items():
+            cropped_box.add_field(k, v)
+
+        if self.amodal:
+            # amodal allows the corners of bbox go beyond image boundary
+            cropped_box = self.remove_invisible_box(cropped_box)
+        else:
+            # the corners of bbox need to be within image boundary for non-amodal training
+            cropped_box = cropped_box.clip_to_image(remove_empty=True)
+        return cropped_box.convert(box.mode)
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        tl_x = int(w * (random.random() * self.crop_limit))
+        tl_y = int(h * (random.random() * self.crop_limit))
+        br_x = int(w - w * (random.random() * self.crop_limit))
+        # keep aspect ratio
+        br_y = int((h / w) * (br_x - tl_x) + tl_y)
+
+        if len(target) > 0:
+            box = target.bbox
+            box_w = box[:, 2] - box[:, 0]
+            box_h = box[:, 3] - box[:, 1]
+            box_area = box_h * box_w
+            max_area_idx = torch.argmax(box_area, dim=0)
+            max_motion_limit_w = int(box_w[max_area_idx] * 0.25)
+            max_motion_limit_h = int(box_h[max_area_idx] * 0.25)
+
+            # make sure at least one bounding box is preserved
+            # after motion augmentation
+            tl_x = min(tl_x, max_motion_limit_w)
+            tl_y = min(tl_y, max_motion_limit_h)
+            br_x = max(br_x, w-max_motion_limit_w)
+            br_y = max(br_y, h-max_motion_limit_h)
+
+        assert (tl_x < br_x) and (tl_y < br_y)
+
+        crop = F.crop(image, tl_y, tl_x, (br_y-tl_y), (br_x-tl_x))
+        crop = F.resize(crop, (h, w))
+        if len(target) > 0:
+            target = self.boxlist_crop(target, tl_x, tl_y, br_x, br_y)
+        target = target.resize(image.size)
+
+        return crop, target
+
+
+class ImageMotionBlur(object):
+    """
+    Perform motion augmentation to an image
+    """
+    def __init__(self):
+        motion_blur = iaa.MotionBlur(k=10, angle=[-30, 30])
+        gaussian_blur = iaa.GaussianBlur(sigma=(0.0, 2.0))
+
+        self.blur_func_pool = [motion_blur, gaussian_blur]
+
+        pass
+
+    def __call__(self, image):
+        blur_id = random.choice(list(range(0, len(self.blur_func_pool))))
+        blur_func = self.blur_func_pool[blur_id]
+        np_image = np.asarray(image)
+        blurred_image = blur_func.augment_image(np_image)
+        pil_image = Image.fromarray(np.uint8(blurred_image))
+        return pil_image
+
+
+class ImageCompression(object):
+    """
+    Perform JPEG compression augmentation to an image
+    """
+    def __init__(self, max_compression):
+        self.max_compression = max_compression
+
+    def __call__(self, image):
+        ratio = random.uniform(0, 1)
+        compression = min(100, int(ratio * self.max_compression))
+        np_image = np.asarray(image)
+        compressed_image = iaa.arithmetic.compress_jpeg(np_image, compression)
+        pil_image = Image.fromarray(np.uint8(compressed_image))
+        return pil_image
+
+
+class ToTensor(object):
+    def __call__(self, image, target=None):
+        return F.to_tensor(image), target
+
+
+class ToBGR255(object):
+    def __init__(self, to_bgr255=True):
+        self.to_bgr255 = to_bgr255
+
+    def __call__(self, image, target=None):
+        if self.to_bgr255:
+            image = image[[2, 1, 0]] * 255
+        return image, target
+
diff --git a/siam-mot/siammot/data/adapters/augmentation/video_augmentation.py b/siam-mot/siammot/data/adapters/augmentation/video_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f267bf0db912b3e240d82dbc0e1c0cfe9790b37
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/augmentation/video_augmentation.py
@@ -0,0 +1,187 @@
+import torch
+import random
+from torchvision.transforms import functional as F
+from torchvision.transforms import ColorJitter as ImageColorJitter
+
+from .image_augmentation import ImageResize, ImageCropResize, \
+    ImageMotionBlur, ImageCompression
+
+
+class VideoTransformer(object):
+    def __init__(self, transform_fn=None):
+        if transform_fn is None:
+            raise KeyError('Transform function should not be None.')
+        self.transform_fn = transform_fn
+
+    def __call__(self, video, target=None):
+        """
+        A data transformation wrapper for video
+        :param video: a list of images
+        :param target: a list of BoxList (per image)
+        """
+        if not isinstance(video, (list, tuple)):
+            return self.transform_fn(video, target)
+
+        new_video = []
+        new_target = []
+        for (image, image_target) in zip(video, target):
+            (image, image_target) = self.transform_fn(image, image_target)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+
+class SiamVideoResize(ImageResize):
+    def __init__(self, min_size, max_size, size_divisibility):
+        super(SiamVideoResize, self).__init__(min_size, max_size, size_divisibility)
+
+    def __call__(self, video, target=None):
+
+        if not isinstance(video, (list, tuple)):
+            return super(SiamVideoResize, self).__call__(video, target)
+
+        assert len(video) >= 1
+        new_size = self.get_size(video[0].size)
+
+        new_video = []
+        new_target = []
+        for (image, image_target) in zip(video, target):
+            (image, image_target) = self._resize(image, new_size, image_target)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+    def _resize(self, image, size, target=None):
+        image = F.resize(image, size)
+        target = target.resize(image.size)
+        return image, target
+
+
+class SiamVideoRandomHorizontalFlip(object):
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, video, target=None):
+
+        if not isinstance(video, (list, tuple)):
+            return video, target
+
+        new_video = []
+        new_target = []
+        # All frames should have the same flipping operation
+        if random.random() < self.prob:
+            for (image, image_target) in zip(video, target):
+                new_video.append(F.hflip(image))
+                new_target.append(image_target.transpose(0))
+        else:
+            new_video = video
+            new_target = target
+        return new_video, new_target
+
+
+class SiamVideoColorJitter(ImageColorJitter):
+    def __init__(self,
+                 brightness=None,
+                 contrast=None,
+                 saturation=None,
+                 hue=None):
+        super(SiamVideoColorJitter, self).__init__(brightness, contrast, saturation, hue)
+
+    def __call__(self, video, target=None):
+        # Color jitter only applies for Siamese Training
+        if not isinstance(video, (list, tuple)):
+            return video, target
+
+        idx = random.choice((0, 1))
+        # all frames in the video should go through the same transformation
+        transform = self.get_params(self.brightness, self.contrast,
+                                    self.saturation, self.hue)
+        new_video = []
+        new_target = []
+        for i, (image, image_target) in enumerate(zip(video, target)):
+            if i == idx:
+                image = transform(image)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+
+class SiamVideoMotionAugment(object):
+    def __init__(self, motion_limit=None, amodal=False):
+        # maximum motion augmentation
+        self.motion_limit = min(0.1, motion_limit)
+        if motion_limit is None:
+            self.motion_limit = 0
+        self.motion_augment = ImageCropResize(self.motion_limit, amodal)
+
+    def __call__(self, video, target=None):
+
+        # Motion augmentation only applies for Siamese Training
+        if not isinstance(video, (list, tuple)) or self.motion_limit == 0:
+            return video, target
+
+        new_video = []
+        new_target = []
+        # Only 1 frame go through the motion augmentation,
+        # the other unchanged
+        idx = random.choice((0, 1))
+        for i, (image, image_target) in enumerate(zip(video, target)):
+            if i == idx:
+                (image, image_target) = self.motion_augment(image, image_target)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+
+class SiamVideoMotionBlurAugment(object):
+    def __init__(self, motion_blur_prob=None):
+        self.motion_blur_prob = motion_blur_prob
+        if motion_blur_prob is None:
+            self.motion_blur_prob = 0.0
+        self.motion_blur_func = ImageMotionBlur()
+
+    def __call__(self, video, target):
+        # Blur augmentation only applies for Siamese Training
+        if not isinstance(video, (list, tuple)) or self.motion_blur_prob == 0.0:
+            return video, target
+
+        new_video = []
+        new_target = []
+        idx = random.choice((0, 1))
+        for i, (image, image_target) in enumerate(zip(video, target)):
+            if i == idx:
+                random_prob = random.uniform(0, 1)
+                if random_prob < self.motion_blur_prob:
+                    image = self.motion_blur_func(image)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
+
+
+class SiamVideoCompressionAugment(object):
+    def __init__(self, max_compression=None):
+        self.max_compression = max_compression
+        if max_compression is None:
+            self.max_compression = 0.0
+        self.compression_func = ImageCompression(self.max_compression)
+
+    def __call__(self, video, target):
+        # Compression augmentation only applies for Siamese Training
+        if not isinstance(video, (list, tuple)) or self.max_compression == 0.0:
+            return video, target
+
+        idx = random.choice((0, 1))
+        new_video = []
+        new_target = []
+        for i, (image, image_target) in enumerate(zip(video, target)):
+            if i == idx:
+                image = self.compression_func(image)
+            new_video.append(image)
+            new_target.append(image_target)
+
+        return new_video, new_target
\ No newline at end of file
diff --git a/siam-mot/siammot/data/adapters/handler/data_filtering.py b/siam-mot/siammot/data/adapters/handler/data_filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c51b8db6bdd0a41010bfe9454fcc06bacc5347e
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/handler/data_filtering.py
@@ -0,0 +1,140 @@
+import numpy as np
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import AnnoEntity
+
+from siammot.utils.entity_utils import bbs_iou
+
+
+def build_data_filter_fn(dataset_key: str, *args, **kwargs):
+    """
+    Get dataset specific filter function list, if there is any
+    """
+    filter_fn = None
+    if dataset_key == 'CRP':
+        filter_fn = CRPFilter(*args, **kwargs)
+    elif dataset_key.startswith('MOT'):
+        filter_fn = MOTFilter(*args, **kwargs)
+    elif dataset_key == 'AOT':
+        filter_fn = AOTFilter(*args, **kwargs)
+    return filter_fn
+
+
+class BaseFilter:
+    def __init__(self):
+        pass
+
+    # the default filter does not filter any entity, which is technically doing nothing
+    def _filter(self, entity: AnnoEntity, ignored_gt_entities=None):
+        raise False
+
+    def filter(self, entity:AnnoEntity, ignored_gt_entities=None):
+        return self._filter(entity, ignored_gt_entities)
+
+    def __call__(self, entities: [AnnoEntity], ignored_entities=None, meta_data=None):
+        """
+            Check each entity whether it is valid or should be filtered (ignored).
+            :param entities: A list of entities (for a single frame) to be evaluated
+            :param ignored_entities: A list of ignored entities or a binary mask indicating ignored regions
+            :param meta_data: The meta data for the frame (or video)
+            :return: A list of valid entities and a list of filtered (ignored) entities
+            """
+        valid_entities = []
+        filtered_entities = []
+
+        for entity in entities:
+            if self._filter(entity, ignored_entities):
+                filtered_entities.append(entity)
+            else:
+                valid_entities.append(entity)
+
+        return valid_entities, filtered_entities
+
+
+class CRPFilter(BaseFilter):
+    """
+        A class for filtering JTA dataset entities during evaluation
+        A gt entity will be filtered (ignored) if its id is -1 (negative)
+        A predicted entity will be filtered (ignored) if it is matched to a ignored ground truth entity
+        """
+    def __init__(self, iou_thresh=0.2, is_train=False):
+        """
+        :param iou_thresh: a predicted entity which overlaps with any ignored gt entity with at least
+         iou_thresh would be filtered
+        """
+        self.iou_thresh = iou_thresh
+
+    def _filter(self, entity: AnnoEntity, ignored_gt_entities=None):
+        if ignored_gt_entities is None:
+            if entity.id < 0:
+                return True
+        else:
+            for entity_ in ignored_gt_entities:
+                if bbs_iou(entity, entity_) >= self.iou_thresh:
+                    return True
+        return False
+
+
+class MOTFilter(BaseFilter):
+    """
+    A class for filtering MOT dataset entities
+    A gt entity will be filtered (ignored) if its visibility ratio is very low
+    A predicted entity will be filtered (ignored) if it is matched to a ignored ground truth entity
+    """
+    def __init__(self, visibility_thresh=0.1, iou_thresh=0.5, is_train=False):
+        self.visibility_thresh = visibility_thresh
+        self.iou_thresh = iou_thresh
+        self.is_train = is_train
+
+    def _filter(self, entity: AnnoEntity, ignored_gt_entities=None):
+        if ignored_gt_entities is None:
+            if self.is_train:
+                # any entity whose visibility is below the pre-defined
+                # threshold should be filtered out
+                # meanwhile, any entity whose class does not have label
+                # needs to be filtered
+                if entity.blob['visibility'] < self.visibility_thresh or \
+                        not any(k in ('person', '2', '7') for k in entity.labels):
+                    return True
+            else:
+                if 'person' not in entity.labels or int(entity.id) < 0:
+                    return True
+        else:
+            for entity_ in ignored_gt_entities:
+                if bbs_iou(entity, entity_) >= self.iou_thresh:
+                    return True
+            return False
+
+
+class AOTFilter(BaseFilter):
+    """
+    A class for filtering AOT entities
+    A gt entity will be filtered if it falls into one the following criterion
+      1. tracking id is not Helicopter1 or Airplane1
+      2. range distance is larger than 1200
+    """
+
+    def __init__(self, range_distance_thresh=1200, iou_thresh=0.2, is_train=False):
+        self.range_distance_thresh = range_distance_thresh
+        self.iou_thresh = iou_thresh
+        self.is_train = is_train
+
+    def _filter(self, entity: AnnoEntity, ignored_gt_entities=None):
+        if ignored_gt_entities is None:
+            range_distance_m = np.inf
+            if 'range_distance_m' in entity.blob:
+                range_distance_m = entity.blob['range_distance_m']
+
+            labels = []
+            if entity.labels is not None:
+                labels = entity.labels
+
+            if ('intruder' not in labels) or \
+                    (range_distance_m >= self.range_distance_thresh):
+                return True
+        else:
+            for entity_ in ignored_gt_entities:
+                if entity_.bbox is not None:
+                    if bbs_iou(entity, entity_) >= self.iou_thresh:
+                        return True
+        return False
+
diff --git a/siam-mot/siammot/data/adapters/utils/data_utils.py b/siam-mot/siammot/data/adapters/utils/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d2ce35763b08667824918fb8001b7f43e58bb98
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/utils/data_utils.py
@@ -0,0 +1,62 @@
+import os
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset
+from pycocotools.coco import COCO
+
+from .dataset_info import dataset_maps
+
+
+def load_motion_anno(dataset_folder,
+                     anno_file,
+                     split_file,
+                     set=None,
+                     ):
+    """
+    Load GluonCVMotionDataset format annotations for downstream training / testing
+    """
+
+    dataset = GluonCVMotionDataset(anno_file,
+                                   root_path=dataset_folder,
+                                   split_file=split_file
+                                   )
+
+    if set == 'train':
+        dataset = list(dataset.train_samples)
+    elif set == 'val':
+        dataset = list(dataset.val_samples)
+    elif set == 'test':
+        dataset = list(dataset.test_samples)
+
+    return dataset
+
+
+def load_coco_anno(dataset_folder,
+                   anno_file):
+
+    dataset_anno_path = os.path.join(dataset_folder, anno_file)
+    dataset = COCO(dataset_anno_path)
+    return dataset
+
+
+def load_dataset_anno(cfg, dataset_key, set=None):
+    dataset_folder, anno_file, split_file, modality = dataset_maps[dataset_key]
+
+    dataset_info = dict()
+    dataset_info['modality'] = modality
+
+    dataset_folder = os.path.join(cfg.DATASETS.ROOT_DIR, dataset_folder)
+    if modality == 'video':
+        dataset = load_motion_anno(dataset_folder,
+                                   anno_file,
+                                   split_file,
+                                   set)
+    elif modality == 'image':
+        dataset = load_coco_anno(dataset_folder,
+                                 anno_file)
+        image_folder = os.path.join(dataset_folder, split_file)
+        dataset_info['image_folder'] = image_folder
+    else:
+        raise ValueError("dataset has to be video or image.")
+
+    return dataset, dataset_info
+
diff --git a/siam-mot/siammot/data/adapters/utils/dataset_info.py b/siam-mot/siammot/data/adapters/utils/dataset_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..36527d83f8ffad656aa510e0d83cf83fad7a408b
--- /dev/null
+++ b/siam-mot/siammot/data/adapters/utils/dataset_info.py
@@ -0,0 +1,49 @@
+dataset_maps = dict()
+"""
+each item in the dataset maps are a list of the following info
+(
+dataset_folder, 
+annotation file name (video dataset) / path of annotation file (image dataset), 
+split file name (video dataset) / path of image folder (image dataset) , 
+modality
+)
+"""
+dataset_maps['TAO'] = ['TAO',
+                       'anno_person.json',
+                       'splits_person.json',
+                       'video']
+
+dataset_maps['CRP'] = ['caltech_roadside_pedestrians',
+                       'anno.json',
+                       'splits.json',
+                       'video']
+
+dataset_maps['MOT17_DPM'] = ['MOT17',
+                             'anno.json',
+                             'splits_DPM.json',
+                             'video']
+
+dataset_maps['MOT17'] = ['MOT17',
+                         'anno.json',
+                         'splits.json',
+                         'video']
+
+dataset_maps['AOT'] = ['airbone_object_tracking',
+                       'anno.json',
+                       'splits.json',
+                       'video']
+
+dataset_maps['COCO17_train'] = ['mscoco',
+                                'annotations/MSCOCO2017_train_person.json',
+                                'images/train2017',   # all raw images would be in dataset_root/mscoco/images/train2017
+                                'image']
+
+dataset_maps['crowdhuman_train_fbox'] = ['CrowdHuman',
+                                         'annotations/annotation_train_fbox.json',
+                                         'Images',  # all raw images would be in dataset_root/CrowdHuman/Images
+                                         'image']
+
+dataset_maps['crowdhuman_train_vbox'] = ['CrowdHuman',
+                                         'annotations/annotation_train_vbox.json',
+                                         'Images',
+                                         'image']
\ No newline at end of file
diff --git a/siam-mot/siammot/data/build_inference_data_loader.py b/siam-mot/siammot/data/build_inference_data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..970fca45cec1e52ee0d35328f08bf9626947ffdd
--- /dev/null
+++ b/siam-mot/siammot/data/build_inference_data_loader.py
@@ -0,0 +1,56 @@
+import torch
+import torch.utils.data as data
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import DataSample
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+class InferenceVideoData(data.Dataset):
+    """
+    Split the video into small chunks (in an non-overlapping fashion) for inference
+    """
+
+    def __init__(self, video: DataSample, clip_len=1, transforms=None):
+        """
+        Construct a data loader for inference
+        :param video: a video stream in DataSample format
+        :param clip_len: the length of video clips
+        :param transforms: transform function for video pre-processing
+        """
+        self.video = video
+        self.video_reader = video.get_data_reader()
+        self.clip_len = clip_len
+        self.transforms = transforms
+        self.clip_idxs = list(range(0, len(self.video), self.clip_len))
+
+    def __getitem__(self, id):
+        video_clip = []
+        # this is needed for transformation
+        dummy_boxes = []
+        timestamps = []
+        start_idx = self.clip_idxs[id]
+        end_idx = min(len(self.video), start_idx + self.clip_len)
+        for frame_idx in range(start_idx, end_idx):
+            (im, timestamp, _) = self.video_reader[frame_idx]
+            dummy_bbox = torch.tensor([[0, 0, 1, 1]])
+            dummy_boxlist = BoxList(dummy_bbox, im.size, mode='xywh')
+
+            video_clip.append(im)
+            timestamps.append(torch.tensor(timestamp))
+            dummy_boxes.append(dummy_boxlist)
+
+        if self.transforms is not None:
+            video_clip, _ = self.transforms(video_clip, dummy_boxes)
+
+        return torch.stack(video_clip), start_idx, torch.stack(timestamps)
+
+    def __len__(self):
+        return len(self.clip_idxs)
+
+
+def build_video_loader(cfg, video: DataSample, transforms):
+    clip_len = cfg.INFERENCE.CLIP_LEN
+    videodata = InferenceVideoData(video, clip_len=clip_len, transforms=transforms)
+    videoloader = data.DataLoader(videodata, num_workers=4, batch_size=1, shuffle=False)
+
+    return videoloader
diff --git a/siam-mot/siammot/data/build_train_data_loader.py b/siam-mot/siammot/data/build_train_data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..20174132ae8ced050df863e3827a60ff35d9f6f6
--- /dev/null
+++ b/siam-mot/siammot/data/build_train_data_loader.py
@@ -0,0 +1,77 @@
+import torch.utils.data
+
+from maskrcnn_benchmark.utils.comm import get_world_size
+from maskrcnn_benchmark.data.build import make_data_sampler, make_batch_data_sampler
+from maskrcnn_benchmark.data.datasets.concat_dataset import ConcatDataset
+
+from .video_dataset import VideoDataset, VideoDatasetBatchCollator
+from .image_dataset import ImageDataset
+from .adapters.utils.data_utils import load_dataset_anno
+from .adapters.augmentation.build_augmentation import build_siam_augmentation
+from .adapters.handler.data_filtering import build_data_filter_fn
+
+
+def build_dataset(cfg):
+    """
+
+    """
+
+    dataset_list = cfg.DATASETS.TRAIN
+    if not isinstance(dataset_list, (list, tuple)):
+        raise RuntimeError(
+            "dataset_list should be a list of strings, got {}".format(dataset_list)
+        )
+
+    datasets = []
+    for dataset_key in dataset_list:
+        dataset_anno, dataset_info = load_dataset_anno(cfg, dataset_key)
+        modality = dataset_info['modality']
+        transforms = build_siam_augmentation(cfg, is_train=True, modality=modality)
+        data_filter_fn = build_data_filter_fn(dataset_key, is_train=True)
+
+        if modality == 'image':
+            assert 'image_folder' in dataset_info
+            _dataset = ImageDataset(dataset_anno,
+                                    dataset_info['image_folder'],
+                                    transforms=transforms,
+                                    frames_per_image=cfg.VIDEO.RANDOM_FRAMES_PER_CLIP,
+                                    amodal=cfg.INPUT.AMODAL)
+        else:
+            _dataset = VideoDataset(dataset_anno,
+                                    sampling_interval=cfg.VIDEO.TEMPORAL_SAMPLING,
+                                    clip_len=cfg.VIDEO.TEMPORAL_WINDOW,
+                                    transforms=transforms,
+                                    filter_fn=data_filter_fn,
+                                    frames_in_clip=cfg.VIDEO.RANDOM_FRAMES_PER_CLIP,
+                                    amodal=cfg.INPUT.AMODAL)
+        datasets.append(_dataset)
+
+    dataset = ConcatDataset(datasets)
+
+    return dataset
+
+
+def build_train_data_loader(cfg, is_distributed=False, start_iter=0):
+
+    num_gpus = get_world_size()
+
+    video_clips_per_batch = cfg.SOLVER.VIDEO_CLIPS_PER_BATCH
+    assert (
+        video_clips_per_batch % num_gpus == 0
+    ), "SOLVER.VIDEO_CLIPS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format(
+        video_clips_per_batch, num_gpus)
+
+    video_clips_per_gpu = video_clips_per_batch // num_gpus
+
+    dataset = build_dataset(cfg)
+    num_iters = cfg.SOLVER.MAX_ITER
+    sampler = make_data_sampler(dataset, True, is_distributed)
+    batch_sampler = make_batch_data_sampler(
+        dataset, sampler, [], video_clips_per_gpu, num_iters, start_iter
+    )
+
+    num_workers = cfg.DATALOADER.NUM_WORKERS
+    collator = VideoDatasetBatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
+    data_loader = torch.utils.data.DataLoader(dataset, num_workers=num_workers,
+                                              batch_sampler=batch_sampler, collate_fn=collator)
+    return data_loader
diff --git a/siam-mot/siammot/data/image_dataset.py b/siam-mot/siammot/data/image_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..806e8e6842941517bce6a4d7025695f1c932326d
--- /dev/null
+++ b/siam-mot/siammot/data/image_dataset.py
@@ -0,0 +1,232 @@
+import torch
+import os
+from tqdm import tqdm
+from PIL import Image
+
+import torch.utils.data as data
+from pycocotools.coco import COCO
+from gluoncv.utils.bbox import bbox_xywh_to_xyxy, bbox_clip_xyxy
+
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+
+class ImageDataset(data.Dataset):
+    def __init__(self,
+                 dataset: COCO,
+                 image_dir,
+                 transforms=None,
+                 frames_per_image=1,
+                 amodal=False,
+                 skip_empty=True,
+                 min_object_area=0,
+                 use_crowd=False,
+                 include_bg=False,
+                 ):
+        """
+        :param dataset: the ingested dataset with COCO-format
+        :param transforms: image transformation
+        :param frames_per_image: how many image copies are generated from a single image
+        :param amodal: whether to use amodal ground truth (no image boundary clipping)
+        :param include_bg: whether to include the full background images during training
+        """
+
+        self.dataset = dataset
+        self.image_dir = image_dir
+        self.transforms = transforms
+        self.frames_per_image = frames_per_image
+
+        self._skip_empty = skip_empty
+        self._min_object_area = min_object_area
+        self._use_crowd = use_crowd
+        self._amodal = amodal
+        self._include_bg = include_bg
+        self._det_classes = [c['name'] for c in self.dataset.loadCats(self.dataset.getCatIds())]
+
+        # These are tha mapping table of COCO labels
+        self.json_category_id_to_contiguous_id = {
+            v: i+1 for i, v in enumerate(self.dataset.getCatIds())
+        }
+
+        self._labels, self._im_aspect_ratios, self._items, self._ids \
+            = self._dataset_preprocess()
+
+        self.id_to_img_map = {k: v for k, v in enumerate(self._ids)}
+
+    def __getitem__(self, index):
+        img_name = self._items[index]
+        img_path = os.path.join(self.image_dir, img_name)
+
+        img = Image.open(img_path).convert('RGB')
+        target = self._get_target(img, index)
+
+        # for tracking purposes, two frames are needed
+        # the pairs would go into random augmentation to generate fake motion
+        video_clip = [img for _ in range(self.frames_per_image)]
+        video_target = [target for _ in range(self.frames_per_image)]
+
+        if self.transforms is not None:
+            video_clip, video_target = self.transforms(video_clip, video_target)
+
+        return video_clip, video_target, img_name
+
+    def _get_target(self, img, index):
+
+        # a list of label (x1, y1, x2, y2, class_id, instance_id)
+        labels = self._labels[index]
+        if len(labels) == 0:
+            assert self._include_bg is True, "The image does not has ground truth"
+            bbox = torch.as_tensor(labels).reshape(-1, 4)
+            class_ids = torch.as_tensor(labels)
+            instance_ids = torch.as_tensor(labels)
+            empty_boxlist = BoxList(bbox, img.size, mode="xyxy")
+            empty_boxlist.add_field("labels", class_ids)
+            empty_boxlist.add_field("ids", instance_ids)
+            return empty_boxlist
+
+        labels = torch.as_tensor(labels).reshape(-1, 6)
+        boxes = labels[:, :4]
+        target = BoxList(boxes, img.size, mode="xyxy")
+
+        class_ids = labels[:, 4].clone().to(torch.int64)
+        target.add_field("labels", class_ids)
+
+        instance_ids = labels[:, -1].clone().to(torch.int64)
+        target.add_field("ids", instance_ids)
+
+        if not self._amodal:
+            target = target.clip_to_image(remove_empty=True)
+
+        return target
+
+    def _dataset_preprocess(self):
+        items = []
+        labels = []
+        ids = []
+        im_aspect_ratios = []
+        image_ids = sorted(self.dataset.getImgIds())
+        instance_id = 0
+        rm_redundant = 0
+        all_amodal = 0
+
+        for entry in tqdm(self.dataset.loadImgs(image_ids)):
+            label, num_instances, num_redundant, num_amodal\
+                = self._check_load_bbox(entry, instance_id)
+            if not label and not self._include_bg:
+                continue
+            instance_id += num_instances
+            rm_redundant += num_redundant
+            all_amodal += num_amodal
+            labels.append(label)
+            ids.append(entry['id'])
+            items.append(entry['file_name'])
+            im_aspect_ratios.append(float(entry['width']) / entry['height'])
+
+        print('{} / {} valid images...'.format(len(labels), len(image_ids)))
+        print('{} instances...'.format(instance_id))
+        print('{} redundant instances are removed...'.format(rm_redundant))
+        print('{} amodal instances...'.format(all_amodal))
+        return labels, im_aspect_ratios, items, ids
+
+    def _check_load_bbox(self, entry, instance_id):
+        """
+        Check and load ground-truth labels
+        """
+        entry_id = entry['id']
+        entry_id = [entry_id] if not isinstance(entry_id, (list, tuple)) else entry_id
+        ann_ids = self.dataset.getAnnIds(imgIds=entry_id, iscrowd=None)
+        objs = self.dataset.loadAnns(ann_ids)
+
+        # check valid bboxes
+        valid_objs = []
+        width = entry['width']
+        height = entry['height']
+        _instance_count = 0
+        _redudant_count = 0
+        _amodal_count = 0
+        unique_bbs = set()
+        for obj in objs:
+            if obj.get('ignore', 0) == 1:
+                continue
+            if not self._use_crowd and obj.get('iscrowd', 0):
+                continue
+            if self._amodal:
+                xmin, ymin, xmax, ymax = bbox_xywh_to_xyxy(obj['bbox'])
+                if xmin < 0 or ymin < 0 or xmax > width or ymax > height:
+                    _amodal_count += 1
+            else:
+                xmin, ymin, xmax, ymax = bbox_clip_xyxy(bbox_xywh_to_xyxy(obj['bbox']), width, height)
+
+            if (xmin, ymin, xmax, ymax) in unique_bbs:
+                _redudant_count += 1
+                continue
+
+            box_w = (xmax - xmin)
+            box_h = (ymax - ymin)
+            area = box_w * box_h
+            if area <= self._min_object_area:
+                continue
+
+            # require non-zero box area
+            if xmax > xmin and ymax > ymin:
+                unique_bbs.add((xmin, ymin, xmax, ymax))
+                contiguous_cid = self.json_category_id_to_contiguous_id[obj['category_id']]
+                valid_objs.append([xmin, ymin, xmax, ymax, contiguous_cid,
+                                   instance_id+_instance_count])
+                _instance_count += 1
+        if not valid_objs:
+            if not self._skip_empty:
+                # dummy invalid labels if no valid objects are found
+                valid_objs.append([-1, -1, -1, -1, -1, -1])
+        return valid_objs, _instance_count, _redudant_count, _amodal_count
+
+    def __len__(self):
+        return len(self._items)
+
+    def get_img_info(self, index):
+        img_id = self.id_to_img_map[index]
+        img_data = self.dataset.imgs[img_id]
+        return img_data
+
+    @property
+    def classes(self):
+        return self._det_classes
+
+    def get_im_aspect_ratio(self):
+        return self._im_aspect_ratios
+
+
+if __name__ == "__main__":
+
+    from siammot.configs.defaults import cfg
+    from siammot.data.video_dataset import VideoDatasetBatchCollator
+    from siammot.data.adapters.utils.data_utils import load_dataset_anno
+    from siammot.data.adapters.augmentation.build_augmentation import build_siam_augmentation
+
+    torch.manual_seed(0)
+
+    dataset_anno, dataset_info = load_dataset_anno('COCO17_train')
+    collator = VideoDatasetBatchCollator()
+    transforms = build_siam_augmentation(cfg, modality=dataset_info['modality'])
+
+    dataset = ImageDataset(dataset_anno,
+                           dataset_info['image_folder'],
+                           frames_per_image=2,
+                           transforms=transforms,
+                           amodal=True)
+
+    batch_size = 16
+    sampler = torch.utils.data.sampler.RandomSampler(dataset)
+    batch_sampler = torch.utils.data.sampler.BatchSampler(
+        sampler, batch_size, drop_last=False)
+    dataloader = data.DataLoader(dataset,
+                                 num_workers=4,
+                                 batch_sampler=batch_sampler,
+                                 collate_fn=collator
+                                 )
+    import time
+    tic = time.time()
+    for iteration, (image, target, image_ids) in enumerate(dataloader):
+        data_time = time.time() - tic
+        print("Data loading time: {}".format(data_time))
+        tic = time.time()
+        print(image_ids)
\ No newline at end of file
diff --git a/siam-mot/siammot/data/ingestion/ingest_mot.py b/siam-mot/siammot/data/ingestion/ingest_mot.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd101288e563e8260ff678158cac02ed008b4d4f
--- /dev/null
+++ b/siam-mot/siammot/data/ingestion/ingest_mot.py
@@ -0,0 +1,197 @@
+import argparse
+import csv
+import configparser
+import datetime
+import glob
+import os
+
+from PIL import Image
+from pathlib import Path
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset, DataSample, AnnoEntity, FieldNames, SplitNames
+from gluoncv.torch.data.gluoncv_motion_dataset.utils.ingestion_utils import process_dataset_splits
+
+# From paper, see table 5 and 6: https://arxiv.org/pdf/1603.00831.pdf
+MOT_LABEL_MAP = {
+    1: "Pedestrian",
+    2: "Person on vehicle",
+    3: "Car",
+    4: "Bicycle",
+    5: "Motorbike",
+    6: "Non motorized vehicle",
+    7: "Static person",
+    8: "Distractor",
+    9: "Occluder",
+    10: "Occluder on the ground",
+    11: "Occluder full",
+    12: "Reflection",
+}
+
+DET_OPTIONS = {"SDP", "FRCNN", "DPM"}
+
+
+def sample_from_mot_csv(csv_path, fps, sample=None, mot17=True, has_gt=False):
+    if sample is None:
+        id_ = Path(csv_path).stem
+        sample = DataSample(id_)
+    else:
+        sample = sample.get_copy_without_entities()
+    with open(csv_path, newline='') as f:
+        reader = csv.reader(f, delimiter=',')
+
+        def coord(x):
+            return round(float(x))
+
+        for row in reader:
+            frame_num = int(row[0])
+            obj_id = row[1]
+            x = coord(row[2])
+            y = coord(row[3])
+            w = coord(row[4])
+            h = coord(row[5])
+            conf = float(row[6])
+            # If not mot17 the last 3 are 3D coords which are usually -1
+            # (see pg. 9 https://arxiv.org/pdf/1504.01942.pdf)
+            if has_gt and mot17:
+                label = int(row[7])
+                visibility = float(row[8])
+            else:
+                label = 1
+                visibility = 1
+
+            label_text = MOT_LABEL_MAP[label]
+
+            # NOTE: Actually all classes that aren't Pedestrian have confidence 0 and so should be ingested
+            # but are ignored at evaluation time
+            # i.e. (label != 1 and conf) is never true
+            assert not (label != 1 and conf)
+            has_person_label = label_text in ("Pedestrian")
+
+            time_ms = int((frame_num - 1) / fps * 1000)
+            entity = AnnoEntity(time=time_ms, id=obj_id)
+            entity.bbox = [x, y, w, h]
+            blob = {
+                "frame_csv": frame_num,
+                "frame_idx": frame_num - 1,
+                "visibility": visibility
+            }
+            entity.labels = {}
+            # entity.labels["person"] = 1
+            if has_person_label:
+                entity.labels["person"] = 1
+            else:
+                entity.labels[str(label)] = 1
+            entity.labels["vis"] = visibility
+
+            entity.confidence = conf
+            entity.blob = blob
+
+            sample.add_entity(entity)
+    return sample
+
+
+def main(args, description="Initial ingestion", det_options=None, mot17=True):
+    if mot17:
+        if det_options is not None and not all(x in DET_OPTIONS for x in det_options):
+            raise ValueError("Det options were {} but must be only: {}".format(det_options, DET_OPTIONS))
+        if det_options is None:
+            det_options = DET_OPTIONS
+    else:
+        print("Ingesting MOT15, ignoring det options {}".format(det_options))
+        det_options = [""]
+
+    dataset_path = args.dataset_path
+    out_filename = args.anno_name
+
+    out_dataset = GluonCVMotionDataset(out_filename, dataset_path, load_anno=False)
+    metadata = {
+        FieldNames.DESCRIPTION: description,
+        FieldNames.DATE_MODIFIED: str(datetime.datetime.now()),
+    }
+    out_dataset.metadata = metadata
+
+    splits = {
+        "train": os.path.join(out_dataset.data_root_path, "train"),
+        "test": os.path.join(out_dataset.data_root_path, "test"), # No gt for MOT test
+    }
+
+    for det_option in det_options:
+        for split_name, split_path in splits.items():
+            subdirs = glob.glob(os.path.join(split_path, "*" + det_option))
+            for i, subdir in enumerate(subdirs):
+                vid_id = os.path.basename(subdir)
+                vid_path = os.path.join(split_path, subdir)
+
+                sample = DataSample(vid_id)
+
+                if mot17:
+                    info_path = os.path.join(vid_path, "seqinfo.ini")
+                    config = configparser.ConfigParser()
+                    config.read(info_path)
+                    seq_conf = config["Sequence"]
+                    fps = float(seq_conf['frameRate'])
+                    num_frames = int(seq_conf['seqLength'])
+                    width = int(seq_conf['imWidth'])
+                    height = int(seq_conf['imHeight'])
+                else:
+                    # Assume 30 fps
+                    fps = 30
+                    im_paths = glob.glob(os.path.join(vid_path, "img1", "*.jpg"))
+                    num_frames = len(im_paths)
+                    im_example = Image.open(im_paths[0])
+                    width = im_example.width
+                    height = im_example.height
+
+                rel_base_dir = vid_path.replace(out_dataset.data_root_path, "").lstrip(os.path.sep)
+                rel_base_dir = os.path.join(rel_base_dir, "img1")
+                metadata = {
+                    FieldNames.DATA_PATH: rel_base_dir,
+                    FieldNames.FPS: fps,
+                    FieldNames.NUM_FRAMES: num_frames,
+                    FieldNames.RESOLUTION: {"width": width, "height": height},
+                }
+                sample.metadata = metadata
+
+                gt_path = os.path.join(vid_path, "gt/gt.txt")
+                det_path = os.path.join(vid_path, "det/det.txt")
+                has_gt = os.path.exists(gt_path)
+                anno_path = gt_path if has_gt else det_path
+
+                sample = sample_from_mot_csv(anno_path, fps, sample, mot17, has_gt)
+
+                out_dataset.add_sample(sample)
+
+                print("Done {} sample {}/{}, {}".format(split_name, i+1, len(subdirs), vid_id))
+
+    out_dataset.dump()
+
+    return out_dataset
+
+
+def write_data_split(args, dataset):
+    if dataset is None:
+        dataset = GluonCVMotionDataset(args.anno_name, args.dataset_path)
+
+    def split_func(sample):
+        data_path = sample.data_relative_path
+        if data_path.startswith("train"):
+            return SplitNames.TRAIN
+        elif data_path.startswith("test"):
+            return SplitNames.TEST
+
+        raise Exception("Shouldn't happen")
+
+    process_dataset_splits(dataset, split_func, save=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Ingest mot dataset')
+    parser.add_argument('--dataset_path', default="",
+                        help="The path of dataset folder")
+    parser.add_argument('--anno_name', default="anno.json",
+                        help="The file name (with json) of ingested annotation file")
+    args = parser.parse_args()
+
+    mot17 = "MOT17" in args.dataset_path
+    dataset = main(args, mot17=mot17)
+    write_data_split(args, dataset)
diff --git a/siam-mot/siammot/data/ingestion/ingest_prim_air.py b/siam-mot/siammot/data/ingestion/ingest_prim_air.py
new file mode 100644
index 0000000000000000000000000000000000000000..b973d1d5f87a419c035c0b12c88c7dbbbe608682
--- /dev/null
+++ b/siam-mot/siammot/data/ingestion/ingest_prim_air.py
@@ -0,0 +1,127 @@
+import argparse
+import copy
+import datetime
+import fire
+import string
+import tqdm
+import os
+from pathlib import Path
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset, FieldNames, SplitNames
+from gluoncv.torch.data.gluoncv_motion_dataset.utils.ingestion_utils import process_dataset_splits
+from gluoncv.torch.data.gluoncv_motion_dataset.utils.serialization_utils import save_json
+
+
+def ingest_dataset(args, renumber_ids=True):
+    """
+
+    :param args: Input arguments
+    :param renumber_ids: rename track identities to integers
+    """
+    dataset = GluonCVMotionDataset(args.anno_name, args.dataset_path, load_anno=False)
+    dataset.metadata = {
+        FieldNames.DESCRIPTION: "Initial ingestion",
+        FieldNames.DATE_MODIFIED: str(datetime.datetime.now()),
+    }
+    #raw_anno_paths = sorted(Path(dataset.data_root_path).glob("groundtruth.json"))
+    raw_anno_paths = sorted(Path('/home/ubuntu/airborne-detection-starter-kit/data/').glob("groundtruth.json"))
+    
+    for raw_anno_path in tqdm.tqdm(raw_anno_paths):
+        # Setting the dataset and samples to None here looks pointless but it allows the memory to be freed, otherwise
+        # on subsequent iterations it can actually run out of memory as it loads a new dataset while keeping the
+        # previous one still in memory (happened on c5.xlarge 8GB RAM)
+        raw_dataset = None
+        samples = None
+        # raw_sample and sample have references back to the dataset so have to unset these too
+        raw_sample = sample = None
+        raw_dataset = GluonCVMotionDataset(raw_anno_path)
+        raw_dataset.__version__ = 1
+        set_dir = raw_anno_path.parent.parent
+        images_root_path = Path(dataset.data_root_path) # set_dir / "Images"
+
+        samples = sorted(raw_dataset.samples)
+        with open ('/home/ubuntu/siam-mot/data/all_flights_val.txt', 'r') as f:
+            all_flights = f.readlines()
+        all_flights = [flight.rstrip() for flight in all_flights]
+
+        for raw_id, raw_sample in tqdm.tqdm(samples):
+            if raw_id not in all_flights[200:]:
+                continue
+            data_path = images_root_path /raw_id 
+            data_rel_path = str(data_path.relative_to(dataset.data_root_path))
+            new_id = data_rel_path
+            first_img = sorted(data_path.glob("*.png"))[0]
+            first_timestamp = int(first_img.name.split(raw_id)[0])
+            sample = raw_sample.get_copy_without_entities(new_id=new_id)
+            sample.metadata["orig_path"] = raw_sample.data_relative_path
+            sample.data_relative_path = data_rel_path
+            unique_ids = {}
+
+            first_frame = None
+            for raw_entity in raw_sample.entities:
+                entity = copy.deepcopy(raw_entity)
+                orig_frame = entity.blob.pop("frame")
+                orig_time = entity.time
+                if first_frame is None:
+                    assert raw_entity.time == first_timestamp
+                    first_frame = orig_frame
+                rel_frame = orig_frame - first_frame
+                # rel_ts = raw_entity.time - first_timestamp
+                # assert rel_ts >= 0
+                # rel_ts_msec = rel_ts / 1e6
+                # ts_msec_round = int(round(rel_ts_msec / sample.period) * sample.period)
+                # print(f"frame: {raw_entity.blob.get('frame')} ts_msec: {rel_ts_msec} ts_round {ts_msec_round}")
+                # print()
+                # assert abs(rel_ts_msec - ts_msec_round) < sample.period / 10
+                # entity.time = ts_msec_round
+
+                entity.time = round(rel_frame / sample.fps * 1000)
+                if entity.id:
+                    obj_type = entity.id.rstrip(string.digits).lower()
+                    entity.labels[obj_type] = 1
+                    if entity.id.lower() in ("airplane1", "helicopter1"):
+                        entity.labels["intruder"] = 1
+                    entity.blob["orig_id"] = entity.id
+                    if renumber_ids:
+                        entity.id = unique_ids.setdefault(entity.id, len(unique_ids))
+                entity.blob[FieldNames.FRAME_IDX] = rel_frame
+                entity.blob["orig_frame"] = orig_frame
+                entity.blob["orig_time"] = orig_time
+                if entity.labels and "miss_distance_class" in entity.labels:
+                    entity.blob["miss_distance_class"] = entity.labels.pop("miss_distance_class")
+                if "range_distance_m" in entity.blob:
+                    entity.blob["range_distance_m"] = round(entity.blob["range_distance_m"], 1)
+                sample.add_entity(entity)
+
+            # break
+            dataset.add_sample(sample, dump_directly=True)
+
+        dataset.dump()
+
+    return dataset
+
+
+def write_split(dataset):
+    def split_func(sample):
+        # data_path = sample.data_relative_path
+        orig_path = sample.metadata['orig_path']
+        if orig_path.startswith("train"):
+            return SplitNames.TRAIN
+        elif orig_path.startswith("val"):
+            return SplitNames.VAL
+
+        raise Exception("Shouldn't happen")
+
+    process_dataset_splits(dataset, split_func, save=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Ingest Prime Air dataset')
+    parser.add_argument('--dataset_path', default="/home/ubuntu/airborne-detection-starter-kit/data/val/")
+                        #description="The path of dataset folder")
+    parser.add_argument('--anno_name', default="anno.json")
+                        #description="The file name (with json) of ingested annotation file")
+    args = parser.parse_args()
+
+    dataset = ingest_dataset(args, renumber_ids=True)
+    write_split(dataset)
diff --git a/siam-mot/siammot/data/video_dataset.py b/siam-mot/siammot/data/video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9c3f6acca456d0207f14ec2ca46232a3649bba6
--- /dev/null
+++ b/siam-mot/siammot/data/video_dataset.py
@@ -0,0 +1,195 @@
+import random
+import torch
+import itertools
+import torch.utils.data as data
+from tqdm import tqdm
+from collections import defaultdict
+from PIL.Image import Image
+
+from maskrcnn_benchmark.structures.image_list import to_image_list
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+
+from gluoncv.torch.data.gluoncv_motion_dataset.dataset import GluonCVMotionDataset, AnnoEntity
+
+
+class VideoDataset(data.Dataset):
+
+    def __init__(self, dataset: GluonCVMotionDataset, sampling_interval=250, clip_len=1000,
+                 is_train=True, frames_in_clip=2, transforms=None, filter_fn=None,
+                 amodal=False):
+        """
+        :param dataset: the ingested dataset with GluonCVMotionDataset
+        :param sampling_interval: the temporal stride (in ms) of sliding window
+        :param clip_len: the temporal length (in ms) of video clips
+        :param is_train: a boolean flag indicating whether it is training
+        :param frames_in_clip: the number of frames sampled in a video clip (for a training example)
+        :param transforms: frame-level transformation before they are fed into neural networks
+        :param filter_fn: a callable function to filter entities
+        :param amodal: whether to clip the bounding box beyond image boundary
+        """
+
+        if dataset is None:
+            raise Exception('dataset should not be None. Call GluonCVMotionDataset to construct dataset first.')
+
+        assert is_train is True, "The dataset class only supports training"
+        assert (2 >= frames_in_clip > 0), "frames_in_clip has to be 1 or 2"
+
+        self.data = dict(dataset.train_samples)
+
+        self.clip_len = clip_len
+        self.transforms = transforms
+        self.filter_fn = filter_fn
+        self.frames_in_clip = min(clip_len, frames_in_clip)
+
+        # Process dataset to get all valid video clips
+        self.clips = self.get_video_clips(sampling_interval_ms=sampling_interval)
+        self.amodal = amodal
+
+    def __getitem__(self, item_id):
+
+        video = []
+        target = []
+
+        (sample_id, clip_frame_ids) = self.clips[item_id]
+        video_info = self.data[sample_id]
+        video_reader = video_info.get_data_reader()
+
+        # Randomly sampling self.frames_in_clip frames
+        # And keep their relative temporal order
+        rand_idxs = sorted(random.sample(clip_frame_ids, self.frames_in_clip))
+        for frame_idx in rand_idxs:
+            im = video_reader[frame_idx][0]
+            entities = video_info.get_entities_for_frame_num(frame_idx)
+            if self.filter_fn is not None:
+                entities, _ = self.filter_fn(entities, meta_data=video_info.metadata)
+            boxes = self.entity2target(im, entities)
+
+            video.append(im)
+            target.append(boxes)
+
+        # Video clip-level augmentation
+        if self.transforms is not None:
+            video, target = self.transforms(video, target)
+
+        return video, target, sample_id
+
+    def __len__(self):
+        return len(self.clips)
+
+    def get_video_clips(self, sampling_interval_ms=250):
+        """
+        Process the long videos to a small video chunk (with self.clip_len seconds)
+        Video clips are generated in a temporal sliding window fashion
+        """
+        video_clips = []
+        for (sample_id, sample) in tqdm(self.data.items()):
+            frame_idxs_with_anno = sample.get_non_empty_frames(self.filter_fn)
+            if len(frame_idxs_with_anno) == 0:
+                continue
+            # The video clip may not be temporally continuous
+            start_frame = min(frame_idxs_with_anno)
+            end_frame = max(frame_idxs_with_anno)
+            # make sure that the video clip has at least two frames
+            clip_len_in_frames = max(self.frames_in_clip, int(self.clip_len / 1000. * sample.fps))
+            sampling_interval = int(sampling_interval_ms / 1000. * sample.fps)
+            for idx in range(start_frame, end_frame, sampling_interval):
+                clip_frame_ids = []
+                # only include frames with annotation within the video clip
+                for frame_idx in range(idx, idx + clip_len_in_frames):
+                    if frame_idx in frame_idxs_with_anno:
+                        clip_frame_ids.append(frame_idx)
+                # Only include video clips that have at least self.frames_in_clip annotating frames
+                if len(clip_frame_ids) >= self.frames_in_clip:
+                    video_clips.append((sample_id, clip_frame_ids))
+
+        return video_clips
+
+    def entity2target(self, im: Image, entities: [AnnoEntity]):
+        """
+        Wrap up the entity to maskrcnn-benchmark compatible format - BoxList
+        """
+        boxes = [entity.bbox for entity in entities]
+        ids = [int(entity.id) for entity in entities]
+        # we only consider person tracking for now,
+        # thus all the labels are 1,
+        # reserve category 0 for background during training
+        int_labels = [1 for _ in entities]
+
+        boxes = torch.as_tensor(boxes).reshape(-1, 4)
+        boxes = BoxList(boxes, im.size, mode='xywh').convert('xyxy')
+        if not self.amodal:
+            boxes = boxes.clip_to_image(remove_empty=False)
+        boxes.add_field('labels', torch.as_tensor(int_labels, dtype=torch.int64))
+        boxes.add_field('ids', torch.as_tensor(ids, dtype=torch.int64))
+
+        return boxes
+
+
+class VideoDatasetBatchCollator(object):
+    """
+    From a list of samples from the dataset,
+    returns the batched images and targets.
+    This should be passed to the DataLoader
+    """
+
+    def __init__(self, size_divisible=0):
+        self.size_divisible = size_divisible
+
+    def __call__(self, batch):
+        transposed_batch = list(zip(*batch))
+        image_batch = list(itertools.chain(*transposed_batch[0]))
+        image_batch = to_image_list(image_batch, self.size_divisible)
+
+        # to make sure that the id of each instance
+        # are unique across the whole batch
+        targets = transposed_batch[1]
+        video_ids = transposed_batch[2]
+        uid = 0
+        video_id_map = defaultdict(dict)
+        for targets_per_video, video_id in zip(targets, video_ids):
+            for targets_per_video_frame in targets_per_video:
+                if targets_per_video_frame.has_field('ids'):
+                    _ids = targets_per_video_frame.get_field('ids')
+                    _uids = _ids.clone()
+                    for i in range(len(_ids)):
+                        _id = _ids[i].item()
+                        if _id not in video_id_map[video_id]:
+                            video_id_map[video_id][_id] = uid
+                            uid += 1
+                        _uids[i] = video_id_map[video_id][_id]
+                    targets_per_video_frame.extra_fields['ids'] = _uids
+
+        targets = list(itertools.chain(*targets))
+
+        return image_batch, targets, video_ids
+
+
+if __name__ == "__main__":
+
+    from siammot.data.adapters.utils.data_utils import load_dataset_anno
+
+    torch.manual_seed(0)
+
+    dataset_anno, dataset_info = load_dataset_anno('MOT17')
+    collator = VideoDatasetBatchCollator()
+
+    dataset = VideoDataset(dataset_anno,
+                           frames_in_clip=2,
+                           amodal=True)
+
+    batch_size = 16
+    sampler = torch.utils.data.sampler.RandomSampler(dataset)
+    batch_sampler = torch.utils.data.sampler.BatchSampler(
+        sampler, batch_size, drop_last=False)
+    dataloader = data.DataLoader(dataset,
+                                 num_workers=4,
+                                 batch_sampler=batch_sampler,
+                                 collate_fn=collator
+                                 )
+    import time
+    tic = time.time()
+    for iteration, (image, target, image_ids) in enumerate(dataloader):
+        data_time = time.time() - tic
+        print("Data loading time: {}".format(data_time))
+        tic = time.time()
+        print(image_ids)