diff --git a/mmdet/datasets/__init__.py b/mmdet/datasets/__init__.py index 425ea72535a144544f44ebe8b5d63dd31336a54c..75e07097756bd014bbef17294b6803aa83621fd1 100644 --- a/mmdet/datasets/__init__.py +++ b/mmdet/datasets/__init__.py @@ -1,8 +1,9 @@ +from .custom import CustomDataset from .coco import CocoDataset from .loader import GroupSampler, DistributedGroupSampler, build_dataloader from .utils import to_tensor, random_scale, show_ann __all__ = [ - 'CocoDataset', 'GroupSampler', 'DistributedGroupSampler', + 'CustomDataset', 'CocoDataset', 'GroupSampler', 'DistributedGroupSampler', 'build_dataloader', 'to_tensor', 'random_scale', 'show_ann' ] diff --git a/mmdet/datasets/coco.py b/mmdet/datasets/coco.py index fbb14aa285a6fe33be6edaa395134c322f9daef4..e30a49ebd287d0f18922baeb79b70d9eae8f633f 100644 --- a/mmdet/datasets/coco.py +++ b/mmdet/datasets/coco.py @@ -1,113 +1,39 @@ -import os.path as osp - -import mmcv import numpy as np -from mmcv.parallel import DataContainer as DC from pycocotools.coco import COCO -from torch.utils.data import Dataset -from .transforms import (ImageTransform, BboxTransform, MaskTransform, - Numpy2Tensor) -from .utils import to_tensor, show_ann, random_scale +from .custom import CustomDataset -class CocoDataset(Dataset): +class CocoDataset(CustomDataset): - def __init__(self, - ann_file, - img_prefix, - img_scale, - img_norm_cfg, - size_divisor=None, - proposal_file=None, - num_max_proposals=1000, - flip_ratio=0, - with_mask=True, - with_crowd=True, - with_label=True, - test_mode=False, - debug=False): - # path of the data file + def load_annotations(self, ann_file): self.coco = COCO(ann_file) - # filter images with no annotation during training - if not test_mode: - self.img_ids, self.img_infos = self._filter_imgs() - else: - self.img_ids = self.coco.getImgIds() - self.img_infos = [ - self.coco.loadImgs(idx)[0] for idx in self.img_ids - ] - assert len(self.img_ids) == len(self.img_infos) - # get the mapping from original category ids to labels - self.cat_ids = self.coco.getCatIds() - self.cat2label = { - cat_id: i + 1 - for i, cat_id in enumerate(self.cat_ids) - } - # prefix of images path - self.img_prefix = img_prefix - # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...] - self.img_scales = img_scale if isinstance(img_scale, - list) else [img_scale] - assert mmcv.is_list_of(self.img_scales, tuple) - # color channel order and normalize configs - self.img_norm_cfg = img_norm_cfg - # proposals - # TODO: revise _filter_imgs to be more flexible - if proposal_file is not None: - self.proposals = mmcv.load(proposal_file) - ori_ids = self.coco.getImgIds() - sorted_idx = [ori_ids.index(id) for id in self.img_ids] - self.proposals = [self.proposals[idx] for idx in sorted_idx] - else: - self.proposals = None - self.num_max_proposals = num_max_proposals - # flip ratio - self.flip_ratio = flip_ratio - assert flip_ratio >= 0 and flip_ratio <= 1 - # padding border to ensure the image size can be divided by - # size_divisor (used for FPN) - self.size_divisor = size_divisor - # with crowd or not, False when using RetinaNet - self.with_crowd = with_crowd - # with mask or not - self.with_mask = with_mask - # with label is False for RPN - self.with_label = with_label - # in test mode or not - self.test_mode = test_mode - # debug mode or not - self.debug = debug - - # set group flag for the sampler - self._set_group_flag() - # transforms - self.img_transform = ImageTransform( - size_divisor=self.size_divisor, **self.img_norm_cfg) - self.bbox_transform = BboxTransform() - self.mask_transform = MaskTransform() - self.numpy2tensor = Numpy2Tensor() - - def __len__(self): - return len(self.img_ids) - - def _filter_imgs(self, min_size=32): - """Filter images too small or without ground truths.""" - img_ids = list(set([_['image_id'] for _ in self.coco.anns.values()])) - valid_ids = [] + cat_ids = self.coco.getCatIds() + self.cat2label = {cat_id: i + 1 for i, cat_id in enumerate(cat_ids)} + self.img_ids = self.coco.getImgIds() img_infos = [] - for i in img_ids: + for i in self.img_ids: info = self.coco.loadImgs(i)[0] - if min(info['width'], info['height']) >= min_size: - valid_ids.append(i) - img_infos.append(info) - return valid_ids, img_infos + info['filename'] = info['file_name'] + img_infos.append(info) + return img_infos - def _load_ann_info(self, idx): - img_id = self.img_ids[idx] + def get_ann_info(self, idx): + img_id = self.img_infos[idx]['id'] ann_ids = self.coco.getAnnIds(imgIds=img_id) ann_info = self.coco.loadAnns(ann_ids) - return ann_info + return self._parse_ann_info(ann_info) + + def _filter_imgs(self, min_size=32): + """Filter images too small or without ground truths.""" + valid_inds = [] + ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values()) + for i, img_info in enumerate(self.img_infos): + if self.img_ids[i] not in ids_with_ann: + continue + if min(img_info['width'], img_info['height']) >= min_size: + valid_inds.append(i) + return valid_inds def _parse_ann_info(self, ann_info, with_mask=True): """Parse bbox and mask annotation. @@ -172,158 +98,3 @@ class CocoDataset(Dataset): ann['mask_polys'] = gt_mask_polys ann['poly_lens'] = gt_poly_lens return ann - - def _set_group_flag(self): - """Set flag according to image aspect ratio. - - Images with aspect ratio greater than 1 will be set as group 1, - otherwise group 0. - """ - self.flag = np.zeros(len(self.img_ids), dtype=np.uint8) - for i in range(len(self.img_ids)): - img_info = self.img_infos[i] - if img_info['width'] / img_info['height'] > 1: - self.flag[i] = 1 - - def _rand_another(self, idx): - pool = np.where(self.flag == self.flag[idx])[0] - return np.random.choice(pool) - - def __getitem__(self, idx): - if self.test_mode: - return self.prepare_test_img(idx) - while True: - img_info = self.img_infos[idx] - ann_info = self._load_ann_info(idx) - - # load image - img = mmcv.imread(osp.join(self.img_prefix, img_info['file_name'])) - if self.debug: - show_ann(self.coco, img, ann_info) - - # load proposals if necessary - if self.proposals is not None: - proposals = self.proposals[idx][:self.num_max_proposals] - # TODO: Handle empty proposals properly. Currently images with - # no proposals are just ignored, but they can be used for - # training in concept. - if len(proposals) == 0: - idx = self._rand_another(idx) - continue - if not (proposals.shape[1] == 4 or proposals.shape[1] == 5): - raise AssertionError( - 'proposals should have shapes (n, 4) or (n, 5), ' - 'but found {}'.format(proposals.shape)) - if proposals.shape[1] == 5: - scores = proposals[:, 4, None] - proposals = proposals[:, :4] - else: - scores = None - - ann = self._parse_ann_info(ann_info, self.with_mask) - gt_bboxes = ann['bboxes'] - gt_labels = ann['labels'] - gt_bboxes_ignore = ann['bboxes_ignore'] - # skip the image if there is no valid gt bbox - if len(gt_bboxes) == 0: - idx = self._rand_another(idx) - continue - - # apply transforms - flip = True if np.random.rand() < self.flip_ratio else False - img_scale = random_scale(self.img_scales) # sample a scale - img, img_shape, pad_shape, scale_factor = self.img_transform( - img, img_scale, flip) - if self.proposals is not None: - proposals = self.bbox_transform(proposals, img_shape, - scale_factor, flip) - proposals = np.hstack( - [proposals, scores]) if scores is not None else proposals - gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor, - flip) - gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape, - scale_factor, flip) - - if self.with_mask: - gt_masks = self.mask_transform(ann['masks'], pad_shape, - scale_factor, flip) - - ori_shape = (img_info['height'], img_info['width'], 3) - img_meta = dict( - ori_shape=ori_shape, - img_shape=img_shape, - pad_shape=pad_shape, - scale_factor=scale_factor, - flip=flip) - - data = dict( - img=DC(to_tensor(img), stack=True), - img_meta=DC(img_meta, cpu_only=True), - gt_bboxes=DC(to_tensor(gt_bboxes))) - if self.proposals is not None: - data['proposals'] = DC(to_tensor(proposals)) - if self.with_label: - data['gt_labels'] = DC(to_tensor(gt_labels)) - if self.with_crowd: - data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore)) - if self.with_mask: - data['gt_masks'] = DC(gt_masks, cpu_only=True) - return data - - def prepare_test_img(self, idx): - """Prepare an image for testing (multi-scale and flipping)""" - img_info = self.img_infos[idx] - img = mmcv.imread(osp.join(self.img_prefix, img_info['file_name'])) - if self.proposals is not None: - proposal = self.proposals[idx][:self.num_max_proposals] - if not (proposal.shape[1] == 4 or proposal.shape[1] == 5): - raise AssertionError( - 'proposals should have shapes (n, 4) or (n, 5), ' - 'but found {}'.format(proposal.shape)) - else: - proposal = None - - def prepare_single(img, scale, flip, proposal=None): - _img, img_shape, pad_shape, scale_factor = self.img_transform( - img, scale, flip) - _img = to_tensor(_img) - _img_meta = dict( - ori_shape=(img_info['height'], img_info['width'], 3), - img_shape=img_shape, - pad_shape=pad_shape, - scale_factor=scale_factor, - flip=flip) - if proposal is not None: - if proposal.shape[1] == 5: - score = proposal[:, 4, None] - proposal = proposal[:, :4] - else: - score = None - _proposal = self.bbox_transform(proposal, img_shape, - scale_factor, flip) - _proposal = np.hstack( - [_proposal, score]) if score is not None else _proposal - _proposal = to_tensor(_proposal) - else: - _proposal = None - return _img, _img_meta, _proposal - - imgs = [] - img_metas = [] - proposals = [] - for scale in self.img_scales: - _img, _img_meta, _proposal = prepare_single( - img, scale, False, proposal) - imgs.append(_img) - img_metas.append(DC(_img_meta, cpu_only=True)) - proposals.append(_proposal) - if self.flip_ratio > 0: - _img, _img_meta, _proposal = prepare_single( - img, scale, True, proposal) - imgs.append(_img) - img_metas.append(DC(_img_meta, cpu_only=True)) - proposals.append(_proposal) - data = dict(img=imgs, img_meta=img_metas) - if self.proposals is not None: - data['proposals'] = proposals - return data diff --git a/mmdet/datasets/custom.py b/mmdet/datasets/custom.py new file mode 100644 index 0000000000000000000000000000000000000000..3ae470443511c1164388d443c2623b454fa20d84 --- /dev/null +++ b/mmdet/datasets/custom.py @@ -0,0 +1,274 @@ +import os.path as osp + +import mmcv +import numpy as np +from mmcv.parallel import DataContainer as DC +from torch.utils.data import Dataset + +from .transforms import (ImageTransform, BboxTransform, MaskTransform, + Numpy2Tensor) +from .utils import to_tensor, random_scale + + +class CustomDataset(Dataset): + """Custom dataset for detection. + + Annotation format: + [ + { + 'filename': 'a.jpg', + 'width': 1280, + 'height': 720, + 'ann': { + 'bboxes': <np.ndarray> (n, 4), + 'labels': <np.ndarray> (n, ), + 'bboxes_ignore': <np.ndarray> (k, 4), + 'labels_ignore': <np.ndarray> (k, 4) (optional field) + } + }, + ... + ] + + The `ann` field is optional for testing. + """ + + def __init__(self, + ann_file, + img_prefix, + img_scale, + img_norm_cfg, + size_divisor=None, + proposal_file=None, + num_max_proposals=1000, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True, + test_mode=False): + # load annotations (and proposals) + self.img_infos = self.load_annotations(ann_file) + if proposal_file is not None: + self.proposals = self.load_proposals(proposal_file) + else: + self.proposals = None + # filter images with no annotation during training + if not test_mode: + valid_inds = self._filter_imgs() + self.img_infos = [self.img_infos[i] for i in valid_inds] + if self.proposals is not None: + self.proposals = [self.proposals[i] for i in valid_inds] + + # prefix of images path + self.img_prefix = img_prefix + # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...] + self.img_scales = img_scale if isinstance(img_scale, + list) else [img_scale] + assert mmcv.is_list_of(self.img_scales, tuple) + # normalization configs + self.img_norm_cfg = img_norm_cfg + + # max proposals per image + self.num_max_proposals = num_max_proposals + # flip ratio + self.flip_ratio = flip_ratio + assert flip_ratio >= 0 and flip_ratio <= 1 + # padding border to ensure the image size can be divided by + # size_divisor (used for FPN) + self.size_divisor = size_divisor + + # with mask or not (reserved field, takes no effect) + self.with_mask = with_mask + # some datasets provide bbox annotations as ignore/crowd/difficult, + # if `with_crowd` is True, then these info is returned. + self.with_crowd = with_crowd + # with label is False for RPN + self.with_label = with_label + # in test mode or not + self.test_mode = test_mode + + # set group flag for the sampler + if not self.test_mode: + self._set_group_flag() + # transforms + self.img_transform = ImageTransform( + size_divisor=self.size_divisor, **self.img_norm_cfg) + self.bbox_transform = BboxTransform() + self.mask_transform = MaskTransform() + self.numpy2tensor = Numpy2Tensor() + + def __len__(self): + return len(self.img_infos) + + def load_annotations(self, ann_file): + return mmcv.load(ann_file) + + def load_proposals(self, proposal_file): + return mmcv.load(proposal_file) + + def get_ann_info(self, idx): + return self.img_infos[idx]['ann'] + + def _filter_imgs(self, min_size=32): + """Filter images too small.""" + valid_inds = [] + for i, img_info in enumerate(self.img_infos): + if min(img_info['width'], img_info['height']) >= min_size: + valid_inds.append(i) + return valid_inds + + def _set_group_flag(self): + """Set flag according to image aspect ratio. + + Images with aspect ratio greater than 1 will be set as group 1, + otherwise group 0. + """ + self.flag = np.zeros(len(self), dtype=np.uint8) + for i in range(len(self)): + img_info = self.img_infos[i] + if img_info['width'] / img_info['height'] > 1: + self.flag[i] = 1 + + def _rand_another(self, idx): + pool = np.where(self.flag == self.flag[idx])[0] + return np.random.choice(pool) + + def __getitem__(self, idx): + if self.test_mode: + return self.prepare_test_img(idx) + while True: + data = self.prepare_train_img(idx) + if data is None: + idx = self._rand_another(idx) + continue + return data + + def prepare_train_img(self, idx): + img_info = self.img_infos[idx] + # load image + img = mmcv.imread(osp.join(self.img_prefix, img_info['filename'])) + # load proposals if necessary + if self.proposals is not None: + proposals = self.proposals[idx][:self.num_max_proposals] + # TODO: Handle empty proposals properly. Currently images with + # no proposals are just ignored, but they can be used for + # training in concept. + if len(proposals) == 0: + return None + if not (proposals.shape[1] == 4 or proposals.shape[1] == 5): + raise AssertionError( + 'proposals should have shapes (n, 4) or (n, 5), ' + 'but found {}'.format(proposals.shape)) + if proposals.shape[1] == 5: + scores = proposals[:, 4, None] + proposals = proposals[:, :4] + else: + scores = None + + ann = self.get_ann_info(idx) + gt_bboxes = ann['bboxes'] + gt_labels = ann['labels'] + if self.with_crowd: + gt_bboxes_ignore = ann['bboxes_ignore'] + + # skip the image if there is no valid gt bbox + if len(gt_bboxes) == 0: + return None + + # apply transforms + flip = True if np.random.rand() < self.flip_ratio else False + img_scale = random_scale(self.img_scales) # sample a scale + img, img_shape, pad_shape, scale_factor = self.img_transform( + img, img_scale, flip) + if self.proposals is not None: + proposals = self.bbox_transform(proposals, img_shape, scale_factor, + flip) + proposals = np.hstack( + [proposals, scores]) if scores is not None else proposals + gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor, + flip) + if self.with_crowd: + gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape, + scale_factor, flip) + if self.with_mask: + gt_masks = self.mask_transform(ann['masks'], pad_shape, + scale_factor, flip) + + ori_shape = (img_info['height'], img_info['width'], 3) + img_meta = dict( + ori_shape=ori_shape, + img_shape=img_shape, + pad_shape=pad_shape, + scale_factor=scale_factor, + flip=flip) + + data = dict( + img=DC(to_tensor(img), stack=True), + img_meta=DC(img_meta, cpu_only=True), + gt_bboxes=DC(to_tensor(gt_bboxes))) + if self.proposals is not None: + data['proposals'] = DC(to_tensor(proposals)) + if self.with_label: + data['gt_labels'] = DC(to_tensor(gt_labels)) + if self.with_crowd: + data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore)) + if self.with_mask: + data['gt_masks'] = DC(gt_masks, cpu_only=True) + return data + + def prepare_test_img(self, idx): + """Prepare an image for testing (multi-scale and flipping)""" + img_info = self.img_infos[idx] + img = mmcv.imread(osp.join(self.img_prefix, img_info['filename'])) + if self.proposals is not None: + proposal = self.proposals[idx][:self.num_max_proposals] + if not (proposal.shape[1] == 4 or proposal.shape[1] == 5): + raise AssertionError( + 'proposals should have shapes (n, 4) or (n, 5), ' + 'but found {}'.format(proposal.shape)) + else: + proposal = None + + def prepare_single(img, scale, flip, proposal=None): + _img, img_shape, pad_shape, scale_factor = self.img_transform( + img, scale, flip) + _img = to_tensor(_img) + _img_meta = dict( + ori_shape=(img_info['height'], img_info['width'], 3), + img_shape=img_shape, + pad_shape=pad_shape, + scale_factor=scale_factor, + flip=flip) + if proposal is not None: + if proposal.shape[1] == 5: + score = proposal[:, 4, None] + proposal = proposal[:, :4] + else: + score = None + _proposal = self.bbox_transform(proposal, img_shape, + scale_factor, flip) + _proposal = np.hstack( + [_proposal, score]) if score is not None else _proposal + _proposal = to_tensor(_proposal) + else: + _proposal = None + return _img, _img_meta, _proposal + + imgs = [] + img_metas = [] + proposals = [] + for scale in self.img_scales: + _img, _img_meta, _proposal = prepare_single( + img, scale, False, proposal) + imgs.append(_img) + img_metas.append(DC(_img_meta, cpu_only=True)) + proposals.append(_proposal) + if self.flip_ratio > 0: + _img, _img_meta, _proposal = prepare_single( + img, scale, True, proposal) + imgs.append(_img) + img_metas.append(DC(_img_meta, cpu_only=True)) + proposals.append(_proposal) + data = dict(img=imgs, img_meta=img_metas) + if self.proposals is not None: + data['proposals'] = proposals + return data \ No newline at end of file diff --git a/tools/convert_datasets/pascal_voc.py b/tools/convert_datasets/pascal_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..5fb5cb4b7080f134287494f7f0283bed42b351cb --- /dev/null +++ b/tools/convert_datasets/pascal_voc.py @@ -0,0 +1,140 @@ +import argparse +import os.path as osp +import xml.etree.ElementTree as ET + +import mmcv +import numpy as np + +from mmdet.core import voc_classes + +label_ids = {name: i + 1 for i, name in enumerate(voc_classes())} + + +def parse_xml(args): + xml_path, img_path = args + tree = ET.parse(xml_path) + root = tree.getroot() + size = root.find('size') + w = int(size.find('width').text) + h = int(size.find('height').text) + bboxes = [] + labels = [] + bboxes_ignore = [] + labels_ignore = [] + for obj in root.findall('object'): + name = obj.find('name').text + label = label_ids[name] + difficult = int(obj.find('difficult').text) + bnd_box = obj.find('bndbox') + bbox = [ + int(bnd_box.find('xmin').text), + int(bnd_box.find('ymin').text), + int(bnd_box.find('xmax').text), + int(bnd_box.find('ymax').text) + ] + if difficult: + bboxes_ignore.append(bbox) + labels_ignore.append(label) + else: + bboxes.append(bbox) + labels.append(label) + if not bboxes: + bboxes = np.zeros((0, 4)) + labels = np.zeros((0, )) + else: + bboxes = np.array(bboxes, ndmin=2) - 1 + labels = np.array(labels) + if not bboxes_ignore: + bboxes_ignore = np.zeros((0, 4)) + labels_ignore = np.zeros((0, )) + else: + bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1 + labels_ignore = np.array(labels_ignore) + annotation = { + 'filename': img_path, + 'width': w, + 'height': h, + 'ann': { + 'bboxes': bboxes.astype(np.float32), + 'labels': labels.astype(np.int64), + 'bboxes_ignore': bboxes_ignore.astype(np.float32), + 'labels_ignore': labels_ignore.astype(np.int64) + } + } + return annotation + + +def cvt_annotations(devkit_path, years, split, out_file): + if not isinstance(years, list): + years = [years] + annotations = [] + for year in years: + filelist = osp.join(devkit_path, 'VOC{}/ImageSets/Main/{}.txt'.format( + year, split)) + if not osp.isfile(filelist): + print('filelist does not exist: {}, skip voc{} {}'.format( + filelist, year, split)) + return + img_names = mmcv.list_from_file(filelist) + xml_paths = [ + osp.join(devkit_path, 'VOC{}/Annotations/{}.xml'.format( + year, img_name)) for img_name in img_names + ] + img_paths = [ + 'VOC{}/JPEGImages/{}.jpg'.format(year, img_name) + for img_name in img_names + ] + part_annotations = mmcv.track_progress(parse_xml, + list(zip(xml_paths, img_paths))) + annotations.extend(part_annotations) + mmcv.dump(annotations, out_file) + return annotations + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert PASCAL VOC annotations to mmdetection format') + parser.add_argument('devkit_path', help='pascal voc devkit path') + parser.add_argument('-o', '--out-dir', help='output path') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + devkit_path = args.devkit_path + out_dir = args.out_dir if args.out_dir else devkit_path + mmcv.mkdir_or_exist(out_dir) + + years = [] + if osp.isdir(osp.join(devkit_path, 'VOC2007')): + years.append('2007') + if osp.isdir(osp.join(devkit_path, 'VOC2012')): + years.append('2012') + if '2007' in years and '2012' in years: + years.append(['2007', '2012']) + if not years: + raise IOError('The devkit path {} contains neither "VOC2007" nor ' + '"VOC2012" subfolder'.format(devkit_path)) + for year in years: + if year == '2007': + prefix = 'voc07' + elif year == '2012': + prefix = 'voc12' + elif year == ['2007', '2012']: + prefix = 'voc0712' + for split in ['train', 'val', 'trainval']: + dataset_name = prefix + '_' + split + print('processing {} ...'.format(dataset_name)) + cvt_annotations(devkit_path, year, split, + osp.join(out_dir, dataset_name + '.pkl')) + if not isinstance(year, list): + dataset_name = prefix + '_test' + print('processing {} ...'.format(dataset_name)) + cvt_annotations(devkit_path, year, 'test', + osp.join(out_dir, dataset_name + '.pkl')) + print('Done!') + + +if __name__ == '__main__': + main()