diff --git a/README.md b/README.md index db83b3a680518bb0040bc843652ee7407e1ac440..79f2316859e84a3cb0e86bc0d55e1466cdeccfc7 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ Results and models are available in the [Model zoo](MODEL_ZOO.md). | SSD | ✗ | ✗ | ✗ | ✓ | | RetinaNet | ✓ | ✓ | ☠| ✗ | | Hybrid Task Cascade| ✓ | ✓ | ☠| ✗ | +| FCOS | ✓ | ✓ | ☠| ✗ | Other features - [x] DCNv2 diff --git a/configs/fcos/README.md b/configs/fcos/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3e31b22af978c21216638ee4dd37288887e78c2c --- /dev/null +++ b/configs/fcos/README.md @@ -0,0 +1,25 @@ +# FCOS: Fully Convolutional One-Stage Object Detection + +## Introduction + +``` +@article{tian2019fcos, + title={FCOS: Fully Convolutional One-Stage Object Detection}, + author={Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong}, + journal={arXiv preprint arXiv:1904.01355}, + year={2019} +} +``` + +## Results and Models + +| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | +|:---------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:--------:| +| R-50-FPN | caffe | 1x | 6.9 | 0.396 | 13.6 | 36.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r50_fpn_1x-9f253a93.pth) | +| R-50-FPN | caffe | 2x | - | - | - | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r50_fpn_2x-f7329d80.pth) | +| R-101-FPN | caffe | 1x | 10.4 | 0.558 | 11.6 | 39.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r101_fpn_1x-e4889733.pth) | +| R-101-FPN | caffe | 2x | - | - | - | 40.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r101_fpn_2x-42e6f62d.pth) | +| X-101-64x4d-FPN | caffe |2x | 9.7 | 0.892 | 7.0 | 42.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_x101_64x4d_fpn_2x-a36c0872.pth) | + +**Notes:** +- To be consistent with the author's implementation, we use 4 GPUs with 4 images/GPU for R-50 and R-101 models, and 8 GPUs with 2 image/GPU for X-101 models. diff --git a/configs/fcos/fcos_mstrain_640_800_r101_caffe_fpn_gn_2x_4gpu.py b/configs/fcos/fcos_mstrain_640_800_r101_caffe_fpn_gn_2x_4gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..41297fc0aa4dd0939839d7c9e6798d36f07d4b11 --- /dev/null +++ b/configs/fcos/fcos_mstrain_640_800_r101_caffe_fpn_gn_2x_4gpu.py @@ -0,0 +1,124 @@ +# model settings +model = dict( + type='FCOS', + pretrained='open-mmlab://resnet101_caffe', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + extra_convs_on_inputs=False, # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSHead', + num_classes=81, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128])) +# training and testing settings +train_cfg = dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + smoothl1_beta=0.11, + gamma=2.0, + alpha=0.25, + allowed_border=-1, + pos_weight=-1, + debug=False) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +data = dict( + imgs_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='value', + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=False, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict( + type='SGD', + lr=0.01, + momentum=0.9, + weight_decay=0.0001, + paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='constant', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(4) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/fcos_mstrain_640_800_r101_caffe_fpn_gn_2x_4gpu' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/fcos/fcos_mstrain_640_800_x101_64x4d_fpn_gn_2x.py b/configs/fcos/fcos_mstrain_640_800_x101_64x4d_fpn_gn_2x.py new file mode 100644 index 0000000000000000000000000000000000000000..4f9352c2c9df4514f5b43074ef5d956b2d5a309c --- /dev/null +++ b/configs/fcos/fcos_mstrain_640_800_x101_64x4d_fpn_gn_2x.py @@ -0,0 +1,125 @@ +# model settings +model = dict( + type='FCOS', + pretrained='open-mmlab://resnext101_64x4d', + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + extra_convs_on_inputs=False, # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSHead', + num_classes=81, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128])) +# training and testing settings +train_cfg = dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + smoothl1_beta=0.11, + gamma=2.0, + alpha=0.25, + allowed_border=-1, + pos_weight=-1, + debug=False) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=[(1333, 640), (1333, 800)], + multiscale_mode='value', + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=False, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict( + type='SGD', + lr=0.01, + momentum=0.9, + weight_decay=0.0001, + paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='constant', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/fcos_mstrain_640_800_x101_64x4d_fpn_gn_2x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/fcos/fcos_r50_caffe_fpn_gn_1x_4gpu.py b/configs/fcos/fcos_r50_caffe_fpn_gn_1x_4gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..dd63ccfb2296d7d077aa8a35548f382eba71a560 --- /dev/null +++ b/configs/fcos/fcos_r50_caffe_fpn_gn_1x_4gpu.py @@ -0,0 +1,123 @@ +# model settings +model = dict( + type='FCOS', + pretrained='open-mmlab://resnet50_caffe', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + extra_convs_on_inputs=False, # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSHead', + num_classes=81, + in_channels=256, + stacked_convs=4, + feat_channels=256, + strides=[8, 16, 32, 64, 128])) +# training and testing settings +train_cfg = dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + smoothl1_beta=0.11, + gamma=2.0, + alpha=0.25, + allowed_border=-1, + pos_weight=-1, + debug=False) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False) +data = dict( + imgs_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=False, + with_crowd=False, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_crowd=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict( + type='SGD', + lr=0.01, + momentum=0.9, + weight_decay=0.0001, + paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='constant', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +device_ids = range(4) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/fcos_r50_caffe_fpn_gn_1x_4gpu' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/mmdet/apis/train.py b/mmdet/apis/train.py index 34b632624ca5e20cccd5394f967ca1cc1c1ad5d2..6732664b3b049ee10f4337d6bda946b947c4e789 100644 --- a/mmdet/apis/train.py +++ b/mmdet/apis/train.py @@ -39,9 +39,8 @@ def batch_processor(model, data, train_mode): losses = model(**data) loss, log_vars = parse_losses(losses) - outputs = dict(loss=loss, - log_vars=log_vars, - num_samples=len(data['img'].data)) + outputs = dict( + loss=loss, log_vars=log_vars, num_samples=len(data['img'].data)) return outputs @@ -135,10 +134,11 @@ def build_optimizer(model, optimizer_cfg): def _dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ - build_dataloader(dataset, - cfg.data.imgs_per_gpu, - cfg.data.workers_per_gpu, - dist=True) + build_dataloader( + dataset, + cfg.data.imgs_per_gpu, + cfg.data.workers_per_gpu, + dist=True) ] # put model on gpus model = MMDistributedDataParallel(model.cuda()) @@ -174,11 +174,12 @@ def _dist_train(model, dataset, cfg, validate=False): def _non_dist_train(model, dataset, cfg, validate=False): # prepare data loaders data_loaders = [ - build_dataloader(dataset, - cfg.data.imgs_per_gpu, - cfg.data.workers_per_gpu, - cfg.gpus, - dist=False) + build_dataloader( + dataset, + cfg.data.imgs_per_gpu, + cfg.data.workers_per_gpu, + cfg.gpus, + dist=False) ] # put model on gpus model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda() diff --git a/mmdet/core/bbox/__init__.py b/mmdet/core/bbox/__init__.py index 496bd7a26ad0d4278059e07510fffa945d8cd116..bcf6efda4772a44c1e63b45e443c87c19da20cd1 100644 --- a/mmdet/core/bbox/__init__.py +++ b/mmdet/core/bbox/__init__.py @@ -5,7 +5,8 @@ from .samplers import (BaseSampler, PseudoSampler, RandomSampler, CombinedSampler, SamplingResult) from .assign_sampling import build_assigner, build_sampler, assign_and_sample from .transforms import (bbox2delta, delta2bbox, bbox_flip, bbox_mapping, - bbox_mapping_back, bbox2roi, roi2bbox, bbox2result) + bbox_mapping_back, bbox2roi, roi2bbox, bbox2result, + distance2bbox) from .bbox_target import bbox_target __all__ = [ @@ -14,5 +15,6 @@ __all__ = [ 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', 'SamplingResult', 'build_assigner', 'build_sampler', 'assign_and_sample', 'bbox2delta', 'delta2bbox', 'bbox_flip', 'bbox_mapping', - 'bbox_mapping_back', 'bbox2roi', 'roi2bbox', 'bbox2result', 'bbox_target' + 'bbox_mapping_back', 'bbox2roi', 'roi2bbox', 'bbox2result', + 'distance2bbox', 'bbox_target' ] diff --git a/mmdet/core/bbox/transforms.py b/mmdet/core/bbox/transforms.py index 0d8f6f44f20df5c019dc8ed9ea46c2eb6c411c66..580b9bdfb24d43e80600ad44a70cde6b8ccc58e3 100644 --- a/mmdet/core/bbox/transforms.py +++ b/mmdet/core/bbox/transforms.py @@ -154,3 +154,27 @@ def bbox2result(bboxes, labels, num_classes): bboxes = bboxes.cpu().numpy() labels = labels.cpu().numpy() return [bboxes[labels == i, :] for i in range(num_classes - 1)] + + +def distance2bbox(points, distance, max_shape=None): + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (n, 2), [x, y]. + distance (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). + max_shape (tuple): Shape of the image. + + Returns: + Tensor: Decoded bboxes. + """ + x1 = points[:, 0] - distance[:, 0] + y1 = points[:, 1] - distance[:, 1] + x2 = points[:, 0] + distance[:, 2] + y2 = points[:, 1] + distance[:, 3] + if max_shape is not None: + x1 = x1.clamp(min=0, max=max_shape[1] - 1) + y1 = y1.clamp(min=0, max=max_shape[0] - 1) + x2 = x2.clamp(min=0, max=max_shape[1] - 1) + y2 = y2.clamp(min=0, max=max_shape[0] - 1) + return torch.stack([x1, y1, x2, y2], -1) diff --git a/mmdet/core/loss/__init__.py b/mmdet/core/loss/__init__.py index 477906e5449cdda8d22c948626593c3232a5b0da..888051875f16073c1c46282ae4e08b7487fb2501 100644 --- a/mmdet/core/loss/__init__.py +++ b/mmdet/core/loss/__init__.py @@ -1,11 +1,12 @@ from .losses import ( weighted_nll_loss, weighted_cross_entropy, weighted_binary_cross_entropy, sigmoid_focal_loss, py_sigmoid_focal_loss, weighted_sigmoid_focal_loss, - mask_cross_entropy, smooth_l1_loss, weighted_smoothl1, accuracy) + mask_cross_entropy, smooth_l1_loss, weighted_smoothl1, accuracy, iou_loss) __all__ = [ 'weighted_nll_loss', 'weighted_cross_entropy', 'weighted_binary_cross_entropy', 'sigmoid_focal_loss', 'py_sigmoid_focal_loss', 'weighted_sigmoid_focal_loss', - 'mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1', 'accuracy' + 'mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1', 'accuracy', + 'iou_loss' ] diff --git a/mmdet/core/loss/losses.py b/mmdet/core/loss/losses.py index 1c5bf70051b16d86c3b0027f3277e9167931ca02..e541ec47b43ea5ca7f13c106820b5d64b87bc259 100644 --- a/mmdet/core/loss/losses.py +++ b/mmdet/core/loss/losses.py @@ -2,6 +2,7 @@ import torch import torch.nn.functional as F +from ..bbox import bbox_overlaps from ...ops import sigmoid_focal_loss @@ -127,3 +128,16 @@ def _expand_binary_labels(labels, label_weights, label_channels): bin_label_weights = label_weights.view(-1, 1).expand( label_weights.size(0), label_channels) return bin_labels, bin_label_weights + + +def iou_loss(pred_bboxes, target_bboxes, reduction='mean'): + ious = bbox_overlaps(pred_bboxes, target_bboxes, is_aligned=True) + loss = -ious.log() + + reduction_enum = F._Reduction.get_enum(reduction) + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.mean() + elif reduction_enum == 2: + return loss.sum() diff --git a/mmdet/core/post_processing/bbox_nms.py b/mmdet/core/post_processing/bbox_nms.py index 1f7c6f17e36af180c7dd78a48bf431a4ad85e226..01beecd43abc7641d43773b192f988a55f4295d9 100644 --- a/mmdet/core/post_processing/bbox_nms.py +++ b/mmdet/core/post_processing/bbox_nms.py @@ -3,7 +3,12 @@ import torch from mmdet.ops.nms import nms_wrapper -def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1): +def multiclass_nms(multi_bboxes, + multi_scores, + score_thr, + nms_cfg, + max_num=-1, + score_factors=None): """NMS for multi-class bboxes. Args: @@ -14,6 +19,8 @@ def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1): nms_thr (float): NMS IoU threshold max_num (int): if there are more than max_num bboxes after NMS, only top max_num will be kept. + score_factors (Tensor): The factors multiplied to scores before + applying NMS Returns: tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels @@ -34,10 +41,13 @@ def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1): else: _bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4] _scores = multi_scores[cls_inds, i] + if score_factors is not None: + _scores *= score_factors[cls_inds] cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1) cls_dets, _ = nms_op(cls_dets, **nms_cfg_) - cls_labels = multi_bboxes.new_full( - (cls_dets.shape[0], ), i - 1, dtype=torch.long) + cls_labels = multi_bboxes.new_full((cls_dets.shape[0], ), + i - 1, + dtype=torch.long) bboxes.append(cls_dets) labels.append(cls_labels) if bboxes: diff --git a/mmdet/models/anchor_heads/__init__.py b/mmdet/models/anchor_heads/__init__.py index 25f12087e06126d4169dac9f085776ad55224b2b..86877a2425f56f49a00c680f6d271992ee89dfd3 100644 --- a/mmdet/models/anchor_heads/__init__.py +++ b/mmdet/models/anchor_heads/__init__.py @@ -1,6 +1,7 @@ from .anchor_head import AnchorHead -from .rpn_head import RPNHead +from .fcos_head import FCOSHead from .retina_head import RetinaHead +from .rpn_head import RPNHead from .ssd_head import SSDHead -__all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead'] +__all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead', 'FCOSHead'] diff --git a/mmdet/models/anchor_heads/fcos_head.py b/mmdet/models/anchor_heads/fcos_head.py new file mode 100644 index 0000000000000000000000000000000000000000..979177cb86ccfaa3f07b926d0cac5908323a3678 --- /dev/null +++ b/mmdet/models/anchor_heads/fcos_head.py @@ -0,0 +1,371 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import normal_init + +from mmdet.core import (sigmoid_focal_loss, iou_loss, multi_apply, + multiclass_nms, distance2bbox) +from ..registry import HEADS +from ..utils import bias_init_with_prob, Scale, ConvModule + +INF = 1e8 + + +@HEADS.register_module +class FCOSHead(nn.Module): + + def __init__(self, + num_classes, + in_channels, + feat_channels=256, + stacked_convs=4, + strides=(4, 8, 16, 32, 64), + regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512), + (512, INF)), + conv_cfg=None, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)): + super(FCOSHead, self).__init__() + + self.num_classes = num_classes + self.cls_out_channels = num_classes - 1 + self.in_channels = in_channels + self.feat_channels = feat_channels + self.stacked_convs = stacked_convs + self.strides = strides + self.regress_ranges = regress_ranges + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + self._init_layers() + + def _init_layers(self): + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.norm_cfg is None)) + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.norm_cfg is None)) + self.fcos_cls = nn.Conv2d( + self.feat_channels, self.cls_out_channels, 3, padding=1) + self.fcos_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + self.fcos_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1) + + self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides]) + + def init_weights(self): + for m in self.cls_convs: + normal_init(m.conv, std=0.01) + for m in self.reg_convs: + normal_init(m.conv, std=0.01) + bias_cls = bias_init_with_prob(0.01) + normal_init(self.fcos_cls, std=0.01, bias=bias_cls) + normal_init(self.fcos_reg, std=0.01) + normal_init(self.fcos_centerness, std=0.01) + + def forward(self, feats): + return multi_apply(self.forward_single, feats, self.scales) + + def forward_single(self, x, scale): + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs: + cls_feat = cls_layer(cls_feat) + cls_score = self.fcos_cls(cls_feat) + centerness = self.fcos_centerness(cls_feat) + + for reg_layer in self.reg_convs: + reg_feat = reg_layer(reg_feat) + # scale the bbox_pred of different level + bbox_pred = scale(self.fcos_reg(reg_feat)).exp() + return cls_score, bbox_pred, centerness + + def loss(self, + cls_scores, + bbox_preds, + centernesses, + gt_bboxes, + gt_labels, + img_metas, + cfg, + gt_bboxes_ignore=None): + assert len(cls_scores) == len(bbox_preds) == len(centernesses) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, + bbox_preds[0].device) + labels, bbox_targets = self.fcos_target(all_level_points, gt_bboxes, + gt_labels) + + num_imgs = cls_scores[0].size(0) + # flatten cls_scores, bbox_preds and centerness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4) + for bbox_pred in bbox_preds + ] + flatten_centerness = [ + centerness.permute(0, 2, 3, 1).reshape(-1) + for centerness in centernesses + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + flatten_centerness = torch.cat(flatten_centerness) + flatten_labels = torch.cat(labels) + flatten_bbox_targets = torch.cat(bbox_targets) + # repeat points to align with bbox_preds + flatten_points = torch.cat( + [points.repeat(num_imgs, 1) for points in all_level_points]) + + pos_inds = flatten_labels.nonzero().reshape(-1) + num_pos = len(pos_inds) + loss_cls = sigmoid_focal_loss( + flatten_cls_scores, flatten_labels, cfg.gamma, cfg.alpha, + 'none').sum()[None] / (num_pos + num_imgs) # avoid num_pos is 0 + + pos_bbox_preds = flatten_bbox_preds[pos_inds] + pos_bbox_targets = flatten_bbox_targets[pos_inds] + pos_centerness = flatten_centerness[pos_inds] + pos_centerness_targets = self.centerness_target(pos_bbox_targets) + + if num_pos > 0: + pos_points = flatten_points[pos_inds] + pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds) + pos_decoded_target_preds = distance2bbox(pos_points, + pos_bbox_targets) + # centerness weighted iou loss + loss_reg = ((iou_loss( + pos_decoded_bbox_preds, + pos_decoded_target_preds, + reduction='none') * pos_centerness_targets).sum() / + pos_centerness_targets.sum())[None] + loss_centerness = F.binary_cross_entropy_with_logits( + pos_centerness, pos_centerness_targets, reduction='mean')[None] + else: + loss_reg = pos_bbox_preds.sum()[None] + loss_centerness = pos_centerness.sum()[None] + + return dict( + loss_cls=loss_cls, + loss_reg=loss_reg, + loss_centerness=loss_centerness) + + def get_bboxes(self, + cls_scores, + bbox_preds, + centernesses, + img_metas, + cfg, + rescale=None): + assert len(cls_scores) == len(bbox_preds) + num_levels = len(cls_scores) + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, + bbox_preds[0].device) + result_list = [] + for img_id in range(len(img_metas)): + cls_score_list = [ + cls_scores[i][img_id].detach() for i in range(num_levels) + ] + bbox_pred_list = [ + bbox_preds[i][img_id].detach() for i in range(num_levels) + ] + centerness_pred_list = [ + centernesses[i][img_id].detach() for i in range(num_levels) + ] + img_shape = img_metas[img_id]['img_shape'] + scale_factor = img_metas[img_id]['scale_factor'] + det_bboxes = self.get_bboxes_single( + cls_score_list, bbox_pred_list, centerness_pred_list, + mlvl_points, img_shape, scale_factor, cfg, rescale) + result_list.append(det_bboxes) + return result_list + + def get_bboxes_single(self, + cls_scores, + bbox_preds, + centernesses, + mlvl_points, + img_shape, + scale_factor, + cfg, + rescale=False): + assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_centerness = [] + for cls_score, bbox_pred, centerness, points in zip( + cls_scores, bbox_preds, centernesses, mlvl_points): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + scores = cls_score.permute(1, 2, 0).reshape( + -1, self.cls_out_channels).sigmoid() + centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid() + + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + nms_pre = cfg.get('nms_pre', -1) + if nms_pre > 0 and scores.shape[0] > nms_pre: + max_scores, _ = (scores * centerness[:, None]).max(dim=1) + _, topk_inds = max_scores.topk(nms_pre) + points = points[topk_inds, :] + bbox_pred = bbox_pred[topk_inds, :] + scores = scores[topk_inds, :] + centerness = centerness[topk_inds] + bboxes = distance2bbox(points, bbox_pred, max_shape=img_shape) + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_centerness.append(centerness) + mlvl_bboxes = torch.cat(mlvl_bboxes) + if rescale: + mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) + mlvl_scores = torch.cat(mlvl_scores) + padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) + mlvl_scores = torch.cat([padding, mlvl_scores], dim=1) + mlvl_centerness = torch.cat(mlvl_centerness) + det_bboxes, det_labels = multiclass_nms( + mlvl_bboxes, + mlvl_scores, + cfg.score_thr, + cfg.nms, + cfg.max_per_img, + score_cofficient=mlvl_centerness) + return det_bboxes, det_labels + + def get_points(self, featmap_sizes, dtype, device): + """Get points according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + dtype (torch.dtype): Type of points. + device (torch.device): Device of points. + + Returns: + tuple: points of each image. + """ + mlvl_points = [] + for i in range(len(featmap_sizes)): + mlvl_points.append( + self.get_points_single(featmap_sizes[i], self.strides[i], + dtype, device)) + return mlvl_points + + def get_points_single(self, featmap_size, stride, dtype, device): + h, w = featmap_size + x_range = torch.arange( + 0, w * stride, stride, dtype=dtype, device=device) + y_range = torch.arange( + 0, h * stride, stride, dtype=dtype, device=device) + y, x = torch.meshgrid(y_range, x_range) + points = torch.stack( + (x.reshape(-1), y.reshape(-1)), dim=-1) + stride // 2 + return points + + def fcos_target(self, points, gt_bboxes_list, gt_labels_list): + assert len(points) == len(self.regress_ranges) + num_levels = len(points) + # expand regress ranges to align with points + expanded_regress_ranges = [ + points[i].new_tensor(self.regress_ranges[i])[None].expand_as( + points[i]) for i in range(num_levels) + ] + # concat all levels points and regress ranges + concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) + concat_points = torch.cat(points, dim=0) + # get labels and bbox_targets of each image + labels_list, bbox_targets_list = multi_apply( + self.fcos_target_single, + gt_bboxes_list, + gt_labels_list, + points=concat_points, + regress_ranges=concat_regress_ranges) + + # split to per img, per level + num_points = [center.size(0) for center in points] + labels_list = [labels.split(num_points, 0) for labels in labels_list] + bbox_targets_list = [ + bbox_targets.split(num_points, 0) + for bbox_targets in bbox_targets_list + ] + + # concat per level image + concat_lvl_labels = [] + concat_lvl_bbox_targets = [] + for i in range(num_levels): + concat_lvl_labels.append( + torch.cat([labels[i] for labels in labels_list])) + concat_lvl_bbox_targets.append( + torch.cat( + [bbox_targets[i] for bbox_targets in bbox_targets_list])) + return concat_lvl_labels, concat_lvl_bbox_targets + + def fcos_target_single(self, gt_bboxes, gt_labels, points, regress_ranges): + num_points = points.size(0) + num_gts = gt_labels.size(0) + + areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1) + # TODO: figure out why these two are different + # areas = areas[None].expand(num_points, num_gts) + areas = areas[None].repeat(num_points, 1) + regress_ranges = regress_ranges[:, None, :].expand( + num_points, num_gts, 2) + gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) + xs, ys = points[:, 0], points[:, 1] + xs = xs[:, None].expand(num_points, num_gts) + ys = ys[:, None].expand(num_points, num_gts) + + left = xs - gt_bboxes[..., 0] + right = gt_bboxes[..., 2] - xs + top = ys - gt_bboxes[..., 1] + bottom = gt_bboxes[..., 3] - ys + bbox_targets = torch.stack((left, top, right, bottom), -1) + + # condition1: inside a gt bbox + inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0 + + # condition2: limit the regression range for each location + max_regress_distance = bbox_targets.max(-1)[0] + inside_regress_range = ( + max_regress_distance >= regress_ranges[..., 0]) & ( + max_regress_distance <= regress_ranges[..., 1]) + + # if there are still more than one objects for a location, + # we choose the one with minimal area + areas[inside_gt_bbox_mask == 0] = INF + areas[inside_regress_range == 0] = INF + min_area, min_area_inds = areas.min(dim=1) + + labels = gt_labels[min_area_inds] + labels[min_area == INF] = 0 + bbox_targets = bbox_targets[range(num_points), min_area_inds] + + return labels, bbox_targets + + def centerness_target(self, pos_bbox_targets): + # only calculate pos centerness targets, otherwise there may be nan + left_right = pos_bbox_targets[:, [0, 2]] + top_bottom = pos_bbox_targets[:, [1, 3]] + centerness_targets = ( + left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * ( + top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]) + return torch.sqrt(centerness_targets) diff --git a/mmdet/models/detectors/__init__.py b/mmdet/models/detectors/__init__.py index d3653ec903cdc4b615678ba1391da3a1ae7755a6..824833d8823d924c75ebe68c3ffca4c80270a981 100644 --- a/mmdet/models/detectors/__init__.py +++ b/mmdet/models/detectors/__init__.py @@ -8,9 +8,10 @@ from .mask_rcnn import MaskRCNN from .cascade_rcnn import CascadeRCNN from .htc import HybridTaskCascade from .retinanet import RetinaNet +from .fcos import FCOS __all__ = [ 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN', 'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade', - 'RetinaNet' + 'RetinaNet', 'FCOS' ] diff --git a/mmdet/models/detectors/fcos.py b/mmdet/models/detectors/fcos.py new file mode 100644 index 0000000000000000000000000000000000000000..4c3dce1b164c750f58f9fb2fc0e836a1dcc0e8d1 --- /dev/null +++ b/mmdet/models/detectors/fcos.py @@ -0,0 +1,16 @@ +from .single_stage import SingleStageDetector +from ..registry import DETECTORS + + +@DETECTORS.register_module +class FCOS(SingleStageDetector): + + def __init__(self, + backbone, + neck, + bbox_head, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(FCOS, self).__init__(backbone, neck, bbox_head, train_cfg, + test_cfg, pretrained) diff --git a/mmdet/models/necks/fpn.py b/mmdet/models/necks/fpn.py index 7b33b69de7de8432c7b1e40ec737b1cdc63a28e8..6b8c862a223144566a748b8fd584a64e9274b85d 100644 --- a/mmdet/models/necks/fpn.py +++ b/mmdet/models/necks/fpn.py @@ -2,8 +2,8 @@ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import xavier_init -from ..utils import ConvModule from ..registry import NECKS +from ..utils import ConvModule @NECKS.register_module @@ -17,6 +17,7 @@ class FPN(nn.Module): end_level=-1, add_extra_convs=False, extra_convs_on_inputs=True, + relu_before_extra_convs=False, conv_cfg=None, norm_cfg=None, activation=None): @@ -27,6 +28,7 @@ class FPN(nn.Module): self.num_ins = len(in_channels) self.num_outs = num_outs self.activation = activation + self.relu_before_extra_convs = relu_before_extra_convs if end_level == -1: self.backbone_end_level = self.num_ins @@ -127,6 +129,8 @@ class FPN(nn.Module): else: outs.append(self.fpn_convs[used_backbone_levels](outs[-1])) for i in range(used_backbone_levels + 1, self.num_outs): - # BUG: we should add relu before each extra conv - outs.append(self.fpn_convs[i](outs[-1])) + if self.relu_before_extra_convs: + outs.append(self.fpn_convs[i](F.relu(outs[-1]))) + else: + outs.append(self.fpn_convs[i](outs[-1])) return tuple(outs) diff --git a/mmdet/models/utils/__init__.py b/mmdet/models/utils/__init__.py index c517b07aef65c74b9f2853cf96c388aac1531d79..f9215c5ade9757b812697d85704188d5a593e0d9 100644 --- a/mmdet/models/utils/__init__.py +++ b/mmdet/models/utils/__init__.py @@ -1,11 +1,12 @@ from .conv_ws import conv_ws_2d, ConvWS2d from .conv_module import build_conv_layer, ConvModule from .norm import build_norm_layer +from .scale import Scale from .weight_init import (xavier_init, normal_init, uniform_init, kaiming_init, bias_init_with_prob) __all__ = [ 'conv_ws_2d', 'ConvWS2d', 'build_conv_layer', 'ConvModule', 'build_norm_layer', 'xavier_init', 'normal_init', 'uniform_init', - 'kaiming_init', 'bias_init_with_prob' + 'kaiming_init', 'bias_init_with_prob', 'Scale' ] diff --git a/mmdet/models/utils/scale.py b/mmdet/models/utils/scale.py new file mode 100644 index 0000000000000000000000000000000000000000..68c37cd4e99c50341869f8fc1efa85f94f016adb --- /dev/null +++ b/mmdet/models/utils/scale.py @@ -0,0 +1,12 @@ +import torch +import torch.nn as nn + + +class Scale(nn.Module): + + def __init__(self, scale=1.0): + super(Scale, self).__init__() + self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) + + def forward(self, x): + return x * self.scale