Skip to content
Snippets Groups Projects
Commit 8d38fd8c authored by Cao Yuhang's avatar Cao Yuhang Committed by Kai Chen
Browse files

Reimplement "FCOS: Fully Convolutional One-Stage Object Detection" (#586)

* add fcos

* use P5 instead of C5

* add relu before extra convs in FPN

* add singleclass_nms, use caffe2 lr

* fix log interval

* use caffe2init and relu in extra layers

* fix scale layer, use p5 instead of c5

* fix focs target

* refactor code

* delete useless file

* clean

* refactor code

* change num_classes to cls_out_channels

* fix bug of in get_bboxes

* fix bug in test

* add r101 2x cfg

* ms use value mode, add x101-64x4d cfg

* add more comment and rename some variable

* rename centers to points, modify doc string of distance2bbox

* add fcos detector, replace frozen with requires_grad

* add README.md

* add r101-1x performance, rename cfg, add detector FCOS

* update fcos r50 2x performance, remove fpn caffe2 initialize

* fix flake8 error

* rename cfg

* fix grammar error of some comments

* minor fix comment

* change work_dir to be consistent with config name

* add FCOS support in README
parent 981ee2f8
No related branches found
No related tags found
No related merge requests found
Showing
with 882 additions and 26 deletions
......@@ -92,6 +92,7 @@ Results and models are available in the [Model zoo](MODEL_ZOO.md).
| SSD | ✗ | ✗ | ✗ | ✓ |
| RetinaNet | ✓ | ✓ | ☐ | ✗ |
| Hybrid Task Cascade| ✓ | ✓ | ☐ | ✗ |
| FCOS | ✓ | ✓ | ☐ | ✗ |
Other features
- [x] DCNv2
......
# FCOS: Fully Convolutional One-Stage Object Detection
## Introduction
```
@article{tian2019fcos,
title={FCOS: Fully Convolutional One-Stage Object Detection},
author={Tian, Zhi and Shen, Chunhua and Chen, Hao and He, Tong},
journal={arXiv preprint arXiv:1904.01355},
year={2019}
}
```
## Results and Models
| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download |
|:---------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:--------:|
| R-50-FPN | caffe | 1x | 6.9 | 0.396 | 13.6 | 36.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r50_fpn_1x-9f253a93.pth) |
| R-50-FPN | caffe | 2x | - | - | - | 38.7 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r50_fpn_2x-f7329d80.pth) |
| R-101-FPN | caffe | 1x | 10.4 | 0.558 | 11.6 | 39.1 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r101_fpn_1x-e4889733.pth) |
| R-101-FPN | caffe | 2x | - | - | - | 40.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_r101_fpn_2x-42e6f62d.pth) |
| X-101-64x4d-FPN | caffe |2x | 9.7 | 0.892 | 7.0 | 42.8 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/fcos/fcos_x101_64x4d_fpn_2x-a36c0872.pth) |
**Notes:**
- To be consistent with the author's implementation, we use 4 GPUs with 4 images/GPU for R-50 and R-101 models, and 8 GPUs with 2 image/GPU for X-101 models.
# model settings
model = dict(
type='FCOS',
pretrained='open-mmlab://resnet101_caffe',
backbone=dict(
type='ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
style='caffe'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
start_level=1,
add_extra_convs=True,
extra_convs_on_inputs=False, # use P5
num_outs=5,
relu_before_extra_convs=True),
bbox_head=dict(
type='FCOSHead',
num_classes=81,
in_channels=256,
stacked_convs=4,
feat_channels=256,
strides=[8, 16, 32, 64, 128]))
# training and testing settings
train_cfg = dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.4,
min_pos_iou=0,
ignore_iof_thr=-1),
smoothl1_beta=0.11,
gamma=2.0,
alpha=0.25,
allowed_border=-1,
pos_weight=-1,
debug=False)
test_cfg = dict(
nms_pre=1000,
min_bbox_size=0,
score_thr=0.05,
nms=dict(type='nms', iou_thr=0.5),
max_per_img=100)
# dataset settings
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
data = dict(
imgs_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
img_scale=[(1333, 640), (1333, 800)],
multiscale_mode='value',
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=False,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=False,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(
type='SGD',
lr=0.01,
momentum=0.9,
weight_decay=0.0001,
paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.))
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='constant',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[16, 22])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 24
device_ids = range(4)
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/fcos_mstrain_640_800_r101_caffe_fpn_gn_2x_4gpu'
load_from = None
resume_from = None
workflow = [('train', 1)]
# model settings
model = dict(
type='FCOS',
pretrained='open-mmlab://resnext101_64x4d',
backbone=dict(
type='ResNeXt',
depth=101,
groups=64,
base_width=4,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
start_level=1,
add_extra_convs=True,
extra_convs_on_inputs=False, # use P5
num_outs=5,
relu_before_extra_convs=True),
bbox_head=dict(
type='FCOSHead',
num_classes=81,
in_channels=256,
stacked_convs=4,
feat_channels=256,
strides=[8, 16, 32, 64, 128]))
# training and testing settings
train_cfg = dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.4,
min_pos_iou=0,
ignore_iof_thr=-1),
smoothl1_beta=0.11,
gamma=2.0,
alpha=0.25,
allowed_border=-1,
pos_weight=-1,
debug=False)
test_cfg = dict(
nms_pre=1000,
min_bbox_size=0,
score_thr=0.05,
nms=dict(type='nms', iou_thr=0.5),
max_per_img=100)
# dataset settings
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
data = dict(
imgs_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
img_scale=[(1333, 640), (1333, 800)],
multiscale_mode='value',
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=False,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=False,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(
type='SGD',
lr=0.01,
momentum=0.9,
weight_decay=0.0001,
paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.))
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='constant',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[16, 22])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 24
device_ids = range(8)
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/fcos_mstrain_640_800_x101_64x4d_fpn_gn_2x'
load_from = None
resume_from = None
workflow = [('train', 1)]
# model settings
model = dict(
type='FCOS',
pretrained='open-mmlab://resnet50_caffe',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
style='caffe'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
start_level=1,
add_extra_convs=True,
extra_convs_on_inputs=False, # use P5
num_outs=5,
relu_before_extra_convs=True),
bbox_head=dict(
type='FCOSHead',
num_classes=81,
in_channels=256,
stacked_convs=4,
feat_channels=256,
strides=[8, 16, 32, 64, 128]))
# training and testing settings
train_cfg = dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.4,
min_pos_iou=0,
ignore_iof_thr=-1),
smoothl1_beta=0.11,
gamma=2.0,
alpha=0.25,
allowed_border=-1,
pos_weight=-1,
debug=False)
test_cfg = dict(
nms_pre=1000,
min_bbox_size=0,
score_thr=0.05,
nms=dict(type='nms', iou_thr=0.5),
max_per_img=100)
# dataset settings
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
data = dict(
imgs_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0.5,
with_mask=False,
with_crowd=False,
with_label=True),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=False,
with_label=True),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
img_scale=(1333, 800),
img_norm_cfg=img_norm_cfg,
size_divisor=32,
flip_ratio=0,
with_mask=False,
with_crowd=False,
with_label=False,
test_mode=True))
# optimizer
optimizer = dict(
type='SGD',
lr=0.01,
momentum=0.9,
weight_decay=0.0001,
paramwise_options=dict(bias_lr_mult=2., bias_decay_mult=0.))
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='constant',
warmup_iters=500,
warmup_ratio=1.0 / 3,
step=[8, 11])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 12
device_ids = range(4)
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/fcos_r50_caffe_fpn_gn_1x_4gpu'
load_from = None
resume_from = None
workflow = [('train', 1)]
......@@ -39,9 +39,8 @@ def batch_processor(model, data, train_mode):
losses = model(**data)
loss, log_vars = parse_losses(losses)
outputs = dict(loss=loss,
log_vars=log_vars,
num_samples=len(data['img'].data))
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
return outputs
......@@ -135,10 +134,11 @@ def build_optimizer(model, optimizer_cfg):
def _dist_train(model, dataset, cfg, validate=False):
# prepare data loaders
data_loaders = [
build_dataloader(dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
dist=True)
build_dataloader(
dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
dist=True)
]
# put model on gpus
model = MMDistributedDataParallel(model.cuda())
......@@ -174,11 +174,12 @@ def _dist_train(model, dataset, cfg, validate=False):
def _non_dist_train(model, dataset, cfg, validate=False):
# prepare data loaders
data_loaders = [
build_dataloader(dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
cfg.gpus,
dist=False)
build_dataloader(
dataset,
cfg.data.imgs_per_gpu,
cfg.data.workers_per_gpu,
cfg.gpus,
dist=False)
]
# put model on gpus
model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
......
......@@ -5,7 +5,8 @@ from .samplers import (BaseSampler, PseudoSampler, RandomSampler,
CombinedSampler, SamplingResult)
from .assign_sampling import build_assigner, build_sampler, assign_and_sample
from .transforms import (bbox2delta, delta2bbox, bbox_flip, bbox_mapping,
bbox_mapping_back, bbox2roi, roi2bbox, bbox2result)
bbox_mapping_back, bbox2roi, roi2bbox, bbox2result,
distance2bbox)
from .bbox_target import bbox_target
__all__ = [
......@@ -14,5 +15,6 @@ __all__ = [
'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
'SamplingResult', 'build_assigner', 'build_sampler', 'assign_and_sample',
'bbox2delta', 'delta2bbox', 'bbox_flip', 'bbox_mapping',
'bbox_mapping_back', 'bbox2roi', 'roi2bbox', 'bbox2result', 'bbox_target'
'bbox_mapping_back', 'bbox2roi', 'roi2bbox', 'bbox2result',
'distance2bbox', 'bbox_target'
]
......@@ -154,3 +154,27 @@ def bbox2result(bboxes, labels, num_classes):
bboxes = bboxes.cpu().numpy()
labels = labels.cpu().numpy()
return [bboxes[labels == i, :] for i in range(num_classes - 1)]
def distance2bbox(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.
Args:
points (Tensor): Shape (n, 2), [x, y].
distance (Tensor): Distance from the given point to 4
boundaries (left, top, right, bottom).
max_shape (tuple): Shape of the image.
Returns:
Tensor: Decoded bboxes.
"""
x1 = points[:, 0] - distance[:, 0]
y1 = points[:, 1] - distance[:, 1]
x2 = points[:, 0] + distance[:, 2]
y2 = points[:, 1] + distance[:, 3]
if max_shape is not None:
x1 = x1.clamp(min=0, max=max_shape[1] - 1)
y1 = y1.clamp(min=0, max=max_shape[0] - 1)
x2 = x2.clamp(min=0, max=max_shape[1] - 1)
y2 = y2.clamp(min=0, max=max_shape[0] - 1)
return torch.stack([x1, y1, x2, y2], -1)
from .losses import (
weighted_nll_loss, weighted_cross_entropy, weighted_binary_cross_entropy,
sigmoid_focal_loss, py_sigmoid_focal_loss, weighted_sigmoid_focal_loss,
mask_cross_entropy, smooth_l1_loss, weighted_smoothl1, accuracy)
mask_cross_entropy, smooth_l1_loss, weighted_smoothl1, accuracy, iou_loss)
__all__ = [
'weighted_nll_loss', 'weighted_cross_entropy',
'weighted_binary_cross_entropy', 'sigmoid_focal_loss',
'py_sigmoid_focal_loss', 'weighted_sigmoid_focal_loss',
'mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1', 'accuracy'
'mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1', 'accuracy',
'iou_loss'
]
......@@ -2,6 +2,7 @@
import torch
import torch.nn.functional as F
from ..bbox import bbox_overlaps
from ...ops import sigmoid_focal_loss
......@@ -127,3 +128,16 @@ def _expand_binary_labels(labels, label_weights, label_channels):
bin_label_weights = label_weights.view(-1, 1).expand(
label_weights.size(0), label_channels)
return bin_labels, bin_label_weights
def iou_loss(pred_bboxes, target_bboxes, reduction='mean'):
ious = bbox_overlaps(pred_bboxes, target_bboxes, is_aligned=True)
loss = -ious.log()
reduction_enum = F._Reduction.get_enum(reduction)
if reduction_enum == 0:
return loss
elif reduction_enum == 1:
return loss.mean()
elif reduction_enum == 2:
return loss.sum()
......@@ -3,7 +3,12 @@ import torch
from mmdet.ops.nms import nms_wrapper
def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1):
def multiclass_nms(multi_bboxes,
multi_scores,
score_thr,
nms_cfg,
max_num=-1,
score_factors=None):
"""NMS for multi-class bboxes.
Args:
......@@ -14,6 +19,8 @@ def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1):
nms_thr (float): NMS IoU threshold
max_num (int): if there are more than max_num bboxes after NMS,
only top max_num will be kept.
score_factors (Tensor): The factors multiplied to scores before
applying NMS
Returns:
tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels
......@@ -34,10 +41,13 @@ def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_cfg, max_num=-1):
else:
_bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4]
_scores = multi_scores[cls_inds, i]
if score_factors is not None:
_scores *= score_factors[cls_inds]
cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1)
cls_dets, _ = nms_op(cls_dets, **nms_cfg_)
cls_labels = multi_bboxes.new_full(
(cls_dets.shape[0], ), i - 1, dtype=torch.long)
cls_labels = multi_bboxes.new_full((cls_dets.shape[0], ),
i - 1,
dtype=torch.long)
bboxes.append(cls_dets)
labels.append(cls_labels)
if bboxes:
......
from .anchor_head import AnchorHead
from .rpn_head import RPNHead
from .fcos_head import FCOSHead
from .retina_head import RetinaHead
from .rpn_head import RPNHead
from .ssd_head import SSDHead
__all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead']
__all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead', 'FCOSHead']
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import normal_init
from mmdet.core import (sigmoid_focal_loss, iou_loss, multi_apply,
multiclass_nms, distance2bbox)
from ..registry import HEADS
from ..utils import bias_init_with_prob, Scale, ConvModule
INF = 1e8
@HEADS.register_module
class FCOSHead(nn.Module):
def __init__(self,
num_classes,
in_channels,
feat_channels=256,
stacked_convs=4,
strides=(4, 8, 16, 32, 64),
regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 512),
(512, INF)),
conv_cfg=None,
norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)):
super(FCOSHead, self).__init__()
self.num_classes = num_classes
self.cls_out_channels = num_classes - 1
self.in_channels = in_channels
self.feat_channels = feat_channels
self.stacked_convs = stacked_convs
self.strides = strides
self.regress_ranges = regress_ranges
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self._init_layers()
def _init_layers(self):
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
for i in range(self.stacked_convs):
chn = self.in_channels if i == 0 else self.feat_channels
self.cls_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.norm_cfg is None))
self.reg_convs.append(
ConvModule(
chn,
self.feat_channels,
3,
stride=1,
padding=1,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
bias=self.norm_cfg is None))
self.fcos_cls = nn.Conv2d(
self.feat_channels, self.cls_out_channels, 3, padding=1)
self.fcos_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
self.fcos_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
def init_weights(self):
for m in self.cls_convs:
normal_init(m.conv, std=0.01)
for m in self.reg_convs:
normal_init(m.conv, std=0.01)
bias_cls = bias_init_with_prob(0.01)
normal_init(self.fcos_cls, std=0.01, bias=bias_cls)
normal_init(self.fcos_reg, std=0.01)
normal_init(self.fcos_centerness, std=0.01)
def forward(self, feats):
return multi_apply(self.forward_single, feats, self.scales)
def forward_single(self, x, scale):
cls_feat = x
reg_feat = x
for cls_layer in self.cls_convs:
cls_feat = cls_layer(cls_feat)
cls_score = self.fcos_cls(cls_feat)
centerness = self.fcos_centerness(cls_feat)
for reg_layer in self.reg_convs:
reg_feat = reg_layer(reg_feat)
# scale the bbox_pred of different level
bbox_pred = scale(self.fcos_reg(reg_feat)).exp()
return cls_score, bbox_pred, centerness
def loss(self,
cls_scores,
bbox_preds,
centernesses,
gt_bboxes,
gt_labels,
img_metas,
cfg,
gt_bboxes_ignore=None):
assert len(cls_scores) == len(bbox_preds) == len(centernesses)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
labels, bbox_targets = self.fcos_target(all_level_points, gt_bboxes,
gt_labels)
num_imgs = cls_scores[0].size(0)
# flatten cls_scores, bbox_preds and centerness
flatten_cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
for cls_score in cls_scores
]
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
for bbox_pred in bbox_preds
]
flatten_centerness = [
centerness.permute(0, 2, 3, 1).reshape(-1)
for centerness in centernesses
]
flatten_cls_scores = torch.cat(flatten_cls_scores)
flatten_bbox_preds = torch.cat(flatten_bbox_preds)
flatten_centerness = torch.cat(flatten_centerness)
flatten_labels = torch.cat(labels)
flatten_bbox_targets = torch.cat(bbox_targets)
# repeat points to align with bbox_preds
flatten_points = torch.cat(
[points.repeat(num_imgs, 1) for points in all_level_points])
pos_inds = flatten_labels.nonzero().reshape(-1)
num_pos = len(pos_inds)
loss_cls = sigmoid_focal_loss(
flatten_cls_scores, flatten_labels, cfg.gamma, cfg.alpha,
'none').sum()[None] / (num_pos + num_imgs) # avoid num_pos is 0
pos_bbox_preds = flatten_bbox_preds[pos_inds]
pos_bbox_targets = flatten_bbox_targets[pos_inds]
pos_centerness = flatten_centerness[pos_inds]
pos_centerness_targets = self.centerness_target(pos_bbox_targets)
if num_pos > 0:
pos_points = flatten_points[pos_inds]
pos_decoded_bbox_preds = distance2bbox(pos_points, pos_bbox_preds)
pos_decoded_target_preds = distance2bbox(pos_points,
pos_bbox_targets)
# centerness weighted iou loss
loss_reg = ((iou_loss(
pos_decoded_bbox_preds,
pos_decoded_target_preds,
reduction='none') * pos_centerness_targets).sum() /
pos_centerness_targets.sum())[None]
loss_centerness = F.binary_cross_entropy_with_logits(
pos_centerness, pos_centerness_targets, reduction='mean')[None]
else:
loss_reg = pos_bbox_preds.sum()[None]
loss_centerness = pos_centerness.sum()[None]
return dict(
loss_cls=loss_cls,
loss_reg=loss_reg,
loss_centerness=loss_centerness)
def get_bboxes(self,
cls_scores,
bbox_preds,
centernesses,
img_metas,
cfg,
rescale=None):
assert len(cls_scores) == len(bbox_preds)
num_levels = len(cls_scores)
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
bbox_preds[0].device)
result_list = []
for img_id in range(len(img_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
centerness_pred_list = [
centernesses[i][img_id].detach() for i in range(num_levels)
]
img_shape = img_metas[img_id]['img_shape']
scale_factor = img_metas[img_id]['scale_factor']
det_bboxes = self.get_bboxes_single(
cls_score_list, bbox_pred_list, centerness_pred_list,
mlvl_points, img_shape, scale_factor, cfg, rescale)
result_list.append(det_bboxes)
return result_list
def get_bboxes_single(self,
cls_scores,
bbox_preds,
centernesses,
mlvl_points,
img_shape,
scale_factor,
cfg,
rescale=False):
assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
mlvl_bboxes = []
mlvl_scores = []
mlvl_centerness = []
for cls_score, bbox_pred, centerness, points in zip(
cls_scores, bbox_preds, centernesses, mlvl_points):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
scores = cls_score.permute(1, 2, 0).reshape(
-1, self.cls_out_channels).sigmoid()
centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
max_scores, _ = (scores * centerness[:, None]).max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
points = points[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
centerness = centerness[topk_inds]
bboxes = distance2bbox(points, bbox_pred, max_shape=img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_centerness.append(centerness)
mlvl_bboxes = torch.cat(mlvl_bboxes)
if rescale:
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
mlvl_scores = torch.cat(mlvl_scores)
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([padding, mlvl_scores], dim=1)
mlvl_centerness = torch.cat(mlvl_centerness)
det_bboxes, det_labels = multiclass_nms(
mlvl_bboxes,
mlvl_scores,
cfg.score_thr,
cfg.nms,
cfg.max_per_img,
score_cofficient=mlvl_centerness)
return det_bboxes, det_labels
def get_points(self, featmap_sizes, dtype, device):
"""Get points according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
dtype (torch.dtype): Type of points.
device (torch.device): Device of points.
Returns:
tuple: points of each image.
"""
mlvl_points = []
for i in range(len(featmap_sizes)):
mlvl_points.append(
self.get_points_single(featmap_sizes[i], self.strides[i],
dtype, device))
return mlvl_points
def get_points_single(self, featmap_size, stride, dtype, device):
h, w = featmap_size
x_range = torch.arange(
0, w * stride, stride, dtype=dtype, device=device)
y_range = torch.arange(
0, h * stride, stride, dtype=dtype, device=device)
y, x = torch.meshgrid(y_range, x_range)
points = torch.stack(
(x.reshape(-1), y.reshape(-1)), dim=-1) + stride // 2
return points
def fcos_target(self, points, gt_bboxes_list, gt_labels_list):
assert len(points) == len(self.regress_ranges)
num_levels = len(points)
# expand regress ranges to align with points
expanded_regress_ranges = [
points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
points[i]) for i in range(num_levels)
]
# concat all levels points and regress ranges
concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
concat_points = torch.cat(points, dim=0)
# get labels and bbox_targets of each image
labels_list, bbox_targets_list = multi_apply(
self.fcos_target_single,
gt_bboxes_list,
gt_labels_list,
points=concat_points,
regress_ranges=concat_regress_ranges)
# split to per img, per level
num_points = [center.size(0) for center in points]
labels_list = [labels.split(num_points, 0) for labels in labels_list]
bbox_targets_list = [
bbox_targets.split(num_points, 0)
for bbox_targets in bbox_targets_list
]
# concat per level image
concat_lvl_labels = []
concat_lvl_bbox_targets = []
for i in range(num_levels):
concat_lvl_labels.append(
torch.cat([labels[i] for labels in labels_list]))
concat_lvl_bbox_targets.append(
torch.cat(
[bbox_targets[i] for bbox_targets in bbox_targets_list]))
return concat_lvl_labels, concat_lvl_bbox_targets
def fcos_target_single(self, gt_bboxes, gt_labels, points, regress_ranges):
num_points = points.size(0)
num_gts = gt_labels.size(0)
areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * (
gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1)
# TODO: figure out why these two are different
# areas = areas[None].expand(num_points, num_gts)
areas = areas[None].repeat(num_points, 1)
regress_ranges = regress_ranges[:, None, :].expand(
num_points, num_gts, 2)
gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
xs, ys = points[:, 0], points[:, 1]
xs = xs[:, None].expand(num_points, num_gts)
ys = ys[:, None].expand(num_points, num_gts)
left = xs - gt_bboxes[..., 0]
right = gt_bboxes[..., 2] - xs
top = ys - gt_bboxes[..., 1]
bottom = gt_bboxes[..., 3] - ys
bbox_targets = torch.stack((left, top, right, bottom), -1)
# condition1: inside a gt bbox
inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
# condition2: limit the regression range for each location
max_regress_distance = bbox_targets.max(-1)[0]
inside_regress_range = (
max_regress_distance >= regress_ranges[..., 0]) & (
max_regress_distance <= regress_ranges[..., 1])
# if there are still more than one objects for a location,
# we choose the one with minimal area
areas[inside_gt_bbox_mask == 0] = INF
areas[inside_regress_range == 0] = INF
min_area, min_area_inds = areas.min(dim=1)
labels = gt_labels[min_area_inds]
labels[min_area == INF] = 0
bbox_targets = bbox_targets[range(num_points), min_area_inds]
return labels, bbox_targets
def centerness_target(self, pos_bbox_targets):
# only calculate pos centerness targets, otherwise there may be nan
left_right = pos_bbox_targets[:, [0, 2]]
top_bottom = pos_bbox_targets[:, [1, 3]]
centerness_targets = (
left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
return torch.sqrt(centerness_targets)
......@@ -8,9 +8,10 @@ from .mask_rcnn import MaskRCNN
from .cascade_rcnn import CascadeRCNN
from .htc import HybridTaskCascade
from .retinanet import RetinaNet
from .fcos import FCOS
__all__ = [
'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN',
'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade',
'RetinaNet'
'RetinaNet', 'FCOS'
]
from .single_stage import SingleStageDetector
from ..registry import DETECTORS
@DETECTORS.register_module
class FCOS(SingleStageDetector):
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(FCOS, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
......@@ -2,8 +2,8 @@ import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import xavier_init
from ..utils import ConvModule
from ..registry import NECKS
from ..utils import ConvModule
@NECKS.register_module
......@@ -17,6 +17,7 @@ class FPN(nn.Module):
end_level=-1,
add_extra_convs=False,
extra_convs_on_inputs=True,
relu_before_extra_convs=False,
conv_cfg=None,
norm_cfg=None,
activation=None):
......@@ -27,6 +28,7 @@ class FPN(nn.Module):
self.num_ins = len(in_channels)
self.num_outs = num_outs
self.activation = activation
self.relu_before_extra_convs = relu_before_extra_convs
if end_level == -1:
self.backbone_end_level = self.num_ins
......@@ -127,6 +129,8 @@ class FPN(nn.Module):
else:
outs.append(self.fpn_convs[used_backbone_levels](outs[-1]))
for i in range(used_backbone_levels + 1, self.num_outs):
# BUG: we should add relu before each extra conv
outs.append(self.fpn_convs[i](outs[-1]))
if self.relu_before_extra_convs:
outs.append(self.fpn_convs[i](F.relu(outs[-1])))
else:
outs.append(self.fpn_convs[i](outs[-1]))
return tuple(outs)
from .conv_ws import conv_ws_2d, ConvWS2d
from .conv_module import build_conv_layer, ConvModule
from .norm import build_norm_layer
from .scale import Scale
from .weight_init import (xavier_init, normal_init, uniform_init, kaiming_init,
bias_init_with_prob)
__all__ = [
'conv_ws_2d', 'ConvWS2d', 'build_conv_layer', 'ConvModule',
'build_norm_layer', 'xavier_init', 'normal_init', 'uniform_init',
'kaiming_init', 'bias_init_with_prob'
'kaiming_init', 'bias_init_with_prob', 'Scale'
]
import torch
import torch.nn as nn
class Scale(nn.Module):
def __init__(self, scale=1.0):
super(Scale, self).__init__()
self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
def forward(self, x):
return x * self.scale
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment