diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md index 6677abbad556d45df2d682b3b71f3075e502e69c..ca84069e404a1d443e99f13844830fd28605d609 100644 --- a/MODEL_ZOO.md +++ b/MODEL_ZOO.md @@ -154,6 +154,10 @@ We released RPN, Faster R-CNN and Mask R-CNN models in the first version. More m - The `20e` schedule in Cascade (Mask) R-CNN indicates decreasing the lr at 16 and 19 epochs, with a total of 20 epochs. - Cascade Mask R-CNN with X-101-64x4d-FPN was trained using 16 GPU with a batch size of 16 (1 images per GPU). +### Hybrid Task Cascade (HTC) + +Please refer to [HTC](configs/htc/README.md) for details. + ### SSD | Backbone | Size | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | Download | diff --git a/README.md b/README.md index aeb674d1d4741b1d2e4683b24e6aaf3e6b04675b..a6b40e4999871a77c3725ae620a55e306d51683a 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ Results and models are available in the [Model zoo](MODEL_ZOO.md). | Cascade Mask R-CNN | ✓ | ✓ | ☠| ✗ | | SSD | ✗ | ✗ | ✗ | ✓ | | RetinaNet | ✓ | ✓ | ☠| ✗ | +| Hybrid Task Cascade| ✓ | ✓ | ☠| ✗ | Other features - [x] DCNv2 diff --git a/configs/htc/README.md b/configs/htc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3a03d8164d44ac175c23cfdf7697fd6d8868884d --- /dev/null +++ b/configs/htc/README.md @@ -0,0 +1,55 @@ +# Hybrid Task Cascade for Instance Segmentation + +## Introduction + +We provide config files to reproduce the results in the CVPR 2019 paper for [Hybrid Task Cascade](https://arxiv.org/abs/1901.07518). + +``` +@inproceedings{chen2019hybrid, + title={Hybrid task cascade for instance segmentation}, + author={Chen, Kai and Pang, Jiangmiao and Wang, Jiaqi and Xiong, Yu and Li, Xiaoxiao and Sun, Shuyang and Feng, Wansen and Liu, Ziwei and Shi, Jianping and Ouyang, Wanli and Chen Change Loy and Dahua Lin}, + booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, + year={2019} +} +``` + +## Dataset + +HTC requires COCO and COCO-stuff dataset for training. You need to download and extract it in the COCO dataset path. +The directory should be like this. + +``` +mmdetection +├── mmdet +├── tools +├── configs +├── data +│ ├── coco +│ │ ├── annotations +│ │ ├── train2017 +│ │ ├── val2017 +│ │ ├── test2017 +| | ├── stuffthingmaps +``` + +## Results and Models + +The results on COCO 2017val is shown in the below table. (results on test-dev are usually slightly higher than val) + +| Backbone | Style | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | +|:---------:|:-------:|:-------:|:--------:|:-------------------:|:--------------:|:------:|:-------:|:--------:| +| R-50-FPN | pytorch | 1x | | | | 42.2 | 37.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_1x_20190408-878c1712.pth) | +| R-50-FPN | pytorch | 20e | | | | 43.2 | 38.0 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r50_fpn_20e_20190408-c03b7015.pth) | +| R-101-FPN | pytorch | 20e | | | | 44.9 | 39.4 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_r101_fpn_20e_20190408-a2e586db.pth) | +| X-101-32x4d-FPN | pytorch |20e| | | | 46.1 | 40.3 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_32x4d_fpn_20e_20190408-9eae4d0b.pth) | +| X-101-64x4d-FPN | pytorch |20e| | | | 47.0 | 40.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_x101_64x4d_fpn_20e_20190408-497f2561.pth) | + +- In the HTC paper and COCO 2018 Challenge, `score_thr` is set to 0.001 for both baselines and HTC. +- We use 8 GPUs with 2 images/GPU for R-50 and R-101 models, and 16 GPUs with 1 image/GPU for X-101 models. +If you would like to train X-101 HTC with 8 GPUs, you need to change the lr from 0.02 to 0.01. + +We also provide a powerful HTC with DCN and multi-scale training model. No testing augmentation is used. + +| Backbone | Style | DCN | training scales | Lr schd | box AP | mask AP | Download | +|:----------------:|:-------:|:-----:|:---------------:|:-------:|:------:|:-------:|:--------:| +| X-101-64x4d-FPN | pytorch | c3-c5 | 400~1400 | 20e | 50.7 | 43.9 | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e_20190408-0e50669c.pth) | \ No newline at end of file diff --git a/configs/htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e.py b/configs/htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e.py new file mode 100644 index 0000000000000000000000000000000000000000..2e322af60c110c99717133772cc63e6011ad5c27 --- /dev/null +++ b/configs/htc/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e.py @@ -0,0 +1,256 @@ +# model settings +model = dict( + type='HybridTaskCascade', + num_stages=3, + pretrained='open-mmlab://resnext101_64x4d', + interleaved=True, + mask_info_flow=True, + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + dcn=dict( + modulated=False, + groups=64, + deformable_groups=1, + fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + use_sigmoid_cls=True), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067], + reg_class_agnostic=True) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81), + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='FusedSemanticHead', + num_ins=5, + fusion_level=1, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + ignore_label=255, + loss_weight=0.2)) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + smoothl1_beta=1 / 9.0, + debug=False), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ], + stage_loss_weights=[1, 0.5, 0.25]) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5), + keep_all_stages=False) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=1, + workers_per_gpu=1, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=[(1600, 400), (1600, 1400)], + multiscale_mode='range', + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + seg_prefix=data_root + 'stuffthingmaps/train2017/', + seg_scale_factor=1 / 8, + with_mask=True, + with_crowd=True, + with_label=True, + with_semantic_seg=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 19]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 20 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/htc_dconv_c3-c5_mstrain_400_1400_x101_64x4d_fpn_20e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/htc/htc_r101_fpn_20e.py b/configs/htc/htc_r101_fpn_20e.py new file mode 100644 index 0000000000000000000000000000000000000000..29ba42f795b55cbc5493468f4cab3d03508df959 --- /dev/null +++ b/configs/htc/htc_r101_fpn_20e.py @@ -0,0 +1,247 @@ +# model settings +model = dict( + type='HybridTaskCascade', + num_stages=3, + pretrained='modelzoo://resnet101', + interleaved=True, + mask_info_flow=True, + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + use_sigmoid_cls=True), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067], + reg_class_agnostic=True) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81), + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='FusedSemanticHead', + num_ins=5, + fusion_level=1, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + ignore_label=255, + loss_weight=0.2)) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + smoothl1_beta=1 / 9.0, + debug=False), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ], + stage_loss_weights=[1, 0.5, 0.25]) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5), + keep_all_stages=False) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + seg_prefix=data_root + 'stuffthingmaps/train2017/', + seg_scale_factor=1 / 8, + with_mask=True, + with_crowd=True, + with_label=True, + with_semantic_seg=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 19]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 20 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/htc_r101_fpn_20e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/htc/htc_r50_fpn_1x.py b/configs/htc/htc_r50_fpn_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..ca612b413a2b31f7df8d4588a7f0ab9344d220c5 --- /dev/null +++ b/configs/htc/htc_r50_fpn_1x.py @@ -0,0 +1,247 @@ +# model settings +model = dict( + type='HybridTaskCascade', + num_stages=3, + pretrained='modelzoo://resnet50', + interleaved=True, + mask_info_flow=True, + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + use_sigmoid_cls=True), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067], + reg_class_agnostic=True) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81), + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='FusedSemanticHead', + num_ins=5, + fusion_level=1, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + ignore_label=255, + loss_weight=0.2)) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + smoothl1_beta=1 / 9.0, + debug=False), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ], + stage_loss_weights=[1, 0.5, 0.25]) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5), + keep_all_stages=False) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + seg_prefix=data_root + 'stuffthingmaps/train2017/', + seg_scale_factor=1 / 8, + with_mask=True, + with_crowd=True, + with_label=True, + with_semantic_seg=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/htc_r50_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/htc/htc_r50_fpn_20e.py b/configs/htc/htc_r50_fpn_20e.py new file mode 100644 index 0000000000000000000000000000000000000000..4087d0b12912315a582745f9bf84ba691d957c8f --- /dev/null +++ b/configs/htc/htc_r50_fpn_20e.py @@ -0,0 +1,247 @@ +# model settings +model = dict( + type='HybridTaskCascade', + num_stages=3, + pretrained='modelzoo://resnet50', + interleaved=True, + mask_info_flow=True, + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + use_sigmoid_cls=True), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067], + reg_class_agnostic=True) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81), + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='FusedSemanticHead', + num_ins=5, + fusion_level=1, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + ignore_label=255, + loss_weight=0.2)) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + smoothl1_beta=1 / 9.0, + debug=False), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ], + stage_loss_weights=[1, 0.5, 0.25]) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5), + keep_all_stages=False) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + seg_prefix=data_root + 'stuffthingmaps/train2017/', + seg_scale_factor=1 / 8, + with_mask=True, + with_crowd=True, + with_label=True, + with_semantic_seg=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 19]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 20 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/htc_r50_fpn_20e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/htc/htc_without_semantic_r50_fpn_1x.py b/configs/htc/htc_without_semantic_r50_fpn_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..94421cfbbd5b3a6bb222005b45389ebc01bcd5ce --- /dev/null +++ b/configs/htc/htc_without_semantic_r50_fpn_1x.py @@ -0,0 +1,229 @@ +# model settings +model = dict( + type='HybridTaskCascade', + num_stages=3, + pretrained='modelzoo://resnet50', + interleaved=True, + mask_info_flow=True, + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + use_sigmoid_cls=True), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067], + reg_class_agnostic=True) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81)) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + smoothl1_beta=1 / 9.0, + debug=False), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ], + stage_loss_weights=[1, 0.5, 0.25]) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5), + keep_all_stages=False) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=True, + with_crowd=True, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/htc_without_semantic_r50_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/htc/htc_x101_32x4d_fpn_20e_16gpu.py b/configs/htc/htc_x101_32x4d_fpn_20e_16gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..97bd96c53934455779bc42575ef2106905f503ef --- /dev/null +++ b/configs/htc/htc_x101_32x4d_fpn_20e_16gpu.py @@ -0,0 +1,249 @@ +# model settings +model = dict( + type='HybridTaskCascade', + num_stages=3, + pretrained='open-mmlab://resnext101_32x4d', + interleaved=True, + mask_info_flow=True, + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + use_sigmoid_cls=True), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067], + reg_class_agnostic=True) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81), + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='FusedSemanticHead', + num_ins=5, + fusion_level=1, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + ignore_label=255, + loss_weight=0.2)) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + smoothl1_beta=1 / 9.0, + debug=False), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ], + stage_loss_weights=[1, 0.5, 0.25]) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5), + keep_all_stages=False) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=1, + workers_per_gpu=1, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + seg_prefix=data_root + 'stuffthingmaps/train2017/', + seg_scale_factor=1 / 8, + with_mask=True, + with_crowd=True, + with_label=True, + with_semantic_seg=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 19]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 20 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/htc_x101_32x4d_fpn_20e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/htc/htc_x101_64x4d_fpn_20e_16gpu.py b/configs/htc/htc_x101_64x4d_fpn_20e_16gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..df502876b1afe3068d7b4d4c0827aa2834ed4947 --- /dev/null +++ b/configs/htc/htc_x101_64x4d_fpn_20e_16gpu.py @@ -0,0 +1,249 @@ +# model settings +model = dict( + type='HybridTaskCascade', + num_stages=3, + pretrained='open-mmlab://resnext101_64x4d', + interleaved=True, + mask_info_flow=True, + backbone=dict( + type='ResNeXt', + depth=101, + groups=64, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + use_sigmoid_cls=True), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1], + reg_class_agnostic=True), + dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067], + reg_class_agnostic=True) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81), + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='FusedSemanticHead', + num_ins=5, + fusion_level=1, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + ignore_label=255, + loss_weight=0.2)) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + smoothl1_beta=1 / 9.0, + debug=False), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ], + stage_loss_weights=[1, 0.5, 0.25]) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5), + keep_all_stages=False) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=1, + workers_per_gpu=1, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + seg_prefix=data_root + 'stuffthingmaps/train2017/', + seg_scale_factor=1 / 8, + with_mask=True, + with_crowd=True, + with_label=True, + with_semantic_seg=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 19]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 20 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/htc_x101_64x4d_fpn_20e' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/mmdet/datasets/custom.py b/mmdet/datasets/custom.py index 38abe05889cc73ba8eaa9ee9e9e5c51167581f69..9bf4731184b221bb6a20984236311ac158d04676 100644 --- a/mmdet/datasets/custom.py +++ b/mmdet/datasets/custom.py @@ -6,7 +6,7 @@ from mmcv.parallel import DataContainer as DC from torch.utils.data import Dataset from .transforms import (ImageTransform, BboxTransform, MaskTransform, - Numpy2Tensor) + SegMapTransform, Numpy2Tensor) from .utils import to_tensor, random_scale from .extra_aug import ExtraAugmentation @@ -48,6 +48,9 @@ class CustomDataset(Dataset): with_mask=True, with_crowd=True, with_label=True, + with_semantic_seg=False, + seg_prefix=None, + seg_scale_factor=1, extra_aug=None, resize_keep_ratio=True, test_mode=False): @@ -94,6 +97,12 @@ class CustomDataset(Dataset): self.with_crowd = with_crowd # with label is False for RPN self.with_label = with_label + # with semantic segmentation (stuff) annotation or not + self.with_seg = with_semantic_seg + # prefix of semantic segmentation map path + self.seg_prefix = seg_prefix + # rescale factor for segmentation maps + self.seg_scale_factor = seg_scale_factor # in test mode or not self.test_mode = test_mode @@ -105,6 +114,7 @@ class CustomDataset(Dataset): size_divisor=self.size_divisor, **self.img_norm_cfg) self.bbox_transform = BboxTransform() self.mask_transform = MaskTransform() + self.seg_transform = SegMapTransform(self.size_divisor) self.numpy2tensor = Numpy2Tensor() # if use extra augmentation @@ -206,6 +216,15 @@ class CustomDataset(Dataset): img, img_shape, pad_shape, scale_factor = self.img_transform( img, img_scale, flip, keep_ratio=self.resize_keep_ratio) img = img.copy() + if self.with_seg: + gt_seg = mmcv.imread( + osp.join(self.seg_prefix, img_info['file_name'].replace( + 'jpg', 'png')), + flag='unchanged') + gt_seg = self.seg_transform(gt_seg.squeeze(), img_scale, flip) + gt_seg = mmcv.imrescale( + gt_seg, self.seg_scale_factor, interpolation='nearest') + gt_seg = gt_seg[None, ...] if self.proposals is not None: proposals = self.bbox_transform(proposals, img_shape, scale_factor, flip) @@ -240,6 +259,8 @@ class CustomDataset(Dataset): data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore)) if self.with_mask: data['gt_masks'] = DC(gt_masks, cpu_only=True) + if self.with_seg: + data['gt_semantic_seg'] = DC(to_tensor(gt_seg), stack=True) return data def prepare_test_img(self, idx): diff --git a/mmdet/datasets/transforms.py b/mmdet/datasets/transforms.py index f94c0725ba0ccc1a94b392555703a3af3239f8ed..df36590020c98c10cce118a72df60977cb4109aa 100644 --- a/mmdet/datasets/transforms.py +++ b/mmdet/datasets/transforms.py @@ -2,7 +2,10 @@ import mmcv import numpy as np import torch -__all__ = ['ImageTransform', 'BboxTransform', 'MaskTransform', 'Numpy2Tensor'] +__all__ = [ + 'ImageTransform', 'BboxTransform', 'MaskTransform', 'SegMapTransform', + 'Numpy2Tensor' +] class ImageTransform(object): @@ -109,6 +112,29 @@ class MaskTransform(object): return padded_masks +class SegMapTransform(object): + """Preprocess semantic segmentation maps. + + 1. rescale the segmentation map to expected size + 3. flip the image (if needed) + 4. pad the image (if needed) + """ + + def __init__(self, size_divisor=None): + self.size_divisor = size_divisor + + def __call__(self, img, scale, flip=False, keep_ratio=True): + if keep_ratio: + img = mmcv.imrescale(img, scale, interpolation='nearest') + else: + img = mmcv.imresize(img, scale, interpolation='nearest') + if flip: + img = mmcv.imflip(img) + if self.size_divisor is not None: + img = mmcv.impad_to_multiple(img, self.size_divisor) + return img + + class Numpy2Tensor(object): def __init__(self): diff --git a/mmdet/models/detectors/__init__.py b/mmdet/models/detectors/__init__.py index 882f6a5f453ec331bb09b940fce02201fa0c70d6..d3653ec903cdc4b615678ba1391da3a1ae7755a6 100644 --- a/mmdet/models/detectors/__init__.py +++ b/mmdet/models/detectors/__init__.py @@ -6,9 +6,11 @@ from .fast_rcnn import FastRCNN from .faster_rcnn import FasterRCNN from .mask_rcnn import MaskRCNN from .cascade_rcnn import CascadeRCNN +from .htc import HybridTaskCascade from .retinanet import RetinaNet __all__ = [ 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN', - 'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'RetinaNet' + 'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade', + 'RetinaNet' ] diff --git a/mmdet/models/detectors/htc.py b/mmdet/models/detectors/htc.py new file mode 100644 index 0000000000000000000000000000000000000000..c1f9f538a9bde5bea5f5593d1e6dfaec1cc47463 --- /dev/null +++ b/mmdet/models/detectors/htc.py @@ -0,0 +1,392 @@ +import torch +import torch.nn.functional as F + +from .cascade_rcnn import CascadeRCNN +from .. import builder +from ..registry import DETECTORS +from mmdet.core import (bbox2roi, bbox2result, build_assigner, build_sampler, + merge_aug_masks) + + +@DETECTORS.register_module +class HybridTaskCascade(CascadeRCNN): + + def __init__(self, + num_stages, + backbone, + semantic_roi_extractor=None, + semantic_head=None, + semantic_fusion=('bbox', 'mask'), + interleaved=True, + mask_info_flow=True, + **kwargs): + super(HybridTaskCascade, self).__init__(num_stages, backbone, **kwargs) + assert self.with_bbox and self.with_mask + assert not self.with_shared_head # shared head not supported + if semantic_head is not None: + self.semantic_roi_extractor = builder.build_roi_extractor( + semantic_roi_extractor) + self.semantic_head = builder.build_head(semantic_head) + + self.semantic_fusion = semantic_fusion + self.interleaved = interleaved + self.mask_info_flow = mask_info_flow + + @property + def with_semantic(self): + if hasattr(self, 'semantic_head') and self.semantic_head is not None: + return True + else: + return False + + def _bbox_forward_train(self, + stage, + x, + sampling_results, + gt_bboxes, + gt_labels, + rcnn_train_cfg, + semantic_feat=None): + rois = bbox2roi([res.bboxes for res in sampling_results]) + bbox_roi_extractor = self.bbox_roi_extractor[stage] + bbox_head = self.bbox_head[stage] + bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs], + rois) + # semantic feature fusion + # element-wise sum for original features and pooled semantic features + if self.with_semantic and 'bbox' in self.semantic_fusion: + bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat], + rois) + if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]: + bbox_semantic_feat = F.adaptive_avg_pool2d( + bbox_semantic_feat, bbox_feats.shape[-2:]) + bbox_feats += bbox_semantic_feat + + cls_score, bbox_pred = bbox_head(bbox_feats) + + bbox_targets = bbox_head.get_target(sampling_results, gt_bboxes, + gt_labels, rcnn_train_cfg) + loss_bbox = bbox_head.loss(cls_score, bbox_pred, *bbox_targets) + return loss_bbox, rois, bbox_targets, bbox_pred + + def _mask_forward_train(self, + stage, + x, + sampling_results, + gt_masks, + rcnn_train_cfg, + semantic_feat=None): + mask_roi_extractor = self.mask_roi_extractor[stage] + mask_head = self.mask_head[stage] + pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results]) + mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs], + pos_rois) + + # semantic feature fusion + # element-wise sum for original features and pooled semantic features + if self.with_semantic and 'mask' in self.semantic_fusion: + mask_semantic_feat = self.semantic_roi_extractor([semantic_feat], + pos_rois) + if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]: + mask_semantic_feat = F.adaptive_avg_pool2d( + mask_semantic_feat, mask_feats.shape[-2:]) + mask_feats += mask_semantic_feat + + # mask information flow + # forward all previous mask heads to obtain last_feat, and fuse it + # with the normal mask feature + if self.mask_info_flow: + last_feat = None + for i in range(stage): + last_feat = self.mask_head[i]( + mask_feats, last_feat, return_logits=False) + mask_pred = mask_head(mask_feats, last_feat, return_feat=False) + else: + mask_pred = mask_head(mask_feats) + + mask_targets = mask_head.get_target(sampling_results, gt_masks, + rcnn_train_cfg) + pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results]) + loss_mask = mask_head.loss(mask_pred, mask_targets, pos_labels) + return loss_mask + + def _bbox_forward_test(self, stage, x, rois, semantic_feat=None): + bbox_roi_extractor = self.bbox_roi_extractor[stage] + bbox_head = self.bbox_head[stage] + bbox_feats = bbox_roi_extractor( + x[:len(bbox_roi_extractor.featmap_strides)], rois) + if self.with_semantic and 'bbox' in self.semantic_fusion: + bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat], + rois) + if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]: + bbox_semantic_feat = F.adaptive_avg_pool2d( + bbox_semantic_feat, bbox_feats.shape[-2:]) + bbox_feats += bbox_semantic_feat + cls_score, bbox_pred = bbox_head(bbox_feats) + return cls_score, bbox_pred + + def _mask_forward_test(self, stage, x, bboxes, semantic_feat=None): + mask_roi_extractor = self.mask_roi_extractor[stage] + mask_head = self.mask_head[stage] + mask_rois = bbox2roi([bboxes]) + mask_feats = mask_roi_extractor( + x[:len(mask_roi_extractor.featmap_strides)], mask_rois) + if self.with_semantic and 'mask' in self.semantic_fusion: + mask_semantic_feat = self.semantic_roi_extractor([semantic_feat], + mask_rois) + if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]: + mask_semantic_feat = F.adaptive_avg_pool2d( + mask_semantic_feat, mask_feats.shape[-2:]) + mask_feats += mask_semantic_feat + if self.mask_info_flow: + last_feat = None + last_pred = None + for i in range(stage): + mask_pred, last_feat = self.mask_head[i](mask_feats, last_feat) + if last_pred is not None: + mask_pred = mask_pred + last_pred + last_pred = mask_pred + mask_pred = mask_head(mask_feats, last_feat, return_feat=False) + if last_pred is not None: + mask_pred = mask_pred + last_pred + else: + mask_pred = mask_head(mask_feats) + return mask_pred + + def forward_train(self, + img, + img_meta, + gt_bboxes, + gt_labels, + gt_bboxes_ignore=None, + gt_masks=None, + gt_semantic_seg=None, + proposals=None): + x = self.extract_feat(img) + + losses = dict() + + # RPN part, the same as normal two-stage detectors + if self.with_rpn: + rpn_outs = self.rpn_head(x) + rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, + self.train_cfg.rpn) + rpn_losses = self.rpn_head.loss( + *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + losses.update(rpn_losses) + + proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn) + proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) + else: + proposal_list = proposals + + # semantic segmentation part + # 2 outputs: segmentation prediction and embedded features + if self.with_semantic: + semantic_pred, semantic_feat = self.semantic_head(x) + loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_seg) + losses['loss_semantic_seg'] = loss_seg + else: + semantic_feat = None + + for i in range(self.num_stages): + self.current_stage = i + rcnn_train_cfg = self.train_cfg.rcnn[i] + lw = self.train_cfg.stage_loss_weights[i] + + # assign gts and sample proposals + sampling_results = [] + bbox_assigner = build_assigner(rcnn_train_cfg.assigner) + bbox_sampler = build_sampler(rcnn_train_cfg.sampler, context=self) + num_imgs = img.size(0) + if gt_bboxes_ignore is None: + gt_bboxes_ignore = [None for _ in range(num_imgs)] + + for j in range(num_imgs): + assign_result = bbox_assigner.assign( + proposal_list[j], gt_bboxes[j], gt_bboxes_ignore[j], + gt_labels[j]) + sampling_result = bbox_sampler.sample( + assign_result, + proposal_list[j], + gt_bboxes[j], + gt_labels[j], + feats=[lvl_feat[j][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + + # bbox head forward and loss + loss_bbox, rois, bbox_targets, bbox_pred = \ + self._bbox_forward_train( + i, x, sampling_results, gt_bboxes, gt_labels, + rcnn_train_cfg, semantic_feat) + roi_labels = bbox_targets[0] + + for name, value in loss_bbox.items(): + losses['s{}.{}'.format( + i, name)] = (value * lw if 'loss' in name else value) + + # mask head forward and loss + if self.with_mask: + # interleaved execution: use regressed bboxes by the box branch + # to train the mask branch + if self.interleaved: + pos_is_gts = [res.pos_is_gt for res in sampling_results] + with torch.no_grad(): + proposal_list = self.bbox_head[i].refine_bboxes( + rois, roi_labels, bbox_pred, pos_is_gts, img_meta) + # re-assign and sample 512 RoIs from 512 RoIs + sampling_results = [] + for j in range(num_imgs): + assign_result = bbox_assigner.assign( + proposal_list[j], gt_bboxes[j], + gt_bboxes_ignore[j], gt_labels[j]) + sampling_result = bbox_sampler.sample( + assign_result, + proposal_list[j], + gt_bboxes[j], + gt_labels[j], + feats=[lvl_feat[j][None] for lvl_feat in x]) + sampling_results.append(sampling_result) + loss_mask = self._mask_forward_train(i, x, sampling_results, + gt_masks, rcnn_train_cfg, + semantic_feat) + for name, value in loss_mask.items(): + losses['s{}.{}'.format( + i, name)] = (value * lw if 'loss' in name else value) + + # refine bboxes (same as Cascade R-CNN) + if i < self.num_stages - 1 and not self.interleaved: + pos_is_gts = [res.pos_is_gt for res in sampling_results] + with torch.no_grad(): + proposal_list = self.bbox_head[i].refine_bboxes( + rois, roi_labels, bbox_pred, pos_is_gts, img_meta) + + return losses + + def simple_test(self, img, img_meta, proposals=None, rescale=False): + x = self.extract_feat(img) + proposal_list = self.simple_test_rpn( + x, img_meta, self.test_cfg.rpn) if proposals is None else proposals + + if self.with_semantic: + _, semantic_feat = self.semantic_head(x) + else: + semantic_feat = None + + img_shape = img_meta[0]['img_shape'] + ori_shape = img_meta[0]['ori_shape'] + scale_factor = img_meta[0]['scale_factor'] + + # "ms" in variable names means multi-stage + ms_bbox_result = {} + ms_segm_result = {} + ms_scores = [] + rcnn_test_cfg = self.test_cfg.rcnn + + rois = bbox2roi(proposal_list) + for i in range(self.num_stages): + bbox_head = self.bbox_head[i] + cls_score, bbox_pred = self._bbox_forward_test( + i, x, rois, semantic_feat=semantic_feat) + ms_scores.append(cls_score) + + if self.test_cfg.keep_all_stages: + det_bboxes, det_labels = bbox_head.get_det_bboxes( + rois, + cls_score, + bbox_pred, + img_shape, + scale_factor, + rescale=rescale, + nms_cfg=rcnn_test_cfg) + bbox_result = bbox2result(det_bboxes, det_labels, + bbox_head.num_classes) + ms_bbox_result['stage{}'.format(i)] = bbox_result + + if self.with_mask: + mask_head = self.mask_head[i] + if det_bboxes.shape[0] == 0: + segm_result = [ + [] for _ in range(mask_head.num_classes - 1) + ] + else: + _bboxes = (det_bboxes[:, :4] * scale_factor + if rescale else det_bboxes) + mask_pred = self._mask_forward_test( + i, x, _bboxes, semantic_feat=semantic_feat) + segm_result = mask_head.get_seg_masks( + mask_pred, _bboxes, det_labels, rcnn_test_cfg, + ori_shape, scale_factor, rescale) + ms_segm_result['stage{}'.format(i)] = segm_result + + if i < self.num_stages - 1: + bbox_label = cls_score.argmax(dim=1) + rois = bbox_head.regress_by_class(rois, bbox_label, bbox_pred, + img_meta[0]) + + cls_score = sum(ms_scores) / float(len(ms_scores)) + det_bboxes, det_labels = self.bbox_head[-1].get_det_bboxes( + rois, + cls_score, + bbox_pred, + img_shape, + scale_factor, + rescale=rescale, + cfg=rcnn_test_cfg) + bbox_result = bbox2result(det_bboxes, det_labels, + self.bbox_head[-1].num_classes) + ms_bbox_result['ensemble'] = bbox_result + + if self.with_mask: + if det_bboxes.shape[0] == 0: + segm_result = [ + [] for _ in range(self.mask_head[-1].num_classes - 1) + ] + else: + _bboxes = (det_bboxes[:, :4] * scale_factor + if rescale else det_bboxes) + + mask_rois = bbox2roi([_bboxes]) + aug_masks = [] + mask_roi_extractor = self.mask_roi_extractor[-1] + mask_feats = mask_roi_extractor( + x[:len(mask_roi_extractor.featmap_strides)], mask_rois) + if self.with_semantic and 'mask' in self.semantic_fusion: + mask_semantic_feat = self.semantic_roi_extractor( + [semantic_feat], mask_rois) + mask_feats += mask_semantic_feat + last_feat = None + for i in range(self.num_stages): + mask_head = self.mask_head[i] + if self.mask_info_flow: + mask_pred, last_feat = mask_head(mask_feats, last_feat) + else: + mask_pred = mask_head(mask_feats) + aug_masks.append(mask_pred.sigmoid().cpu().numpy()) + merged_masks = merge_aug_masks(aug_masks, + [img_meta] * self.num_stages, + self.test_cfg.rcnn) + segm_result = self.mask_head[-1].get_seg_masks( + merged_masks, _bboxes, det_labels, rcnn_test_cfg, + ori_shape, scale_factor, rescale) + ms_segm_result['ensemble'] = segm_result + + if not self.test_cfg.keep_all_stages: + if self.with_mask: + results = (ms_bbox_result['ensemble'], + ms_segm_result['ensemble']) + else: + results = ms_bbox_result['ensemble'] + else: + if self.with_mask: + results = { + stage: (ms_bbox_result[stage], ms_segm_result[stage]) + for stage in ms_bbox_result + } + else: + results = ms_bbox_result + + return results + + def aug_test(self, img, img_meta, proposals=None, rescale=False): + raise NotImplementedError diff --git a/mmdet/models/mask_heads/__init__.py b/mmdet/models/mask_heads/__init__.py index a21ae9add5a78d23781bf36a696b28606e19b0ce..899e4645a190d9912722305b86f9b166ed946d0f 100644 --- a/mmdet/models/mask_heads/__init__.py +++ b/mmdet/models/mask_heads/__init__.py @@ -1,3 +1,5 @@ from .fcn_mask_head import FCNMaskHead +from .htc_mask_head import HTCMaskHead +from .fused_semantic_head import FusedSemanticHead -__all__ = ['FCNMaskHead'] +__all__ = ['FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead'] diff --git a/mmdet/models/mask_heads/fused_semantic_head.py b/mmdet/models/mask_heads/fused_semantic_head.py new file mode 100644 index 0000000000000000000000000000000000000000..8795f66ebbef93aef6b101736e151c010f2c94cc --- /dev/null +++ b/mmdet/models/mask_heads/fused_semantic_head.py @@ -0,0 +1,104 @@ +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import kaiming_init + +from ..registry import HEADS +from ..utils import ConvModule + + +@HEADS.register_module +class FusedSemanticHead(nn.Module): + """Multi-level fused semantic segmentation head. + + in_1 -> 1x1 conv --- + | + in_2 -> 1x1 conv -- | + || + in_3 -> 1x1 conv - || + ||| /-> 1x1 conv (mask prediction) + in_4 -> 1x1 conv -----> 3x3 convs (*4) + | \-> 1x1 conv (feature) + in_5 -> 1x1 conv --- + """ # noqa: W605 + + def __init__(self, + num_ins, + fusion_level, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + ignore_label=255, + loss_weight=0.2, + normalize=None): + super(FusedSemanticHead, self).__init__() + self.num_ins = num_ins + self.fusion_level = fusion_level + self.num_convs = num_convs + self.in_channels = in_channels + self.conv_out_channels = conv_out_channels + self.num_classes = num_classes + self.ignore_label = ignore_label + self.loss_weight = loss_weight + self.normalize = normalize + self.with_bias = normalize is None + + self.lateral_convs = nn.ModuleList() + for i in range(self.num_ins): + self.lateral_convs.append( + ConvModule( + self.in_channels, + self.in_channels, + 1, + normalize=self.normalize, + bias=self.with_bias, + inplace=False)) + + self.convs = nn.ModuleList() + for i in range(self.num_convs): + in_channels = self.in_channels if i == 0 else conv_out_channels + self.convs.append( + ConvModule( + in_channels, + conv_out_channels, + 3, + padding=1, + normalize=self.normalize, + bias=self.with_bias)) + self.conv_embedding = ConvModule( + conv_out_channels, + conv_out_channels, + 1, + normalize=self.normalize, + bias=self.with_bias) + self.conv_logits = nn.Conv2d(conv_out_channels, self.num_classes, 1) + + self.criterion = nn.CrossEntropyLoss(ignore_index=ignore_label) + + def init_weights(self): + kaiming_init(self.conv_logits) + + def forward(self, feats): + x = self.lateral_convs[self.fusion_level](feats[self.fusion_level]) + fused_size = tuple(x.shape[-2:]) + for i, feat in enumerate(feats): + if i != self.fusion_level: + feat = F.interpolate( + feat, + size=fused_size, + mode='bilinear', + align_corners=True) + x += self.lateral_convs[i](feat) + + for i in range(self.num_convs): + x = self.convs[i](x) + + mask_pred = self.conv_logits(x) + x = self.conv_embedding(x) + return mask_pred, x + + def loss(self, mask_pred, labels): + labels = labels.squeeze(1).long() + loss_semantic_seg = self.criterion(mask_pred, labels) + loss_semantic_seg *= self.loss_weight + return loss_semantic_seg diff --git a/mmdet/models/mask_heads/htc_mask_head.py b/mmdet/models/mask_heads/htc_mask_head.py new file mode 100644 index 0000000000000000000000000000000000000000..6f1ccf00368fa0ce8857c8e6a68847bb96842946 --- /dev/null +++ b/mmdet/models/mask_heads/htc_mask_head.py @@ -0,0 +1,38 @@ +from .fcn_mask_head import FCNMaskHead +from ..registry import HEADS +from ..utils import ConvModule + + +@HEADS.register_module +class HTCMaskHead(FCNMaskHead): + + def __init__(self, *args, **kwargs): + super(HTCMaskHead, self).__init__(*args, **kwargs) + self.conv_res = ConvModule( + self.conv_out_channels, + self.conv_out_channels, + 1, + normalize=self.normalize, + bias=self.with_bias) + + def init_weights(self): + super(HTCMaskHead, self).init_weights() + self.conv_res.init_weights() + + def forward(self, x, res_feat=None, return_logits=True, return_feat=True): + if res_feat is not None: + res_feat = self.conv_res(res_feat) + x = x + res_feat + for conv in self.convs: + x = conv(x) + res_feat = x + outs = [] + if return_logits: + x = self.upsample(x) + if self.upsample_method == 'deconv': + x = self.relu(x) + mask_pred = self.conv_logits(x) + outs.append(mask_pred) + if return_feat: + outs.append(res_feat) + return outs if len(outs) > 1 else outs[0]