From 58c415a069ceab37bab76c47da72824f4181cff6 Mon Sep 17 00:00:00 2001 From: Jerry XU <xvjiarui0826@gmail.com> Date: Mon, 3 Jun 2019 17:23:53 +0800 Subject: [PATCH] Code of "GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond" (#598) * add pytorch 1.1.0 SyncBN support * change BatchNorm2d to _BatchNorm and call freeze after train * add freeze back to init function * fixed indentation typo in adding freeze * convert SyncBN to BN when eval is called * add gcb and configs * fixed line too long * fixed space and newline * ignore too long line in config files * two space before inline comment * refactor var name * fixed line too long * changed SyncBN to backbone only and add non-Sync BN configs * reformat config to adapt master branch * reformat change method->type * remoe some configs * clean up readme.md * add benchmark * fix typo in README * change sbn to syncbn * change work dir * refactor gcb * fixed line too long * Code formatting --- configs/gcnet/README.md | 58 ++++++ .../mask_rcnn_r16_gcb_c3-c5_r50_fpn_1x.py | 185 +++++++++++++++++ ...sk_rcnn_r16_gcb_c3-c5_r50_fpn_syncbn_1x.py | 189 ++++++++++++++++++ .../mask_rcnn_r4_gcb_c3-c5_r50_fpn_1x.py | 185 +++++++++++++++++ ...ask_rcnn_r4_gcb_c3-c5_r50_fpn_syncbn_1x.py | 189 ++++++++++++++++++ configs/gcnet/mask_rcnn_r50_fpn_sbn_1x.py | 185 +++++++++++++++++ mmdet/models/backbones/resnet.py | 40 +++- mmdet/models/backbones/resnext.py | 13 +- mmdet/ops/__init__.py | 3 +- mmdet/ops/gcb/__init__.py | 5 + mmdet/ops/gcb/context_block.py | 104 ++++++++++ 11 files changed, 1144 insertions(+), 12 deletions(-) create mode 100644 configs/gcnet/README.md create mode 100644 configs/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_1x.py create mode 100644 configs/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_syncbn_1x.py create mode 100644 configs/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_1x.py create mode 100644 configs/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_syncbn_1x.py create mode 100644 configs/gcnet/mask_rcnn_r50_fpn_sbn_1x.py create mode 100644 mmdet/ops/gcb/__init__.py create mode 100644 mmdet/ops/gcb/context_block.py diff --git a/configs/gcnet/README.md b/configs/gcnet/README.md new file mode 100644 index 0000000..367ee43 --- /dev/null +++ b/configs/gcnet/README.md @@ -0,0 +1,58 @@ +# GCNet for Object Detection + +By [Yue Cao](http://yue-cao.me), [Jiarui Xu](http://jerryxu.net), [Stephen Lin](https://scholar.google.com/citations?user=c3PYmxUAAAAJ&hl=en), Fangyun Wei, [Han Hu](https://sites.google.com/site/hanhushomepage/). + +We provide config files to reproduce the results in the paper for +["GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond"](https://arxiv.org/abs/1904.11492) on COCO object detection. + +## Introduction + +**GCNet** is initially described in [arxiv](https://arxiv.org/abs/1904.11492). Via absorbing advantages of Non-Local Networks (NLNet) and Squeeze-Excitation Networks (SENet), GCNet provides a simple, fast and effective approach for global context modeling, which generally outperforms both NLNet and SENet on major benchmarks for various recognition tasks. + +## Citing GCNet + +``` +@article{cao2019GCNet, + title={GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond}, + author={Cao, Yue and Xu, Jiarui and Lin, Stephen and Wei, Fangyun and Hu, Han}, + journal={arXiv preprint arXiv:1904.11492}, + year={2019} +} +``` + +## Results and models +The results on COCO 2017val are shown in the below table. + +| Backbone | Model | Context | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | +| :-------: | :--------------: | :------------: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :-------: | +| R-50-FPN | Mask | GC(c3-c5, r16) | 1x | 4.5 | 0.533 | 10.1 | 38.5 | 35.1 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_1x_20190602-c550c707.pth) | +| R-50-FPN | Mask | GC(c3-c5, r4) | 1x | 4.6 | 0.533 | 9.9 | 38.9 | 35.5 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_1x_20190602-18ae2dfd.pth) | +| R-101-FPN | Mask | GC(c3-c5, r16) | 1x | 7.0 | 0.731 | 8.6 | 40.8 | 37.0 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r16_gcb_c3-c5_r101_fpn_1x_20190602-f4456442.pth) | +| R-101-FPN | Mask | GC(c3-c5, r4) | 1x | 7.1 | 0.747 | 8.6 | 40.8 | 36.9 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r4_gcb_c3-c5_r101_fpn_1x_20190602-1ee20d5f.pth) | + +| Backbone | Model | Context | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP | mask AP | Download | +| :-------: | :--------------: | :------------: | :-----: | :------: | :-----------------: | :------------: | :----: | :-----: | :-------: | +| R-50-FPN | Mask | - | 1x | 3.9 | 0.543 | 10.2 | 37.2 | 33.8 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r50_fpn_syncbn_1x_20190602-bccc62fa.pth) | +| R-50-FPN | Mask | GC(c3-c5, r16) | 1x | 4.5 | 0.547 | 9.9 | 39.4 | 35.7 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_syncbn_1x_20190602-a0169c20.pth) | +| R-50-FPN | Mask | GC(c3-c5, r4) | 1x | 4.6 | 0.603 | 9.4 | 39.9 | 36.2 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_syncbn_1x_20190602-ace08792.pth) | +| R-101-FPN | Mask | - | 1x | 5.8 | 0.665 | 9.2 | 39.8 | 36.0 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r101_fpn_syncbn_1x_20190602-b2a0e2b7.pth) | +| R-101-FPN | Mask | GC(c3-c5, r16) | 1x | 7.0 | 0.778 | 9.0 | 41.1 | 37.4 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r16_gcb_c3-c5_r101_fpn_syncbn_1x_20190602-717e6dbd.pth) | +| R-101-FPN | Mask | GC(c3-c5, r4) | 1x | 7.1 | 0.786 | 8.9 | 41.7 | 37.6 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r4_gcb_c3-c5_r101_fpn_syncbn_1x_20190602-a893c718.pth) | +| X-101-FPN | Mask | - | 1x | 7.1 | 0.912 | 8.5 | 41.2 | 37.3 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_x101_32x4d_fpn_syncbn_1x_20190602-bb8ae7e5.pth) | +| X-101-FPN | Mask | GC(c3-c5, r16) | 1x | 8.2 | 1.055 | 7.7 | 42.4 | 38.0 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r16_gcb_c3-c5_x101_32x4d_fpn_syncbn_1x_20190602-c28edb53.pth) | +| X-101-FPN | Mask | GC(c3-c5, r4) | 1x | 8.3 | 1.037 | 7.6 | 42.9 | 38.5 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/mask_rcnn_r4_gcb_c3-c5_x101_32x4d_fpn_syncbn_1x_20190602-930b3d51.pth) | +| X-101-FPN | Cascade Mask | - | 1x | - | - | - | 44.7 | 38.3 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/cascade_mask_rcnn_x101_32x4d_fpn_syncbn_1x_20190602-63a800fb.pth) | +| X-101-FPN | Cascade Mask | GC(c3-c5, r16) | 1x | - | - | - | 45.9 | 39.3 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/cascade_mask_rcnn_r16_gcb_c3-c5_x101_32x4d_fpn_syncbn_1x_20190602-3e168d88.pth) | +| X-101-FPN | Cascade Mask | GC(c3-c5, r4) | 1x | - | - | - | 46.5 | 39.7 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/cascade_mask_rcnn_r4_gcb_c3-c5_x101_32x4d_fpn_syncbn_1x_20190602-b579157f.pth) | +| X-101-FPN | DCN Cascade Mask | - | 1x | - | - | - | 47.1 | 40.4 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/cascade_mask_rcnn_dconv_c3-c5_x101_32x4d_fpn_syncbn_1x_20190602-9aa8c394.pth) | +| X-101-FPN | DCN Cascade Mask | GC(c3-c5, r16) | 1x | - | - | - | 47.9 | 40.9 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/cascade_mask_rcnn_r16_gcb_dconv_c3-c5_x101_32x4d_fpn_syncbn_1x_20190602-b86027a6.pth) | +| X-101-FPN | DCN Cascade Mask | GC(c3-c5, r4) | 1x | - | - | - | 47.9 | 40.8 | [model](https://open-mmlab.s3.ap-northeast-2.amazonaws.com/mmdetection/models/gcnet/cascade_mask_rcnn_r4_gcb_dconv_c3-c5_x101_32x4d_fpn_syncbn_1x_20190602-b4164f6b.pth) | + + +**Notes:** + +- The `SyncBN` is added in the backbone for all models in **Table 2**. +- `GC` denotes Global Context (GC) block is inserted after 1x1 conv of backbone. +- `DCN` denotes replace 3x3 conv with 3x3 Deformable Convolution in `c3-c5` stages of backbone. +- `r4` and `r16` denote ratio 4 and ratio 16 in GC block respectively. + diff --git a/configs/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_1x.py b/configs/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_1x.py new file mode 100644 index 0000000..580c6de --- /dev/null +++ b/configs/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_1x.py @@ -0,0 +1,185 @@ +# model settings +model = dict( + type='MaskRCNN', + pretrained='modelzoo://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + gcb=dict( + ratio=1./16., + ), + stage_with_gcb=(False, True, True, True)), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=True, + with_crowd=True, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/mask_rcnn_r16_gcb_c3-c5_r50_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_syncbn_1x.py b/configs/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_syncbn_1x.py new file mode 100644 index 0000000..0706c23 --- /dev/null +++ b/configs/gcnet/mask_rcnn_r16_gcb_c3-c5_r50_fpn_syncbn_1x.py @@ -0,0 +1,189 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) + +model = dict( + type='MaskRCNN', + pretrained='modelzoo://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + gcb=dict( + ratio=1./16., + ), + stage_with_gcb=(False, True, True, True), + norm_eval=False, + norm_cfg=norm_cfg), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=True, + with_crowd=True, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/mask_rcnn_r16_gcb_c3-c5_r50_fpn_syncbn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_1x.py b/configs/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_1x.py new file mode 100644 index 0000000..ec91f91 --- /dev/null +++ b/configs/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_1x.py @@ -0,0 +1,185 @@ +# model settings +model = dict( + type='MaskRCNN', + pretrained='modelzoo://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + gcb=dict( + ratio=1./4., + ), + stage_with_gcb=(False, True, True, True)), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=True, + with_crowd=True, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/mask_rcnn_r4_gcb_c3-c5_r50_fpn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_syncbn_1x.py b/configs/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_syncbn_1x.py new file mode 100644 index 0000000..da0d06a --- /dev/null +++ b/configs/gcnet/mask_rcnn_r4_gcb_c3-c5_r50_fpn_syncbn_1x.py @@ -0,0 +1,189 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) + +model = dict( + type='MaskRCNN', + pretrained='modelzoo://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + gcb=dict( + ratio=1./4., + ), + stage_with_gcb=(False, True, True, True), + norm_eval=False, + norm_cfg=norm_cfg), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=True, + with_crowd=True, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/mask_rcnn_r4_gcb_c3-c5_r50_fpn_syncbn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/gcnet/mask_rcnn_r50_fpn_sbn_1x.py b/configs/gcnet/mask_rcnn_r50_fpn_sbn_1x.py new file mode 100644 index 0000000..8301f51 --- /dev/null +++ b/configs/gcnet/mask_rcnn_r50_fpn_sbn_1x.py @@ -0,0 +1,185 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) + +model = dict( + type='MaskRCNN', + pretrained='modelzoo://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + norm_eval=False, + norm_cfg=norm_cfg), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_scales=[8], + anchor_ratios=[0.5, 1.0, 2.0], + anchor_strides=[4, 8, 16, 32, 64], + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='SharedFCBBoxHead', + num_fcs=2, + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=81, + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2], + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=81, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))) +# model training and testing settings +train_cfg = dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)) +test_cfg = dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100, + mask_thr_binary=0.5)) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0.5, + with_mask=True, + with_crowd=True, + with_label=True), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=True, + with_crowd=True, + with_label=True), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + img_scale=(1333, 800), + img_norm_cfg=img_norm_cfg, + size_divisor=32, + flip_ratio=0, + with_mask=False, + with_label=False, + test_mode=True)) +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/mask_rcnn_r50_fpn_sbn_1x' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/mmdet/models/backbones/resnet.py b/mmdet/models/backbones/resnet.py index d1db1dd..c018ed6 100644 --- a/mmdet/models/backbones/resnet.py +++ b/mmdet/models/backbones/resnet.py @@ -7,7 +7,7 @@ from torch.nn.modules.batchnorm import _BatchNorm from mmcv.cnn import constant_init, kaiming_init from mmcv.runner import load_checkpoint -from mmdet.ops import DeformConv, ModulatedDeformConv +from mmdet.ops import DeformConv, ModulatedDeformConv, ContextBlock from ..registry import BACKBONES from ..utils import build_conv_layer, build_norm_layer @@ -25,9 +25,11 @@ class BasicBlock(nn.Module): with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), - dcn=None): + dcn=None, + gcb=None): super(BasicBlock, self).__init__() assert dcn is None, "Not implemented yet." + assert gcb is None, "Not implemented yet." self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) @@ -92,7 +94,8 @@ class Bottleneck(nn.Module): with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), - dcn=None): + dcn=None, + gcb=None): """Bottleneck block for ResNet. If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is "caffe", the stride-two layer is the first 1x1 conv layer. @@ -100,6 +103,7 @@ class Bottleneck(nn.Module): super(Bottleneck, self).__init__() assert style in ['pytorch', 'caffe'] assert dcn is None or isinstance(dcn, dict) + assert gcb is None or isinstance(gcb, dict) self.inplanes = inplanes self.planes = planes self.stride = stride @@ -110,6 +114,8 @@ class Bottleneck(nn.Module): self.norm_cfg = norm_cfg self.dcn = dcn self.with_dcn = dcn is not None + self.gcb = gcb + self.with_gcb = gcb is not None if self.style == 'pytorch': self.conv1_stride = 1 self.conv2_stride = stride @@ -181,6 +187,12 @@ class Bottleneck(nn.Module): self.relu = nn.ReLU(inplace=True) self.downsample = downsample + if self.with_gcb: + gcb_inplanes = planes * self.expansion + self.context_block = ContextBlock( + inplanes=gcb_inplanes, + **gcb + ) @property def norm1(self): @@ -219,6 +231,9 @@ class Bottleneck(nn.Module): out = self.conv3(out) out = self.norm3(out) + if self.with_gcb: + out = self.context_block(out) + if self.downsample is not None: identity = self.downsample(x) @@ -246,7 +261,8 @@ def make_res_layer(block, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), - dcn=None): + dcn=None, + gcb=None): downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( @@ -272,7 +288,8 @@ def make_res_layer(block, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, - dcn=dcn)) + dcn=dcn, + gcb=gcb)) inplanes = planes * block.expansion for i in range(1, blocks): layers.append( @@ -285,7 +302,8 @@ def make_res_layer(block, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, - dcn=dcn)) + dcn=dcn, + gcb=gcb)) return nn.Sequential(*layers) @@ -336,6 +354,8 @@ class ResNet(nn.Module): norm_eval=True, dcn=None, stage_with_dcn=(False, False, False, False), + gcb=None, + stage_with_gcb=(False, False, False, False), with_cp=False, zero_init_residual=True): super(ResNet, self).__init__() @@ -359,6 +379,10 @@ class ResNet(nn.Module): self.stage_with_dcn = stage_with_dcn if dcn is not None: assert len(stage_with_dcn) == num_stages + self.gcb = gcb + self.stage_with_gcb = stage_with_gcb + if gcb is not None: + assert len(stage_with_gcb) == num_stages self.zero_init_residual = zero_init_residual self.block, stage_blocks = self.arch_settings[depth] self.stage_blocks = stage_blocks[:num_stages] @@ -371,6 +395,7 @@ class ResNet(nn.Module): stride = strides[i] dilation = dilations[i] dcn = self.dcn if self.stage_with_dcn[i] else None + gcb = self.gcb if self.stage_with_gcb[i] else None planes = 64 * 2**i res_layer = make_res_layer( self.block, @@ -383,7 +408,8 @@ class ResNet(nn.Module): with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, - dcn=dcn) + dcn=dcn, + gcb=gcb) self.inplanes = planes * self.block.expansion layer_name = 'layer{}'.format(i + 1) self.add_module(layer_name, res_layer) diff --git a/mmdet/models/backbones/resnext.py b/mmdet/models/backbones/resnext.py index c869a02..ced5c53 100644 --- a/mmdet/models/backbones/resnext.py +++ b/mmdet/models/backbones/resnext.py @@ -103,7 +103,8 @@ def make_res_layer(block, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), - dcn=None): + dcn=None, + gcb=None): downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( @@ -131,7 +132,8 @@ def make_res_layer(block, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, - dcn=dcn)) + dcn=dcn, + gcb=gcb)) inplanes = planes * block.expansion for i in range(1, blocks): layers.append( @@ -146,7 +148,8 @@ def make_res_layer(block, with_cp=with_cp, conv_cfg=conv_cfg, norm_cfg=norm_cfg, - dcn=dcn)) + dcn=dcn, + gcb=gcb)) return nn.Sequential(*layers) @@ -195,6 +198,7 @@ class ResNeXt(ResNet): stride = self.strides[i] dilation = self.dilations[i] dcn = self.dcn if self.stage_with_dcn[i] else None + gcb = self.gcb if self.stage_with_gcb[i] else None planes = 64 * 2**i res_layer = make_res_layer( self.block, @@ -209,7 +213,8 @@ class ResNeXt(ResNet): with_cp=self.with_cp, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, - dcn=dcn) + dcn=dcn, + gcb=gcb) self.inplanes = planes * self.block.expansion layer_name = 'layer{}'.format(i + 1) self.add_module(layer_name, res_layer) diff --git a/mmdet/ops/__init__.py b/mmdet/ops/__init__.py index 34467bf..5f6ad09 100644 --- a/mmdet/ops/__init__.py +++ b/mmdet/ops/__init__.py @@ -2,6 +2,7 @@ from .dcn import (DeformConv, DeformConvPack, ModulatedDeformConv, ModulatedDeformConvPack, DeformRoIPooling, DeformRoIPoolingPack, ModulatedDeformRoIPoolingPack, deform_conv, modulated_deform_conv, deform_roi_pooling) +from .gcb import ContextBlock from .nms import nms, soft_nms from .roi_align import RoIAlign, roi_align from .roi_pool import RoIPool, roi_pool @@ -14,5 +15,5 @@ __all__ = [ 'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv', 'ModulatedDeformConvPack', 'deform_conv', 'modulated_deform_conv', 'deform_roi_pooling', 'SigmoidFocalLoss', 'sigmoid_focal_loss', - 'MaskedConv2d' + 'MaskedConv2d', 'ContextBlock' ] diff --git a/mmdet/ops/gcb/__init__.py b/mmdet/ops/gcb/__init__.py new file mode 100644 index 0000000..05dd625 --- /dev/null +++ b/mmdet/ops/gcb/__init__.py @@ -0,0 +1,5 @@ +from .context_block import ContextBlock + +__all__ = [ + 'ContextBlock', +] diff --git a/mmdet/ops/gcb/context_block.py b/mmdet/ops/gcb/context_block.py new file mode 100644 index 0000000..be9092c --- /dev/null +++ b/mmdet/ops/gcb/context_block.py @@ -0,0 +1,104 @@ +import torch +from mmcv.cnn import constant_init, kaiming_init +from torch import nn + + +def last_zero_init(m): + if isinstance(m, nn.Sequential): + constant_init(m[-1], val=0) + else: + constant_init(m, val=0) + + +class ContextBlock(nn.Module): + + def __init__(self, + inplanes, + ratio, + pooling_type='att', + fusion_types=('channel_add', )): + super(ContextBlock, self).__init__() + assert pooling_type in ['avg', 'att'] + assert isinstance(fusion_types, (list, tuple)) + valid_fusion_types = ['channel_add', 'channel_mul'] + assert all([f in valid_fusion_types for f in fusion_types]) + assert len(fusion_types) > 0, 'at least one fusion should be used' + self.inplanes = inplanes + self.ratio = ratio + self.planes = int(inplanes * ratio) + self.pooling_type = pooling_type + self.fusion_types = fusion_types + if pooling_type == 'att': + self.conv_mask = nn.Conv2d(inplanes, 1, kernel_size=1) + self.softmax = nn.Softmax(dim=2) + else: + self.avg_pool = nn.AdaptiveAvgPool2d(1) + if 'channel_add' in fusion_types: + self.channel_add_conv = nn.Sequential( + nn.Conv2d(self.inplanes, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(inplace=True), # yapf: disable + nn.Conv2d(self.planes, self.inplanes, kernel_size=1)) + else: + self.channel_add_conv = None + if 'channel_mul' in fusion_types: + self.channel_mul_conv = nn.Sequential( + nn.Conv2d(self.inplanes, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(inplace=True), # yapf: disable + nn.Conv2d(self.planes, self.inplanes, kernel_size=1)) + else: + self.channel_mul_conv = None + self.reset_parameters() + + def reset_parameters(self): + if self.pooling_type == 'att': + kaiming_init(self.conv_mask, mode='fan_in') + self.conv_mask.inited = True + + if self.channel_add_conv is not None: + last_zero_init(self.channel_add_conv) + if self.channel_mul_conv is not None: + last_zero_init(self.channel_mul_conv) + + def spatial_pool(self, x): + batch, channel, height, width = x.size() + if self.pooling_type == 'att': + input_x = x + # [N, C, H * W] + input_x = input_x.view(batch, channel, height * width) + # [N, 1, C, H * W] + input_x = input_x.unsqueeze(1) + # [N, 1, H, W] + context_mask = self.conv_mask(x) + # [N, 1, H * W] + context_mask = context_mask.view(batch, 1, height * width) + # [N, 1, H * W] + context_mask = self.softmax(context_mask) + # [N, 1, H * W, 1] + context_mask = context_mask.unsqueeze(-1) + # [N, 1, C, 1] + context = torch.matmul(input_x, context_mask) + # [N, C, 1, 1] + context = context.view(batch, channel, 1, 1) + else: + # [N, C, 1, 1] + context = self.avg_pool(x) + + return context + + def forward(self, x): + # [N, C, 1, 1] + context = self.spatial_pool(x) + + out = x + if self.channel_mul_conv is not None: + # [N, C, 1, 1] + channel_mul_term = torch.sigmoid(self.channel_mul_conv(context)) + out = out * channel_mul_term + if self.channel_add_conv is not None: + # [N, C, 1, 1] + channel_add_term = self.channel_add_conv(context) + out = out + channel_add_term + + return out -- GitLab