diff --git a/configs/reppoints/bbox_r50_grid_center_fpn_1x.py b/configs/reppoints/bbox_r50_grid_center_fpn_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..d2ab61d0a2d89554aaaf5d43ce216e31d4bcd8ab --- /dev/null +++ b/configs/reppoints/bbox_r50_grid_center_fpn_1x.py @@ -0,0 +1,143 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='minmax', + use_grid_points=True)) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/bbox_r50_grid_center_fpn_1x' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/bbox_r50_grid_fpn_1x.py b/configs/reppoints/bbox_r50_grid_fpn_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..79e3c76ff4b927eca34ed3489d17f14bbe11f708 --- /dev/null +++ b/configs/reppoints/bbox_r50_grid_fpn_1x.py @@ -0,0 +1,148 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='minmax', + use_grid_points=True)) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/bbox_r50_grid_fpn_1x' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_minmax_r50_fpn_1x.py b/configs/reppoints/reppoints_minmax_r50_fpn_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..0103beb937ab80b9b5a6b875f050c475a13f1b36 --- /dev/null +++ b/configs/reppoints/reppoints_minmax_r50_fpn_1x.py @@ -0,0 +1,142 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='minmax')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_minmax_r50_fpn_1x' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_moment_r101_dcn_fpn_2x.py b/configs/reppoints/reppoints_moment_r101_dcn_fpn_2x.py new file mode 100644 index 0000000000000000000000000000000000000000..864cec03d1ebf8c748f8d2d8d43c2e396f49c209 --- /dev/null +++ b/configs/reppoints/reppoints_moment_r101_dcn_fpn_2x.py @@ -0,0 +1,145 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet101', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + dcn=dict( + modulated=False, deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_moment_r101_dcn_fpn_2x' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_moment_r101_dcn_fpn_2x_mt.py b/configs/reppoints/reppoints_moment_r101_dcn_fpn_2x_mt.py new file mode 100644 index 0000000000000000000000000000000000000000..ac6d93a9cb00b5f1ddd51dcaaac58c7fea432719 --- /dev/null +++ b/configs/reppoints/reppoints_moment_r101_dcn_fpn_2x_mt.py @@ -0,0 +1,149 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet101', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + dcn=dict( + modulated=False, deformable_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 960)], + keep_ratio=True, + multiscale_mode='range'), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_moment_r101_dcn_fpn_2x_mt' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_moment_r101_fpn_2x.py b/configs/reppoints/reppoints_moment_r101_fpn_2x.py new file mode 100644 index 0000000000000000000000000000000000000000..a4732a279bd43fe0f158aaf753ff1e1d910c65ec --- /dev/null +++ b/configs/reppoints/reppoints_moment_r101_fpn_2x.py @@ -0,0 +1,142 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet101', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_moment_r101_fpn_2x' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_moment_r101_fpn_2x_mt.py b/configs/reppoints/reppoints_moment_r101_fpn_2x_mt.py new file mode 100644 index 0000000000000000000000000000000000000000..2f481e7ac66197a886a97cb1c81fadc5608e828c --- /dev/null +++ b/configs/reppoints/reppoints_moment_r101_fpn_2x_mt.py @@ -0,0 +1,146 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet101', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 960)], + keep_ratio=True, + multiscale_mode='range'), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_moment_r101_fpn_2x_mt' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_moment_r50_fpn_1x.py b/configs/reppoints/reppoints_moment_r50_fpn_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..671b9e2655b5ca886febd2b09937e4206b272533 --- /dev/null +++ b/configs/reppoints/reppoints_moment_r50_fpn_1x.py @@ -0,0 +1,142 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_moment_r50_fpn_1x' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_moment_r50_fpn_2x.py b/configs/reppoints/reppoints_moment_r50_fpn_2x.py new file mode 100644 index 0000000000000000000000000000000000000000..53824301418db1e46a467fe2547b0f2a5b420c32 --- /dev/null +++ b/configs/reppoints/reppoints_moment_r50_fpn_2x.py @@ -0,0 +1,142 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_moment_r50_fpn_2x' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_moment_r50_fpn_2x_mt.py b/configs/reppoints/reppoints_moment_r50_fpn_2x_mt.py new file mode 100644 index 0000000000000000000000000000000000000000..ad86d74c458000784ee1d2c832a23500cafe8169 --- /dev/null +++ b/configs/reppoints/reppoints_moment_r50_fpn_2x_mt.py @@ -0,0 +1,146 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 960)], + keep_ratio=True, + multiscale_mode='range'), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_moment_r50_fpn_2x_mt' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_moment_x101_dcn_fpn_2x.py b/configs/reppoints/reppoints_moment_x101_dcn_fpn_2x.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0bd6636d37d5c5cd95a4c7af9fe0cbc6c7d3c2 --- /dev/null +++ b/configs/reppoints/reppoints_moment_x101_dcn_fpn_2x.py @@ -0,0 +1,150 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='open-mmlab://resnext101_32x4d', + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + dcn=dict( + modulated=False, + groups=32, + deformable_groups=1, + fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_moment_x101_dcn_fpn_2x' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_moment_x101_dcn_fpn_2x_mt.py b/configs/reppoints/reppoints_moment_x101_dcn_fpn_2x_mt.py new file mode 100644 index 0000000000000000000000000000000000000000..93b5ac83abde9dc530f4105fd0ca71d9c42d329c --- /dev/null +++ b/configs/reppoints/reppoints_moment_x101_dcn_fpn_2x_mt.py @@ -0,0 +1,154 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='open-mmlab://resnext101_32x4d', + backbone=dict( + type='ResNeXt', + depth=101, + groups=32, + base_width=4, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch', + dcn=dict( + modulated=False, + groups=32, + deformable_groups=1, + fallback_on_stride=False), + stage_with_dcn=(False, True, True, True)), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='moment')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Resize', + img_scale=[(1333, 480), (1333, 960)], + keep_ratio=True, + multiscale_mode='range'), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[16, 22]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 24 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_moment_x101_dcn_fpn_2x_mt' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/configs/reppoints/reppoints_partial_minmax_r50_fpn_1x.py b/configs/reppoints/reppoints_partial_minmax_r50_fpn_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..2296163c8a254defb65e0340bb796f3d45c127d8 --- /dev/null +++ b/configs/reppoints/reppoints_partial_minmax_r50_fpn_1x.py @@ -0,0 +1,142 @@ +# model settings +norm_cfg = dict(type='GN', num_groups=32, requires_grad=True) + +model = dict( + type='RepPointsDetector', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + num_outs=5, + norm_cfg=norm_cfg), + bbox_head=dict( + type='RepPointsHead', + num_classes=81, + in_channels=256, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + norm_cfg=norm_cfg, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict(type='SmoothL1Loss', beta=0.11, loss_weight=0.5), + loss_bbox_refine=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0), + transform_method='partial_minmax')) +# training and testing settings +train_cfg = dict( + init=dict( + assigner=dict(type='PointAssigner', scale=4, pos_num=1), + allowed_border=-1, + pos_weight=-1, + debug=False), + refine=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) +test_cfg = dict( + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_thr=0.5), + max_per_img=100) +# dataset settings +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + imgs_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +checkpoint_config = dict(interval=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +# runtime settings +total_epochs = 12 +device_ids = range(8) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/reppoints_partial_minmax_r50_fpn_1x' +load_from = None +resume_from = None +auto_resume = True +workflow = [('train', 1)] diff --git a/mmdet/models/anchor_heads/__init__.py b/mmdet/models/anchor_heads/__init__.py index f5a54ce4cfe1aadf1cf61d2bda714b5413bf013d..5df25d04e16975e71067028f7622d8174eb7a7b7 100644 --- a/mmdet/models/anchor_heads/__init__.py +++ b/mmdet/models/anchor_heads/__init__.py @@ -3,11 +3,13 @@ from .fcos_head import FCOSHead from .ga_retina_head import GARetinaHead from .ga_rpn_head import GARPNHead from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead +from .reppoints_head import RepPointsHead from .retina_head import RetinaHead from .rpn_head import RPNHead from .ssd_head import SSDHead __all__ = [ 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption', 'RPNHead', - 'GARPNHead', 'RetinaHead', 'GARetinaHead', 'SSDHead', 'FCOSHead' + 'GARPNHead', 'RetinaHead', 'GARetinaHead', 'SSDHead', 'FCOSHead', + 'RepPointsHead' ] diff --git a/mmdet/models/anchor_heads/reppoints_head.py b/mmdet/models/anchor_heads/reppoints_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1ce7abd16f917c1ca07dcf8eb78bc3633eb75704 --- /dev/null +++ b/mmdet/models/anchor_heads/reppoints_head.py @@ -0,0 +1,596 @@ +from __future__ import division + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import normal_init + +from mmdet.core import (PointGenerator, multi_apply, multiclass_nms, + point_target) +from mmdet.ops import DeformConv +from ..builder import build_loss +from ..registry import HEADS +from ..utils import ConvModule, bias_init_with_prob + + +@HEADS.register_module +class RepPointsHead(nn.Module): + """RepPoint head. + + Args: + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of channels of the feature map. + point_feat_channels (int): Number of channels of points features. + stacked_convs (int): How many conv layers are used. + gradient_mul (float): The multiplier to gradients from + points refinement and recognition. + point_strides (Iterable): points strides. + point_base_scale (int): bbox scale for assigning labels. + loss_cls (dict): Config of classification loss. + loss_bbox_init (dict): Config of initial points loss. + loss_bbox_refine (dict): Config of points loss in refinement. + use_grid_points (bool): If we use bounding box representation, the + reppoints is represented as grid points on the bounding box. + center_init (bool): Whether to use center point assignment. + transform_method (str): The methods to transform RepPoints to bbox. + """ # noqa: W605 + + def __init__(self, + num_classes, + in_channels, + feat_channels=256, + point_feat_channels=256, + stacked_convs=3, + num_points=9, + gradient_mul=0.1, + point_strides=[8, 16, 32, 64, 128], + point_base_scale=4, + conv_cfg=None, + norm_cfg=None, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox_init=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5), + loss_bbox_refine=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + use_grid_points=False, + center_init=True, + transform_method='moment', + moment_mul=0.01): + super(RepPointsHead, self).__init__() + self.in_channels = in_channels + self.num_classes = num_classes + self.feat_channels = feat_channels + self.point_feat_channels = point_feat_channels + self.stacked_convs = stacked_convs + self.num_points = num_points + self.gradient_mul = gradient_mul + self.point_base_scale = point_base_scale + self.point_strides = point_strides + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + self.sampling = loss_cls['type'] not in ['FocalLoss'] + self.loss_cls = build_loss(loss_cls) + self.loss_bbox_init = build_loss(loss_bbox_init) + self.loss_bbox_refine = build_loss(loss_bbox_refine) + self.use_grid_points = use_grid_points + self.center_init = center_init + self.transform_method = transform_method + if self.transform_method == 'moment': + self.moment_transfer = nn.Parameter( + data=torch.zeros(2), requires_grad=True) + self.moment_mul = moment_mul + if self.use_sigmoid_cls: + self.cls_out_channels = self.num_classes - 1 + else: + self.cls_out_channels = self.num_classes + self.point_generators = [PointGenerator() for _ in self.point_strides] + # we use deformable conv to extract points features + self.dcn_kernel = int(np.sqrt(num_points)) + self.dcn_pad = int((self.dcn_kernel - 1) / 2) + assert self.dcn_kernel * self.dcn_kernel == num_points, \ + "The points number should be a square number." + assert self.dcn_kernel % 2 == 1, \ + "The points number should be an odd square number." + dcn_base = np.arange(-self.dcn_pad, + self.dcn_pad + 1).astype(np.float64) + dcn_base_y = np.repeat(dcn_base, self.dcn_kernel) + dcn_base_x = np.tile(dcn_base, self.dcn_kernel) + dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape( + (-1)) + self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1) + self._init_layers() + + def _init_layers(self): + self.relu = nn.ReLU(inplace=True) + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg)) + pts_out_dim = 4 if self.use_grid_points else 2 * self.num_points + self.reppoints_cls_conv = DeformConv(self.feat_channels, + self.point_feat_channels, + self.dcn_kernel, 1, self.dcn_pad) + self.reppoints_cls_out = nn.Conv2d(self.point_feat_channels, + self.cls_out_channels, 1, 1, 0) + self.reppoints_pts_init_conv = nn.Conv2d(self.feat_channels, + self.point_feat_channels, 3, + 1, 1) + self.reppoints_pts_init_out = nn.Conv2d(self.point_feat_channels, + pts_out_dim, 1, 1, 0) + self.reppoints_pts_refine_conv = DeformConv(self.feat_channels, + self.point_feat_channels, + self.dcn_kernel, 1, + self.dcn_pad) + self.reppoints_pts_refine_out = nn.Conv2d(self.point_feat_channels, + pts_out_dim, 1, 1, 0) + + def init_weights(self): + for m in self.cls_convs: + normal_init(m.conv, std=0.01) + for m in self.reg_convs: + normal_init(m.conv, std=0.01) + bias_cls = bias_init_with_prob(0.01) + normal_init(self.reppoints_cls_conv, std=0.01) + normal_init(self.reppoints_cls_out, std=0.01, bias=bias_cls) + normal_init(self.reppoints_pts_init_conv, std=0.01) + normal_init(self.reppoints_pts_init_out, std=0.01) + normal_init(self.reppoints_pts_refine_conv, std=0.01) + normal_init(self.reppoints_pts_refine_out, std=0.01) + + def points2bbox(self, pts, y_first=True): + """ + Converting the points set into bounding box. + :param pts: the input points sets (fields), each points + set (fields) is represented as 2n scalar. + :param y_first: if y_fisrt=True, the point set is represented as + [y1, x1, y2, x2 ... yn, xn], otherwise the point set is + represented as [x1, y1, x2, y2 ... xn, yn]. + :return: each points set is converting to a bbox [x1, y1, x2, y2]. + """ + pts_reshape = pts.view(pts.shape[0], -1, 2, *pts.shape[2:]) + pts_y = pts_reshape[:, :, 0, ...] if y_first else pts_reshape[:, :, 1, + ...] + pts_x = pts_reshape[:, :, 1, ...] if y_first else pts_reshape[:, :, 0, + ...] + if self.transform_method == 'minmax': + bbox_left = pts_x.min(dim=1, keepdim=True)[0] + bbox_right = pts_x.max(dim=1, keepdim=True)[0] + bbox_up = pts_y.min(dim=1, keepdim=True)[0] + bbox_bottom = pts_y.max(dim=1, keepdim=True)[0] + bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom], + dim=1) + elif self.transform_method == 'partial_minmax': + pts_y = pts_y[:, :4, ...] + pts_x = pts_x[:, :4, ...] + bbox_left = pts_x.min(dim=1, keepdim=True)[0] + bbox_right = pts_x.max(dim=1, keepdim=True)[0] + bbox_up = pts_y.min(dim=1, keepdim=True)[0] + bbox_bottom = pts_y.max(dim=1, keepdim=True)[0] + bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom], + dim=1) + elif self.transform_method == 'moment': + pts_y_mean = pts_y.mean(dim=1, keepdim=True) + pts_x_mean = pts_x.mean(dim=1, keepdim=True) + pts_y_std = torch.std(pts_y - pts_y_mean, dim=1, keepdim=True) + pts_x_std = torch.std(pts_x - pts_x_mean, dim=1, keepdim=True) + moment_transfer = (self.moment_transfer * self.moment_mul) + ( + self.moment_transfer.detach() * (1 - self.moment_mul)) + moment_width_transfer = moment_transfer[0] + moment_height_transfer = moment_transfer[1] + half_width = pts_x_std * torch.exp(moment_width_transfer) + half_height = pts_y_std * torch.exp(moment_height_transfer) + bbox = torch.cat([ + pts_x_mean - half_width, pts_y_mean - half_height, + pts_x_mean + half_width, pts_y_mean + half_height + ], + dim=1) + else: + raise NotImplementedError + return bbox + + def gen_grid_from_reg(self, reg, previous_boxes): + """ + Base on the previous bboxes and regression values, we compute the + regressed bboxes and generate the grids on the bboxes. + :param reg: the regression value to previous bboxes. + :param previous_boxes: previous bboxes. + :return: generate grids on the regressed bboxes. + """ + b, _, h, w = reg.shape + bxy = (previous_boxes[:, :2, ...] + previous_boxes[:, 2:, ...]) / 2. + bwh = (previous_boxes[:, 2:, ...] - + previous_boxes[:, :2, ...]).clamp(min=1e-6) + grid_topleft = bxy + bwh * reg[:, :2, ...] - 0.5 * bwh * torch.exp( + reg[:, 2:, ...]) + grid_wh = bwh * torch.exp(reg[:, 2:, ...]) + grid_left = grid_topleft[:, [0], ...] + grid_top = grid_topleft[:, [1], ...] + grid_width = grid_wh[:, [0], ...] + grid_height = grid_wh[:, [1], ...] + intervel = torch.linspace(0., 1., self.dcn_kernel).view( + 1, self.dcn_kernel, 1, 1).type_as(reg) + grid_x = grid_left + grid_width * intervel + grid_x = grid_x.unsqueeze(1).repeat(1, self.dcn_kernel, 1, 1, 1) + grid_x = grid_x.view(b, -1, h, w) + grid_y = grid_top + grid_height * intervel + grid_y = grid_y.unsqueeze(2).repeat(1, 1, self.dcn_kernel, 1, 1) + grid_y = grid_y.view(b, -1, h, w) + grid_yx = torch.stack([grid_y, grid_x], dim=2) + grid_yx = grid_yx.view(b, -1, h, w) + regressed_bbox = torch.cat([ + grid_left, grid_top, grid_left + grid_width, grid_top + grid_height + ], 1) + return grid_yx, regressed_bbox + + def forward_single(self, x): + dcn_base_offset = self.dcn_base_offset.type_as(x) + # If we use center_init, the initial reppoints is from center points. + # If we use bounding bbox representation, the initial reppoints is + # from regular grid placed on a pre-defined bbox. + if self.use_grid_points or not self.center_init: + scale = self.point_base_scale / 2 + points_init = dcn_base_offset / dcn_base_offset.max() * scale + bbox_init = x.new_tensor([-scale, -scale, scale, + scale]).view(1, 4, 1, 1) + else: + points_init = 0 + cls_feat = x + pts_feat = x + for cls_conv in self.cls_convs: + cls_feat = cls_conv(cls_feat) + for reg_conv in self.reg_convs: + pts_feat = reg_conv(pts_feat) + # initialize reppoints + pts_out_init = self.reppoints_pts_init_out( + self.relu(self.reppoints_pts_init_conv(pts_feat))) + if self.use_grid_points: + pts_out_init, bbox_out_init = self.gen_grid_from_reg( + pts_out_init, bbox_init.detach()) + else: + pts_out_init = pts_out_init + points_init + # refine and classify reppoints + pts_out_init_grad_mul = (1 - self.gradient_mul) * pts_out_init.detach( + ) + self.gradient_mul * pts_out_init + dcn_offset = pts_out_init_grad_mul - dcn_base_offset + cls_out = self.reppoints_cls_out( + self.relu(self.reppoints_cls_conv(cls_feat, dcn_offset))) + pts_out_refine = self.reppoints_pts_refine_out( + self.relu(self.reppoints_pts_refine_conv(pts_feat, dcn_offset))) + if self.use_grid_points: + pts_out_refine, bbox_out_refine = self.gen_grid_from_reg( + pts_out_refine, bbox_out_init.detach()) + else: + pts_out_refine = pts_out_refine + pts_out_init.detach() + return cls_out, pts_out_init, pts_out_refine + + def forward(self, feats): + return multi_apply(self.forward_single, feats) + + def get_points(self, featmap_sizes, img_metas): + """Get points according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + img_metas (list[dict]): Image meta info. + + Returns: + tuple: points of each image, valid flags of each image + """ + num_imgs = len(img_metas) + num_levels = len(featmap_sizes) + + # since feature map sizes of all images are the same, we only compute + # points center for one time + multi_level_points = [] + for i in range(num_levels): + points = self.point_generators[i].grid_points( + featmap_sizes[i], self.point_strides[i]) + multi_level_points.append(points) + points_list = [[point.clone() for point in multi_level_points] + for _ in range(num_imgs)] + + # for each image, we compute valid flags of multi level grids + valid_flag_list = [] + for img_id, img_meta in enumerate(img_metas): + multi_level_flags = [] + for i in range(num_levels): + point_stride = self.point_strides[i] + feat_h, feat_w = featmap_sizes[i] + h, w, _ = img_meta['pad_shape'] + valid_feat_h = min(int(np.ceil(h / point_stride)), feat_h) + valid_feat_w = min(int(np.ceil(w / point_stride)), feat_w) + flags = self.point_generators[i].valid_flags( + (feat_h, feat_w), (valid_feat_h, valid_feat_w)) + multi_level_flags.append(flags) + valid_flag_list.append(multi_level_flags) + + return points_list, valid_flag_list + + def centers_to_bboxes(self, point_list): + """Get bboxes according to center points. Only used in MaxIOUAssigner. + """ + bbox_list = [] + for i_img, point in enumerate(point_list): + bbox = [] + for i_lvl in range(len(self.point_strides)): + scale = self.point_base_scale * self.point_strides[i_lvl] * 0.5 + bbox_shift = torch.Tensor([-scale, -scale, scale, + scale]).view(1, 4).type_as(point[0]) + bbox_center = torch.cat( + [point[i_lvl][:, :2], point[i_lvl][:, :2]], dim=1) + bbox.append(bbox_center + bbox_shift) + bbox_list.append(bbox) + return bbox_list + + def offset_to_pts(self, center_list, pred_list): + """Change from point offset to point coordinate. + """ + pts_list = [] + for i_lvl in range(len(self.point_strides)): + pts_lvl = [] + for i_img in range(len(center_list)): + pts_center = center_list[i_img][i_lvl][:, :2].repeat( + 1, self.num_points) + pts_shift = pred_list[i_lvl][i_img] + yx_pts_shift = pts_shift.permute(1, 2, 0).view( + -1, 2 * self.num_points) + y_pts_shift = yx_pts_shift[..., 0::2] + x_pts_shift = yx_pts_shift[..., 1::2] + xy_pts_shift = torch.stack([x_pts_shift, y_pts_shift], -1) + xy_pts_shift = xy_pts_shift.view(*yx_pts_shift.shape[:-1], -1) + pts = xy_pts_shift * self.point_strides[i_lvl] + pts_center + pts_lvl.append(pts) + pts_lvl = torch.stack(pts_lvl, 0) + pts_list.append(pts_lvl) + return pts_list + + def loss_single(self, cls_score, pts_pred_init, pts_pred_refine, labels, + label_weights, bbox_gt_init, bbox_weights_init, + bbox_gt_refine, bbox_weights_refine, stride, + num_total_samples_init, num_total_samples_refine): + # classification loss + labels = labels.reshape(-1) + label_weights = label_weights.reshape(-1) + cls_score = cls_score.permute(0, 2, 3, + 1).reshape(-1, self.cls_out_channels) + loss_cls = self.loss_cls( + cls_score, + labels, + label_weights, + avg_factor=num_total_samples_refine) + + # points loss + bbox_gt_init = bbox_gt_init.reshape(-1, 4) + bbox_weights_init = bbox_weights_init.reshape(-1, 4) + bbox_pred_init = self.points2bbox( + pts_pred_init.reshape(-1, 2 * self.num_points), y_first=False) + bbox_gt_refine = bbox_gt_refine.reshape(-1, 4) + bbox_weights_refine = bbox_weights_refine.reshape(-1, 4) + bbox_pred_refine = self.points2bbox( + pts_pred_refine.reshape(-1, 2 * self.num_points), y_first=False) + normalize_term = self.point_base_scale * stride + loss_pts_init = self.loss_bbox_init( + bbox_pred_init / normalize_term, + bbox_gt_init / normalize_term, + bbox_weights_init, + avg_factor=num_total_samples_init) + loss_pts_refine = self.loss_bbox_refine( + bbox_pred_refine / normalize_term, + bbox_gt_refine / normalize_term, + bbox_weights_refine, + avg_factor=num_total_samples_refine) + return loss_cls, loss_pts_init, loss_pts_refine + + def loss(self, + cls_scores, + pts_preds_init, + pts_preds_refine, + gt_bboxes, + gt_labels, + img_metas, + cfg, + gt_bboxes_ignore=None): + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == len(self.point_generators) + label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 + + # target for initial stage + center_list, valid_flag_list = self.get_points(featmap_sizes, + img_metas) + pts_coordinate_preds_init = self.offset_to_pts(center_list, + pts_preds_init) + if cfg.init.assigner['type'] == 'PointAssigner': + # Assign target for center list + candidate_list = center_list + else: + # transform center list to bbox list and + # assign target for bbox list + bbox_list = self.centers_to_bboxes(center_list) + candidate_list = bbox_list + cls_reg_targets_init = point_target( + candidate_list, + valid_flag_list, + gt_bboxes, + img_metas, + cfg.init, + gt_bboxes_ignore_list=gt_bboxes_ignore, + gt_labels_list=gt_labels, + label_channels=label_channels, + sampling=self.sampling) + (*_, bbox_gt_list_init, candidate_list_init, bbox_weights_list_init, + num_total_pos_init, num_total_neg_init) = cls_reg_targets_init + num_total_samples_init = ( + num_total_pos_init + + num_total_neg_init if self.sampling else num_total_pos_init) + + # target for refinement stage + center_list, valid_flag_list = self.get_points(featmap_sizes, + img_metas) + pts_coordinate_preds_refine = self.offset_to_pts( + center_list, pts_preds_refine) + bbox_list = [] + for i_img, center in enumerate(center_list): + bbox = [] + for i_lvl in range(len(pts_preds_refine)): + bbox_preds_init = self.points2bbox( + pts_preds_init[i_lvl].detach()) + bbox_shift = bbox_preds_init * self.point_strides[i_lvl] + bbox_center = torch.cat( + [center[i_lvl][:, :2], center[i_lvl][:, :2]], dim=1) + bbox.append(bbox_center + + bbox_shift[i_img].permute(1, 2, 0).reshape(-1, 4)) + bbox_list.append(bbox) + cls_reg_targets_refine = point_target( + bbox_list, + valid_flag_list, + gt_bboxes, + img_metas, + cfg.refine, + gt_bboxes_ignore_list=gt_bboxes_ignore, + gt_labels_list=gt_labels, + label_channels=label_channels, + sampling=self.sampling) + (labels_list, label_weights_list, bbox_gt_list_refine, + candidate_list_refine, bbox_weights_list_refine, num_total_pos_refine, + num_total_neg_refine) = cls_reg_targets_refine + num_total_samples_refine = ( + num_total_pos_refine + + num_total_neg_refine if self.sampling else num_total_pos_refine) + + # compute loss + losses_cls, losses_pts_init, losses_pts_refine = multi_apply( + self.loss_single, + cls_scores, + pts_coordinate_preds_init, + pts_coordinate_preds_refine, + labels_list, + label_weights_list, + bbox_gt_list_init, + bbox_weights_list_init, + bbox_gt_list_refine, + bbox_weights_list_refine, + self.point_strides, + num_total_samples_init=num_total_samples_init, + num_total_samples_refine=num_total_samples_refine) + loss_dict_all = { + 'loss_cls': losses_cls, + 'loss_pts_init': losses_pts_init, + 'loss_pts_refine': losses_pts_refine + } + return loss_dict_all + + def get_bboxes(self, + cls_scores, + pts_preds_init, + pts_preds_refine, + img_metas, + cfg, + rescale=False, + nms=True): + assert len(cls_scores) == len(pts_preds_refine) + bbox_preds_refine = [ + self.points2bbox(pts_pred_refine) + for pts_pred_refine in pts_preds_refine + ] + num_levels = len(cls_scores) + mlvl_points = [ + self.point_generators[i].grid_points(cls_scores[i].size()[-2:], + self.point_strides[i]) + for i in range(num_levels) + ] + result_list = [] + for img_id in range(len(img_metas)): + cls_score_list = [ + cls_scores[i][img_id].detach() for i in range(num_levels) + ] + bbox_pred_list = [ + bbox_preds_refine[i][img_id].detach() + for i in range(num_levels) + ] + img_shape = img_metas[img_id]['img_shape'] + scale_factor = img_metas[img_id]['scale_factor'] + proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list, + mlvl_points, img_shape, + scale_factor, cfg, rescale, nms) + result_list.append(proposals) + return result_list + + def get_bboxes_single(self, + cls_scores, + bbox_preds, + mlvl_points, + img_shape, + scale_factor, + cfg, + rescale=False, + nms=True): + assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) + mlvl_bboxes = [] + mlvl_scores = [] + for i_lvl, (cls_score, bbox_pred, points) in enumerate( + zip(cls_scores, bbox_preds, mlvl_points)): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + cls_score = cls_score.permute(1, 2, + 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + scores = cls_score.softmax(-1) + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + nms_pre = cfg.get('nms_pre', -1) + if nms_pre > 0 and scores.shape[0] > nms_pre: + if self.use_sigmoid_cls: + max_scores, _ = scores.max(dim=1) + else: + max_scores, _ = scores[:, 1:].max(dim=1) + _, topk_inds = max_scores.topk(nms_pre) + points = points[topk_inds, :] + bbox_pred = bbox_pred[topk_inds, :] + scores = scores[topk_inds, :] + bbox_pos_center = torch.cat([points[:, :2], points[:, :2]], dim=1) + bboxes = bbox_pred * self.point_strides[i_lvl] + bbox_pos_center + x1 = bboxes[:, 0].clamp(min=0, max=img_shape[1]) + y1 = bboxes[:, 1].clamp(min=0, max=img_shape[0]) + x2 = bboxes[:, 2].clamp(min=0, max=img_shape[1]) + y2 = bboxes[:, 3].clamp(min=0, max=img_shape[0]) + bboxes = torch.stack([x1, y1, x2, y2], dim=-1) + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_bboxes = torch.cat(mlvl_bboxes) + if rescale: + mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) + mlvl_scores = torch.cat(mlvl_scores) + if self.use_sigmoid_cls: + padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) + mlvl_scores = torch.cat([padding, mlvl_scores], dim=1) + if nms: + det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores, + cfg.score_thr, cfg.nms, + cfg.max_per_img) + return det_bboxes, det_labels + else: + return mlvl_bboxes, mlvl_scores diff --git a/mmdet/models/detectors/__init__.py b/mmdet/models/detectors/__init__.py index d613a3bf7bdb3e95fa165a37c3036ecad36b4b34..189c823bdec5f0cbf0d3e157702c53c5ed75c934 100644 --- a/mmdet/models/detectors/__init__.py +++ b/mmdet/models/detectors/__init__.py @@ -8,6 +8,7 @@ from .grid_rcnn import GridRCNN from .htc import HybridTaskCascade from .mask_rcnn import MaskRCNN from .mask_scoring_rcnn import MaskScoringRCNN +from .reppoints_detector import RepPointsDetector from .retinanet import RetinaNet from .rpn import RPN from .single_stage import SingleStageDetector @@ -16,5 +17,6 @@ from .two_stage import TwoStageDetector __all__ = [ 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN', 'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade', - 'DoubleHeadRCNN', 'RetinaNet', 'FCOS', 'GridRCNN', 'MaskScoringRCNN' + 'DoubleHeadRCNN', 'RetinaNet', 'FCOS', 'GridRCNN', 'MaskScoringRCNN', + 'RepPointsDetector' ] diff --git a/mmdet/models/detectors/reppoints_detector.py b/mmdet/models/detectors/reppoints_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..53d698f1f69a9aeae3cb139820efc3e5c033142a --- /dev/null +++ b/mmdet/models/detectors/reppoints_detector.py @@ -0,0 +1,81 @@ +import torch + +from mmdet.core import bbox2result, bbox_mapping_back, multiclass_nms +from ..registry import DETECTORS +from .single_stage import SingleStageDetector + + +@DETECTORS.register_module +class RepPointsDetector(SingleStageDetector): + """RepPoints: Point Set Representation for Object Detection. + + This detector is the implementation of: + - RepPoints detector (https://arxiv.org/pdf/1904.11490) + """ + + def __init__(self, + backbone, + neck, + bbox_head, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(RepPointsDetector, + self).__init__(backbone, neck, bbox_head, train_cfg, test_cfg, + pretrained) + + def merge_aug_results(self, aug_bboxes, aug_scores, img_metas): + """Merge augmented detection bboxes and scores. + + Args: + aug_bboxes (list[Tensor]): shape (n, 4*#class) + aug_scores (list[Tensor] or None): shape (n, #class) + img_shapes (list[Tensor]): shape (3, ). + + Returns: + tuple: (bboxes, scores) + """ + recovered_bboxes = [] + for bboxes, img_info in zip(aug_bboxes, img_metas): + img_shape = img_info[0]['img_shape'] + scale_factor = img_info[0]['scale_factor'] + flip = img_info[0]['flip'] + bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip) + recovered_bboxes.append(bboxes) + bboxes = torch.cat(recovered_bboxes, dim=0) + if aug_scores is None: + return bboxes + else: + scores = torch.cat(aug_scores, dim=0) + return bboxes, scores + + def aug_test(self, imgs, img_metas, rescale=False): + # recompute feats to save memory + feats = self.extract_feats(imgs) + + aug_bboxes = [] + aug_scores = [] + for x, img_meta in zip(feats, img_metas): + # only one image in the batch + outs = self.bbox_head(x) + bbox_inputs = outs + (img_meta, self.test_cfg, False, False) + det_bboxes, det_scores = self.bbox_head.get_bboxes(*bbox_inputs)[0] + aug_bboxes.append(det_bboxes) + aug_scores.append(det_scores) + + # after merging, bboxes will be rescaled to the original image size + merged_bboxes, merged_scores = self.merge_aug_results( + aug_bboxes, aug_scores, img_metas) + det_bboxes, det_labels = multiclass_nms(merged_bboxes, merged_scores, + self.test_cfg.score_thr, + self.test_cfg.nms, + self.test_cfg.max_per_img) + + if rescale: + _det_bboxes = det_bboxes + else: + _det_bboxes = det_bboxes.clone() + _det_bboxes[:, :4] *= img_metas[0][0]['scale_factor'] + bbox_results = bbox2result(_det_bboxes, det_labels, + self.bbox_head.num_classes) + return bbox_results