diff --git a/.gitignore b/.gitignore
index ffbae97a51e885187c5fc0c0485e58bf6067e310..01c47d6e277dba0d7b880dff88f9695f9a8eec50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,4 +104,6 @@ venv.bak/
 .mypy_cache/
 
 # cython generated cpp
-mmdet/ops/nms/*.cpp
\ No newline at end of file
+mmdet/ops/nms/*.cpp
+mmdet/version.py
+data
diff --git a/TDL.md b/TDL.md
deleted file mode 100644
index 1679338c04733a1a23e5dc2e8ac96069c6b3c41e..0000000000000000000000000000000000000000
--- a/TDL.md
+++ /dev/null
@@ -1,29 +0,0 @@
-### MMCV
-- [ ] Implement the attr 'get' of 'Config'
-- [ ] Config bugs: None type to '{}' with addict
-- [ ] Default logger should be only with gpu0
-- [ ] Unit Test: mmcv and mmcv.torchpack
-
-
-### MMDetection
-
-#### Basic
-- [ ] Implement training function without distributed
-- [ ] Verify nccl/nccl2/gloo
-- [ ] Replace UGLY code: params plug in 'args' to reach a global flow
-- [ ] Replace 'print' by 'logger'
-
-
-#### Testing
-- [ ] Implement distributed testing
-- [ ] Implement single gpu testing
-
-
-#### Refactor
-- [ ] Re-consider params names
-- [ ] Refactor functions in 'core'
-- [ ] Merge single test & aug test as one function, so as other redundancy
-
-#### New features
-- [ ] Plug loss params into Config
-- [ ] Multi-head communication
diff --git a/tools/examples/r50_fpn_frcnn_1x.py b/configs/faster_rcnn_r50_fpn_1x.py
similarity index 53%
rename from tools/examples/r50_fpn_frcnn_1x.py
rename to configs/faster_rcnn_r50_fpn_1x.py
index 6814445f8e1ba10a5fad24502ac8aff535f60f21..f4803f0b045e3801d2a09b652d6869625fb589f0 100644
--- a/tools/examples/r50_fpn_frcnn_1x.py
+++ b/configs/faster_rcnn_r50_fpn_1x.py
@@ -1,14 +1,14 @@
 # model settings
 model = dict(
-    pretrained=
-    '/mnt/lustre/pangjiangmiao/initmodel/pytorch/resnet50-19c8e357.pth',
+    type='FasterRCNN',
+    pretrained='modelzoo://resnet50',
     backbone=dict(
         type='resnet',
         depth=50,
         num_stages=4,
         out_indices=(0, 1, 2, 3),
         frozen_stages=1,
-        style='fb'),
+        style='pytorch'),
     neck=dict(
         type='FPN',
         in_channels=[256, 512, 1024, 2048],
@@ -18,15 +18,14 @@ model = dict(
         type='RPNHead',
         in_channels=256,
         feat_channels=256,
-        coarsest_stride=32,
         anchor_scales=[8],
         anchor_ratios=[0.5, 1.0, 2.0],
         anchor_strides=[4, 8, 16, 32, 64],
         target_means=[.0, .0, .0, .0],
         target_stds=[1.0, 1.0, 1.0, 1.0],
         use_sigmoid_cls=True),
-    roi_block=dict(
-        type='SingleLevelRoI',
+    bbox_roi_extractor=dict(
+        type='SingleRoIExtractor',
         roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
         out_channels=256,
         featmap_strides=[4, 8, 16, 32]),
@@ -40,28 +39,23 @@ model = dict(
         target_means=[0., 0., 0., 0.],
         target_stds=[0.1, 0.1, 0.2, 0.2],
         reg_class_agnostic=False))
-meta_params = dict(
-    rpn_train_cfg = dict(
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
         pos_fraction=0.5,
         pos_balance_sampling=False,
         neg_pos_ub=256,
         allowed_border=0,
+        crowd_thr=1.1,
         anchor_batch_size=256,
         pos_iou_thr=0.7,
         neg_iou_thr=0.3,
         neg_balance_thr=0,
-        min_pos_iou=1e-3,
+        min_pos_iou=0.3,
         pos_weight=-1,
         smoothl1_beta=1 / 9.0,
         debug=False),
-    rpn_test_cfg = dict(
-        nms_across_levels=False,
-        nms_pre=2000,
-        nms_post=2000,
-        max_num=2000,
-        nms_thr=0.7,
-        min_bbox_size=0),
-    rcnn_train_cfg = dict(
+    rcnn=dict(
         pos_iou_thr=0.5,
         neg_iou_thr=0.5,
         crowd_thr=1.1,
@@ -71,55 +65,84 @@ meta_params = dict(
         pos_balance_sampling=False,
         neg_pos_ub=512,
         neg_balance_thr=0,
+        min_pos_iou=1.1,
         pos_weight=-1,
-        debug=False),
-    rcnn_test_cfg = dict(score_thr=1e-3, max_per_img=100, nms_thr=0.5)
-)
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(score_thr=0.05, max_per_img=100, nms_thr=0.5))
 # dataset settings
-data_root = '/mnt/lustre/pangjiangmiao/dataset/coco/'
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
 img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53],
-    std=[58.395, 57.12, 57.375],
-    to_rgb=True)
-img_per_gpu = 1
-data_workers = 2
-train_dataset = dict(
-    ann_file=data_root + 'annotations/instances_train2017.json',
-    img_prefix=data_root + 'train2017/',
-    img_scale=(1333, 800),
-    img_norm_cfg=img_norm_cfg,
-    size_divisor=32,
-    flip_ratio=0.5)
-test_dataset = dict(
-    ann_file=data_root + 'annotations/instances_val2017.json',
-    img_prefix=data_root + 'val2017/',
-    img_scale=(1333, 800),
-    img_norm_cfg=img_norm_cfg,
-    size_divisor=32)
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
 # optimizer
 optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-grad_clip_config = dict(grad_clip=True, max_norm=35, norm_type=2)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
 # learning policy
-lr_policy = dict(
+lr_config = dict(
     policy='step',
     warmup='linear',
     warmup_iters=500,
-    warmup_ratio=0.333,
+    warmup_ratio=1.0 / 3,
     step=[8, 11])
-max_epoch = 12
 checkpoint_config = dict(interval=1)
-dist_params = dict(backend='nccl', port='29500', master_ip='127.0.0.1')
-# logging settings
-log_level = 'INFO'
 # yapf:disable
 log_config = dict(
     interval=50,
     hooks=[
         dict(type='TextLoggerHook'),
-        # ('TensorboardLoggerHook', dict(log_dir=work_dir + '/log')),
+        # dict(type='TensorboardLoggerHook')
     ])
 # yapf:enable
-work_dir = './model/r50_fpn_frcnn_1x'
+# runtime settings
+total_epochs = 12
+device_ids = range(8)
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
 load_from = None
 resume_from = None
 workflow = [('train', 1)]
diff --git a/tools/examples/r50_fpn_maskrcnn_1x.py b/configs/mask_rcnn_r50_fpn_1x.py
similarity index 57%
rename from tools/examples/r50_fpn_maskrcnn_1x.py
rename to configs/mask_rcnn_r50_fpn_1x.py
index 49b32037ec5139ee64d21bc6e9c607dcd69da018..4760821e24464b2e21d5ac0b0b0418f4163e9494 100644
--- a/tools/examples/r50_fpn_maskrcnn_1x.py
+++ b/configs/mask_rcnn_r50_fpn_1x.py
@@ -1,14 +1,14 @@
 # model settings
 model = dict(
-    pretrained=
-    '/mnt/lustre/pangjiangmiao/initmodel/pytorch/resnet50-19c8e357.pth',
+    type='MaskRCNN',
+    pretrained='modelzoo://resnet50',
     backbone=dict(
         type='resnet',
         depth=50,
         num_stages=4,
         out_indices=(0, 1, 2, 3),
         frozen_stages=1,
-        style='fb'),
+        style='pytorch'),
     neck=dict(
         type='FPN',
         in_channels=[256, 512, 1024, 2048],
@@ -18,15 +18,14 @@ model = dict(
         type='RPNHead',
         in_channels=256,
         feat_channels=256,
-        coarsest_stride=32,
         anchor_scales=[8],
         anchor_ratios=[0.5, 1.0, 2.0],
         anchor_strides=[4, 8, 16, 32, 64],
         target_means=[.0, .0, .0, .0],
         target_stds=[1.0, 1.0, 1.0, 1.0],
         use_sigmoid_cls=True),
-    roi_block=dict(
-        type='SingleLevelRoI',
+    bbox_roi_extractor=dict(
+        type='SingleRoIExtractor',
         roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
         out_channels=256,
         featmap_strides=[4, 8, 16, 32]),
@@ -40,8 +39,8 @@ model = dict(
         target_means=[0., 0., 0., 0.],
         target_stds=[0.1, 0.1, 0.2, 0.2],
         reg_class_agnostic=False),
-    mask_block=dict(
-        type='SingleLevelRoI',
+    mask_roi_extractor=dict(
+        type='SingleRoIExtractor',
         roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
         out_channels=256,
         featmap_strides=[4, 8, 16, 32]),
@@ -51,28 +50,23 @@ model = dict(
         in_channels=256,
         conv_out_channels=256,
         num_classes=81))
-meta_params = dict(
-    rpn_train_cfg=dict(
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
         pos_fraction=0.5,
         pos_balance_sampling=False,
         neg_pos_ub=256,
         allowed_border=0,
+        crowd_thr=1.1,
         anchor_batch_size=256,
         pos_iou_thr=0.7,
         neg_iou_thr=0.3,
         neg_balance_thr=0,
-        min_pos_iou=1e-3,
+        min_pos_iou=0.3,
         pos_weight=-1,
         smoothl1_beta=1 / 9.0,
         debug=False),
-    rpn_test_cfg=dict(
-        nms_across_levels=False,
-        nms_pre=2000,
-        nms_post=2000,
-        max_num=2000,
-        nms_thr=0.7,
-        min_bbox_size=0),
-    rcnn_train_cfg=dict(
+    rcnn=dict(
         mask_size=28,
         pos_iou_thr=0.5,
         neg_iou_thr=0.5,
@@ -83,54 +77,85 @@ meta_params = dict(
         pos_balance_sampling=False,
         neg_pos_ub=512,
         neg_balance_thr=0,
+        min_pos_iou=1.1,
         pos_weight=-1,
-        debug=False),
-    rcnn_test_cfg=dict(
-        score_thr=1e-3, max_per_img=100, nms_thr=0.5, mask_thr_binary=0.5))
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05, max_per_img=100, nms_thr=0.5, mask_thr_binary=0.5))
 # dataset settings
-data_root = '/mnt/lustre/pangjiangmiao/dataset/coco/'
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
 img_norm_cfg = dict(
     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-img_per_gpu = 1
-data_workers = 2
-train_dataset = dict(
-    with_mask=True,
-    ann_file=data_root + 'annotations/instances_train2017.json',
-    img_prefix=data_root + 'train2017/',
-    img_scale=(1333, 800),
-    img_norm_cfg=img_norm_cfg,
-    size_divisor=32,
-    flip_ratio=0.5)
-test_dataset = dict(
-    ann_file=data_root + 'annotations/instances_val2017.json',
-    img_prefix=data_root + 'val2017/',
-    img_scale=(1333, 800),
-    img_norm_cfg=img_norm_cfg,
-    size_divisor=32)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=True,
+        with_crowd=True,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=True,
+        with_crowd=True,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
 # optimizer
 optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-grad_clip_config = dict(grad_clip=True, max_norm=35, norm_type=2)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
 # learning policy
-lr_policy = dict(
+lr_config = dict(
     policy='step',
     warmup='linear',
     warmup_iters=500,
-    warmup_ratio=0.333,
+    warmup_ratio=1.0 / 3,
     step=[8, 11])
-max_epoch = 12
 checkpoint_config = dict(interval=1)
-dist_params = dict(backend='nccl', port='29500', master_ip='127.0.0.1')
-# logging settings
-log_level = 'INFO'
 # yapf:disable
 log_config = dict(
     interval=50,
     hooks=[
         dict(type='TextLoggerHook'),
-        # ('TensorboardLoggerHook', dict(log_dir=work_dir + '/log')),
+        # dict(type='TensorboardLoggerHook')
     ])
 # yapf:enable
-work_dir = './model/r50_fpn_mask_rcnn_1x'
+# runtime settings
+total_epochs = 12
+device_ids = range(8)
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/mask_rcnn_r50_fpn_1x'
 load_from = None
 resume_from = None
 workflow = [('train', 1)]
diff --git a/configs/rpn_r50_fpn_1x.py b/configs/rpn_r50_fpn_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e45eb9e41b8b727256b2abfe974e12802b73560
--- /dev/null
+++ b/configs/rpn_r50_fpn_1x.py
@@ -0,0 +1,118 @@
+# model settings
+model = dict(
+    type='RPN',
+    pretrained='modelzoo://resnet50',
+    backbone=dict(
+        type='resnet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_scales=[8],
+        anchor_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        target_means=[.0, .0, .0, .0],
+        target_stds=[1.0, 1.0, 1.0, 1.0],
+        use_sigmoid_cls=True))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        pos_fraction=0.5,
+        pos_balance_sampling=False,
+        neg_pos_ub=256,
+        allowed_border=0,
+        crowd_thr=1.1,
+        anchor_batch_size=256,
+        pos_iou_thr=0.7,
+        neg_iou_thr=0.3,
+        neg_balance_thr=0,
+        min_pos_iou=0.3,
+        pos_weight=-1,
+        smoothl1_beta=1 / 9.0,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+# runner configs
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/rpn_r50_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/mmdet/__init__.py b/mmdet/__init__.py
index 58f3ace6c03d093337c9fa417ccbe8bc267b6c69..1c4f7e8fcc54041e383b72d48860ccbdc3afc41c 100644
--- a/mmdet/__init__.py
+++ b/mmdet/__init__.py
@@ -1 +1,3 @@
-from .version import __version__
+from .version import __version__, short_version
+
+__all__ = ['__version__', 'short_version']
diff --git a/mmdet/core/__init__.py b/mmdet/core/__init__.py
index 52ed690e6689abdd1dcc4af6ccb237f1d3fbdad9..645d5be29c039aeb2173525163b681675741d7ea 100644
--- a/mmdet/core/__init__.py
+++ b/mmdet/core/__init__.py
@@ -1,9 +1,7 @@
-from .train_engine import *
-from .test_engine import *
-from .rpn_ops import *
-from .bbox_ops import *
-from .mask_ops import *
-from .losses import *
-from .eval import *
-from .post_processing import *
-from .utils import *
+from .anchor import *  # noqa: F401, F403
+from .bbox import *  # noqa: F401, F403
+from .mask import *  # noqa: F401, F403
+from .loss import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
diff --git a/mmdet/core/anchor/__init__.py b/mmdet/core/anchor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ff430a4be1825fbbaa3cb31d54de8790aa2fb90
--- /dev/null
+++ b/mmdet/core/anchor/__init__.py
@@ -0,0 +1,4 @@
+from .anchor_generator import AnchorGenerator
+from .anchor_target import anchor_target
+
+__all__ = ['AnchorGenerator', 'anchor_target']
diff --git a/mmdet/core/rpn_ops/anchor_generator.py b/mmdet/core/anchor/anchor_generator.py
similarity index 98%
rename from mmdet/core/rpn_ops/anchor_generator.py
rename to mmdet/core/anchor/anchor_generator.py
index e7a1fa256fb6d4df69be77a341728ed194b54b7e..84600be331e52d9a64f70e2cb43696b82801bf0e 100644
--- a/mmdet/core/rpn_ops/anchor_generator.py
+++ b/mmdet/core/anchor/anchor_generator.py
@@ -50,15 +50,18 @@ class AnchorGenerator(object):
             return yy, xx
 
     def grid_anchors(self, featmap_size, stride=16, device='cuda'):
+        base_anchors = self.base_anchors.to(device)
+
         feat_h, feat_w = featmap_size
         shift_x = torch.arange(0, feat_w, device=device) * stride
         shift_y = torch.arange(0, feat_h, device=device) * stride
         shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
         shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
         # first feat_w elements correspond to the first row of shifts
         # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
         # shifted anchors (K, A, 4), reshape to (K*A, 4)
-        base_anchors = self.base_anchors.to(device)
+
         all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
         all_anchors = all_anchors.view(-1, 4)
         # first A rows correspond to A anchors of (0, 0) in feature map,
diff --git a/mmdet/core/anchor/anchor_target.py b/mmdet/core/anchor/anchor_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad81e390e6dcb2a064862818a34ea99adbe462e0
--- /dev/null
+++ b/mmdet/core/anchor/anchor_target.py
@@ -0,0 +1,149 @@
+import torch
+
+from ..bbox import bbox_assign, bbox2delta, bbox_sampling
+from ..utils import multi_apply
+
+
+def anchor_target(anchor_list, valid_flag_list, gt_bboxes_list, img_metas,
+                  target_means, target_stds, cfg):
+    """Compute regression and classification targets for anchors.
+
+    Args:
+        anchor_list (list[list]): Multi level anchors of each image.
+        valid_flag_list (list[list]): Multi level valid flags of each image.
+        gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+        img_metas (list[dict]): Meta info of each image.
+        target_means (Iterable): Mean value of regression targets.
+        target_stds (Iterable): Std value of regression targets.
+        cfg (dict): RPN train configs.
+
+    Returns:
+        tuple
+    """
+    num_imgs = len(img_metas)
+    assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+    # anchor number of multi levels
+    num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+    # concat all level anchors and flags to a single tensor
+    for i in range(num_imgs):
+        assert len(anchor_list[i]) == len(valid_flag_list[i])
+        anchor_list[i] = torch.cat(anchor_list[i])
+        valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+    # compute targets for each image
+    means_replicas = [target_means for _ in range(num_imgs)]
+    stds_replicas = [target_stds for _ in range(num_imgs)]
+    cfg_replicas = [cfg for _ in range(num_imgs)]
+    (all_labels, all_label_weights, all_bbox_targets,
+     all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
+         anchor_target_single, anchor_list, valid_flag_list, gt_bboxes_list,
+         img_metas, means_replicas, stds_replicas, cfg_replicas)
+    # no valid anchors
+    if any([labels is None for labels in all_labels]):
+        return None
+    # sampled anchors of all images
+    num_total_samples = sum([
+        max(pos_inds.numel() + neg_inds.numel(), 1)
+        for pos_inds, neg_inds in zip(pos_inds_list, neg_inds_list)
+    ])
+    # split targets to a list w.r.t. multiple levels
+    labels_list = images_to_levels(all_labels, num_level_anchors)
+    label_weights_list = images_to_levels(all_label_weights, num_level_anchors)
+    bbox_targets_list = images_to_levels(all_bbox_targets, num_level_anchors)
+    bbox_weights_list = images_to_levels(all_bbox_weights, num_level_anchors)
+    return (labels_list, label_weights_list, bbox_targets_list,
+            bbox_weights_list, num_total_samples)
+
+
+def images_to_levels(target, num_level_anchors):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = torch.stack(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_level_anchors:
+        end = start + n
+        level_targets.append(target[:, start:end].squeeze(0))
+        start = end
+    return level_targets
+
+
+def anchor_target_single(flat_anchors, valid_flags, gt_bboxes, img_meta,
+                         target_means, target_stds, cfg):
+    inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                       img_meta['img_shape'][:2],
+                                       cfg.allowed_border)
+    if not inside_flags.any():
+        return (None, ) * 6
+    # assign gt and sample anchors
+    anchors = flat_anchors[inside_flags, :]
+    assigned_gt_inds, argmax_overlaps, max_overlaps = bbox_assign(
+        anchors,
+        gt_bboxes,
+        pos_iou_thr=cfg.pos_iou_thr,
+        neg_iou_thr=cfg.neg_iou_thr,
+        min_pos_iou=cfg.min_pos_iou)
+    pos_inds, neg_inds = bbox_sampling(assigned_gt_inds, cfg.anchor_batch_size,
+                                       cfg.pos_fraction, cfg.neg_pos_ub,
+                                       cfg.pos_balance_sampling, max_overlaps,
+                                       cfg.neg_balance_thr)
+
+    bbox_targets = torch.zeros_like(anchors)
+    bbox_weights = torch.zeros_like(anchors)
+    labels = torch.zeros_like(assigned_gt_inds)
+    label_weights = torch.zeros_like(assigned_gt_inds, dtype=anchors.dtype)
+
+    if len(pos_inds) > 0:
+        pos_anchors = anchors[pos_inds, :]
+        pos_gt_bbox = gt_bboxes[assigned_gt_inds[pos_inds] - 1, :]
+        pos_bbox_targets = bbox2delta(pos_anchors, pos_gt_bbox, target_means,
+                                      target_stds)
+        bbox_targets[pos_inds, :] = pos_bbox_targets
+        bbox_weights[pos_inds, :] = 1.0
+        labels[pos_inds] = 1
+        if cfg.pos_weight <= 0:
+            label_weights[pos_inds] = 1.0
+        else:
+            label_weights[pos_inds] = cfg.pos_weight
+    if len(neg_inds) > 0:
+        label_weights[neg_inds] = 1.0
+
+    # map up to original set of anchors
+    num_total_anchors = flat_anchors.size(0)
+    labels = unmap(labels, num_total_anchors, inside_flags)
+    label_weights = unmap(label_weights, num_total_anchors, inside_flags)
+    bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+    bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+    return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+            neg_inds)
+
+
+def anchor_inside_flags(flat_anchors, valid_flags, img_shape,
+                        allowed_border=0):
+    img_h, img_w = img_shape[:2]
+    if allowed_border >= 0:
+        inside_flags = valid_flags & \
+            (flat_anchors[:, 0] >= -allowed_border) & \
+            (flat_anchors[:, 1] >= -allowed_border) & \
+            (flat_anchors[:, 2] < img_w + allowed_border) & \
+            (flat_anchors[:, 3] < img_h + allowed_border)
+    else:
+        inside_flags = valid_flags
+    return inside_flags
+
+
+def unmap(data, count, inds, fill=0):
+    """ Unmap a subset of item (data) back to the original set of items (of
+    size count) """
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds, :] = data
+    return ret
diff --git a/mmdet/core/bbox/__init__.py b/mmdet/core/bbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5c21dce52f25781e2e4e3e760a837d4d36eec5c
--- /dev/null
+++ b/mmdet/core/bbox/__init__.py
@@ -0,0 +1,15 @@
+from .geometry import bbox_overlaps
+from .sampling import (random_choice, bbox_assign, bbox_assign_wrt_overlaps,
+                       bbox_sampling, bbox_sampling_pos, bbox_sampling_neg,
+                       sample_bboxes)
+from .transforms import (bbox2delta, delta2bbox, bbox_flip, bbox_mapping,
+                         bbox_mapping_back, bbox2roi, roi2bbox, bbox2result)
+from .bbox_target import bbox_target
+
+__all__ = [
+    'bbox_overlaps', 'random_choice', 'bbox_assign',
+    'bbox_assign_wrt_overlaps', 'bbox_sampling', 'bbox_sampling_pos',
+    'bbox_sampling_neg', 'sample_bboxes', 'bbox2delta', 'delta2bbox',
+    'bbox_flip', 'bbox_mapping', 'bbox_mapping_back', 'bbox2roi', 'roi2bbox',
+    'bbox2result', 'bbox_target'
+]
diff --git a/mmdet/core/bbox_ops/bbox_target.py b/mmdet/core/bbox/bbox_target.py
similarity index 60%
rename from mmdet/core/bbox_ops/bbox_target.py
rename to mmdet/core/bbox/bbox_target.py
index ce1f885e184a37779c7636f8c6053248e8cd3330..2e205c3850c9bc232b99826a23e79f416a3dbcfb 100644
--- a/mmdet/core/bbox_ops/bbox_target.py
+++ b/mmdet/core/bbox/bbox_target.py
@@ -1,8 +1,7 @@
-import mmcv
 import torch
 
-from .geometry import bbox_overlaps
-from .transforms import bbox_transform, bbox_transform_inv
+from .transforms import bbox2delta
+from ..utils import multi_apply
 
 
 def bbox_target(pos_proposals_list,
@@ -13,33 +12,23 @@ def bbox_target(pos_proposals_list,
                 reg_num_classes=1,
                 target_means=[.0, .0, .0, .0],
                 target_stds=[1.0, 1.0, 1.0, 1.0],
-                return_list=False):
-    img_per_gpu = len(pos_proposals_list)
-    all_labels = []
-    all_label_weights = []
-    all_bbox_targets = []
-    all_bbox_weights = []
-    for img_id in range(img_per_gpu):
-        pos_proposals = pos_proposals_list[img_id]
-        neg_proposals = neg_proposals_list[img_id]
-        pos_gt_bboxes = pos_gt_bboxes_list[img_id]
-        pos_gt_labels = pos_gt_labels_list[img_id]
-        debug_img = debug_imgs[img_id] if cfg.debug else None
-        labels, label_weights, bbox_targets, bbox_weights = proposal_target_single(
-            pos_proposals, neg_proposals, pos_gt_bboxes, pos_gt_labels,
-            reg_num_classes, cfg, target_means, target_stds)
-        all_labels.append(labels)
-        all_label_weights.append(label_weights)
-        all_bbox_targets.append(bbox_targets)
-        all_bbox_weights.append(bbox_weights)
+                concat=True):
+    labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+        proposal_target_single,
+        pos_proposals_list,
+        neg_proposals_list,
+        pos_gt_bboxes_list,
+        pos_gt_labels_list,
+        cfg=cfg,
+        reg_num_classes=reg_num_classes,
+        target_means=target_means,
+        target_stds=target_stds)
 
-    if return_list:
-        return all_labels, all_label_weights, all_bbox_targets, all_bbox_weights
-
-    labels = torch.cat(all_labels, 0)
-    label_weights = torch.cat(all_label_weights, 0)
-    bbox_targets = torch.cat(all_bbox_targets, 0)
-    bbox_weights = torch.cat(all_bbox_weights, 0)
+    if concat:
+        labels = torch.cat(labels, 0)
+        label_weights = torch.cat(label_weights, 0)
+        bbox_targets = torch.cat(bbox_targets, 0)
+        bbox_weights = torch.cat(bbox_weights, 0)
     return labels, label_weights, bbox_targets, bbox_weights
 
 
@@ -47,8 +36,8 @@ def proposal_target_single(pos_proposals,
                            neg_proposals,
                            pos_gt_bboxes,
                            pos_gt_labels,
-                           reg_num_classes,
                            cfg,
+                           reg_num_classes=1,
                            target_means=[.0, .0, .0, .0],
                            target_stds=[1.0, 1.0, 1.0, 1.0]):
     num_pos = pos_proposals.size(0)
@@ -62,8 +51,8 @@ def proposal_target_single(pos_proposals,
         labels[:num_pos] = pos_gt_labels
         pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
         label_weights[:num_pos] = pos_weight
-        pos_bbox_targets = bbox_transform(pos_proposals, pos_gt_bboxes,
-                                          target_means, target_stds)
+        pos_bbox_targets = bbox2delta(pos_proposals, pos_gt_bboxes,
+                                      target_means, target_stds)
         bbox_targets[:num_pos, :] = pos_bbox_targets
         bbox_weights[:num_pos, :] = 1
     if num_neg > 0:
diff --git a/mmdet/core/bbox_ops/geometry.py b/mmdet/core/bbox/geometry.py
similarity index 100%
rename from mmdet/core/bbox_ops/geometry.py
rename to mmdet/core/bbox/geometry.py
diff --git a/mmdet/core/bbox_ops/sampling.py b/mmdet/core/bbox/sampling.py
similarity index 61%
rename from mmdet/core/bbox_ops/sampling.py
rename to mmdet/core/bbox/sampling.py
index eed820496409f1f8265f73e81bd4667e6b1558f8..976cd9507f2279b663d3f5e09ed1180da5b457c1 100644
--- a/mmdet/core/bbox_ops/sampling.py
+++ b/mmdet/core/bbox/sampling.py
@@ -5,6 +5,11 @@ from .geometry import bbox_overlaps
 
 
 def random_choice(gallery, num):
+    """Random select some elements from the gallery.
+
+    It seems that Pytorch's implementation is slower than numpy so we use numpy
+    to randperm the indices.
+    """
     assert len(gallery) >= num
     if isinstance(gallery, list):
         gallery = np.array(gallery)
@@ -12,38 +17,42 @@ def random_choice(gallery, num):
     np.random.shuffle(cands)
     rand_inds = cands[:num]
     if not isinstance(gallery, np.ndarray):
-        rand_inds = torch.from_numpy(rand_inds).long()
-        if gallery.is_cuda:
-            rand_inds = rand_inds.cuda(gallery.get_device())
+        rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
     return gallery[rand_inds]
 
 
 def bbox_assign(proposals,
                 gt_bboxes,
-                gt_crowd_bboxes=None,
+                gt_bboxes_ignore=None,
                 gt_labels=None,
                 pos_iou_thr=0.5,
                 neg_iou_thr=0.5,
                 min_pos_iou=.0,
                 crowd_thr=-1):
-    """Assign a corresponding gt bbox or background to each proposal/anchor
-    This function assign a gt bbox to every proposal, each proposals will be
-    assigned with -1, 0, or a positive number. -1 means don't care, 0 means
-    negative sample, positive number is the index (1-based) of assigned gt.
-    If gt_crowd_bboxes is not None, proposals which have iof(intersection over foreground)
-    with crowd bboxes over crowd_thr will be ignored
+    """Assign a corresponding gt bbox or background to each proposal/anchor.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    If `gt_bboxes_ignore` is specified, bboxes which have iof (intersection
+    over foreground) with `gt_bboxes_ignore` above `crowd_thr` will be ignored.
+
     Args:
-        proposals(Tensor): proposals or RPN anchors, shape (n, 4)
-        gt_bboxes(Tensor): shape (k, 4)
-        gt_crowd_bboxes(Tensor): shape(m, 4)
-        gt_labels(Tensor, optional): shape (k, )
-        pos_iou_thr(float): iou threshold for positive bboxes
-        neg_iou_thr(float or tuple): iou threshold for negative bboxes
-        min_pos_iou(float): minimum iou for a bbox to be considered as a positive bbox,
-                            for RPN, it is usually set as 0, for Fast R-CNN,
-                            it is usually set as pos_iou_thr
-        crowd_thr: ignore proposals which have iof(intersection over foreground) with
-        crowd bboxes over crowd_thr
+        proposals (Tensor): Proposals or RPN anchors, shape (n, 4).
+        gt_bboxes (Tensor): Ground truth bboxes, shape (k, 4).
+        gt_bboxes_ignore (Tensor, optional): shape(m, 4).
+        gt_labels (Tensor, optional): shape (k, ).
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. For RPN, it is usually set as 0.3, for Fast R-CNN,
+            it is usually set as pos_iou_thr
+        crowd_thr (float): IoF threshold for ignoring bboxes. Negative value
+            for not ignoring any bboxes.
+
     Returns:
         tuple: (assigned_gt_inds, argmax_overlaps, max_overlaps), shape (n, )
     """
@@ -54,45 +63,50 @@ def bbox_assign(proposals,
         raise ValueError('No gt bbox or proposals')
 
     # ignore proposals according to crowd bboxes
-    if (crowd_thr > 0) and (gt_crowd_bboxes is
-                            not None) and (gt_crowd_bboxes.numel() > 0):
-        crowd_overlaps = bbox_overlaps(proposals, gt_crowd_bboxes, mode='iof')
+    if (crowd_thr > 0) and (gt_bboxes_ignore is
+                            not None) and (gt_bboxes_ignore.numel() > 0):
+        crowd_overlaps = bbox_overlaps(proposals, gt_bboxes_ignore, mode='iof')
         crowd_max_overlaps, _ = crowd_overlaps.max(dim=1)
         crowd_bboxes_inds = torch.nonzero(
             crowd_max_overlaps > crowd_thr).long()
         if crowd_bboxes_inds.numel() > 0:
             overlaps[crowd_bboxes_inds, :] = -1
 
-    return bbox_assign_via_overlaps(overlaps, gt_labels, pos_iou_thr,
+    return bbox_assign_wrt_overlaps(overlaps, gt_labels, pos_iou_thr,
                                     neg_iou_thr, min_pos_iou)
 
 
-def bbox_assign_via_overlaps(overlaps,
+def bbox_assign_wrt_overlaps(overlaps,
                              gt_labels=None,
                              pos_iou_thr=0.5,
                              neg_iou_thr=0.5,
                              min_pos_iou=.0):
-    """Assign a corresponding gt bbox or background to each proposal/anchor
-    This function assign a gt bbox to every proposal, each proposals will be
+    """Assign a corresponding gt bbox or background to each proposal/anchor.
+
+    This method assign a gt bbox to every proposal, each proposals will be
     assigned with -1, 0, or a positive number. -1 means don't care, 0 means
     negative sample, positive number is the index (1-based) of assigned gt.
     The assignment is done in following steps, the order matters:
+
     1. assign every anchor to -1
     2. assign proposals whose iou with all gts < neg_iou_thr to 0
     3. for each anchor, if the iou with its nearest gt >= pos_iou_thr,
     assign it to that bbox
     4. for each gt bbox, assign its nearest proposals(may be more than one)
     to itself
+
     Args:
-        overlaps(Tensor): overlaps between n proposals and k gt_bboxes, shape(n, k)
-        gt_labels(Tensor, optional): shape (k, )
-        pos_iou_thr(float): iou threshold for positive bboxes
-        neg_iou_thr(float or tuple): iou threshold for negative bboxes
-        min_pos_iou(float): minimum iou for a bbox to be considered as a positive bbox,
-                            for RPN, it is usually set as 0, for Fast R-CNN,
-                            it is usually set as pos_iou_thr
+        overlaps (Tensor): Overlaps between n proposals and k gt_bboxes,
+            shape(n, k).
+        gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ).
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum IoU for a bbox to be considered as a
+            positive bbox. This argument only affects the 4th step.
+
     Returns:
-        tuple: (assigned_gt_inds, argmax_overlaps, max_overlaps), shape (n, )
+        tuple: (assigned_gt_inds, [assigned_labels], argmax_overlaps,
+            max_overlaps), shape (n, )
     """
     num_bboxes, num_gts = overlaps.size(0), overlaps.size(1)
     # 1. assign -1 by default
@@ -138,8 +152,9 @@ def bbox_assign_via_overlaps(overlaps,
         return assigned_gt_inds, assigned_labels, argmax_overlaps, max_overlaps
 
 
-def sample_positives(assigned_gt_inds, num_expected, balance_sampling=True):
-    """Balance sampling for positive bboxes/anchors
+def bbox_sampling_pos(assigned_gt_inds, num_expected, balance_sampling=True):
+    """Balance sampling for positive bboxes/anchors.
+
     1. calculate average positive num for each gt: num_per_gt
     2. sample at most num_per_gt positives for each gt
     3. random sampling from rest anchors if not enough fg
@@ -180,15 +195,16 @@ def sample_positives(assigned_gt_inds, num_expected, balance_sampling=True):
         return sampled_inds
 
 
-def sample_negatives(assigned_gt_inds,
-                     num_expected,
-                     max_overlaps=None,
-                     balance_thr=0,
-                     hard_fraction=0.5):
-    """Balance sampling for negative bboxes/anchors
-    negative samples are split into 2 set: hard(balance_thr <= iou < neg_iou_thr)
-    and easy(iou < balance_thr), around equal number of bg are sampled
-    from each set.
+def bbox_sampling_neg(assigned_gt_inds,
+                      num_expected,
+                      max_overlaps=None,
+                      balance_thr=0,
+                      hard_fraction=0.5):
+    """Balance sampling for negative bboxes/anchors.
+
+    Negative samples are split into 2 set: hard (balance_thr <= iou <
+    neg_iou_thr) and easy(iou < balance_thr). The sampling ratio is controlled
+    by `hard_fraction`.
     """
     neg_inds = torch.nonzero(assigned_gt_inds == 0)
     if neg_inds.numel() != 0:
@@ -241,55 +257,87 @@ def bbox_sampling(assigned_gt_inds,
                   max_overlaps=None,
                   neg_balance_thr=0,
                   neg_hard_fraction=0.5):
+    """Sample positive and negative bboxes given assigned results.
+
+    Args:
+        assigned_gt_inds (Tensor): Assigned gt indices for each bbox.
+        num_expected (int): Expected total samples (pos and neg).
+        pos_fraction (float): Positive sample fraction.
+        neg_pos_ub (float): Negative/Positive upper bound.
+        pos_balance_sampling(bool): Whether to sample positive samples around
+            each gt bbox evenly.
+        max_overlaps (Tensor, optional): For each bbox, the max IoU of all gts.
+            Used for negative balance sampling only.
+        neg_balance_thr (float, optional): IoU threshold for simple/hard
+            negative balance sampling.
+        neg_hard_fraction (float, optional): Fraction of hard negative samples
+            for negative balance sampling.
+
+    Returns:
+        tuple[Tensor]: positive bbox indices, negative bbox indices.
+    """
     num_expected_pos = int(num_expected * pos_fraction)
-    pos_inds = sample_positives(assigned_gt_inds, num_expected_pos,
-                                pos_balance_sampling)
+    pos_inds = bbox_sampling_pos(assigned_gt_inds, num_expected_pos,
+                                 pos_balance_sampling)
+    # We found that sampled indices have duplicated items occasionally.
+    # (mab be a bug of PyTorch)
+    pos_inds = pos_inds.unique()
     num_sampled_pos = pos_inds.numel()
     num_neg_max = int(
         neg_pos_ub *
         num_sampled_pos) if num_sampled_pos > 0 else int(neg_pos_ub)
     num_expected_neg = min(num_neg_max, num_expected - num_sampled_pos)
-    neg_inds = sample_negatives(assigned_gt_inds, num_expected_neg,
-                                max_overlaps, neg_balance_thr,
-                                neg_hard_fraction)
+    neg_inds = bbox_sampling_neg(assigned_gt_inds, num_expected_neg,
+                                 max_overlaps, neg_balance_thr,
+                                 neg_hard_fraction)
+    neg_inds = neg_inds.unique()
     return pos_inds, neg_inds
 
 
+def sample_bboxes(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels, cfg):
+    """Sample positive and negative bboxes.
 
-def sample_proposals(proposals_list, gt_bboxes_list, gt_crowds_list,
-                     gt_labels_list, cfg):
-    cfg_list = [cfg for _ in range(len(proposals_list))]
-    results = map(sample_proposals_single, proposals_list, gt_bboxes_list,
-                  gt_crowds_list, gt_labels_list, cfg_list)
-    # list of tuple to tuple of list
-    return tuple(map(list, zip(*results)))
+    This is a simple implementation of bbox sampling given candidates and
+    ground truth bboxes, which includes 3 steps.
 
+    1. Assign gt to each bbox.
+    2. Add gt bboxes to the sampling pool (optional).
+    3. Perform positive and negative sampling.
 
-def sample_proposals_single(proposals,
-                            gt_bboxes,
-                            gt_crowds,
-                            gt_labels,
-                            cfg):
-    proposals = proposals[:, :4]
+    Args:
+        bboxes (Tensor): Boxes to be sampled from.
+        gt_bboxes (Tensor): Ground truth bboxes.
+        gt_bboxes_ignore (Tensor): Ignored ground truth bboxes. In MS COCO,
+            `crowd` bboxes are considered as ignored.
+        gt_labels (Tensor): Class labels of ground truth bboxes.
+        cfg (dict): Sampling configs.
+
+    Returns:
+        tuple[Tensor]: pos_bboxes, neg_bboxes, pos_assigned_gt_inds,
+            pos_gt_bboxes, pos_gt_labels
+    """
+    bboxes = bboxes[:, :4]
     assigned_gt_inds, assigned_labels, argmax_overlaps, max_overlaps = \
-        bbox_assign(
-            proposals, gt_bboxes, gt_crowds, gt_labels, cfg.pos_iou_thr,
-            cfg.neg_iou_thr, cfg.pos_iou_thr, cfg.crowd_thr)
+        bbox_assign(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels,
+                    cfg.pos_iou_thr, cfg.neg_iou_thr, cfg.min_pos_iou,
+                    cfg.crowd_thr)
+
     if cfg.add_gt_as_proposals:
-        proposals = torch.cat([gt_bboxes, proposals], dim=0)
+        bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
         gt_assign_self = torch.arange(
-            1, len(gt_labels) + 1, dtype=torch.long, device=proposals.device)
+            1, len(gt_labels) + 1, dtype=torch.long, device=bboxes.device)
         assigned_gt_inds = torch.cat([gt_assign_self, assigned_gt_inds])
         assigned_labels = torch.cat([gt_labels, assigned_labels])
 
     pos_inds, neg_inds = bbox_sampling(
         assigned_gt_inds, cfg.roi_batch_size, cfg.pos_fraction, cfg.neg_pos_ub,
         cfg.pos_balance_sampling, max_overlaps, cfg.neg_balance_thr)
-    pos_proposals = proposals[pos_inds]
-    neg_proposals = proposals[neg_inds]
+
+    pos_bboxes = bboxes[pos_inds]
+    neg_bboxes = bboxes[neg_inds]
     pos_assigned_gt_inds = assigned_gt_inds[pos_inds] - 1
     pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
     pos_gt_labels = assigned_labels[pos_inds]
 
-    return (pos_inds, neg_inds, pos_proposals, neg_proposals,
-            pos_assigned_gt_inds, pos_gt_bboxes, pos_gt_labels)
+    return (pos_bboxes, neg_bboxes, pos_assigned_gt_inds, pos_gt_bboxes,
+            pos_gt_labels)
diff --git a/mmdet/core/bbox_ops/transforms.py b/mmdet/core/bbox/transforms.py
similarity index 84%
rename from mmdet/core/bbox_ops/transforms.py
rename to mmdet/core/bbox/transforms.py
index a9f1e2a45fab42652189e84f42aadc2e5f7a8994..0d8f6f44f20df5c019dc8ed9ea46c2eb6c411c66 100644
--- a/mmdet/core/bbox_ops/transforms.py
+++ b/mmdet/core/bbox/transforms.py
@@ -3,7 +3,7 @@ import numpy as np
 import torch
 
 
-def bbox_transform(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]):
+def bbox2delta(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]):
     assert proposals.size() == gt.size()
 
     proposals = proposals.float()
@@ -31,12 +31,12 @@ def bbox_transform(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]):
     return deltas
 
 
-def bbox_transform_inv(rois,
-                       deltas,
-                       means=[0, 0, 0, 0],
-                       stds=[1, 1, 1, 1],
-                       max_shape=None,
-                       wh_ratio_clip=16 / 1000):
+def delta2bbox(rois,
+               deltas,
+               means=[0, 0, 0, 0],
+               stds=[1, 1, 1, 1],
+               max_shape=None,
+               wh_ratio_clip=16 / 1000):
     means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
     stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
     denorm_deltas = deltas * stds + means
@@ -69,10 +69,14 @@ def bbox_transform_inv(rois,
 
 
 def bbox_flip(bboxes, img_shape):
-    """Flip bboxes horizontally
+    """Flip bboxes horizontally.
+
     Args:
-        bboxes(Tensor): shape (..., 4*k)
-        img_shape(Tensor): image shape
+        bboxes(Tensor or ndarray): Shape (..., 4*k)
+        img_shape(tuple): Image shape.
+
+    Returns:
+        Same type as `bboxes`: Flipped bboxes.
     """
     if isinstance(bboxes, torch.Tensor):
         assert bboxes.shape[-1] % 4 == 0
@@ -84,25 +88,28 @@ def bbox_flip(bboxes, img_shape):
         return mmcv.bbox_flip(bboxes, img_shape)
 
 
-def bbox_mapping(bboxes, img_shape, flip):
+def bbox_mapping(bboxes, img_shape, scale_factor, flip):
     """Map bboxes from the original image scale to testing scale"""
-    new_bboxes = bboxes * img_shape[-1]
+    new_bboxes = bboxes * scale_factor
     if flip:
         new_bboxes = bbox_flip(new_bboxes, img_shape)
     return new_bboxes
 
 
-def bbox_mapping_back(bboxes, img_shape, flip):
+def bbox_mapping_back(bboxes, img_shape, scale_factor, flip):
     """Map bboxes from testing scale to original image scale"""
     new_bboxes = bbox_flip(bboxes, img_shape) if flip else bboxes
-    new_bboxes = new_bboxes / img_shape[-1]
+    new_bboxes = new_bboxes / scale_factor
     return new_bboxes
 
 
 def bbox2roi(bbox_list):
     """Convert a list of bboxes to roi format.
+
     Args:
-        bbox_list (Tensor): a list of bboxes corresponding to a list of images
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
     Returns:
         Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
     """
@@ -129,11 +136,13 @@ def roi2bbox(rois):
 
 
 def bbox2result(bboxes, labels, num_classes):
-    """Convert detection results to a list of numpy arrays
+    """Convert detection results to a list of numpy arrays.
+
     Args:
         bboxes (Tensor): shape (n, 5)
         labels (Tensor): shape (n, )
         num_classes (int): class number, including background class
+
     Returns:
         list(ndarray): bbox results of each class
     """
diff --git a/mmdet/core/bbox_ops/__init__.py b/mmdet/core/bbox_ops/__init__.py
deleted file mode 100644
index dbdbb970648bcac1ced61096b436ef9966266c1f..0000000000000000000000000000000000000000
--- a/mmdet/core/bbox_ops/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from .geometry import bbox_overlaps
-from .sampling import (random_choice, bbox_assign, bbox_assign_via_overlaps,
-                       bbox_sampling, sample_positives, sample_negatives,
-                       sample_proposals)
-from .transforms import (bbox_transform, bbox_transform_inv, bbox_flip,
-                         bbox_mapping, bbox_mapping_back, bbox2roi, roi2bbox,
-                         bbox2result)
-from .bbox_target import bbox_target
-
-__all__ = [
-    'bbox_overlaps', 'random_choice', 'bbox_assign',
-    'bbox_assign_via_overlaps', 'bbox_sampling', 'sample_positives',
-    'sample_negatives', 'bbox_transform', 'bbox_transform_inv', 'bbox_flip',
-    'bbox_mapping', 'bbox_mapping_back', 'bbox2roi', 'roi2bbox', 'bbox2result',
-    'bbox_target', 'sample_proposals'
-]
diff --git a/mmdet/core/eval/__init__.py b/mmdet/core/evaluation/__init__.py
similarity index 60%
rename from mmdet/core/eval/__init__.py
rename to mmdet/core/evaluation/__init__.py
index fe4893a0af68ffff2633fcd702f7cf73cce93e76..026234fce3198fe410143d9e1578cc384005c0d4 100644
--- a/mmdet/core/eval/__init__.py
+++ b/mmdet/core/evaluation/__init__.py
@@ -1,13 +1,18 @@
 from .class_names import (voc_classes, imagenet_det_classes,
                           imagenet_vid_classes, coco_classes, dataset_aliases,
                           get_classes)
+from .coco_utils import coco_eval, fast_eval_recall, results2json
+from .eval_hooks import (DistEvalHook, CocoDistEvalRecallHook,
+                         CocoDistEvalmAPHook)
 from .mean_ap import average_precision, eval_map, print_map_summary
 from .recall import (eval_recalls, print_recall_summary, plot_num_recall,
                      plot_iou_recall)
 
 __all__ = [
     'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
-    'coco_classes', 'dataset_aliases', 'get_classes', 'average_precision',
+    'coco_classes', 'dataset_aliases', 'get_classes', 'coco_eval',
+    'fast_eval_recall', 'results2json', 'DistEvalHook',
+    'CocoDistEvalRecallHook', 'CocoDistEvalmAPHook', 'average_precision',
     'eval_map', 'print_map_summary', 'eval_recalls', 'print_recall_summary',
     'plot_num_recall', 'plot_iou_recall'
 ]
diff --git a/mmdet/core/eval/bbox_overlaps.py b/mmdet/core/evaluation/bbox_overlaps.py
similarity index 100%
rename from mmdet/core/eval/bbox_overlaps.py
rename to mmdet/core/evaluation/bbox_overlaps.py
diff --git a/mmdet/core/eval/class_names.py b/mmdet/core/evaluation/class_names.py
similarity index 98%
rename from mmdet/core/eval/class_names.py
rename to mmdet/core/evaluation/class_names.py
index b68e9135dca366e93217e0c06959bea990ffda5e..04f806315b7c6ef47419efa61e38d2f7ec3ebd2a 100644
--- a/mmdet/core/eval/class_names.py
+++ b/mmdet/core/evaluation/class_names.py
@@ -95,7 +95,7 @@ def get_classes(dataset):
 
     if mmcv.is_str(dataset):
         if dataset in alias2name:
-            labels = eval(alias2name[dataset] + '_labels()')
+            labels = eval(alias2name[dataset] + '_classes()')
         else:
             raise ValueError('Unrecognized dataset: {}'.format(dataset))
     else:
diff --git a/mmdet/core/evaluation/coco_utils.py b/mmdet/core/evaluation/coco_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9fdb41649c39e83719ae7c8626d4bb8a58c2c28
--- /dev/null
+++ b/mmdet/core/evaluation/coco_utils.py
@@ -0,0 +1,149 @@
+import mmcv
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+from .recall import eval_recalls
+
+
+def coco_eval(result_file, result_types, coco, max_dets=(100, 300, 1000)):
+    for res_type in result_types:
+        assert res_type in [
+            'proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'
+        ]
+
+    if mmcv.is_str(coco):
+        coco = COCO(coco)
+    assert isinstance(coco, COCO)
+
+    if res_type == 'proposal_fast':
+        ar = fast_eval_recall(result_file, coco, max_dets)
+        for i, num in enumerate(max_dets):
+            print('AR@{}\t= {:.4f}'.format(num, ar[i]))
+        return
+
+    assert result_file.endswith('.json')
+    coco_dets = coco.loadRes(result_file)
+
+    img_ids = coco.getImgIds()
+    for res_type in result_types:
+        iou_type = 'bbox' if res_type == 'proposal' else res_type
+        cocoEval = COCOeval(coco, coco_dets, iou_type)
+        cocoEval.params.imgIds = img_ids
+        if res_type == 'proposal':
+            cocoEval.params.useCats = 0
+            cocoEval.params.maxDets = list(max_dets)
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+
+
+def fast_eval_recall(results,
+                     coco,
+                     max_dets,
+                     iou_thrs=np.arange(0.5, 0.96, 0.05)):
+    if mmcv.is_str(results):
+        assert results.endswith('.pkl')
+        results = mmcv.load(results)
+    elif not isinstance(results, list):
+        raise TypeError(
+            'results must be a list of numpy arrays or a filename, not {}'.
+            format(type(results)))
+
+    gt_bboxes = []
+    img_ids = coco.getImgIds()
+    for i in range(len(img_ids)):
+        ann_ids = coco.getAnnIds(imgIds=img_ids[i])
+        ann_info = coco.loadAnns(ann_ids)
+        if len(ann_info) == 0:
+            gt_bboxes.append(np.zeros((0, 4)))
+            continue
+        bboxes = []
+        for ann in ann_info:
+            if ann.get('ignore', False) or ann['iscrowd']:
+                continue
+            x1, y1, w, h = ann['bbox']
+            bboxes.append([x1, y1, x1 + w - 1, y1 + h - 1])
+        bboxes = np.array(bboxes, dtype=np.float32)
+        if bboxes.shape[0] == 0:
+            bboxes = np.zeros((0, 4))
+        gt_bboxes.append(bboxes)
+
+    recalls = eval_recalls(
+        gt_bboxes, results, max_dets, iou_thrs, print_summary=False)
+    ar = recalls.mean(axis=1)
+    return ar
+
+
+def xyxy2xywh(bbox):
+    _bbox = bbox.tolist()
+    return [
+        _bbox[0],
+        _bbox[1],
+        _bbox[2] - _bbox[0] + 1,
+        _bbox[3] - _bbox[1] + 1,
+    ]
+
+
+def proposal2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        bboxes = results[idx]
+        for i in range(bboxes.shape[0]):
+            data = dict()
+            data['image_id'] = img_id
+            data['bbox'] = xyxy2xywh(bboxes[i])
+            data['score'] = float(bboxes[i][4])
+            data['category_id'] = 1
+            json_results.append(data)
+    return json_results
+
+
+def det2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        result = results[idx]
+        for label in range(len(result)):
+            bboxes = result[label]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = dataset.cat_ids[label]
+                json_results.append(data)
+    return json_results
+
+
+def segm2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        det, seg = results[idx]
+        for label in range(len(det)):
+            bboxes = det[label]
+            segms = seg[label]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = dataset.cat_ids[label]
+                segms[i]['counts'] = segms[i]['counts'].decode()
+                data['segmentation'] = segms[i]
+                json_results.append(data)
+    return json_results
+
+
+def results2json(dataset, results, out_file):
+    if isinstance(results[0], list):
+        json_results = det2json(dataset, results)
+    elif isinstance(results[0], tuple):
+        json_results = segm2json(dataset, results)
+    elif isinstance(results[0], np.ndarray):
+        json_results = proposal2json(dataset, results)
+    else:
+        raise TypeError('invalid type of results')
+    mmcv.dump(json_results, out_file)
diff --git a/mmdet/core/evaluation/eval_hooks.py b/mmdet/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..a83b80dbfe7081fa6dbfc13f818339c565076000
--- /dev/null
+++ b/mmdet/core/evaluation/eval_hooks.py
@@ -0,0 +1,142 @@
+import os
+import os.path as osp
+import shutil
+import time
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.runner import Hook, obj_from_dict
+from mmcv.parallel import scatter, collate
+from pycocotools.cocoeval import COCOeval
+from torch.utils.data import Dataset
+
+from .coco_utils import results2json, fast_eval_recall
+from mmdet import datasets
+
+
+class DistEvalHook(Hook):
+
+    def __init__(self, dataset, interval=1):
+        if isinstance(dataset, Dataset):
+            self.dataset = dataset
+        elif isinstance(dataset, dict):
+            self.dataset = obj_from_dict(dataset, datasets,
+                                         {'test_mode': True})
+        else:
+            raise TypeError(
+                'dataset must be a Dataset object or a dict, not {}'.format(
+                    type(dataset)))
+        self.interval = interval
+        self.lock_dir = None
+
+    def _barrier(self, rank, world_size):
+        """Due to some issues with `torch.distributed.barrier()`, we have to
+        implement this ugly barrier function.
+        """
+        if rank == 0:
+            for i in range(1, world_size):
+                tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
+                while not (osp.exists(tmp)):
+                    time.sleep(1)
+            for i in range(1, world_size):
+                tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
+                os.remove(tmp)
+        else:
+            tmp = osp.join(self.lock_dir, '{}.pkl'.format(rank))
+            mmcv.dump([], tmp)
+            while osp.exists(tmp):
+                time.sleep(1)
+
+    def before_run(self, runner):
+        self.lock_dir = osp.join(runner.work_dir, '.lock_map_hook')
+        if runner.rank == 0:
+            if osp.exists(self.lock_dir):
+                shutil.rmtree(self.lock_dir)
+            mmcv.mkdir_or_exist(self.lock_dir)
+
+    def after_train_epoch(self, runner):
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        runner.model.eval()
+        results = [None for _ in range(len(self.dataset))]
+        prog_bar = mmcv.ProgressBar(len(self.dataset))
+        for idx in range(runner.rank, len(self.dataset), runner.world_size):
+            data = self.dataset[idx]
+            data_gpu = scatter(
+                collate([data], samples_per_gpu=1),
+                [torch.cuda.current_device()])[0]
+
+            # compute output
+            with torch.no_grad():
+                result = runner.model(
+                    **data_gpu, return_loss=False, rescale=True)
+            results[idx] = result
+
+            batch_size = runner.world_size
+            for _ in range(batch_size):
+                prog_bar.update()
+
+        if runner.rank == 0:
+            print('\n')
+            self._barrier(runner.rank, runner.world_size)
+            for i in range(1, runner.world_size):
+                tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i))
+                tmp_results = mmcv.load(tmp_file)
+                for idx in range(i, len(results), runner.world_size):
+                    results[idx] = tmp_results[idx]
+                os.remove(tmp_file)
+            self.evaluate(runner, results)
+        else:
+            tmp_file = osp.join(runner.work_dir,
+                                'temp_{}.pkl'.format(runner.rank))
+            mmcv.dump(results, tmp_file)
+            self._barrier(runner.rank, runner.world_size)
+        self._barrier(runner.rank, runner.world_size)
+
+    def evaluate(self):
+        raise NotImplementedError
+
+
+class CocoDistEvalRecallHook(DistEvalHook):
+
+    def __init__(self,
+                 dataset,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
+        super(CocoDistEvalRecallHook, self).__init__(dataset)
+        self.proposal_nums = np.array(proposal_nums, dtype=np.int32)
+        self.iou_thrs = np.array(iou_thrs, dtype=np.float32)
+
+    def evaluate(self, runner, results):
+        # the official coco evaluation is too slow, here we use our own
+        # implementation instead, which may get slightly different results
+        ar = fast_eval_recall(results, self.dataset.coco, self.proposal_nums,
+                              self.iou_thrs)
+        for i, num in enumerate(self.proposal_nums):
+            runner.log_buffer.output['AR@{}'.format(num)] = ar[i]
+        runner.log_buffer.ready = True
+
+
+class CocoDistEvalmAPHook(DistEvalHook):
+
+    def evaluate(self, runner, results):
+        tmp_file = osp.join(runner.work_dir, 'temp_0.json')
+        results2json(self.dataset, results, tmp_file)
+
+        res_types = ['bbox',
+                     'segm'] if runner.model.module.with_mask else ['bbox']
+        cocoGt = self.dataset.coco
+        cocoDt = cocoGt.loadRes(tmp_file)
+        imgIds = cocoGt.getImgIds()
+        for res_type in res_types:
+            iou_type = res_type
+            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
+            cocoEval.params.imgIds = imgIds
+            cocoEval.evaluate()
+            cocoEval.accumulate()
+            cocoEval.summarize()
+            field = '{}_mAP'.format(res_type)
+            runner.log_buffer.output[field] = cocoEval.stats[0]
+        runner.log_buffer.ready = True
+        os.remove(tmp_file)
diff --git a/mmdet/core/eval/mean_ap.py b/mmdet/core/evaluation/mean_ap.py
similarity index 89%
rename from mmdet/core/eval/mean_ap.py
rename to mmdet/core/evaluation/mean_ap.py
index 9a33f7640409993db3e11cedd587f1cd14c38aa5..5f47c1368af0e3385bc8e49cc5d35b99726ce722 100644
--- a/mmdet/core/eval/mean_ap.py
+++ b/mmdet/core/evaluation/mean_ap.py
@@ -9,9 +9,9 @@ def average_precision(recalls, precisions, mode='area'):
     """Calculate average precision (for single or multiple scales).
 
     Args:
-        recalls(ndarray): shape (num_scales, num_dets) or (num_dets, )
-        precisions(ndarray): shape (num_scales, num_dets) or (num_dets, )
-        mode(str): 'area' or '11points', 'area' means calculating the area
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
             under precision-recall curve, '11points' means calculating
             the average precision of recalls at [0, 0.1, ..., 1]
 
@@ -60,11 +60,11 @@ def tpfp_imagenet(det_bboxes,
     """Check if detected bboxes are true positive or false positive.
 
     Args:
-        det_bbox(ndarray): the detected bbox
-        gt_bboxes(ndarray): ground truth bboxes of this image
-        gt_ignore(ndarray): indicate if gts are ignored for evaluation or not
-        default_iou_thr(float): the iou thresholds for medium and large bboxes
-        area_ranges(list or None): gt bbox area ranges
+        det_bbox (ndarray): the detected bbox
+        gt_bboxes (ndarray): ground truth bboxes of this image
+        gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
+        default_iou_thr (float): the iou thresholds for medium and large bboxes
+        area_ranges (list or None): gt bbox area ranges
 
     Returns:
         tuple: two arrays (tp, fp) whose elements are 0 and 1
@@ -115,10 +115,10 @@ def tpfp_imagenet(det_bboxes,
                     max_iou = ious[i, j]
                     matched_gt = j
             # there are 4 cases for a det bbox:
-            # 1. this det bbox matches a gt, tp = 1, fp = 0
-            # 2. this det bbox matches an ignored gt, tp = 0, fp = 0
-            # 3. this det bbox matches no gt and within area range, tp = 0, fp = 1
-            # 4. this det bbox matches no gt but is beyond area range, tp = 0, fp = 0
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
             if matched_gt >= 0:
                 gt_covered[matched_gt] = 1
                 if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
@@ -137,10 +137,10 @@ def tpfp_default(det_bboxes, gt_bboxes, gt_ignore, iou_thr, area_ranges=None):
     """Check if detected bboxes are true positive or false positive.
 
     Args:
-        det_bbox(ndarray): the detected bbox
-        gt_bboxes(ndarray): ground truth bboxes of this image
-        gt_ignore(ndarray): indicate if gts are ignored for evaluation or not
-        iou_thr(float): the iou thresholds
+        det_bbox (ndarray): the detected bbox
+        gt_bboxes (ndarray): ground truth bboxes of this image
+        gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
+        iou_thr (float): the iou thresholds
 
     Returns:
         tuple: (tp, fp), two arrays whose elements are 0 and 1
@@ -227,15 +227,16 @@ def eval_map(det_results,
     """Evaluate mAP of a dataset.
 
     Args:
-        det_results(list): a list of list, [[cls1_det, cls2_det, ...], ...]
-        gt_bboxes(list): ground truth bboxes of each image, a list of K*4 array
-        gt_labels(list): ground truth labels of each image, a list of K array
-        gt_ignore(list): gt ignore indicators of each image, a list of K array
-        scale_ranges(list, optional): [(min1, max1), (min2, max2), ...]
-        iou_thr(float): IoU threshold
-        dataset(None or str): dataset name, there are minor differences in
+        det_results (list): a list of list, [[cls1_det, cls2_det, ...], ...]
+        gt_bboxes (list): ground truth bboxes of each image, a list of K*4
+            array.
+        gt_labels (list): ground truth labels of each image, a list of K array
+        gt_ignore (list): gt ignore indicators of each image, a list of K array
+        scale_ranges (list, optional): [(min1, max1), (min2, max2), ...]
+        iou_thr (float): IoU threshold
+        dataset (None or str): dataset name, there are minor differences in
             metrics for different datsets, e.g. "voc07", "imagenet_det", etc.
-        print_summary(bool): whether to print the mAP summary
+        print_summary (bool): whether to print the mAP summary
 
     Returns:
         tuple: (mAP, [dict, dict, ...])
@@ -265,7 +266,8 @@ def eval_map(det_results,
                       area_ranges) for j in range(len(cls_dets))
         ]
         tp, fp = tuple(zip(*tpfp))
-        # calculate gt number of each scale, gts ignored or beyond scale are not counted
+        # calculate gt number of each scale, gts ignored or beyond scale
+        # are not counted
         num_gts = np.zeros(num_scales, dtype=int)
         for j, bbox in enumerate(cls_gts):
             if area_ranges is None:
diff --git a/mmdet/core/eval/recall.py b/mmdet/core/evaluation/recall.py
similarity index 100%
rename from mmdet/core/eval/recall.py
rename to mmdet/core/evaluation/recall.py
diff --git a/mmdet/core/loss/__init__.py b/mmdet/core/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..661f0d6426602b5bed7dc3367e1322374922ae1c
--- /dev/null
+++ b/mmdet/core/loss/__init__.py
@@ -0,0 +1,11 @@
+from .losses import (weighted_nll_loss, weighted_cross_entropy,
+                     weighted_binary_cross_entropy, sigmoid_focal_loss,
+                     weighted_sigmoid_focal_loss, mask_cross_entropy,
+                     smooth_l1_loss, weighted_smoothl1, accuracy)
+
+__all__ = [
+    'weighted_nll_loss', 'weighted_cross_entropy',
+    'weighted_binary_cross_entropy', 'sigmoid_focal_loss',
+    'weighted_sigmoid_focal_loss', 'mask_cross_entropy', 'smooth_l1_loss',
+    'weighted_smoothl1', 'accuracy'
+]
diff --git a/mmdet/core/loss/losses.py b/mmdet/core/loss/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..14b49f5cb90ccc29240622a0c2a6764ae4c68520
--- /dev/null
+++ b/mmdet/core/loss/losses.py
@@ -0,0 +1,101 @@
+# TODO merge naive and weighted loss.
+import torch
+import torch.nn.functional as F
+
+
+def weighted_nll_loss(pred, label, weight, avg_factor=None):
+    if avg_factor is None:
+        avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
+    raw = F.nll_loss(pred, label, reduction='none')
+    return torch.sum(raw * weight)[None] / avg_factor
+
+
+def weighted_cross_entropy(pred, label, weight, avg_factor=None):
+    if avg_factor is None:
+        avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
+    raw = F.cross_entropy(pred, label, reduction='none')
+    return torch.sum(raw * weight)[None] / avg_factor
+
+
+def weighted_binary_cross_entropy(pred, label, weight, avg_factor=None):
+    if avg_factor is None:
+        avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
+    return F.binary_cross_entropy_with_logits(
+        pred, label.float(), weight.float(),
+        reduction='sum')[None] / avg_factor
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='elementwise_mean'):
+    pred_sigmoid = pred.sigmoid()
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    weight = (alpha * target + (1 - alpha) * (1 - target)) * weight
+    weight = weight * pt.pow(gamma)
+    return F.binary_cross_entropy_with_logits(
+        pred, target, weight, reduction=reduction)
+
+
+def weighted_sigmoid_focal_loss(pred,
+                                target,
+                                weight,
+                                gamma=2.0,
+                                alpha=0.25,
+                                avg_factor=None,
+                                num_classes=80):
+    if avg_factor is None:
+        avg_factor = torch.sum(weight > 0).float().item() / num_classes + 1e-6
+    return sigmoid_focal_loss(
+        pred, target, weight, gamma=gamma, alpha=alpha,
+        reduction='sum')[None] / avg_factor
+
+
+def mask_cross_entropy(pred, target, label):
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, reduction='elementwise_mean')[None]
+
+
+def smooth_l1_loss(pred, target, beta=1.0, reduction='elementwise_mean'):
+    assert beta > 0
+    assert pred.size() == target.size() and target.numel() > 0
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    reduction = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction == 0:
+        return loss
+    elif reduction == 1:
+        return loss.sum() / pred.numel()
+    elif reduction == 2:
+        return loss.sum()
+
+
+def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None):
+    if avg_factor is None:
+        avg_factor = torch.sum(weight > 0).float().item() / 4 + 1e-6
+    loss = smooth_l1_loss(pred, target, beta, reduction='none')
+    return torch.sum(loss * weight)[None] / avg_factor
+
+
+def accuracy(pred, target, topk=1):
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+
+    maxk = max(topk)
+    _, pred_label = pred.topk(maxk, 1, True, True)
+    pred_label = pred_label.t()
+    correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / pred.size(0)))
+    return res[0] if return_single else res
diff --git a/mmdet/core/losses/__init__.py b/mmdet/core/losses/__init__.py
deleted file mode 100644
index 3e4447ff0a6c708e9407bc47698a6281e8c81216..0000000000000000000000000000000000000000
--- a/mmdet/core/losses/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .losses import (
-    weighted_nll_loss, weighted_cross_entropy, weighted_binary_cross_entropy,
-    sigmoid_focal_loss, weighted_sigmoid_focal_loss, mask_cross_entropy,
-    weighted_mask_cross_entropy, smooth_l1_loss, weighted_smoothl1, accuracy)
-
-__all__ = [
-    'weighted_nll_loss', 'weighted_cross_entropy',
-    'weighted_binary_cross_entropy', 'sigmoid_focal_loss',
-    'weighted_sigmoid_focal_loss', 'mask_cross_entropy',
-    'weighted_mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1',
-    'accuracy'
-]
diff --git a/mmdet/core/losses/losses.py b/mmdet/core/losses/losses.py
deleted file mode 100644
index 575c91d053650acbde927f49e0c474e5fd325e77..0000000000000000000000000000000000000000
--- a/mmdet/core/losses/losses.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# TODO merge naive and weighted loss to one function.
-import torch
-import torch.nn.functional as F
-
-from ..bbox_ops import bbox_transform_inv, bbox_overlaps
-
-
-def weighted_nll_loss(pred, label, weight, ave_factor=None):
-    if ave_factor is None:
-        ave_factor = max(torch.sum(weight > 0).float().item(), 1.)
-    raw = F.nll_loss(pred, label, size_average=False, reduce=False)
-    return torch.sum(raw * weight)[None] / ave_factor
-
-
-def weighted_cross_entropy(pred, label, weight, ave_factor=None):
-    if ave_factor is None:
-        ave_factor = max(torch.sum(weight > 0).float().item(), 1.)
-    raw = F.cross_entropy(pred, label, size_average=False, reduce=False)
-    return torch.sum(raw * weight)[None] / ave_factor
-
-
-def weighted_binary_cross_entropy(pred, label, weight, ave_factor=None):
-    if ave_factor is None:
-        ave_factor = max(torch.sum(weight > 0).float().item(), 1.)
-    return F.binary_cross_entropy_with_logits(
-        pred, label.float(), weight.float(),
-        size_average=False)[None] / ave_factor
-
-
-def sigmoid_focal_loss(pred,
-                       target,
-                       weight,
-                       gamma=2.0,
-                       alpha=0.25,
-                       size_average=True):
-    pred_sigmoid = pred.sigmoid()
-    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
-    weight = (alpha * target + (1 - alpha) * (1 - target)) * weight
-    weight = weight * pt.pow(gamma)
-    return F.binary_cross_entropy_with_logits(
-        pred, target, weight, size_average=size_average)
-
-
-def weighted_sigmoid_focal_loss(pred,
-                                target,
-                                weight,
-                                gamma=2.0,
-                                alpha=0.25,
-                                ave_factor=None,
-                                num_classes=80):
-    if ave_factor is None:
-        ave_factor = torch.sum(weight > 0).float().item() / num_classes + 1e-6
-    return sigmoid_focal_loss(
-        pred, target, weight, gamma=gamma, alpha=alpha,
-        size_average=False)[None] / ave_factor
-
-
-def mask_cross_entropy(pred, target, label):
-    num_rois = pred.size()[0]
-    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
-    pred_slice = pred[inds, label].squeeze(1)
-    return F.binary_cross_entropy_with_logits(
-        pred_slice, target, size_average=True)[None]
-
-
-def weighted_mask_cross_entropy(pred, target, weight, label):
-    num_rois = pred.size()[0]
-    num_samples = torch.sum(weight > 0).float().item() + 1e-6
-    assert num_samples >= 1
-    inds = torch.arange(0, num_rois).long().cuda()
-    pred_slice = pred[inds, label].squeeze(1)
-    return F.binary_cross_entropy_with_logits(
-        pred_slice, target, weight, size_average=False)[None] / num_samples
-
-
-def smooth_l1_loss(pred, target, beta=1.0, size_average=True, reduce=True):
-    assert beta > 0
-    assert pred.size() == target.size() and target.numel() > 0
-    diff = torch.abs(pred - target)
-    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
-                       diff - 0.5 * beta)
-    if size_average:
-        loss /= pred.numel()
-    if reduce:
-        loss = loss.sum()
-    return loss
-
-
-def weighted_smoothl1(pred, target, weight, beta=1.0, ave_factor=None):
-    if ave_factor is None:
-        ave_factor = torch.sum(weight > 0).float().item() / 4 + 1e-6
-    loss = smooth_l1_loss(pred, target, beta, size_average=False, reduce=False)
-    return torch.sum(loss * weight)[None] / ave_factor
-
-
-def accuracy(pred, target, topk=1):
-    if isinstance(topk, int):
-        topk = (topk, )
-        return_single = True
-
-    maxk = max(topk)
-    _, pred_label = pred.topk(maxk, 1, True, True)
-    pred_label = pred_label.t()
-    correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
-
-    res = []
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
-        res.append(correct_k.mul_(100.0 / pred.size(0)))
-    return res[0] if return_single else res
diff --git a/mmdet/core/mask/__init__.py b/mmdet/core/mask/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b703b55d3eb92821c28ef38579fcbebeb1fa12cf
--- /dev/null
+++ b/mmdet/core/mask/__init__.py
@@ -0,0 +1,4 @@
+from .utils import split_combined_polys
+from .mask_target import mask_target
+
+__all__ = ['split_combined_polys', 'mask_target']
diff --git a/mmdet/core/mask/mask_target.py b/mmdet/core/mask/mask_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..be93dfc28934052a7497b3c42aa3e9dd1b3b3fe6
--- /dev/null
+++ b/mmdet/core/mask/mask_target.py
@@ -0,0 +1,36 @@
+import torch
+import numpy as np
+import mmcv
+
+
+def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
+                cfg):
+    cfg_list = [cfg for _ in range(len(pos_proposals_list))]
+    mask_targets = map(mask_target_single, pos_proposals_list,
+                       pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
+    mask_targets = torch.cat(list(mask_targets))
+    return mask_targets
+
+
+def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+    mask_size = cfg.mask_size
+    num_pos = pos_proposals.size(0)
+    mask_targets = []
+    if num_pos > 0:
+        proposals_np = pos_proposals.cpu().numpy()
+        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+        for i in range(num_pos):
+            gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+            bbox = proposals_np[i, :].astype(np.int32)
+            x1, y1, x2, y2 = bbox
+            w = np.maximum(x2 - x1 + 1, 1)
+            h = np.maximum(y2 - y1 + 1, 1)
+            # mask is uint8 both before and after resizing
+            target = mmcv.imresize(gt_mask[y1:y1 + h, x1:x1 + w],
+                                   (mask_size, mask_size))
+            mask_targets.append(target)
+        mask_targets = torch.from_numpy(np.stack(mask_targets)).float().to(
+            pos_proposals.device)
+    else:
+        mask_targets = pos_proposals.new_zeros((0, mask_size, mask_size))
+    return mask_targets
diff --git a/mmdet/core/mask/utils.py b/mmdet/core/mask/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a68312b179e56cb0e93e967ecfeeb602d48ca866
--- /dev/null
+++ b/mmdet/core/mask/utils.py
@@ -0,0 +1,30 @@
+import mmcv
+
+
+def split_combined_polys(polys, poly_lens, polys_per_mask):
+    """Split the combined 1-D polys into masks.
+
+    A mask is represented as a list of polys, and a poly is represented as
+    a 1-D array. In dataset, all masks are concatenated into a single 1-D
+    tensor. Here we need to split the tensor into original representations.
+
+    Args:
+        polys (list): a list (length = image num) of 1-D tensors
+        poly_lens (list): a list (length = image num) of poly length
+        polys_per_mask (list): a list (length = image num) of poly number
+            of each mask
+
+    Returns:
+        list: a list (length = image num) of list (length = mask num) of
+            list (length = poly num) of numpy array
+    """
+    mask_polys_list = []
+    for img_id in range(len(polys)):
+        polys_single = polys[img_id]
+        polys_lens_single = poly_lens[img_id].tolist()
+        polys_per_mask_single = polys_per_mask[img_id].tolist()
+
+        split_polys = mmcv.slice_list(polys_single, polys_lens_single)
+        mask_polys = mmcv.slice_list(split_polys, polys_per_mask_single)
+        mask_polys_list.append(mask_polys)
+    return mask_polys_list
diff --git a/mmdet/core/mask_ops/__init__.py b/mmdet/core/mask_ops/__init__.py
deleted file mode 100644
index 4669ba1f9102cbcabe20c48ea193408c1e12e4aa..0000000000000000000000000000000000000000
--- a/mmdet/core/mask_ops/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from .segms import (flip_segms, polys_to_mask, mask_to_bbox,
-                    polys_to_mask_wrt_box, polys_to_boxes, rle_mask_voting,
-                    rle_mask_nms, rle_masks_to_boxes)
-from .utils import split_combined_gt_polys
-from .mask_target import mask_target
-
-__all__ = [
-    'flip_segms', 'polys_to_mask', 'mask_to_bbox', 'polys_to_mask_wrt_box',
-    'polys_to_boxes', 'rle_mask_voting', 'rle_mask_nms', 'rle_masks_to_boxes',
-    'split_combined_gt_polys', 'mask_target'
-]
diff --git a/mmdet/core/mask_ops/mask_target.py b/mmdet/core/mask_ops/mask_target.py
deleted file mode 100644
index 3fb65e3587473b60c4fd25b075072b9a3bb4670c..0000000000000000000000000000000000000000
--- a/mmdet/core/mask_ops/mask_target.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-import numpy as np
-
-from .segms import polys_to_mask_wrt_box
-
-
-def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_polys_list,
-                img_meta, cfg):
-    cfg_list = [cfg for _ in range(len(pos_proposals_list))]
-    img_metas = [img_meta for _ in range(len(pos_proposals_list))]
-    mask_targets = map(mask_target_single, pos_proposals_list,
-                       pos_assigned_gt_inds_list, gt_polys_list, img_metas,
-                       cfg_list)
-    mask_targets = torch.cat(tuple(mask_targets), dim=0)
-    return mask_targets
-
-
-def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_polys,
-                       img_meta, cfg):
-
-    mask_size = cfg.mask_size
-    num_pos = pos_proposals.size(0)
-    mask_targets = pos_proposals.new_zeros((num_pos, mask_size, mask_size))
-    if num_pos > 0:
-        pos_proposals = pos_proposals.cpu().numpy()
-        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
-        scale_factor = img_meta['scale_factor'][0].cpu().numpy()
-        for i in range(num_pos):
-            bbox = pos_proposals[i, :] / scale_factor
-            polys = gt_polys[pos_assigned_gt_inds[i]]
-            mask = polys_to_mask_wrt_box(polys, bbox, mask_size)
-            mask = np.array(mask > 0, dtype=np.float32)
-            mask_targets[i, ...] = torch.from_numpy(mask).to(
-                mask_targets.device)
-    return mask_targets
diff --git a/mmdet/core/mask_ops/segms.py b/mmdet/core/mask_ops/segms.py
deleted file mode 100644
index b2ae6b69a1ff206b085799fa82527e1d17be0a4f..0000000000000000000000000000000000000000
--- a/mmdet/core/mask_ops/segms.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# This file is copied from Detectron.
-
-# Copyright (c) 2017-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-"""Functions for interacting with segmentation masks in the COCO format.
-The following terms are used in this module
-    mask: a binary mask encoded as a 2D numpy array
-    segm: a segmentation mask in one of the two COCO formats (polygon or RLE)
-    polygon: COCO's polygon format
-    RLE: COCO's run length encoding format
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-import pycocotools.mask as mask_util
-
-
-def flip_segms(segms, height, width):
-    """Left/right flip each mask in a list of masks."""
-
-    def _flip_poly(poly, width):
-        flipped_poly = np.array(poly)
-        flipped_poly[0::2] = width - np.array(poly[0::2]) - 1
-        return flipped_poly.tolist()
-
-    def _flip_rle(rle, height, width):
-        if 'counts' in rle and type(rle['counts']) == list:
-            # Magic RLE format handling painfully discovered by looking at the
-            # COCO API showAnns function.
-            rle = mask_util.frPyObjects([rle], height, width)
-        mask = mask_util.decode(rle)
-        mask = mask[:, ::-1, :]
-        rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
-        return rle
-
-    flipped_segms = []
-    for segm in segms:
-        if type(segm) == list:
-            # Polygon format
-            flipped_segms.append([_flip_poly(poly, width) for poly in segm])
-        else:
-            # RLE format
-            assert type(segm) == dict
-            flipped_segms.append(_flip_rle(segm, height, width))
-    return flipped_segms
-
-
-def polys_to_mask(polygons, height, width):
-    """Convert from the COCO polygon segmentation format to a binary mask
-    encoded as a 2D array of data type numpy.float32. The polygon segmentation
-    is understood to be enclosed inside a height x width image. The resulting
-    mask is therefore of shape (height, width).
-    """
-    rle = mask_util.frPyObjects(polygons, height, width)
-    mask = np.array(mask_util.decode(rle), dtype=np.float32)
-    # Flatten in case polygons was a list
-    mask = np.sum(mask, axis=2)
-    mask = np.array(mask > 0, dtype=np.float32)
-    return mask
-
-
-def mask_to_bbox(mask):
-    """Compute the tight bounding box of a binary mask."""
-    xs = np.where(np.sum(mask, axis=0) > 0)[0]
-    ys = np.where(np.sum(mask, axis=1) > 0)[0]
-
-    if len(xs) == 0 or len(ys) == 0:
-        return None
-
-    x0 = xs[0]
-    x1 = xs[-1]
-    y0 = ys[0]
-    y1 = ys[-1]
-    return np.array((x0, y0, x1, y1), dtype=np.float32)
-
-
-def polys_to_mask_wrt_box(polygons, box, M):
-    """Convert from the COCO polygon segmentation format to a binary mask
-    encoded as a 2D array of data type numpy.float32. The polygon segmentation
-    is understood to be enclosed in the given box and rasterized to an M x M
-    mask. The resulting mask is therefore of shape (M, M).
-    """
-    w = box[2] - box[0]
-    h = box[3] - box[1]
-
-    w = np.maximum(w, 1)
-    h = np.maximum(h, 1)
-
-    polygons_norm = []
-    for poly in polygons:
-        p = np.array(poly, dtype=np.float32)
-        p[0::2] = (p[0::2] - box[0]) * M / w
-        p[1::2] = (p[1::2] - box[1]) * M / h
-        polygons_norm.append(p)
-
-    rle = mask_util.frPyObjects(polygons_norm, M, M)
-    mask = np.array(mask_util.decode(rle), dtype=np.float32)
-    # Flatten in case polygons was a list
-    mask = np.sum(mask, axis=2)
-    mask = np.array(mask > 0, dtype=np.float32)
-    return mask
-
-
-def polys_to_boxes(polys):
-    """Convert a list of polygons into an array of tight bounding boxes."""
-    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
-    for i in range(len(polys)):
-        poly = polys[i]
-        x0 = min(min(p[::2]) for p in poly)
-        x1 = max(max(p[::2]) for p in poly)
-        y0 = min(min(p[1::2]) for p in poly)
-        y1 = max(max(p[1::2]) for p in poly)
-        boxes_from_polys[i, :] = [x0, y0, x1, y1]
-
-    return boxes_from_polys
-
-
-def rle_mask_voting(top_masks,
-                    all_masks,
-                    all_dets,
-                    iou_thresh,
-                    binarize_thresh,
-                    method='AVG'):
-    """Returns new masks (in correspondence with `top_masks`) by combining
-    multiple overlapping masks coming from the pool of `all_masks`. Two methods
-    for combining masks are supported: 'AVG' uses a weighted average of
-    overlapping mask pixels; 'UNION' takes the union of all mask pixels.
-    """
-    if len(top_masks) == 0:
-        return
-
-    all_not_crowd = [False] * len(all_masks)
-    top_to_all_overlaps = mask_util.iou(top_masks, all_masks, all_not_crowd)
-    decoded_all_masks = [
-        np.array(mask_util.decode(rle), dtype=np.float32) for rle in all_masks
-    ]
-    decoded_top_masks = [
-        np.array(mask_util.decode(rle), dtype=np.float32) for rle in top_masks
-    ]
-    all_boxes = all_dets[:, :4].astype(np.int32)
-    all_scores = all_dets[:, 4]
-
-    # Fill box support with weights
-    mask_shape = decoded_all_masks[0].shape
-    mask_weights = np.zeros((len(all_masks), mask_shape[0], mask_shape[1]))
-    for k in range(len(all_masks)):
-        ref_box = all_boxes[k]
-        x_0 = max(ref_box[0], 0)
-        x_1 = min(ref_box[2] + 1, mask_shape[1])
-        y_0 = max(ref_box[1], 0)
-        y_1 = min(ref_box[3] + 1, mask_shape[0])
-        mask_weights[k, y_0:y_1, x_0:x_1] = all_scores[k]
-    mask_weights = np.maximum(mask_weights, 1e-5)
-
-    top_segms_out = []
-    for k in range(len(top_masks)):
-        # Corner case of empty mask
-        if decoded_top_masks[k].sum() == 0:
-            top_segms_out.append(top_masks[k])
-            continue
-
-        inds_to_vote = np.where(top_to_all_overlaps[k] >= iou_thresh)[0]
-        # Only matches itself
-        if len(inds_to_vote) == 1:
-            top_segms_out.append(top_masks[k])
-            continue
-
-        masks_to_vote = [decoded_all_masks[i] for i in inds_to_vote]
-        if method == 'AVG':
-            ws = mask_weights[inds_to_vote]
-            soft_mask = np.average(masks_to_vote, axis=0, weights=ws)
-            mask = np.array(soft_mask > binarize_thresh, dtype=np.uint8)
-        elif method == 'UNION':
-            # Any pixel that's on joins the mask
-            soft_mask = np.sum(masks_to_vote, axis=0)
-            mask = np.array(soft_mask > 1e-5, dtype=np.uint8)
-        else:
-            raise NotImplementedError('Method {} is unknown'.format(method))
-        rle = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
-        top_segms_out.append(rle)
-
-    return top_segms_out
-
-
-def rle_mask_nms(masks, dets, thresh, mode='IOU'):
-    """Performs greedy non-maximum suppression based on an overlap measurement
-    between masks. The type of measurement is determined by `mode` and can be
-    either 'IOU' (standard intersection over union) or 'IOMA' (intersection over
-    mininum area).
-    """
-    if len(masks) == 0:
-        return []
-    if len(masks) == 1:
-        return [0]
-
-    if mode == 'IOU':
-        # Computes ious[m1, m2] = area(intersect(m1, m2)) / area(union(m1, m2))
-        all_not_crowds = [False] * len(masks)
-        ious = mask_util.iou(masks, masks, all_not_crowds)
-    elif mode == 'IOMA':
-        # Computes ious[m1, m2] = area(intersect(m1, m2)) / min(area(m1), area(m2))
-        all_crowds = [True] * len(masks)
-        # ious[m1, m2] = area(intersect(m1, m2)) / area(m2)
-        ious = mask_util.iou(masks, masks, all_crowds)
-        # ... = max(area(intersect(m1, m2)) / area(m2),
-        #           area(intersect(m2, m1)) / area(m1))
-        ious = np.maximum(ious, ious.transpose())
-    elif mode == 'CONTAINMENT':
-        # Computes ious[m1, m2] = area(intersect(m1, m2)) / area(m2)
-        # Which measures how much m2 is contained inside m1
-        all_crowds = [True] * len(masks)
-        ious = mask_util.iou(masks, masks, all_crowds)
-    else:
-        raise NotImplementedError('Mode {} is unknown'.format(mode))
-
-    scores = dets[:, 4]
-    order = np.argsort(-scores)
-
-    keep = []
-    while order.size > 0:
-        i = order[0]
-        keep.append(i)
-        ovr = ious[i, order[1:]]
-        inds_to_keep = np.where(ovr <= thresh)[0]
-        order = order[inds_to_keep + 1]
-
-    return keep
-
-
-def rle_masks_to_boxes(masks):
-    """Computes the bounding box of each mask in a list of RLE encoded masks."""
-    if len(masks) == 0:
-        return []
-
-    decoded_masks = [
-        np.array(mask_util.decode(rle), dtype=np.float32) for rle in masks
-    ]
-
-    def get_bounds(flat_mask):
-        inds = np.where(flat_mask > 0)[0]
-        return inds.min(), inds.max()
-
-    boxes = np.zeros((len(decoded_masks), 4))
-    keep = [True] * len(decoded_masks)
-    for i, mask in enumerate(decoded_masks):
-        if mask.sum() == 0:
-            keep[i] = False
-            continue
-        flat_mask = mask.sum(axis=0)
-        x0, x1 = get_bounds(flat_mask)
-        flat_mask = mask.sum(axis=1)
-        y0, y1 = get_bounds(flat_mask)
-        boxes[i, :] = (x0, y0, x1, y1)
-
-    return boxes, np.where(keep)[0]
diff --git a/mmdet/core/mask_ops/utils.py b/mmdet/core/mask_ops/utils.py
deleted file mode 100644
index 2802430007e7b239bcb18ba20a26c0609c62245c..0000000000000000000000000000000000000000
--- a/mmdet/core/mask_ops/utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import cvbase as cvb
-import numpy as np
-import pycocotools.mask as mask_utils
-
-import mmcv
-
-
-def split_combined_gt_polys(gt_polys, gt_poly_lens, num_polys_per_mask):
-    """Split the combined 1-D polys into masks.
-
-    A mask is represented as a list of polys, and a poly is represented as
-    a 1-D array. In dataset, all masks are concatenated into a single 1-D
-    tensor. Here we need to split the tensor into original representations.
-
-    Args:
-        gt_polys (list): a list (length = image num) of 1-D tensors
-        gt_poly_lens (list): a list (length = image num) of poly length
-        num_polys_per_mask (list): a list (length = image num) of poly number
-            of each mask
-
-    Returns:
-        list: a list (length = image num) of list (length = mask num) of
-            list (length = poly num) of numpy array
-    """
-    mask_polys_list = []
-    for img_id in range(len(gt_polys)):
-        gt_polys_single = gt_polys[img_id].cpu().numpy()
-        gt_polys_lens_single = gt_poly_lens[img_id].cpu().numpy().tolist()
-        num_polys_per_mask_single = num_polys_per_mask[
-            img_id].cpu().numpy().tolist()
-
-        split_gt_polys = mmcv.slice_list(gt_polys_single, gt_polys_lens_single)
-        mask_polys = mmcv.slice_list(split_gt_polys, num_polys_per_mask_single)
-        mask_polys_list.append(mask_polys)
-    return mask_polys_list
diff --git a/mmdet/core/post_processing/merge_augs.py b/mmdet/core/post_processing/merge_augs.py
index 35dfce24f91b4a6260476a3f77b67471c88e4bc7..00f65b049ccf2b00a0fee73cc64ac257415425ea 100644
--- a/mmdet/core/post_processing/merge_augs.py
+++ b/mmdet/core/post_processing/merge_augs.py
@@ -1,9 +1,9 @@
 import torch
 
-from mmdet.ops import nms
 import numpy as np
 
-from ..bbox_ops import bbox_mapping_back
+from mmdet.ops import nms
+from ..bbox import bbox_mapping_back
 
 
 def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
@@ -21,11 +21,12 @@ def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
     """
     recovered_proposals = []
     for proposals, img_info in zip(aug_proposals, img_metas):
-        shape_scale = img_info['shape_scale'][0]
-        flip = img_info['flip'][0]
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
         _proposals = proposals.clone()
-        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], shape_scale,
-                                              flip)
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip)
         recovered_proposals.append(_proposals)
     aug_proposals = torch.cat(recovered_proposals, dim=0)
     nms_keep = nms(aug_proposals, rpn_test_cfg.nms_thr,
@@ -53,9 +54,10 @@ def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
     """
     recovered_bboxes = []
     for bboxes, img_info in zip(aug_bboxes, img_metas):
-        shape_scale = img_info['shape_scale'][0]
-        flip = img_info['flip'][0]
-        bboxes = bbox_mapping_back(bboxes, shape_scale, flip)
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip)
         recovered_bboxes.append(bboxes)
     bboxes = torch.stack(recovered_bboxes).mean(dim=0)
     if aug_scores is None:
@@ -73,7 +75,7 @@ def merge_aug_scores(aug_scores):
         return np.mean(aug_scores, axis=0)
 
 
-def merge_aug_masks(aug_masks, bboxes, img_metas, rcnn_test_cfg, weights=None):
+def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None):
     """Merge augmented mask prediction.
 
     Args:
@@ -85,7 +87,7 @@ def merge_aug_masks(aug_masks, bboxes, img_metas, rcnn_test_cfg, weights=None):
         tuple: (bboxes, scores)
     """
     recovered_masks = [
-        mask if not img_info['flip'][0] else mask[..., ::-1]
+        mask if not img_info[0]['flip'] else mask[..., ::-1]
         for mask, img_info in zip(aug_masks, img_metas)
     ]
     if weights is None:
diff --git a/mmdet/core/rpn_ops/__init__.py b/mmdet/core/rpn_ops/__init__.py
deleted file mode 100644
index 4d5f9244dde2b244bbe42d54640e8a648277c506..0000000000000000000000000000000000000000
--- a/mmdet/core/rpn_ops/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .anchor_generator import *
-from .anchor_target import *
diff --git a/mmdet/core/rpn_ops/anchor_target.py b/mmdet/core/rpn_ops/anchor_target.py
deleted file mode 100644
index a6bba8ed221db022fb95590c6b10a56c8b6d4553..0000000000000000000000000000000000000000
--- a/mmdet/core/rpn_ops/anchor_target.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import torch
-import numpy as np
-from ..bbox_ops import (bbox_assign, bbox_transform, bbox_sampling)
-
-
-def anchor_target(anchor_list, valid_flag_list, featmap_sizes, gt_bboxes_list,
-                  img_shapes, target_means, target_stds, cfg):
-    """Compute anchor regression and classification targets
-
-    Args:
-        anchor_list(list): anchors of each feature map level
-        featuremap_sizes(list): feature map sizes
-        gt_bboxes_list(list): ground truth bbox of images in a mini-batch
-        img_shapes(list): shape of each image in a mini-batch
-        cfg(dict): configs
-
-    Returns:
-        tuple
-    """
-    if len(featmap_sizes) == len(anchor_list):
-        all_anchors = torch.cat(anchor_list, 0)
-        anchor_nums = [anchors.size(0) for anchors in anchor_list]
-        use_isomerism_anchors = False
-    elif len(img_shapes) == len(anchor_list):
-        # using different anchors for different images
-        all_anchors_list = [
-            torch.cat(anchor_list[img_id], 0)
-            for img_id in range(len(img_shapes))
-        ]
-        anchor_nums = [anchors.size(0) for anchors in anchor_list[0]]
-        use_isomerism_anchors = True
-    else:
-        raise ValueError('length of anchor_list should be equal to number of '
-                         'feature lvls or number of images in a batch')
-    all_labels = []
-    all_label_weights = []
-    all_bbox_targets = []
-    all_bbox_weights = []
-    num_total_sampled = 0
-    for img_id in range(len(img_shapes)):
-        if isinstance(valid_flag_list[img_id], list):
-            valid_flags = torch.cat(valid_flag_list[img_id], 0)
-        else:
-            valid_flags = valid_flag_list[img_id]
-        if use_isomerism_anchors:
-            all_anchors = all_anchors_list[img_id]
-        inside_flags = anchor_inside_flags(all_anchors, valid_flags,
-                                           img_shapes[img_id][:2],
-                                           cfg.allowed_border)
-        if not inside_flags.any():
-            return None
-        gt_bboxes = gt_bboxes_list[img_id]
-        anchor_targets = anchor_target_single(all_anchors, inside_flags,
-                                              gt_bboxes, target_means,
-                                              target_stds, cfg)
-        (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
-         neg_inds) = anchor_targets
-        all_labels.append(labels)
-        all_label_weights.append(label_weights)
-        all_bbox_targets.append(bbox_targets)
-        all_bbox_weights.append(bbox_weights)
-        num_total_sampled += max(pos_inds.numel() + neg_inds.numel(), 1)
-    all_labels = torch.stack(all_labels, 0)
-    all_label_weights = torch.stack(all_label_weights, 0)
-    all_bbox_targets = torch.stack(all_bbox_targets, 0)
-    all_bbox_weights = torch.stack(all_bbox_weights, 0)
-    # split into different feature levels
-    labels_list = []
-    label_weights_list = []
-    bbox_targets_list = []
-    bbox_weights_list = []
-    start = 0
-    for anchor_num in anchor_nums:
-        end = start + anchor_num
-        labels_list.append(all_labels[:, start:end].squeeze(0))
-        label_weights_list.append(all_label_weights[:, start:end].squeeze(0))
-        bbox_targets_list.append(all_bbox_targets[:, start:end].squeeze(0))
-        bbox_weights_list.append(all_bbox_weights[:, start:end].squeeze(0))
-        start = end
-    return (labels_list, label_weights_list, bbox_targets_list,
-            bbox_weights_list, num_total_sampled)
-
-
-def anchor_target_single(all_anchors, inside_flags, gt_bboxes, target_means,
-                         target_stds, cfg):
-    num_total_anchors = all_anchors.size(0)
-    anchors = all_anchors[inside_flags, :]
-    assigned_gt_inds, argmax_overlaps, max_overlaps = bbox_assign(
-        anchors,
-        gt_bboxes,
-        pos_iou_thr=cfg.pos_iou_thr,
-        neg_iou_thr=cfg.neg_iou_thr,
-        min_pos_iou=cfg.min_pos_iou)
-    pos_inds, neg_inds = bbox_sampling(assigned_gt_inds, cfg.anchor_batch_size,
-                                       cfg.pos_fraction, cfg.neg_pos_ub,
-                                       cfg.pos_balance_sampling, max_overlaps,
-                                       cfg.neg_balance_thr)
-
-    bbox_targets = torch.zeros_like(anchors)
-    bbox_weights = torch.zeros_like(anchors)
-    labels = torch.zeros_like(assigned_gt_inds)
-    label_weights = torch.zeros_like(assigned_gt_inds, dtype=torch.float)
-
-    if len(pos_inds) > 0:
-        pos_inds = unique(pos_inds)
-        pos_anchors = anchors[pos_inds, :]
-        pos_gt_bbox = gt_bboxes[assigned_gt_inds[pos_inds] - 1, :]
-        pos_bbox_targets = bbox_transform(pos_anchors, pos_gt_bbox,
-                                          target_means, target_stds)
-        bbox_targets[pos_inds, :] = pos_bbox_targets
-        bbox_weights[pos_inds, :] = 1.0
-        labels[pos_inds] = 1
-        if cfg.pos_weight <= 0:
-            label_weights[pos_inds] = 1.0
-        else:
-            label_weights[pos_inds] = cfg.pos_weight
-    if len(neg_inds) > 0:
-        neg_inds = unique(neg_inds)
-        label_weights[neg_inds] = 1.0
-
-    # map up to original set of anchors
-    labels = unmap(labels, num_total_anchors, inside_flags)
-    label_weights = unmap(label_weights, num_total_anchors, inside_flags)
-    bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
-    bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
-
-    return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
-            neg_inds)
-
-def anchor_inside_flags(all_anchors, valid_flags, img_shape, allowed_border=0):
-    img_h, img_w = img_shape.float()
-    if allowed_border >= 0:
-        inside_flags = valid_flags & \
-            (all_anchors[:, 0] >= -allowed_border) & \
-            (all_anchors[:, 1] >= -allowed_border) & \
-            (all_anchors[:, 2] < img_w + allowed_border) & \
-            (all_anchors[:, 3] < img_h + allowed_border)
-    else:
-        inside_flags = valid_flags
-    return inside_flags
-
-def unique(tensor):
-    if tensor.is_cuda:
-        u_tensor = np.unique(tensor.cpu().numpy())
-        return tensor.new_tensor(u_tensor)
-    else:
-        return torch.unique(tensor)
-
-def unmap(data, count, inds, fill=0):
-    """ Unmap a subset of item (data) back to the original set of items (of
-    size count) """
-    if data.dim() == 1:
-        ret = data.new_full((count, ), fill)
-        ret[inds] = data
-    else:
-        new_size = (count, ) + data.size()[1:]
-        ret = data.new_full(new_size, fill)
-        ret[inds, :] = data
-    return ret
diff --git a/mmdet/core/test_engine.py b/mmdet/core/test_engine.py
deleted file mode 100644
index 4825beda640c443b5d8aab0daf5c30838be4364b..0000000000000000000000000000000000000000
--- a/mmdet/core/test_engine.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from mmdet.datasets import collate
-from mmdet.nn.parallel import scatter
-
-__all__ = ['_data_func']
-
-def _data_func(data, gpu_id):
-    imgs, img_metas = tuple(
-        scatter(collate([data], samples_per_gpu=1), [gpu_id])[0])
-    return dict(
-        img=imgs,
-        img_meta=img_metas,
-        return_loss=False,
-        return_bboxes=True,
-        rescale=True)
diff --git a/mmdet/core/train_engine.py b/mmdet/core/train_engine.py
deleted file mode 100644
index cc745faad87cb2a97272934902822666be55d71f..0000000000000000000000000000000000000000
--- a/mmdet/core/train_engine.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import numpy as np
-import torch
-from collections import OrderedDict
-from mmdet.nn.parallel import scatter
-
-
-def parse_losses(losses):
-    log_vars = OrderedDict()
-    for loss_key, loss_value in losses.items():
-        if isinstance(loss_value, dict):
-            for _key, _value in loss_value.items():
-                if isinstance(_value, list):
-                    _value = sum([_loss.mean() for _loss in _value])
-                else:
-                    _value = _value.mean()
-                log_vars[_keys] = _value
-        elif isinstance(loss_value, list):
-            log_vars[loss_key] = sum(_loss.mean() for _loss in loss_value)
-        else:
-            log_vars[loss_key] = loss_value.mean()
-
-    loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
-    log_vars['loss'] = loss
-    for _key, _value in log_vars.items():
-        log_vars[_key] = _value.item()
-
-    return loss, log_vars
-
-
-def batch_processor(model, data, train_mode, args=None):
-    data = scatter(data, [torch.cuda.current_device()])[0]
-    losses = model(**data)
-    loss, log_vars = parse_losses(losses)
-
-    outputs = dict(
-        loss=loss / args.world_size,
-        log_vars=log_vars,
-        num_samples=len(data['img'].data))
-
-    return outputs
diff --git a/mmdet/core/utils/__init__.py b/mmdet/core/utils/__init__.py
index 2b6e79d62e60b5e1efaac985e039b36840f86397..981dab7fb0db3841a3bea05a1c96bdd91cfff4ca 100644
--- a/mmdet/core/utils/__init__.py
+++ b/mmdet/core/utils/__init__.py
@@ -1,3 +1,7 @@
-from .dist_utils import *
-from .hooks import *
-from .misc import *
+from .dist_utils import init_dist, allreduce_grads, DistOptimizerHook
+from .misc import tensor2imgs, unmap, multi_apply
+
+__all__ = [
+    'init_dist', 'allreduce_grads', 'DistOptimizerHook', 'tensor2imgs',
+    'unmap', 'multi_apply'
+]
diff --git a/mmdet/core/utils/dist_utils.py b/mmdet/core/utils/dist_utils.py
index 47279c7bf8fd3b0ed66c3099f465b0130c864a23..c7748db661f4467fac0a2081350a0c06264fc593 100644
--- a/mmdet/core/utils/dist_utils.py
+++ b/mmdet/core/utils/dist_utils.py
@@ -1,60 +1,89 @@
 import os
+from collections import OrderedDict
+
 import torch
 import torch.multiprocessing as mp
 import torch.distributed as dist
-from torch.nn.utils import clip_grad
-from mmcv.torchpack import Hook, OptimizerStepperHook
-
-__all__ = [
-    'init_dist', 'average_gradients', 'broadcast_params',
-    'DistOptimizerStepperHook', 'DistSamplerSeedHook'
-]
+from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors,
+                          _take_tensors)
+from mmcv.runner import OptimizerHook
 
 
-def init_dist(world_size,
-              rank,
-              backend='gloo',
-              master_ip='127.0.0.1',
-              port=29500):
+def init_dist(launcher, backend='nccl', **kwargs):
     if mp.get_start_method(allow_none=True) is None:
         mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError('Invalid launcher type: {}'.format(launcher))
+
+
+def _init_dist_pytorch(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['RANK'])
     num_gpus = torch.cuda.device_count()
     torch.cuda.set_device(rank % num_gpus)
-    os.environ['MASTER_ADDR'] = master_ip
-    os.environ['MASTER_PORT'] = str(port)
-    if backend == 'nccl':
-        dist.init_process_group(backend='nccl')
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_mpi(backend, **kwargs):
+    raise NotImplementedError
+
+
+def _init_dist_slurm(backend, **kwargs):
+    raise NotImplementedError
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
     else:
-        dist.init_process_group(
-            backend='gloo', rank=rank, world_size=world_size)
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
 
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
 
-def average_gradients(model):
-    for param in model.parameters():
-        if param.requires_grad and not (param.grad is None):
-            dist.all_reduce(param.grad.data)
 
+def allreduce_grads(model, coalesce=True, bucket_size_mb=-1):
+    grads = [
+        param.grad.data for param in model.parameters()
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
 
-def broadcast_params(model):
-    for p in model.state_dict().values():
-        dist.broadcast(p, 0)
 
+class DistOptimizerHook(OptimizerHook):
 
-class DistOptimizerStepperHook(OptimizerStepperHook):
+    def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
 
     def after_train_iter(self, runner):
         runner.optimizer.zero_grad()
         runner.outputs['loss'].backward()
-        average_gradients(runner.model)
-        if self.grad_clip:
-            clip_grad.clip_grad_norm_(
-                filter(lambda p: p.requires_grad, runner.model.parameters()),
-                max_norm=self.max_norm,
-                norm_type=self.norm_type)
+        allreduce_grads(runner.model, self.coalesce, self.bucket_size_mb)
+        if self.grad_clip is not None:
+            self.clip_grads(runner.model.parameters())
         runner.optimizer.step()
-
-
-class DistSamplerSeedHook(Hook):
-
-    def before_epoch(self, runner):
-        runner.data_loader.sampler.set_epoch(runner.epoch)
diff --git a/mmdet/core/utils/hooks.py b/mmdet/core/utils/hooks.py
deleted file mode 100644
index f97e1fb29061ad5f07fa06907fbec72ede7a3bf3..0000000000000000000000000000000000000000
--- a/mmdet/core/utils/hooks.py
+++ /dev/null
@@ -1,245 +0,0 @@
-import os
-import os.path as osp
-import shutil
-import time
-
-import mmcv
-import numpy as np
-import torch
-from mmcv.torchpack import Hook
-from mmdet.datasets import collate
-from mmdet.nn.parallel import scatter
-from pycocotools.cocoeval import COCOeval
-
-from ..eval import eval_recalls
-
-__all__ = [
-    'EmptyCacheHook', 'DistEvalHook', 'DistEvalRecallHook',
-    'CocoDistEvalmAPHook'
-]
-
-
-class EmptyCacheHook(Hook):
-
-    def before_epoch(self, runner):
-        torch.cuda.empty_cache()
-
-    def after_epoch(self, runner):
-        torch.cuda.empty_cache()
-
-
-class DistEvalHook(Hook):
-
-    def __init__(self, dataset, interval=1):
-        self.dataset = dataset
-        self.interval = interval
-        self.lock_dir = None
-
-    def _barrier(self, rank, world_size):
-        """Due to some issues with `torch.distributed.barrier()`, we have to
-        implement this ugly barrier function.
-        """
-        if rank == 0:
-            for i in range(1, world_size):
-                tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
-                while not (osp.exists(tmp)):
-                    time.sleep(1)
-            for i in range(1, world_size):
-                tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
-                os.remove(tmp)
-        else:
-            tmp = osp.join(self.lock_dir, '{}.pkl'.format(rank))
-            mmcv.dump([], tmp)
-            while osp.exists(tmp):
-                time.sleep(1)
-
-    def before_run(self, runner):
-        self.lock_dir = osp.join(runner.work_dir, '.lock_map_hook')
-        if runner.rank == 0:
-            if osp.exists(self.lock_dir):
-                shutil.rmtree(self.lock_dir)
-            mmcv.mkdir_or_exist(self.lock_dir)
-
-    def after_train_epoch(self, runner):
-        if not self.every_n_epochs(runner, self.interval):
-            return
-        runner.model.eval()
-        results = [None for _ in range(len(self.dataset))]
-        prog_bar = mmcv.ProgressBar(len(self.dataset))
-        for idx in range(runner.rank, len(self.dataset), runner.world_size):
-            data = self.dataset[idx]
-            device_id = torch.cuda.current_device()
-            imgs_data = tuple(
-                scatter(collate([data], samples_per_gpu=1), [device_id])[0])
-
-            # compute output
-            with torch.no_grad():
-                result = runner.model(
-                    *imgs_data,
-                    return_loss=False,
-                    return_bboxes=True,
-                    rescale=True)
-            results[idx] = result
-
-            batch_size = runner.world_size
-            for _ in range(batch_size):
-                prog_bar.update()
-
-        if runner.rank == 0:
-            print('\n')
-            self._barrier(runner.rank, runner.world_size)
-            for i in range(1, runner.world_size):
-                tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i))
-                tmp_results = mmcv.load(tmp_file)
-                for idx in range(i, len(results), runner.world_size):
-                    results[idx] = tmp_results[idx]
-                os.remove(tmp_file)
-            self.evaluate(runner, results)
-        else:
-            tmp_file = osp.join(runner.work_dir,
-                                'temp_{}.pkl'.format(runner.rank))
-            mmcv.dump(results, tmp_file)
-            self._barrier(runner.rank, runner.world_size)
-        self._barrier(runner.rank, runner.world_size)
-
-    def evaluate(self):
-        raise NotImplementedError
-
-
-class CocoEvalMixin(object):
-
-    def _xyxy2xywh(self, bbox):
-        _bbox = bbox.tolist()
-        return [
-            _bbox[0],
-            _bbox[1],
-            _bbox[2] - _bbox[0] + 1,
-            _bbox[3] - _bbox[1] + 1,
-        ]
-
-    def det2json(self, dataset, results):
-        json_results = []
-        for idx in range(len(dataset)):
-            img_id = dataset.img_ids[idx]
-            result = results[idx]
-            for label in range(len(result)):
-                bboxes = result[label]
-                for i in range(bboxes.shape[0]):
-                    data = dict()
-                    data['image_id'] = img_id
-                    data['bbox'] = self._xyxy2xywh(bboxes[i])
-                    data['score'] = float(bboxes[i][4])
-                    data['category_id'] = dataset.cat_ids[label]
-                    json_results.append(data)
-        return json_results
-
-    def segm2json(self, dataset, results):
-        json_results = []
-        for idx in range(len(dataset)):
-            img_id = dataset.img_ids[idx]
-            det, seg = results[idx]
-            for label in range(len(det)):
-                bboxes = det[label]
-                segms = seg[label]
-                for i in range(bboxes.shape[0]):
-                    data = dict()
-                    data['image_id'] = img_id
-                    data['bbox'] = self._xyxy2xywh(bboxes[i])
-                    data['score'] = float(bboxes[i][4])
-                    data['category_id'] = dataset.cat_ids[label]
-                    segms[i]['counts'] = segms[i]['counts'].decode()
-                    data['segmentation'] = segms[i]
-                    json_results.append(data)
-        return json_results
-
-    def proposal2json(self, dataset, results):
-        json_results = []
-        for idx in range(len(dataset)):
-            img_id = dataset.img_ids[idx]
-            bboxes = results[idx]
-            for i in range(bboxes.shape[0]):
-                data = dict()
-                data['image_id'] = img_id
-                data['bbox'] = self._xyxy2xywh(bboxes[i])
-                data['score'] = float(bboxes[i][4])
-                data['category_id'] = 1
-                json_results.append(data)
-        return json_results
-
-    def results2json(self, dataset, results, out_file):
-        if isinstance(results[0], list):
-            json_results = self.det2json(dataset, results)
-        elif isinstance(results[0], tuple):
-            json_results = self.segm2json(dataset, results)
-        elif isinstance(results[0], np.ndarray):
-            json_results = self.proposal2json(dataset, results)
-        else:
-            raise TypeError('invalid type of results')
-        mmcv.dump(json_results, out_file, file_format='json')
-
-
-class DistEvalRecallHook(DistEvalHook):
-
-    def __init__(self,
-                 dataset,
-                 proposal_nums=(100, 300, 1000),
-                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
-        super(DistEvalRecallHook, self).__init__(dataset)
-        self.proposal_nums = np.array(proposal_nums, dtype=np.int32)
-        self.iou_thrs = np.array(iou_thrs, dtype=np.float32)
-
-    def evaluate(self, runner, results):
-        # official coco evaluation is too slow, here we use our own
-        # implementation, which may get slightly different results
-        gt_bboxes = []
-        for i in range(len(self.dataset)):
-            img_id = self.dataset.img_ids[i]
-            ann_ids = self.dataset.coco.getAnnIds(imgIds=img_id)
-            ann_info = self.dataset.coco.loadAnns(ann_ids)
-            if len(ann_info) == 0:
-                gt_bboxes.append(np.zeros((0, 4)))
-                continue
-            bboxes = []
-            for ann in ann_info:
-                if ann.get('ignore', False) or ann['iscrowd']:
-                    continue
-                x1, y1, w, h = ann['bbox']
-                bboxes.append([x1, y1, x1 + w - 1, y1 + h - 1])
-            bboxes = np.array(bboxes, dtype=np.float32)
-            if bboxes.shape[0] == 0:
-                bboxes = np.zeros((0, 4))
-            gt_bboxes.append(bboxes)
-
-        recalls = eval_recalls(
-            gt_bboxes,
-            results,
-            self.proposal_nums,
-            self.iou_thrs,
-            print_summary=False)
-        ar = recalls.mean(axis=1)
-        for i, num in enumerate(self.proposal_nums):
-            runner.log_buffer.output['AR@{}'.format(num)] = ar[i]
-        runner.log_buffer.ready = True
-
-
-class CocoDistEvalmAPHook(DistEvalHook, CocoEvalMixin):
-
-    def evaluate(self, runner, results):
-        tmp_file = osp.join(runner.work_dir, 'temp_0.json')
-        self.results2json(self.dataset, results, tmp_file)
-
-        res_types = ['bbox', 'segm'] if runner.model.with_mask else ['bbox']
-        cocoGt = self.dataset.coco
-        cocoDt = cocoGt.loadRes(tmp_file)
-        imgIds = cocoGt.getImgIds()
-        for res_type in res_types:
-            iou_type = res_type
-            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
-            cocoEval.params.imgIds = imgIds
-            cocoEval.evaluate()
-            cocoEval.accumulate()
-            cocoEval.summarize()
-            field = '{}_mAP'.format(res_type)
-            runner.log_buffer.output[field] = cocoEval.stats[0]
-        runner.log_buffer.ready = True
-        os.remove(tmp_file)
diff --git a/mmdet/core/utils/misc.py b/mmdet/core/utils/misc.py
index 0f9c05e4577f23125fad0f0714a8f1089e82dbee..262f168e646089a535a9ad393947d57198873d93 100644
--- a/mmdet/core/utils/misc.py
+++ b/mmdet/core/utils/misc.py
@@ -1,36 +1,27 @@
-import subprocess
+from functools import partial
 
 import mmcv
 import numpy as np
-import torch
+from six.moves import map, zip
 
-__all__ = ['tensor2imgs', 'unique', 'unmap', 'results2json']
 
-
-def tensor2imgs(tensor,
-                color_order='RGB',
-                color_mean=(0.485, 0.456, 0.406),
-                color_std=(0.229, 0.224, 0.225)):
-    assert color_order in ['RGB', 'BGR']
-    img_per_gpu = tensor.size(0)
-    color_mean = np.array(color_mean, dtype=np.float32)
-    color_std = np.array(color_std, dtype=np.float32)
+def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
+    num_imgs = tensor.size(0)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
     imgs = []
-    for img_id in range(img_per_gpu):
+    for img_id in range(num_imgs):
         img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
-        if color_order == 'RGB':
-            img = mmcv.rgb2bgr(img)
-        img = img * color_std + color_mean
+        img = mmcv.imdenormalize(
+            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
         imgs.append(np.ascontiguousarray(img))
     return imgs
 
 
-def unique(tensor):
-    if tensor.is_cuda:
-        u_tensor = np.unique(tensor.cpu().numpy())
-        return tensor.new_tensor(u_tensor)
-    else:
-        return torch.unique(tensor)
+def multi_apply(func, *args, **kwargs):
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
 
 
 def unmap(data, count, inds, fill=0):
@@ -44,75 +35,3 @@ def unmap(data, count, inds, fill=0):
         ret = data.new_full(new_size, fill)
         ret[inds, :] = data
     return ret
-
-def xyxy2xywh(bbox):
-    _bbox = bbox.tolist()
-    return [
-        _bbox[0],
-        _bbox[1],
-        _bbox[2] - _bbox[0] + 1,
-        _bbox[3] - _bbox[1] + 1,
-    ]
-
-def det2json(dataset, results):
-    json_results = []
-    for idx in range(len(dataset)):
-        img_id = dataset.img_ids[idx]
-        result = results[idx]
-        for label in range(len(result)):
-            bboxes = result[label]
-            for i in range(bboxes.shape[0]):
-                data = dict()
-                data['image_id'] = img_id
-                data['bbox'] = xyxy2xywh(bboxes[i])
-                data['score'] = float(bboxes[i][4])
-                data['category_id'] = dataset.cat_ids[label]
-                json_results.append(data)
-    return json_results
-
-
-def segm2json(dataset, results):
-    json_results = []
-    for idx in range(len(dataset)):
-        img_id = dataset.img_ids[idx]
-        det, seg = results[idx]
-        for label in range(len(det)):
-            bboxes = det[label]
-            segms = seg[label]
-            for i in range(bboxes.shape[0]):
-                data = dict()
-                data['image_id'] = img_id
-                data['bbox'] = xyxy2xywh(bboxes[i])
-                data['score'] = float(bboxes[i][4])
-                data['category_id'] = dataset.cat_ids[label]
-                segms[i]['counts'] = segms[i]['counts'].decode()
-                data['segmentation'] = segms[i]
-                json_results.append(data)
-    return json_results
-
-
-def proposal2json(dataset, results):
-    json_results = []
-    for idx in range(len(dataset)):
-        img_id = dataset.img_ids[idx]
-        bboxes = results[idx]
-        for i in range(bboxes.shape[0]):
-            data = dict()
-            data['image_id'] = img_id
-            data['bbox'] = xyxy2xywh(bboxes[i])
-            data['score'] = float(bboxes[i][4])
-            data['category_id'] = 1
-            json_results.append(data)
-    return json_results
-
-
-def results2json(dataset, results, out_file):
-    if isinstance(results[0], list):
-        json_results = det2json(dataset, results)
-    elif isinstance(results[0], tuple):
-        json_results = segm2json(dataset, results)
-    elif isinstance(results[0], np.ndarray):
-        json_results = proposal2json(dataset, results)
-    else:
-        raise TypeError('invalid type of results')
-    mmcv.dump(json_results, out_file)
diff --git a/mmdet/datasets/__init__.py b/mmdet/datasets/__init__.py
index 6045c2b0923993243a999f0008b79443126d0e26..425ea72535a144544f44ebe8b5d63dd31336a54c 100644
--- a/mmdet/datasets/__init__.py
+++ b/mmdet/datasets/__init__.py
@@ -1,4 +1,8 @@
 from .coco import CocoDataset
-from .collate import *
-from .sampler import *
-from .transforms import *
+from .loader import GroupSampler, DistributedGroupSampler, build_dataloader
+from .utils import to_tensor, random_scale, show_ann
+
+__all__ = [
+    'CocoDataset', 'GroupSampler', 'DistributedGroupSampler',
+    'build_dataloader', 'to_tensor', 'random_scale', 'show_ann'
+]
diff --git a/mmdet/datasets/coco.py b/mmdet/datasets/coco.py
index a7eedca6a2cbab92e069415513def5ab363dc824..3cd0a6d5ca20dbeba11f96135b570635348c74d9 100644
--- a/mmdet/datasets/coco.py
+++ b/mmdet/datasets/coco.py
@@ -2,75 +2,17 @@ import os.path as osp
 
 import mmcv
 import numpy as np
+from mmcv.parallel import DataContainer as DC
 from pycocotools.coco import COCO
 from torch.utils.data import Dataset
 
-from .transforms import (ImageTransform, BboxTransform, PolyMaskTransform,
+from .transforms import (ImageTransform, BboxTransform, MaskTransform,
                          Numpy2Tensor)
-from .utils import show_ann, random_scale
-from .utils import DataContainer as DC
-
-
-def parse_ann_info(ann_info, cat2label, with_mask=True):
-    """Parse bbox and mask annotation.
-
-    Args:
-        ann_info (list[dict]): Annotation info of an image.
-        cat2label (dict): The mapping from category ids to labels.
-        with_mask (bool): Whether to parse mask annotations.
-
-    Returns:
-        tuple: gt_bboxes, gt_labels and gt_mask_info
-    """
-    gt_bboxes = []
-    gt_labels = []
-    gt_bboxes_ignore = []
-    # each mask consists of one or several polys, each poly is a list of float.
-    if with_mask:
-        gt_mask_polys = []
-        gt_poly_lens = []
-    for i, ann in enumerate(ann_info):
-        if ann.get('ignore', False):
-            continue
-        x1, y1, w, h = ann['bbox']
-        if ann['area'] <= 0 or w < 1 or h < 1:
-            continue
-        bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
-        if ann['iscrowd']:
-            gt_bboxes_ignore.append(bbox)
-        else:
-            gt_bboxes.append(bbox)
-            gt_labels.append(cat2label[ann['category_id']])
-            if with_mask:
-                # Note polys are not resized
-                mask_polys = [
-                    p for p in ann['segmentation'] if len(p) >= 6
-                ]  # valid polygons have >= 3 points (6 coordinates)
-                poly_lens = [len(p) for p in mask_polys]
-                gt_mask_polys.append(mask_polys)
-                gt_poly_lens.extend(poly_lens)
-    if gt_bboxes:
-        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
-        gt_labels = np.array(gt_labels, dtype=np.int64)
-    else:
-        gt_bboxes = np.zeros((0, 4), dtype=np.float32)
-        gt_labels = np.array([], dtype=np.int64)
-
-    if gt_bboxes_ignore:
-        gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
-    else:
-        gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
-
-    ann = dict(
-        bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)
-
-    if with_mask:
-        ann['mask_polys'] = gt_mask_polys
-        ann['poly_lens'] = gt_poly_lens
-    return ann
+from .utils import to_tensor, show_ann, random_scale
 
 
 class CocoDataset(Dataset):
+
     def __init__(self,
                  ann_file,
                  img_prefix,
@@ -137,7 +79,7 @@ class CocoDataset(Dataset):
         self.img_transform = ImageTransform(
             size_divisor=self.size_divisor, **self.img_norm_cfg)
         self.bbox_transform = BboxTransform()
-        self.mask_transform = PolyMaskTransform()
+        self.mask_transform = MaskTransform()
         self.numpy2tensor = Numpy2Tensor()
 
     def __len__(self):
@@ -161,6 +103,70 @@ class CocoDataset(Dataset):
         ann_info = self.coco.loadAnns(ann_ids)
         return ann_info
 
+    def _parse_ann_info(self, ann_info, with_mask=True):
+        """Parse bbox and mask annotation.
+
+        Args:
+            ann_info (list[dict]): Annotation info of an image.
+            with_mask (bool): Whether to parse mask annotations.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,
+                labels, masks, mask_polys, poly_lens.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        # Two formats are provided.
+        # 1. mask: a binary map of the same size of the image.
+        # 2. polys: each mask consists of one or several polys, each poly is a
+        # list of float.
+        if with_mask:
+            gt_masks = []
+            gt_mask_polys = []
+            gt_poly_lens = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
+            if ann['iscrowd']:
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+            if with_mask:
+                gt_masks.append(self.coco.annToMask(ann))
+                mask_polys = [
+                    p for p in ann['segmentation'] if len(p) >= 6
+                ]  # valid polygons have >= 3 points (6 coordinates)
+                poly_lens = [len(p) for p in mask_polys]
+                gt_mask_polys.append(mask_polys)
+                gt_poly_lens.extend(poly_lens)
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        ann = dict(
+            bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)
+
+        if with_mask:
+            ann['masks'] = gt_masks
+            # poly format is not used in the current implementation
+            ann['mask_polys'] = gt_mask_polys
+            ann['poly_lens'] = gt_poly_lens
+        return ann
+
     def _set_group_flag(self):
         """Set flag according to image aspect ratio.
 
@@ -199,7 +205,7 @@ class CocoDataset(Dataset):
                     idx = self._rand_another(idx)
                     continue
 
-            ann = parse_ann_info(ann_info, self.cat2label, self.with_mask)
+            ann = self._parse_ann_info(ann_info, self.with_mask)
             gt_bboxes = ann['bboxes']
             gt_labels = ann['labels']
             gt_bboxes_ignore = ann['bboxes_ignore']
@@ -211,7 +217,7 @@ class CocoDataset(Dataset):
             # apply transforms
             flip = True if np.random.rand() < self.flip_ratio else False
             img_scale = random_scale(self.img_scales)  # sample a scale
-            img, img_shape, scale_factor = self.img_transform(
+            img, img_shape, pad_shape, scale_factor = self.img_transform(
                 img, img_scale, flip)
             if self.proposals is not None:
                 proposals = self.bbox_transform(proposals, img_shape,
@@ -222,32 +228,29 @@ class CocoDataset(Dataset):
                                                    scale_factor, flip)
 
             if self.with_mask:
-                gt_mask_polys, gt_poly_lens, num_polys_per_mask = \
-                    self.mask_transform(
-                        ann['mask_polys'], ann['poly_lens'],
-                        img_info['height'], img_info['width'], flip)
+                gt_masks = self.mask_transform(ann['masks'], pad_shape,
+                                               scale_factor, flip)
 
-            ori_shape = (img_info['height'], img_info['width'])
+            ori_shape = (img_info['height'], img_info['width'], 3)
             img_meta = dict(
-                ori_shape=DC(ori_shape),
-                img_shape=DC(img_shape),
-                scale_factor=DC(scale_factor),
-                flip=DC(flip))
+                ori_shape=ori_shape,
+                img_shape=img_shape,
+                pad_shape=pad_shape,
+                scale_factor=scale_factor,
+                flip=flip)
 
             data = dict(
-                img=DC(img, stack=True),
-                img_meta=img_meta,
-                gt_bboxes=DC(gt_bboxes))
+                img=DC(to_tensor(img), stack=True),
+                img_meta=DC(img_meta, cpu_only=True),
+                gt_bboxes=DC(to_tensor(gt_bboxes)))
             if self.proposals is not None:
-                data['proposals'] = DC(proposals)
+                data['proposals'] = DC(to_tensor(proposals))
             if self.with_label:
-                data['gt_labels'] = DC(gt_labels)
+                data['gt_labels'] = DC(to_tensor(gt_labels))
             if self.with_crowd:
-                data['gt_bboxes_ignore'] = DC(gt_bboxes_ignore)
+                data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
             if self.with_mask:
-                data['gt_mask_polys'] = DC(gt_mask_polys)
-                data['gt_poly_lens'] = DC(gt_poly_lens)
-                data['num_polys_per_mask'] = DC(num_polys_per_mask)
+                data['gt_masks'] = DC(gt_masks, cpu_only=True)
             return data
 
     def prepare_test_img(self, idx):
@@ -258,37 +261,38 @@ class CocoDataset(Dataset):
                     if self.proposals is not None else None)
 
         def prepare_single(img, scale, flip, proposal=None):
-            _img, _img_shape, _scale_factor = self.img_transform(
+            _img, img_shape, pad_shape, scale_factor = self.img_transform(
                 img, scale, flip)
-            img, img_shape, scale_factor = self.numpy2tensor(
-                _img, _img_shape, _scale_factor)
-            ori_shape = (img_info['height'], img_info['width'])
-            img_meta = dict(
-                ori_shape=ori_shape,
+            _img = to_tensor(_img)
+            _img_meta = dict(
+                ori_shape=(img_info['height'], img_info['width'], 3),
                 img_shape=img_shape,
+                pad_shape=pad_shape,
                 scale_factor=scale_factor,
                 flip=flip)
             if proposal is not None:
-                proposal = self.bbox_transform(proposal, _scale_factor, flip)
-                proposal = self.numpy2tensor(proposal)
-            return img, img_meta, proposal
+                _proposal = self.bbox_transform(proposal, scale_factor, flip)
+                _proposal = to_tensor(_proposal)
+            else:
+                _proposal = None
+            return _img, _img_meta, _proposal
 
         imgs = []
         img_metas = []
         proposals = []
         for scale in self.img_scales:
-            img, img_meta, proposal = prepare_single(img, scale, False,
-                                                     proposal)
-            imgs.append(img)
-            img_metas.append(img_meta)
-            proposals.append(proposal)
+            _img, _img_meta, _proposal = prepare_single(
+                img, scale, False, proposal)
+            imgs.append(_img)
+            img_metas.append(DC(_img_meta, cpu_only=True))
+            proposals.append(_proposal)
             if self.flip_ratio > 0:
-                img, img_meta, prop = prepare_single(img, scale, True,
-                                                     proposal)
-                imgs.append(img)
-                img_metas.append(img_meta)
-                proposals.append(prop)
-        if self.proposals is None:
-            return imgs, img_metas
-        else:
-            return imgs, img_metas, proposals
+                _img, _img_meta, _proposal = prepare_single(
+                    img, scale, True, proposal)
+                imgs.append(_img)
+                img_metas.append(DC(_img_meta, cpu_only=True))
+                proposals.append(_proposal)
+        data = dict(img=imgs, img_meta=img_metas)
+        if self.proposals is not None:
+            data['proposals'] = proposals
+        return data
diff --git a/mmdet/datasets/collate.py b/mmdet/datasets/collate.py
deleted file mode 100644
index 44117d6f2d01d3aaa4c06996c2d8bf657e4a1ce5..0000000000000000000000000000000000000000
--- a/mmdet/datasets/collate.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import collections
-
-import torch
-import torch.nn.functional as F
-from torch.utils.data.dataloader import default_collate
-
-from .utils import DataContainer
-
-# https://github.com/pytorch/pytorch/issues/973
-import resource
-rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
-resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
-
-__all__ = ['collate']
-
-
-def collate(batch, samples_per_gpu=1):
-
-    if not isinstance(batch, collections.Sequence):
-        raise TypeError("{} is not supported.".format(batch.dtype))
-
-    if isinstance(batch[0], DataContainer):
-        assert len(batch) % samples_per_gpu == 0
-        stacked = []
-        if batch[0].stack:
-            for i in range(0, len(batch), samples_per_gpu):
-                assert isinstance(batch[i].data, torch.Tensor)
-                # TODO: handle tensors other than 3d
-                assert batch[i].dim() == 3
-                c, h, w = batch[0].size()
-                for sample in batch[i:i + samples_per_gpu]:
-                    assert c == sample.size(0)
-                    h = max(h, sample.size(1))
-                    w = max(w, sample.size(2))
-                padded_samples = [
-                    F.pad(
-                        sample.data,
-                        (0, w - sample.size(2), 0, h - sample.size(1)),
-                        value=sample.padding_value)
-                    for sample in batch[i:i + samples_per_gpu]
-                ]
-                stacked.append(default_collate(padded_samples))
-        else:
-            for i in range(0, len(batch), samples_per_gpu):
-                stacked.append(
-                    [sample.data for sample in batch[i:i + samples_per_gpu]])
-        return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
-    elif isinstance(batch[0], collections.Sequence):
-        transposed = zip(*batch)
-        return [collate(samples, samples_per_gpu) for samples in transposed]
-    elif isinstance(batch[0], collections.Mapping):
-        return {
-            key: collate([d[key] for d in batch], samples_per_gpu)
-            for key in batch[0]
-        }
-    else:
-        return default_collate(batch)
diff --git a/mmdet/datasets/data_engine.py b/mmdet/datasets/data_engine.py
deleted file mode 100644
index 0c89f21878a9f2fe2b21669ecfb2cd71cc9ae073..0000000000000000000000000000000000000000
--- a/mmdet/datasets/data_engine.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from functools import partial
-import torch
-from .coco import CocoDataset
-from .collate import collate
-from .sampler import GroupSampler, DistributedGroupSampler
-
-
-def build_data(cfg, args):
-    dataset = CocoDataset(**cfg)
-
-    if args.dist:
-        sampler = DistributedGroupSampler(dataset, args.img_per_gpu,
-                                     args.world_size, args.rank)
-        batch_size = args.img_per_gpu
-        num_workers = args.data_workers
-    else:
-        sampler = GroupSampler(dataset, args.img_per_gpu)
-        batch_size = args.world_size * args.img_per_gpu
-        num_workers = args.world_size * args.data_workers
-
-    loader = torch.utils.data.DataLoader(
-        dataset,
-        batch_size=args.img_per_gpu,
-        sampler=sampler,
-        num_workers=num_workers,
-        collate_fn=partial(collate, samples_per_gpu=args.img_per_gpu),
-        pin_memory=False)
-
-    return loader
diff --git a/mmdet/datasets/loader/__init__.py b/mmdet/datasets/loader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3d4fdd2cbbe85b26b4c5fa4898315accbe94c0a
--- /dev/null
+++ b/mmdet/datasets/loader/__init__.py
@@ -0,0 +1,6 @@
+from .build_loader import build_dataloader
+from .sampler import GroupSampler, DistributedGroupSampler
+
+__all__ = [
+    'GroupSampler', 'DistributedGroupSampler', 'build_dataloader'
+]
diff --git a/mmdet/datasets/loader/build_loader.py b/mmdet/datasets/loader/build_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b342b32b83b629110877de649923c3610ba4bb
--- /dev/null
+++ b/mmdet/datasets/loader/build_loader.py
@@ -0,0 +1,44 @@
+from functools import partial
+
+from mmcv.runner import get_dist_info
+from mmcv.parallel import collate
+from torch.utils.data import DataLoader
+
+from .sampler import GroupSampler, DistributedGroupSampler
+
+# https://github.com/pytorch/pytorch/issues/973
+import resource
+rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
+
+
+def build_dataloader(dataset,
+                     imgs_per_gpu,
+                     workers_per_gpu,
+                     num_gpus,
+                     dist=True,
+                     **kwargs):
+    if dist:
+        rank, world_size = get_dist_info()
+        sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size,
+                                          rank)
+        batch_size = imgs_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = GroupSampler(dataset, imgs_per_gpu)
+        batch_size = num_gpus * imgs_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    if not kwargs.get('shuffle', True):
+        sampler = None
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
+        pin_memory=False,
+        **kwargs)
+
+    return data_loader
diff --git a/mmdet/datasets/sampler.py b/mmdet/datasets/loader/sampler.py
similarity index 98%
rename from mmdet/datasets/sampler.py
rename to mmdet/datasets/loader/sampler.py
index 74089821bf17a7bdc6f1f728c0340e382adb3046..5c060cd926ea50d232d0f765b86933ca8fad0969 100644
--- a/mmdet/datasets/sampler.py
+++ b/mmdet/datasets/loader/sampler.py
@@ -7,8 +7,6 @@ import numpy as np
 from torch.distributed import get_world_size, get_rank
 from torch.utils.data.sampler import Sampler
 
-__all__ = ['GroupSampler', 'DistributedGroupSampler']
-
 
 class GroupSampler(Sampler):
 
diff --git a/mmdet/datasets/transforms.py b/mmdet/datasets/transforms.py
index 1532fe074f2968b225cc030dc3f868b3c7780194..ddb2fb2c2f483326e8703a108d086a919542b212 100644
--- a/mmdet/datasets/transforms.py
+++ b/mmdet/datasets/transforms.py
@@ -2,15 +2,12 @@ import mmcv
 import numpy as np
 import torch
 
-from mmdet.core.mask_ops import segms
-
-__all__ = [
-    'ImageTransform', 'BboxTransform', 'PolyMaskTransform', 'Numpy2Tensor'
-]
+__all__ = ['ImageTransform', 'BboxTransform', 'MaskTransform', 'Numpy2Tensor']
 
 
 class ImageTransform(object):
-    """Preprocess an image
+    """Preprocess an image.
+
     1. rescale the image to expected size
     2. normalize the image
     3. flip the image (if needed)
@@ -29,90 +26,38 @@ class ImageTransform(object):
         self.size_divisor = size_divisor
 
     def __call__(self, img, scale, flip=False):
-        img, scale_factor = mmcv.imrescale(img, scale, True)
+        img, scale_factor = mmcv.imrescale(img, scale, return_scale=True)
         img_shape = img.shape
-        img = mmcv.imnorm(img, self.mean, self.std, self.to_rgb)
+        img = mmcv.imnormalize(img, self.mean, self.std, self.to_rgb)
         if flip:
             img = mmcv.imflip(img)
         if self.size_divisor is not None:
             img = mmcv.impad_to_multiple(img, self.size_divisor)
+            pad_shape = img.shape
+        else:
+            pad_shape = img_shape
         img = img.transpose(2, 0, 1)
-        return img, img_shape, scale_factor
-
-        # img, scale = cvb.resize_keep_ar(img_or_path, max_long_edge,
-        #                                 max_short_edge, True)
-        # shape_scale = np.array(img.shape + (scale, ), dtype=np.float32)
-        # if flip:
-        #     img = img[:, ::-1, :].copy()
-        # if self.color_order == 'RGB':
-        #     img = cvb.bgr2rgb(img)
-        # img = img.astype(np.float32)
-        # img -= self.color_mean
-        # img /= self.color_std
-        # if self.size_divisor is None:
-        #     padded_img = img
-        # else:
-        #     pad_h = int(np.ceil(
-        #         img.shape[0] / self.size_divisor)) * self.size_divisor
-        #     pad_w = int(np.ceil(
-        #         img.shape[1] / self.size_divisor)) * self.size_divisor
-        #     padded_img = cvb.pad_img(img, (pad_h, pad_w), pad_val=0)
-        # padded_img = padded_img.transpose(2, 0, 1)
-        # return padded_img, shape_scale
-
-
-class ImageCrop(object):
-    """crop image patches and resize patches into fixed size
-    1. (read and) flip image (if needed)
-    2. crop image patches according to given bboxes
-    3. resize patches into fixed size (default 224x224)
-    4. normalize the image (if needed)
-    5. transpose to (c, h, w) (if needed)
-    """
+        return img, img_shape, pad_shape, scale_factor
 
-    def __init__(self,
-                 normalize=True,
-                 transpose=True,
-                 color_order='RGB',
-                 color_mean=(0, 0, 0),
-                 color_std=(1, 1, 1)):
-        self.normalize = normalize
-        self.transpose = transpose
-
-        assert color_order in ['RGB', 'BGR']
-        self.color_order = color_order
-        self.color_mean = np.array(color_mean, dtype=np.float32)
-        self.color_std = np.array(color_std, dtype=np.float32)
-
-    def __call__(self,
-                 img_or_path,
-                 bboxes,
-                 crop_size,
-                 scale_ratio=1.0,
-                 flip=False):
-        img = cvb.read_img(img_or_path)
-        if flip:
-            img = img[:, ::-1, :].copy()
-        crop_imgs = cvb.crop_img(
-            img,
-            bboxes[:, :4],
-            scale_ratio=scale_ratio,
-            pad_fill=self.color_mean)
-        processed_crop_imgs_list = []
-        for i in range(len(crop_imgs)):
-            crop_img = crop_imgs[i]
-            crop_img = cvb.resize(crop_img, crop_size)
-            crop_img = crop_img.astype(np.float32)
-            crop_img -= self.color_mean
-            crop_img /= self.color_std
-            processed_crop_imgs_list.append(crop_img)
-        processed_crop_imgs = np.stack(processed_crop_imgs_list, axis=0)
-        processed_crop_imgs = processed_crop_imgs.transpose(0, 3, 1, 2)
-        return processed_crop_imgs
+
+def bbox_flip(bboxes, img_shape):
+    """Flip bboxes horizontally.
+
+    Args:
+        bboxes(ndarray): shape (..., 4*k)
+        img_shape(tuple): (height, width)
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    w = img_shape[1]
+    flipped = bboxes.copy()
+    flipped[..., 0::4] = w - bboxes[..., 2::4] - 1
+    flipped[..., 2::4] = w - bboxes[..., 0::4] - 1
+    return flipped
 
 
 class BboxTransform(object):
-    """Preprocess gt bboxes
+    """Preprocess gt bboxes.
+
     1. rescale bboxes according to image size
     2. flip bboxes (if needed)
     3. pad the first dimension to `max_num_gts`
@@ -124,7 +69,7 @@ class BboxTransform(object):
     def __call__(self, bboxes, img_shape, scale_factor, flip=False):
         gt_bboxes = bboxes * scale_factor
         if flip:
-            gt_bboxes = mmcv.bbox_flip(gt_bboxes, img_shape)
+            gt_bboxes = bbox_flip(gt_bboxes, img_shape)
         gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0, img_shape[1])
         gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0, img_shape[0])
         if self.max_num_gts is None:
@@ -136,64 +81,25 @@ class BboxTransform(object):
             return padded_bboxes
 
 
-class PolyMaskTransform(object):
-
-    def __init__(self):
-        pass
-
-    def __call__(self, gt_mask_polys, gt_poly_lens, img_h, img_w, flip=False):
-        """
-        Args:
-            gt_mask_polys(list): a list of masks, each mask is a list of polys,
-                each poly is a list of numbers
-            gt_poly_lens(list): a list of int, indicating the size of each poly
-        """
-        if flip:
-            gt_mask_polys = segms.flip_segms(gt_mask_polys, img_h, img_w)
-        num_polys_per_mask = np.array(
-            [len(mask_polys) for mask_polys in gt_mask_polys], dtype=np.int64)
-        gt_poly_lens = np.array(gt_poly_lens, dtype=np.int64)
-        gt_mask_polys = [
-            np.concatenate(mask_polys).astype(np.float32)
-            for mask_polys in gt_mask_polys
-        ]
-        gt_mask_polys = np.concatenate(gt_mask_polys)
-        return gt_mask_polys, gt_poly_lens, num_polys_per_mask
-
-
 class MaskTransform(object):
-    """Preprocess masks
+    """Preprocess masks.
+
     1. resize masks to expected size and stack to a single array
     2. flip the masks (if needed)
     3. pad the masks (if needed)
     """
 
-    def __init__(self, max_num_gts, pad_size=None):
-        self.max_num_gts = max_num_gts
-        self.pad_size = pad_size
-
-    def __call__(self, masks, img_size, flip=False):
-        max_long_edge = max(img_size)
-        max_short_edge = min(img_size)
+    def __call__(self, masks, pad_shape, scale_factor, flip=False):
         masks = [
-            cvb.resize_keep_ar(
-                mask,
-                max_long_edge,
-                max_short_edge,
-                interpolation=cvb.INTER_NEAREST) for mask in masks
+            mmcv.imrescale(mask, scale_factor, interpolation='nearest')
+            for mask in masks
         ]
-        masks = np.stack(masks, axis=0)
         if flip:
-            masks = masks[:, ::-1, :]
-        if self.pad_size is None:
-            pad_h = masks.shape[1]
-            pad_w = masks.shape[2]
-        else:
-            pad_size = self.pad_size if self.pad_size > 0 else max_long_edge
-            pad_h = pad_w = pad_size
-        padded_masks = np.zeros(
-            (self.max_num_gts, pad_h, pad_w), dtype=masks.dtype)
-        padded_masks[:masks.shape[0], :masks.shape[1], :masks.shape[2]] = masks
+            masks = [mask[:, ::-1] for mask in masks]
+        padded_masks = [
+            mmcv.impad(mask, pad_shape[:2], pad_val=0) for mask in masks
+        ]
+        padded_masks = np.stack(padded_masks, axis=0)
         return padded_masks
 
 
diff --git a/mmdet/datasets/utils/misc.py b/mmdet/datasets/utils.py
similarity index 64%
rename from mmdet/datasets/utils/misc.py
rename to mmdet/datasets/utils.py
index 419c11ad08462268b9dfe6b43182a9ec4725b00c..5a248ef6890ea348ea7ad98154cc163ae1e035c5 100644
--- a/mmdet/datasets/utils/misc.py
+++ b/mmdet/datasets/utils.py
@@ -1,8 +1,31 @@
+from collections import Sequence
+
 import mmcv
+import torch
 
 import matplotlib.pyplot as plt
 import numpy as np
-import pycocotools.mask as maskUtils
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError('type {} cannot be converted to tensor.'.format(
+            type(data)))
 
 
 def random_scale(img_scales, mode='range'):
@@ -44,19 +67,3 @@ def show_ann(coco, img, ann_info):
     plt.axis('off')
     coco.showAnns(ann_info)
     plt.show()
-
-
-def draw_bbox_and_segm(img, results, dataset, score_thr=0.5):
-    bbox_results, segm_results = results
-    hi_bboxes = []
-    for cls_bboxes, cls_segms in zip(bbox_results, segm_results):
-        if len(cls_bboxes) == 0:
-            hi_bboxes.append(cls_bboxes)
-            continue
-        inds = np.where(cls_bboxes[:, -1] > score_thr)[0]
-        hi_bboxes.append(cls_bboxes[inds, :])
-        color_mask = np.random.random((1, 3))
-        for i in inds:
-            mask = maskUtils.decode(cls_segms[i]).astype(np.bool)
-            img[mask] = img[mask] * 0.5 + color_mask * 0.5
-    mmcv.draw_bboxes_with_label(np.ascontiguousarray(img), hi_bboxes, dataset)
diff --git a/mmdet/datasets/utils/__init__.py b/mmdet/datasets/utils/__init__.py
deleted file mode 100644
index de3ea43bdf4e4cc526119054954fdd1acf811c38..0000000000000000000000000000000000000000
--- a/mmdet/datasets/utils/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .data_container import DataContainer
-from .misc import *
diff --git a/mmdet/datasets/utils/data_container.py b/mmdet/datasets/utils/data_container.py
deleted file mode 100644
index c27beab37bbd28aeb37c1231b8ff94a335702216..0000000000000000000000000000000000000000
--- a/mmdet/datasets/utils/data_container.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import functools
-from collections import Sequence
-
-import mmcv
-import numpy as np
-import torch
-
-
-def to_tensor(data):
-    """Convert objects of various python types to :obj:`torch.Tensor`.
-
-    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
-    :class:`Sequence`, :class:`int` and :class:`float`.
-    """
-    if isinstance(data, np.ndarray):
-        return torch.from_numpy(data)
-    elif isinstance(data, torch.Tensor):
-        return data
-    elif isinstance(data, Sequence) and not mmcv.is_str(data):
-        return torch.tensor(data)
-    elif isinstance(data, int):
-        return torch.LongTensor([data])
-    elif isinstance(data, float):
-        return torch.FloatTensor([data])
-    else:
-        raise TypeError('type {} cannot be converted to tensor.'.format(
-            type(data)))
-
-
-def assert_tensor_type(func):
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        if not isinstance(args[0].data, torch.Tensor):
-            raise AttributeError('{} has no attribute {} for type {}'.format(
-                args[0].__class__.__name__, func.__name__, args[0].datatype))
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-class DataContainer(object):
-
-    def __init__(self, data, stack=False, padding_value=0):
-        if isinstance(data, list):
-            self._data = data
-        else:
-            self._data = to_tensor(data)
-        self._stack = stack
-        self._padding_value = padding_value
-
-    def __repr__(self):
-        return '{}({})'.format(self.__class__.__name__, repr(self.data))
-
-    @property
-    def data(self):
-        return self._data
-
-    @property
-    def datatype(self):
-        if isinstance(self.data, torch.Tensor):
-            return self.data.type()
-        else:
-            return type(self.data)
-
-    @property
-    def stack(self):
-        return self._stack
-
-    @property
-    def padding_value(self):
-        return self._padding_value
-
-    @assert_tensor_type
-    def size(self, *args, **kwargs):
-        return self.data.size(*args, **kwargs)
-
-    @assert_tensor_type
-    def dim(self):
-        return self.data.dim()
diff --git a/mmdet/models/__init__.py b/mmdet/models/__init__.py
index 2209550509f71a71a66b2582440986eebcf3926c..aca6399e45e3e21c40d8e2470b233ac0d992888e 100644
--- a/mmdet/models/__init__.py
+++ b/mmdet/models/__init__.py
@@ -1 +1,9 @@
-from .detectors import Detector
+from .detectors import BaseDetector, RPN, FasterRCNN, MaskRCNN
+from .builder import (build_neck, build_rpn_head, build_roi_extractor,
+                      build_bbox_head, build_mask_head, build_detector)
+
+__all__ = [
+    'BaseDetector', 'RPN', 'FasterRCNN', 'MaskRCNN', 'build_backbone',
+    'build_neck', 'build_rpn_head', 'build_roi_extractor', 'build_bbox_head',
+    'build_mask_head', 'build_detector'
+]
diff --git a/mmdet/models/backbones/__init__.py b/mmdet/models/backbones/__init__.py
index f9e21e83d1469167d35de22c6511f6c09c260727..107507ceaf6d1a36cafe07197cefd9693a13a49b 100644
--- a/mmdet/models/backbones/__init__.py
+++ b/mmdet/models/backbones/__init__.py
@@ -1 +1,3 @@
 from .resnet import resnet
+
+__all__ = ['resnet']
diff --git a/mmdet/models/backbones/resnet.py b/mmdet/models/backbones/resnet.py
index f8203accd4b335886b7ebffd59517bdc8568769e..371f4f59feca466eca0040faeb1ae7de5e78800f 100644
--- a/mmdet/models/backbones/resnet.py
+++ b/mmdet/models/backbones/resnet.py
@@ -1,7 +1,9 @@
+import logging
 import math
+
 import torch.nn as nn
 import torch.utils.checkpoint as cp
-from torchpack import load_checkpoint
+from mmcv.runner import load_checkpoint
 
 
 def conv3x3(in_planes, out_planes, stride=1, dilation=1):
@@ -25,7 +27,7 @@ class BasicBlock(nn.Module):
                  stride=1,
                  dilation=1,
                  downsample=None,
-                 style='fb'):
+                 style='pytorch'):
         super(BasicBlock, self).__init__()
         self.conv1 = conv3x3(inplanes, planes, stride, dilation)
         self.bn1 = nn.BatchNorm2d(planes)
@@ -64,15 +66,16 @@ class Bottleneck(nn.Module):
                  stride=1,
                  dilation=1,
                  downsample=None,
-                 style='fb',
+                 style='pytorch',
                  with_cp=False):
-        """Bottleneck block
-        if style is "fb", the stride-two layer is the 3x3 conv layer,
-        if style is "msra", the stride-two layer is the first 1x1 conv layer
+        """Bottleneck block.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer,
+        if it is "caffe", the stride-two layer is the first 1x1 conv layer.
         """
         super(Bottleneck, self).__init__()
-        assert style in ['fb', 'msra']
-        if style == 'fb':
+        assert style in ['pytorch', 'caffe']
+        if style == 'pytorch':
             conv1_stride = 1
             conv2_stride = stride
         else:
@@ -139,7 +142,7 @@ def make_res_layer(block,
                    blocks,
                    stride=1,
                    dilation=1,
-                   style='fb',
+                   style='pytorch',
                    with_cp=False):
     downsample = None
     if stride != 1 or inplanes != planes * block.expansion:
@@ -173,7 +176,12 @@ def make_res_layer(block,
 
 class ResHead(nn.Module):
 
-    def __init__(self, block, num_blocks, stride=2, dilation=1, style='fb'):
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 stride=2,
+                 dilation=1,
+                 style='pytorch'):
         self.layer4 = make_res_layer(
             block,
             1024,
@@ -196,9 +204,10 @@ class ResNet(nn.Module):
                  dilations=(1, 1, 1, 1),
                  out_indices=(0, 1, 2, 3),
                  frozen_stages=-1,
-                 style='fb',
+                 style='pytorch',
                  sync_bn=False,
-                 with_cp=False):
+                 with_cp=False,
+                 strict_frozen=False):
         super(ResNet, self).__init__()
         if not len(layers) == len(strides) == len(dilations):
             raise ValueError(
@@ -234,14 +243,17 @@ class ResNet(nn.Module):
                 style=self.style,
                 with_cp=with_cp)
             self.inplanes = planes * block.expansion
-            setattr(self, layer_name, res_layer)
+            self.add_module(layer_name, res_layer)
             self.res_layers.append(layer_name)
         self.feat_dim = block.expansion * 64 * 2**(len(layers) - 1)
         self.with_cp = with_cp
 
+        self.strict_frozen = strict_frozen
+
     def init_weights(self, pretrained=None):
         if isinstance(pretrained, str):
-            load_checkpoint(self, pretrained, strict=False)
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
         elif pretrained is None:
             for m in self.modules():
                 if isinstance(m, nn.Conv2d):
@@ -275,6 +287,9 @@ class ResNet(nn.Module):
             for m in self.modules():
                 if isinstance(m, nn.BatchNorm2d):
                     m.eval()
+                    if self.strict_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
         if mode and self.frozen_stages >= 0:
             for param in self.conv1.parameters():
                 param.requires_grad = False
@@ -305,9 +320,10 @@ def resnet(depth,
            dilations=(1, 1, 1, 1),
            out_indices=(2, ),
            frozen_stages=-1,
-           style='fb',
+           style='pytorch',
            sync_bn=False,
-           with_cp=False):
+           with_cp=False,
+           strict_frozen=False):
     """Constructs a ResNet model.
 
     Args:
@@ -321,5 +337,5 @@ def resnet(depth,
         raise KeyError('invalid depth {} for resnet'.format(depth))
     block, layers = resnet_cfg[depth]
     model = ResNet(block, layers[:num_stages], strides, dilations, out_indices,
-                   frozen_stages, style, sync_bn, with_cp)
+                   frozen_stages, style, sync_bn, with_cp, strict_frozen)
     return model
diff --git a/mmdet/models/bbox_heads/bbox_head.py b/mmdet/models/bbox_heads/bbox_head.py
index 5f6e1136eed45abe85a710170e76e04cba0e91cf..67dba03959231b5ed0f784ac97542911b56cc785 100644
--- a/mmdet/models/bbox_heads/bbox_head.py
+++ b/mmdet/models/bbox_heads/bbox_head.py
@@ -1,7 +1,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from mmdet.core import (bbox_transform_inv, multiclass_nms, bbox_target,
+from mmdet.core import (delta2bbox, multiclass_nms, bbox_target,
                         weighted_cross_entropy, weighted_smoothl1, accuracy)
 
 
@@ -60,7 +60,7 @@ class BBoxHead(nn.Module):
         return cls_score, bbox_pred
 
     def get_bbox_target(self, pos_proposals, neg_proposals, pos_gt_bboxes,
-                    pos_gt_labels, rcnn_train_cfg):
+                        pos_gt_labels, rcnn_train_cfg):
         reg_num_classes = 1 if self.reg_class_agnostic else self.num_classes
         cls_reg_targets = bbox_target(
             pos_proposals,
@@ -85,7 +85,7 @@ class BBoxHead(nn.Module):
                 bbox_pred,
                 bbox_targets,
                 bbox_weights,
-                ave_factor=bbox_targets.size(0))
+                avg_factor=bbox_targets.size(0))
         return losses
 
     def get_det_bboxes(self,
@@ -101,15 +101,14 @@ class BBoxHead(nn.Module):
         scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
 
         if bbox_pred is not None:
-            bboxes = bbox_transform_inv(rois[:, 1:], bbox_pred,
-                                        self.target_means, self.target_stds,
-                                        img_shape)
+            bboxes = delta2bbox(rois[:, 1:], bbox_pred, self.target_means,
+                                self.target_stds, img_shape)
         else:
             bboxes = rois[:, 1:]
             # TODO: add clip here
 
         if rescale:
-            bboxes /= scale_factor.float()
+            bboxes /= scale_factor
 
         if nms_cfg is None:
             return bboxes, scores
diff --git a/mmdet/models/bbox_heads/convfc_bbox_head.py b/mmdet/models/bbox_heads/convfc_bbox_head.py
index 02e2a6b6d859e728a47f98fe857f1e71c2a6754a..f7bd7f80a9fc00bd3fc020ccd7d834eb45905067 100644
--- a/mmdet/models/bbox_heads/convfc_bbox_head.py
+++ b/mmdet/models/bbox_heads/convfc_bbox_head.py
@@ -43,17 +43,21 @@ class ConvFCRoIHead(BBoxHead):
         self.fc_out_channels = fc_out_channels
 
         # add shared convs and fcs
-        self.shared_convs, self.shared_fcs, last_layer_dim = self._add_conv_fc_branch(
-            self.num_shared_convs, self.num_shared_fcs, self.in_channels, True)
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
         self.shared_out_channels = last_layer_dim
 
         # add cls specific branch
-        self.cls_convs, self.cls_fcs, self.cls_last_dim = self._add_conv_fc_branch(
-            self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+        self.cls_convs, self.cls_fcs, self.cls_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
 
         # add reg specific branch
-        self.reg_convs, self.reg_fcs, self.reg_last_dim = self._add_conv_fc_branch(
-            self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+        self.reg_convs, self.reg_fcs, self.reg_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
 
         if self.num_shared_fcs == 0 and not self.with_avg_pool:
             if self.num_cls_fcs == 0:
diff --git a/mmdet/models/builder.py b/mmdet/models/builder.py
index c3b058507fcdc461a9d3b0271858522e4ba0f1ce..bdf0ac3d16f9aadb194f944b3f7c4dd1a741e8cd 100644
--- a/mmdet/models/builder.py
+++ b/mmdet/models/builder.py
@@ -1,27 +1,26 @@
-import mmcv
-from mmcv import torchpack
+from mmcv.runner import obj_from_dict
 from torch import nn
 
 from . import (backbones, necks, roi_extractors, rpn_heads, bbox_heads,
-               mask_heads)
+               mask_heads, detectors)
 
 __all__ = [
     'build_backbone', 'build_neck', 'build_rpn_head', 'build_roi_extractor',
-    'build_bbox_head', 'build_mask_head'
+    'build_bbox_head', 'build_mask_head', 'build_detector'
 ]
 
 
-def _build_module(cfg, parrent=None):
-    return cfg if isinstance(cfg, nn.Module) else torchpack.obj_from_dict(
-        cfg, parrent)
+def _build_module(cfg, parrent=None, default_args=None):
+    return cfg if isinstance(cfg, nn.Module) else obj_from_dict(
+        cfg, parrent, default_args)
 
 
-def build(cfg, parrent=None):
+def build(cfg, parrent=None, default_args=None):
     if isinstance(cfg, list):
-        modules = [_build_module(cfg_, parrent) for cfg_ in cfg]
+        modules = [_build_module(cfg_, parrent, default_args) for cfg_ in cfg]
         return nn.Sequential(*modules)
     else:
-        return _build_module(cfg, parrent)
+        return _build_module(cfg, parrent, default_args)
 
 
 def build_backbone(cfg):
@@ -46,3 +45,7 @@ def build_bbox_head(cfg):
 
 def build_mask_head(cfg):
     return build(cfg, mask_heads)
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    return build(cfg, detectors, dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/mmdet/models/detectors/__init__.py b/mmdet/models/detectors/__init__.py
index 5b690f8d77d6d8eae1adc4bf8b04d3dd3db3462a..b8914c1e5d3c834a1373b2a2e8360183a41de4da 100644
--- a/mmdet/models/detectors/__init__.py
+++ b/mmdet/models/detectors/__init__.py
@@ -1 +1,6 @@
-from .detector import Detector
+from .base import BaseDetector
+from .rpn import RPN
+from .faster_rcnn import FasterRCNN
+from .mask_rcnn import MaskRCNN
+
+__all__ = ['BaseDetector', 'RPN', 'FasterRCNN', 'MaskRCNN']
diff --git a/mmdet/models/detectors/base.py b/mmdet/models/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1b0fce1283b012072e7fb1f864313135eeac940
--- /dev/null
+++ b/mmdet/models/detectors/base.py
@@ -0,0 +1,119 @@
+import logging
+from abc import ABCMeta, abstractmethod
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmdet.core import tensor2imgs, get_classes
+
+
+class BaseDetector(nn.Module):
+    """Base class for detectors"""
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self):
+        super(BaseDetector, self).__init__()
+
+    @property
+    def with_neck(self):
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_bbox(self):
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self):
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        pass
+
+    def extract_feats(self, imgs):
+        if isinstance(imgs, torch.Tensor):
+            return self.extract_feat(imgs)
+        elif isinstance(imgs, list):
+            for img in imgs:
+                yield self.extract_feat(img)
+
+    @abstractmethod
+    def forward_train(self, imgs, img_metas, **kwargs):
+        pass
+
+    @abstractmethod
+    def simple_test(self, img, img_meta, **kwargs):
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        pass
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            logger = logging.getLogger()
+            logger.info('load model from: {}'.format(pretrained))
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(
+                'num of augmentations ({}) != num of image meta ({})'.format(
+                    len(imgs), len(img_metas)))
+        # TODO: remove the restriction of imgs_per_gpu == 1 when prepared
+        imgs_per_gpu = imgs[0].size(0)
+        assert imgs_per_gpu == 1
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    def forward(self, img, img_meta, return_loss=True, **kwargs):
+        if return_loss:
+            return self.forward_train(img, img_meta, **kwargs)
+        else:
+            return self.forward_test(img, img_meta, **kwargs)
+
+    def show_result(self,
+                    data,
+                    result,
+                    img_norm_cfg,
+                    dataset='coco',
+                    score_thr=0.3):
+        img_tensor = data['img'][0]
+        img_metas = data['img_meta'][0].data[0]
+        imgs = tensor2imgs(img_tensor, **img_norm_cfg)
+        assert len(imgs) == len(img_metas)
+
+        if isinstance(dataset, str):
+            class_names = get_classes(dataset)
+        elif isinstance(dataset, list):
+            class_names = dataset
+        else:
+            raise TypeError('dataset must be a valid dataset name or a list'
+                            ' of class names, not {}'.format(type(dataset)))
+
+        for img, img_meta in zip(imgs, img_metas):
+            h, w, _ = img_meta['img_shape']
+            img_show = img[:h, :w, :]
+            labels = [
+                np.full(bbox.shape[0], i, dtype=np.int32)
+                for i, bbox in enumerate(result)
+            ]
+            labels = np.concatenate(labels)
+            bboxes = np.vstack(result)
+            mmcv.imshow_det_bboxes(
+                img_show,
+                bboxes,
+                labels,
+                class_names=class_names,
+                score_thr=score_thr)
diff --git a/mmdet/models/detectors/detector.py b/mmdet/models/detectors/detector.py
deleted file mode 100644
index 80b7d4438cb59612dbff8a2bf71930eb6383a144..0000000000000000000000000000000000000000
--- a/mmdet/models/detectors/detector.py
+++ /dev/null
@@ -1,348 +0,0 @@
-import torch
-import torch.nn as nn
-
-from .. import builder
-from mmdet.core import (bbox2roi, bbox_mapping, split_combined_gt_polys,
-                        bbox2result, multiclass_nms, merge_aug_proposals,
-                        merge_aug_bboxes, merge_aug_masks, sample_proposals)
-
-
-class Detector(nn.Module):
-    def __init__(self,
-                 backbone,
-                 neck=None,
-                 rpn_head=None,
-                 roi_block=None,
-                 bbox_head=None,
-                 mask_block=None,
-                 mask_head=None,
-                 rpn_train_cfg=None,
-                 rpn_test_cfg=None,
-                 rcnn_train_cfg=None,
-                 rcnn_test_cfg=None,
-                 pretrained=None):
-        super(Detector, self).__init__()
-        self.backbone = builder.build_backbone(backbone)
-
-        self.with_neck = True if neck is not None else False
-        if self.with_neck:
-            self.neck = builder.build_neck(neck)
-
-        self.with_rpn = True if rpn_head is not None else False
-        if self.with_rpn:
-            self.rpn_head = builder.build_rpn_head(rpn_head)
-            self.rpn_train_cfg = rpn_train_cfg
-            self.rpn_test_cfg = rpn_test_cfg
-
-        self.with_bbox = True if bbox_head is not None else False
-        if self.with_bbox:
-            self.bbox_roi_extractor = builder.build_roi_extractor(roi_block)
-            self.bbox_head = builder.build_bbox_head(bbox_head)
-            self.rcnn_train_cfg = rcnn_train_cfg
-            self.rcnn_test_cfg = rcnn_test_cfg
-
-        self.with_mask = True if mask_head is not None else False
-        if self.with_mask:
-            self.mask_roi_extractor = builder.build_roi_extractor(mask_block)
-            self.mask_head = builder.build_mask_head(mask_head)
-
-        self.init_weights(pretrained=pretrained)
-
-    def init_weights(self, pretrained=None):
-        if pretrained is not None:
-            print('load model from: {}'.format(pretrained))
-        self.backbone.init_weights(pretrained=pretrained)
-        if self.with_neck:
-            if isinstance(self.neck, nn.Sequential):
-                for m in self.neck:
-                    m.init_weights()
-            else:
-                self.neck.init_weights()
-        if self.with_rpn:
-            self.rpn_head.init_weights()
-        if self.with_bbox:
-            self.bbox_roi_extractor.init_weights()
-            self.bbox_head.init_weights()
-        if self.with_mask:
-            self.mask_roi_extractor.init_weights()
-            self.mask_head.init_weights()
-
-    def forward(self,
-                img,
-                img_meta,
-                gt_bboxes=None,
-                proposals=None,
-                gt_labels=None,
-                gt_bboxes_ignore=None,
-                gt_mask_polys=None,
-                gt_poly_lens=None,
-                num_polys_per_mask=None,
-                return_loss=True,
-                return_bboxes=True,
-                rescale=False):
-        assert proposals is not None or self.with_rpn, "Only one of proposals file and RPN can exist."
-
-        if not return_loss:
-            return self.test(img, img_meta, proposals, rescale)
-        else:
-            losses = dict()
-
-        img_shapes = img_meta['img_shape']
-        x = self.backbone(img)
-
-        if self.with_neck:
-            x = self.neck(x)
-
-        if self.with_rpn:
-            rpn_outs = self.rpn_head(x)
-            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_shapes,
-                                          self.rpn_train_cfg)
-            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
-            losses.update(rpn_losses)
-
-        if self.with_bbox:
-            if self.with_rpn:
-                proposal_inputs = rpn_outs + (img_shapes, self.rpn_test_cfg)
-                proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
-            else:
-                proposal_list = proposals
-
-            (pos_inds, neg_inds, pos_proposals, neg_proposals,
-             pos_assigned_gt_inds,
-             pos_gt_bboxes, pos_gt_labels) = sample_proposals(
-                 proposal_list, gt_bboxes, gt_bboxes_ignore, gt_labels,
-                 self.rcnn_train_cfg)
-
-            labels, label_weights, bbox_targets, bbox_weights = \
-                self.bbox_head.get_bbox_target(
-                    pos_proposals, neg_proposals, pos_gt_bboxes, pos_gt_labels,
-                    self.rcnn_train_cfg)
-
-            rois = bbox2roi([
-                torch.cat([pos, neg], dim=0)
-                for pos, neg in zip(pos_proposals, neg_proposals)
-            ])
-            # TODO: a more flexible way to configurate feat maps
-            roi_feats = self.bbox_roi_extractor(
-                x[:self.bbox_roi_extractor.num_inputs], rois)
-            cls_score, bbox_pred = self.bbox_head(roi_feats)
-
-            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, labels,
-                                            label_weights, bbox_targets,
-                                            bbox_weights)
-            losses.update(loss_bbox)
-
-        if self.with_mask:
-            gt_polys = split_combined_gt_polys(gt_mask_polys, gt_poly_lens,
-                                               num_polys_per_mask)
-            mask_targets = self.mask_head.get_mask_target(
-                pos_proposals, pos_assigned_gt_inds, gt_polys, img_meta,
-                self.rcnn_train_cfg)
-            pos_rois = bbox2roi(pos_proposals)
-            mask_feats = self.mask_roi_extractor(
-                x[:self.mask_roi_extractor.num_inputs], pos_rois)
-            mask_pred = self.mask_head(mask_feats)
-            losses['loss_mask'] = self.mask_head.loss(mask_pred, mask_targets,
-                                                      torch.cat(pos_gt_labels))
-        return losses
-
-    def test(self, imgs, img_metas, proposals=None, rescale=False):
-        """Test w/ or w/o augmentations."""
-        assert isinstance(imgs, list) and isinstance(img_metas, list)
-        assert len(imgs) == len(img_metas)
-        img_per_gpu = imgs[0].size(0)
-        assert img_per_gpu == 1
-        if len(imgs) == 1:
-            return self.simple_test(imgs[0], img_metas[0], proposals, rescale)
-        else:
-            return self.aug_test(imgs, img_metas, proposals, rescale)
-
-    def simple_test_rpn(self, x, img_meta):
-        img_shapes = img_meta['img_shape']
-        scale_factor = img_meta['scale_factor']
-        rpn_outs = self.rpn_head(x)
-        proposal_inputs = rpn_outs + (img_shapes, self.rpn_test_cfg)
-        proposal_list = self.rpn_head.get_proposals(*proposal_inputs)[0]
-        return proposal_list
-
-    def simple_test_bboxes(self, x, img_meta, proposals, rescale=False):
-        """Test only det bboxes without augmentation."""
-        rois = bbox2roi(proposals)
-        roi_feats = self.bbox_roi_extractor(
-            x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
-        cls_score, bbox_pred = self.bbox_head(roi_feats)
-        # image shape of the first image in the batch (only one)
-        img_shape = img_meta['img_shape'][0]
-        scale_factor = img_meta['scale_factor']
-        det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
-            rois,
-            cls_score,
-            bbox_pred,
-            img_shape,
-            scale_factor,
-            rescale=rescale,
-            nms_cfg=self.rcnn_test_cfg)
-        return det_bboxes, det_labels
-
-    def simple_test_mask(self,
-                         x,
-                         img_meta,
-                         det_bboxes,
-                         det_labels,
-                         rescale=False):
-        # image shape of the first image in the batch (only one)
-        img_shape = img_meta['img_shape'][0]
-        scale_factor = img_meta['scale_factor']
-        if det_bboxes.shape[0] == 0:
-            segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
-        else:
-            # if det_bboxes is rescaled to the original image size, we need to
-            # rescale it back to the testing scale to obtain RoIs.
-            _bboxes = (det_bboxes[:, :4] * scale_factor.float()
-                       if rescale else det_bboxes)
-            mask_rois = bbox2roi([_bboxes])
-            mask_feats = self.mask_roi_extractor(
-                x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois)
-            mask_pred = self.mask_head(mask_feats)
-            segm_result = self.mask_head.get_seg_masks(
-                mask_pred,
-                det_bboxes,
-                det_labels,
-                self.rcnn_test_cfg,
-                ori_scale=img_meta['ori_shape'])
-        return segm_result
-
-    def simple_test(self, img, img_meta, proposals=None, rescale=False):
-        """Test without augmentation."""
-        # get feature maps
-        x = self.backbone(img)
-        if self.with_neck:
-            x = self.neck(x)
-        if self.with_rpn:
-            proposals = self.simple_test_rpn(x, img_meta)
-        if self.with_bbox:
-            # BUG proposals shape?
-            det_bboxes, det_labels = self.simple_test_bboxes(
-                x, img_meta, [proposals], rescale=rescale)
-            bbox_result = bbox2result(det_bboxes, det_labels,
-                                      self.bbox_head.num_classes)
-            if not self.with_mask:
-                return bbox_result
-
-            segm_result = self.simple_test_mask(
-                x, img_meta, det_bboxes, det_labels, rescale=rescale)
-            return bbox_result, segm_result
-        else:
-            proposals[:, :4] /= img_meta['scale_factor'].float()
-            return proposals.cpu().numpy()
-
-    # TODO aug test haven't been verified
-    def aug_test_bboxes(self, imgs, img_metas):
-        """Test with augmentations for det bboxes."""
-        # step 1: get RPN proposals for augmented images, apply NMS to the
-        # union of all proposals.
-        aug_proposals = []
-        for img, img_meta in zip(imgs, img_metas):
-            x = self.backbone(img)
-            if self.neck is not None:
-                x = self.neck(x)
-            rpn_outs = self.rpn_head(x)
-            proposal_inputs = rpn_outs + (img_meta['shape_scale'],
-                                          self.rpn_test_cfg)
-            proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
-            assert len(proposal_list) == 1
-            aug_proposals.append(proposal_list[0])  # len(proposal_list) = 1
-        # after merging, proposals will be rescaled to the original image size
-        merged_proposals = merge_aug_proposals(aug_proposals, img_metas,
-                                               self.rpn_test_cfg)
-        # step 2: Given merged proposals, predict bboxes for augmented images,
-        # output the union of these bboxes.
-        aug_bboxes = []
-        aug_scores = []
-        for img, img_meta in zip(imgs, img_metas):
-            # only one image in the batch
-            img_shape = img_meta['shape_scale'][0]
-            flip = img_meta['flip'][0]
-            proposals = bbox_mapping(merged_proposals[:, :4], img_shape, flip)
-            rois = bbox2roi([proposals])
-            # recompute feature maps to save GPU memory
-            x = self.backbone(img)
-            if self.neck is not None:
-                x = self.neck(x)
-            roi_feats = self.bbox_roi_extractor(
-                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
-            cls_score, bbox_pred = self.bbox_head(roi_feats)
-            bboxes, scores = self.bbox_head.get_det_bboxes(
-                rois,
-                cls_score,
-                bbox_pred,
-                img_shape,
-                rescale=False,
-                nms_cfg=None)
-            aug_bboxes.append(bboxes)
-            aug_scores.append(scores)
-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes, merged_scores = merge_aug_bboxes(
-            aug_bboxes, aug_scores, img_metas, self.rcnn_test_cfg)
-        det_bboxes, det_labels = multiclass_nms(
-            merged_bboxes, merged_scores, self.rcnn_test_cfg.score_thr,
-            self.rcnn_test_cfg.nms_thr, self.rcnn_test_cfg.max_per_img)
-        return det_bboxes, det_labels
-
-    def aug_test_mask(self,
-                      imgs,
-                      img_metas,
-                      det_bboxes,
-                      det_labels,
-                      rescale=False):
-        # step 3: Given merged bboxes, predict masks for augmented images,
-        # scores of masks are averaged across augmented images.
-        if rescale:
-            _det_bboxes = det_bboxes
-        else:
-            _det_bboxes = det_bboxes.clone()
-            _det_bboxes[:, :4] *= img_metas[0]['shape_scale'][0][-1]
-        if det_bboxes.shape[0] == 0:
-            segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
-        else:
-            aug_masks = []
-            for img, img_meta in zip(imgs, img_metas):
-                img_shape = img_meta['shape_scale'][0]
-                flip = img_meta['flip'][0]
-                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, flip)
-                mask_rois = bbox2roi([_bboxes])
-                x = self.backbone(img)
-                if self.neck is not None:
-                    x = self.neck(x)
-                mask_feats = self.mask_roi_extractor(
-                    x[:len(self.mask_roi_extractor.featmap_strides)],
-                    mask_rois)
-                mask_pred = self.mask_head(mask_feats)
-                # convert to numpy array to save memory
-                aug_masks.append(mask_pred.sigmoid().cpu().numpy())
-            merged_masks = merge_aug_masks(aug_masks, img_metas,
-                                           self.rcnn_test_cfg)
-            segm_result = self.mask_head.get_seg_masks(
-                merged_masks, _det_bboxes, det_labels,
-                img_metas[0]['shape_scale'][0], self.rcnn_test_cfg, rescale)
-        return segm_result
-
-    def aug_test(self, imgs, img_metas, rescale=False):
-        """Test with augmentations.
-        If rescale is False, then returned bboxes and masks will fit the scale
-        if imgs[0].
-        """
-        # aug test det bboxes
-        det_bboxes, det_labels = self.aug_test_bboxes(imgs, img_metas)
-        if rescale:
-            _det_bboxes = det_bboxes
-        else:
-            _det_bboxes = det_bboxes.clone()
-            _det_bboxes[:, :4] *= img_metas[0]['shape_scale'][0][-1]
-        bbox_result = bbox2result(_det_bboxes, det_labels,
-                                  self.bbox_head.num_classes)
-        if not self.with_mask:
-            return bbox_result
-        segm_result = self.aug_test_mask(
-            imgs, img_metas, det_bboxes, det_labels, rescale=rescale)
-        return bbox_result, segm_result
diff --git a/mmdet/models/detectors/faster_rcnn.py b/mmdet/models/detectors/faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd31f60c1d819b6c7ba47a67ecb3285a46e09636
--- /dev/null
+++ b/mmdet/models/detectors/faster_rcnn.py
@@ -0,0 +1,23 @@
+from .two_stage import TwoStageDetector
+
+
+class FasterRCNN(TwoStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 bbox_roi_extractor,
+                 bbox_head,
+                 train_cfg,
+                 test_cfg,
+                 pretrained=None):
+        super(FasterRCNN, self).__init__(
+                    backbone=backbone,
+                    neck=neck,
+                    rpn_head=rpn_head,
+                    bbox_roi_extractor=bbox_roi_extractor,
+                    bbox_head=bbox_head,
+                    train_cfg=train_cfg,
+                    test_cfg=test_cfg,
+                    pretrained=pretrained)
diff --git a/mmdet/models/detectors/mask_rcnn.py b/mmdet/models/detectors/mask_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..25a363e398f6c0d01e2f8bd53e05c9046a5275ac
--- /dev/null
+++ b/mmdet/models/detectors/mask_rcnn.py
@@ -0,0 +1,34 @@
+from .two_stage import TwoStageDetector
+
+
+class MaskRCNN(TwoStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 bbox_roi_extractor,
+                 bbox_head,
+                 mask_roi_extractor,
+                 mask_head,
+                 train_cfg,
+                 test_cfg,
+                 pretrained=None):
+        super(MaskRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            bbox_head=bbox_head,
+            mask_roi_extractor=mask_roi_extractor,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
+
+    def show_result(self, data, result, img_norm_cfg, **kwargs):
+        # TODO: show segmentation masks
+        assert isinstance(result, tuple)
+        assert len(result) == 2  # (bbox_results, segm_results)
+        super(MaskRCNN, self).show_result(data, result[0], img_norm_cfg,
+                                          **kwargs)
diff --git a/mmdet/models/detectors/rpn.py b/mmdet/models/detectors/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d700fe3e3c3af357256b36f1582c6a8c7249580
--- /dev/null
+++ b/mmdet/models/detectors/rpn.py
@@ -0,0 +1,85 @@
+import mmcv
+
+from mmdet.core import tensor2imgs, bbox_mapping
+from .base import BaseDetector
+from .test_mixins import RPNTestMixin
+from .. import builder
+
+
+class RPN(BaseDetector, RPNTestMixin):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 train_cfg,
+                 test_cfg,
+                 pretrained=None):
+        super(RPN, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+        self.neck = builder.build_neck(neck) if neck is not None else None
+        self.rpn_head = builder.build_rpn_head(rpn_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        super(RPN, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        self.rpn_head.init_weights()
+
+    def extract_feat(self, img):
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_train(self, img, img_meta, gt_bboxes=None):
+        if self.train_cfg.rpn.get('debug', False):
+            self.rpn_head.debug_imgs = tensor2imgs(img)
+
+        x = self.extract_feat(img)
+        rpn_outs = self.rpn_head(x)
+
+        rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn)
+        losses = self.rpn_head.loss(*rpn_loss_inputs)
+        return losses
+
+    def simple_test(self, img, img_meta, rescale=False):
+        x = self.extract_feat(img)
+        proposal_list = self.simple_test_rpn(x, img_meta, self.test_cfg.rpn)
+        if rescale:
+            for proposals, meta in zip(proposal_list, img_meta):
+                proposals[:, :4] /= meta['scale_factor']
+        # TODO: remove this restriction
+        return proposal_list[0].cpu().numpy()
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        proposal_list = self.aug_test_rpn(
+            self.extract_feats(imgs), img_metas, self.test_cfg.rpn)
+        if not rescale:
+            for proposals, img_meta in zip(proposal_list, img_metas[0]):
+                img_shape = img_meta['img_shape']
+                scale_factor = img_meta['scale_factor']
+                flip = img_meta['flip']
+                proposals[:, :4] = bbox_mapping(proposals[:, :4], img_shape,
+                                                scale_factor, flip)
+        # TODO: remove this restriction
+        return proposal_list[0].cpu().numpy()
+
+    def show_result(self, data, result, img_norm_cfg):
+        """Show RPN proposals on the image.
+
+        Although we assume batch size is 1, this method supports arbitrary
+        batch size.
+        """
+        img_tensor = data['img'][0]
+        img_metas = data['img_meta'][0].data[0]
+        imgs = tensor2imgs(img_tensor, **img_norm_cfg)
+        assert len(imgs) == len(img_metas)
+        for img, img_meta in zip(imgs, img_metas):
+            h, w, _ = img_meta['img_shape']
+            img_show = img[:h, :w, :]
+            mmcv.imshow_bboxes(img_show, result, top_k=20)
diff --git a/mmdet/models/detectors/test_mixins.py b/mmdet/models/detectors/test_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..77ba244f1a3fa107bfb6828110eaa344f4a0ba8a
--- /dev/null
+++ b/mmdet/models/detectors/test_mixins.py
@@ -0,0 +1,140 @@
+from mmdet.core import (bbox2roi, bbox_mapping, merge_aug_proposals,
+                        merge_aug_bboxes, merge_aug_masks, multiclass_nms)
+
+
+class RPNTestMixin(object):
+
+    def simple_test_rpn(self, x, img_meta, rpn_test_cfg):
+        rpn_outs = self.rpn_head(x)
+        proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg)
+        proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
+        return proposal_list
+
+    def aug_test_rpn(self, feats, img_metas, rpn_test_cfg):
+        imgs_per_gpu = len(img_metas[0])
+        aug_proposals = [[] for _ in range(imgs_per_gpu)]
+        for x, img_meta in zip(feats, img_metas):
+            proposal_list = self.simple_test_rpn(x, img_meta, rpn_test_cfg)
+            for i, proposals in enumerate(proposal_list):
+                aug_proposals[i].append(proposals)
+        # after merging, proposals will be rescaled to the original image size
+        merged_proposals = [
+            merge_aug_proposals(proposals, img_meta, rpn_test_cfg)
+            for proposals, img_meta in zip(aug_proposals, img_metas)
+        ]
+        return merged_proposals
+
+
+class BBoxTestMixin(object):
+
+    def simple_test_bboxes(self,
+                           x,
+                           img_meta,
+                           proposals,
+                           rcnn_test_cfg,
+                           rescale=False):
+        """Test only det bboxes without augmentation."""
+        rois = bbox2roi(proposals)
+        roi_feats = self.bbox_roi_extractor(
+            x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+        cls_score, bbox_pred = self.bbox_head(roi_feats)
+        img_shape = img_meta[0]['img_shape']
+        scale_factor = img_meta[0]['scale_factor']
+        det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
+            rois,
+            cls_score,
+            bbox_pred,
+            img_shape,
+            scale_factor,
+            rescale=rescale,
+            nms_cfg=rcnn_test_cfg)
+        return det_bboxes, det_labels
+
+    def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg):
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            # TODO more flexible
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip)
+            rois = bbox2roi([proposals])
+            # recompute feature maps to save GPU memory
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            cls_score, bbox_pred = self.bbox_head(roi_feats)
+            bboxes, scores = self.bbox_head.get_det_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                scale_factor,
+                rescale=False,
+                nms_cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, self.test_cfg.rcnn)
+        det_bboxes, det_labels = multiclass_nms(
+            merged_bboxes, merged_scores, self.test_cfg.rcnn.score_thr,
+            self.test_cfg.rcnn.nms_thr, self.test_cfg.rcnn.max_per_img)
+        return det_bboxes, det_labels
+
+
+class MaskTestMixin(object):
+
+    def simple_test_mask(self,
+                         x,
+                         img_meta,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        # image shape of the first image in the batch (only one)
+        ori_shape = img_meta[0]['ori_shape']
+        scale_factor = img_meta[0]['scale_factor']
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            _bboxes = (det_bboxes[:, :4] * scale_factor
+                       if rescale else det_bboxes)
+            mask_rois = bbox2roi([_bboxes])
+            mask_feats = self.mask_roi_extractor(
+                x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois)
+            mask_pred = self.mask_head(mask_feats)
+            segm_result = self.mask_head.get_seg_masks(
+                mask_pred, _bboxes, det_labels, self.test_cfg.rcnn, ori_shape,
+                scale_factor, rescale)
+        return segm_result
+
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip)
+                mask_rois = bbox2roi([_bboxes])
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+                mask_pred = self.mask_head(mask_feats)
+                # convert to numpy array to save memory
+                aug_masks.append(mask_pred.sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas,
+                                           self.test_cfg.rcnn)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            segm_result = self.mask_head.get_seg_masks(
+                merged_masks, det_bboxes, det_labels, self.test_cfg.rcnn,
+                ori_shape)
+        return segm_result
diff --git a/mmdet/models/detectors/two_stage.py b/mmdet/models/detectors/two_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..8573d83215f120ba392a2f6b45cb9b6b93ca0519
--- /dev/null
+++ b/mmdet/models/detectors/two_stage.py
@@ -0,0 +1,190 @@
+import torch
+import torch.nn as nn
+
+from .base import BaseDetector
+from .test_mixins import RPNTestMixin, BBoxTestMixin, MaskTestMixin
+from .. import builder
+from mmdet.core import sample_bboxes, bbox2roi, bbox2result, multi_apply
+
+
+class TwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin,
+                       MaskTestMixin):
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 bbox_roi_extractor=None,
+                 bbox_head=None,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(TwoStageDetector, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+        else:
+            raise NotImplementedError
+
+        if rpn_head is not None:
+            self.rpn_head = builder.build_rpn_head(rpn_head)
+
+        if bbox_head is not None:
+            self.bbox_roi_extractor = builder.build_roi_extractor(
+                bbox_roi_extractor)
+            self.bbox_head = builder.build_bbox_head(bbox_head)
+
+        if mask_head is not None:
+            self.mask_roi_extractor = builder.build_roi_extractor(
+                mask_roi_extractor)
+            self.mask_head = builder.build_mask_head(mask_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_rpn(self):
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    def init_weights(self, pretrained=None):
+        super(TwoStageDetector, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_neck:
+            if isinstance(self.neck, nn.Sequential):
+                for m in self.neck:
+                    m.init_weights()
+            else:
+                self.neck.init_weights()
+        if self.with_rpn:
+            self.rpn_head.init_weights()
+        if self.with_bbox:
+            self.bbox_roi_extractor.init_weights()
+            self.bbox_head.init_weights()
+
+    def extract_feat(self, img):
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_train(self,
+                      img,
+                      img_meta,
+                      gt_bboxes,
+                      gt_bboxes_ignore,
+                      gt_labels,
+                      gt_masks=None,
+                      proposals=None):
+        losses = dict()
+
+        x = self.extract_feat(img)
+
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(x)
+            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
+                                          self.train_cfg.rpn)
+            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
+            losses.update(rpn_losses)
+
+            proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn)
+            proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
+        else:
+            proposal_list = proposals
+
+        if self.with_bbox:
+            (pos_proposals, neg_proposals, pos_assigned_gt_inds, pos_gt_bboxes,
+             pos_gt_labels) = multi_apply(
+                 sample_bboxes,
+                 proposal_list,
+                 gt_bboxes,
+                 gt_bboxes_ignore,
+                 gt_labels,
+                 cfg=self.train_cfg.rcnn)
+            (labels, label_weights, bbox_targets,
+             bbox_weights) = self.bbox_head.get_bbox_target(
+                 pos_proposals, neg_proposals, pos_gt_bboxes, pos_gt_labels,
+                 self.train_cfg.rcnn)
+
+            rois = bbox2roi([
+                torch.cat([pos, neg], dim=0)
+                for pos, neg in zip(pos_proposals, neg_proposals)
+            ])
+            # TODO: a more flexible way to configurate feat maps
+            roi_feats = self.bbox_roi_extractor(
+                x[:self.bbox_roi_extractor.num_inputs], rois)
+            cls_score, bbox_pred = self.bbox_head(roi_feats)
+
+            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, labels,
+                                            label_weights, bbox_targets,
+                                            bbox_weights)
+            losses.update(loss_bbox)
+
+        if self.with_mask:
+            mask_targets = self.mask_head.get_mask_target(
+                pos_proposals, pos_assigned_gt_inds, gt_masks,
+                self.train_cfg.rcnn)
+            pos_rois = bbox2roi(pos_proposals)
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], pos_rois)
+            mask_pred = self.mask_head(mask_feats)
+            loss_mask = self.mask_head.loss(mask_pred, mask_targets,
+                                            torch.cat(pos_gt_labels))
+            losses.update(loss_mask)
+
+        return losses
+
+    def simple_test(self, img, img_meta, proposals=None, rescale=False):
+        """Test without augmentation."""
+        assert proposals is None, "Fast RCNN hasn't been implemented."
+        assert self.with_bbox, "Bbox head must be implemented."
+
+        x = self.extract_feat(img)
+
+        proposal_list = self.simple_test_rpn(
+            x, img_meta, self.test_cfg.rpn) if proposals is None else proposals
+
+        det_bboxes, det_labels = self.simple_test_bboxes(
+            x, img_meta, proposal_list, self.test_cfg.rcnn, rescale=rescale)
+        bbox_results = bbox2result(det_bboxes, det_labels,
+                                   self.bbox_head.num_classes)
+
+        if not self.with_mask:
+            return bbox_results
+        else:
+            segm_results = self.simple_test_mask(
+                x, img_meta, det_bboxes, det_labels, rescale=rescale)
+            return bbox_results, segm_results
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        # recompute feats to save memory
+        proposal_list = self.aug_test_rpn(
+            self.extract_feats(imgs), img_metas, self.test_cfg.rpn)
+        det_bboxes, det_labels = self.aug_test_bboxes(
+            self.extract_feats(imgs), img_metas, proposal_list,
+            self.test_cfg.rcnn)
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= img_metas[0][0]['scale_factor']
+        bbox_results = bbox2result(_det_bboxes, det_labels,
+                                   self.bbox_head.num_classes)
+
+        # det_bboxes always keep the original scale
+        if self.with_mask:
+            segm_results = self.aug_test_mask(
+                self.extract_feats(imgs), img_metas, det_bboxes, det_labels)
+            return bbox_results, segm_results
+        else:
+            return bbox_results
diff --git a/mmdet/models/mask_heads/fcn_mask_head.py b/mmdet/models/mask_heads/fcn_mask_head.py
index 016c05204bdc4533f7cca438666aa011f5ceb56d..ba46bea77e16115378f5b8d36626e3097943bd75 100644
--- a/mmdet/models/mask_heads/fcn_mask_head.py
+++ b/mmdet/models/mask_heads/fcn_mask_head.py
@@ -87,18 +87,21 @@ class FCNMaskHead(nn.Module):
         return mask_pred
 
     def get_mask_target(self, pos_proposals, pos_assigned_gt_inds, gt_masks,
-                     img_meta, rcnn_train_cfg):
+                        rcnn_train_cfg):
         mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
-                                   gt_masks, img_meta, rcnn_train_cfg)
+                                   gt_masks, rcnn_train_cfg)
         return mask_targets
 
     def loss(self, mask_pred, mask_targets, labels):
+        loss = dict()
         loss_mask = mask_cross_entropy(mask_pred, mask_targets, labels)
-        return loss_mask
+        loss['loss_mask'] = loss_mask
+        return loss
 
     def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg,
-                      ori_scale):
-        """Get segmentation masks from mask_pred and bboxes
+                      ori_shape, scale_factor, rescale):
+        """Get segmentation masks from mask_pred and bboxes.
+
         Args:
             mask_pred (Tensor or ndarray): shape (n, #class+1, h, w).
                 For single-scale testing, mask_pred is the direct output of
@@ -108,40 +111,44 @@ class FCNMaskHead(nn.Module):
             det_labels (Tensor): shape (n, )
             img_shape (Tensor): shape (3, )
             rcnn_test_cfg (dict): rcnn testing config
-            rescale (bool): whether rescale masks to original image size
+            ori_shape: original image size
+
         Returns:
             list[list]: encoded masks
         """
         if isinstance(mask_pred, torch.Tensor):
             mask_pred = mask_pred.sigmoid().cpu().numpy()
         assert isinstance(mask_pred, np.ndarray)
+
         cls_segms = [[] for _ in range(self.num_classes - 1)]
         bboxes = det_bboxes.cpu().numpy()[:, :4]
         labels = det_labels.cpu().numpy() + 1
-        img_h = ori_scale[0]
-        img_w = ori_scale[1]
+
+        if rescale:
+            img_h, img_w = ori_shape[:2]
+        else:
+            img_h = np.round(ori_shape[0] * scale_factor).astype(np.int32)
+            img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32)
+            scale_factor = 1.0
 
         for i in range(bboxes.shape[0]):
-            bbox = bboxes[i, :].astype(int)
+            bbox = (bboxes[i, :] / scale_factor).astype(np.int32)
             label = labels[i]
-            w = bbox[2] - bbox[0] + 1
-            h = bbox[3] - bbox[1] + 1
-            w = max(w, 1)
-            h = max(h, 1)
+            w = max(bbox[2] - bbox[0] + 1, 1)
+            h = max(bbox[3] - bbox[1] + 1, 1)
 
             if not self.class_agnostic:
                 mask_pred_ = mask_pred[i, label, :, :]
             else:
                 mask_pred_ = mask_pred[i, 0, :, :]
+            im_mask = np.zeros((img_h, img_w), dtype=np.uint8)
 
-            im_mask = np.zeros((img_h, img_w), dtype=np.float32)
-
-            im_mask[bbox[1]:bbox[1] + h, bbox[0]:bbox[0] + w] = mmcv.imresize(
-                mask_pred_, (w, h))
-            # im_mask = cv2.resize(im_mask, (img_w, img_h))
-            im_mask = np.array(
-                im_mask > rcnn_test_cfg.mask_thr_binary, dtype=np.uint8)
+            bbox_mask = mmcv.imresize(mask_pred_, (w, h))
+            bbox_mask = (bbox_mask > rcnn_test_cfg.mask_thr_binary).astype(
+                np.uint8)
+            im_mask[bbox[1]:bbox[1] + h, bbox[0]:bbox[0] + w] = bbox_mask
             rle = mask_util.encode(
                 np.array(im_mask[:, :, np.newaxis], order='F'))[0]
             cls_segms[label - 1].append(rle)
+
         return cls_segms
diff --git a/mmdet/models/necks/fpn.py b/mmdet/models/necks/fpn.py
index 8b5b49826bad94ce00379e60bbafc905b0cba9af..6a256cae3647bcafa54ee2671cb7167f75fc9f95 100644
--- a/mmdet/models/necks/fpn.py
+++ b/mmdet/models/necks/fpn.py
@@ -101,7 +101,7 @@ class FPN(nn.Module):
         # build top-down path
         used_backbone_levels = len(laterals)
         for i in range(used_backbone_levels - 1, 0, -1):
-            laterals[i - 1] += F.upsample(
+            laterals[i - 1] += F.interpolate(
                 laterals[i], scale_factor=2, mode='nearest')
 
         # build outputs
@@ -111,7 +111,8 @@ class FPN(nn.Module):
         ]
         # part 2: add extra levels
         if self.num_outs > len(outs):
-            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
             if not self.add_extra_convs:
                 for i in range(self.num_outs - used_backbone_levels):
                     outs.append(F.max_pool2d(outs[-1], 1, stride=2))
diff --git a/mmdet/models/roi_extractors/__init__.py b/mmdet/models/roi_extractors/__init__.py
index e76e689753f10e87b3f6d9482e880b902f9b747e..9161708ce13fa4f0a6bb188e82a19a163b9b7e4f 100644
--- a/mmdet/models/roi_extractors/__init__.py
+++ b/mmdet/models/roi_extractors/__init__.py
@@ -1,3 +1,3 @@
-from .single_level import SingleLevelRoI
+from .single_level import SingleRoIExtractor
 
-__all__ = ['SingleLevelRoI']
+__all__ = ['SingleRoIExtractor']
diff --git a/mmdet/models/roi_extractors/single_level.py b/mmdet/models/roi_extractors/single_level.py
index 3e37ac83d6ffb7beab56926329f71311f7eef116..3f97a631f987104422f65110a2cb6b49e080de0e 100644
--- a/mmdet/models/roi_extractors/single_level.py
+++ b/mmdet/models/roi_extractors/single_level.py
@@ -6,16 +6,25 @@ import torch.nn as nn
 from mmdet import ops
 
 
-class SingleLevelRoI(nn.Module):
-    """Extract RoI features from a single level feature map. Each RoI is
-    mapped to a level according to its scale."""
+class SingleRoIExtractor(nn.Module):
+    """Extract RoI features from a single level feature map.
+
+    If there are mulitple input feature levels, each RoI is mapped to a level
+    according to its scale.
+
+    Args:
+        roi_layer (dict): Specify RoI layer type and arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (int): Strides of input feature maps.
+        finest_scale (int): Scale threshold of mapping to level 0.
+    """
 
     def __init__(self,
                  roi_layer,
                  out_channels,
                  featmap_strides,
                  finest_scale=56):
-        super(SingleLevelRoI, self).__init__()
+        super(SingleRoIExtractor, self).__init__()
         self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
         self.out_channels = out_channels
         self.featmap_strides = featmap_strides
@@ -23,6 +32,7 @@ class SingleLevelRoI(nn.Module):
 
     @property
     def num_inputs(self):
+        """int: Input feature map levels."""
         return len(self.featmap_strides)
 
     def init_weights(self):
@@ -38,12 +48,19 @@ class SingleLevelRoI(nn.Module):
         return roi_layers
 
     def map_roi_levels(self, rois, num_levels):
-        """Map rois to corresponding feature levels (0-based) by scales.
+        """Map rois to corresponding feature levels by scales.
+
+        - scale < finest_scale: level 0
+        - finest_scale <= scale < finest_scale * 2: level 1
+        - finest_scale * 2 <= scale < finest_scale * 4: level 2
+        - scale >= finest_scale * 4: level 3
 
-        scale < finest_scale: level 0
-        finest_scale <= scale < finest_scale * 2: level 1
-        finest_scale * 2 <= scale < finest_scale * 4: level 2
-        scale >= finest_scale * 4: level 3
+        Args:
+            rois (Tensor): Input RoIs, shape (k, 5).
+            num_levels (int): Total level number.
+
+        Returns:
+            Tensor: Level index (0-based) of each RoI, shape (k, )
         """
         scale = torch.sqrt(
             (rois[:, 3] - rois[:, 1] + 1) * (rois[:, 4] - rois[:, 2] + 1))
@@ -52,10 +69,6 @@ class SingleLevelRoI(nn.Module):
         return target_lvls
 
     def forward(self, feats, rois):
-        """Extract roi features with the roi layer. If multiple feature levels
-        are used, then rois are mapped to corresponding levels according to
-        their scales.
-        """
         if len(feats) == 1:
             return self.roi_layers[0](feats[0], rois)
 
diff --git a/mmdet/models/rpn_heads/rpn_head.py b/mmdet/models/rpn_heads/rpn_head.py
index 7ffd441f694b5d6c37d3042bb25088f27b002ea9..e67d7ae973f05c60c8e226009cfb4234c0894f69 100644
--- a/mmdet/models/rpn_heads/rpn_head.py
+++ b/mmdet/models/rpn_heads/rpn_head.py
@@ -5,20 +5,36 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from mmdet.core import (AnchorGenerator, anchor_target, bbox_transform_inv,
-                        weighted_cross_entropy, weighted_smoothl1,
+from mmdet.core import (AnchorGenerator, anchor_target, delta2bbox,
+                        multi_apply, weighted_cross_entropy, weighted_smoothl1,
                         weighted_binary_cross_entropy)
 from mmdet.ops import nms
-from ..utils import multi_apply
 from ..utils import normal_init
 
 
 class RPNHead(nn.Module):
+    """Network head of RPN.
+
+                                  / - rpn_cls (1x1 conv)
+    input - rpn_conv (3x3 conv) -
+                                  \ - rpn_reg (1x1 conv)
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for the RPN feature map.
+        anchor_scales (Iterable): Anchor scales.
+        anchor_ratios (Iterable): Anchor aspect ratios.
+        anchor_strides (Iterable): Anchor strides.
+        anchor_base_sizes (Iterable): Anchor base sizes.
+        target_means (Iterable): Mean values of regression targets.
+        target_stds (Iterable): Std values of regression targets.
+        use_sigmoid_cls (bool): Whether to use sigmoid loss for classification.
+            (softmax by default)
+    """
 
     def __init__(self,
                  in_channels,
-                 feat_channels=512,
-                 coarsest_stride=32,
+                 feat_channels=256,
                  anchor_scales=[8, 16, 32],
                  anchor_ratios=[0.5, 1.0, 2.0],
                  anchor_strides=[4, 8, 16, 32, 64],
@@ -29,7 +45,6 @@ class RPNHead(nn.Module):
         super(RPNHead, self).__init__()
         self.in_channels = in_channels
         self.feat_channels = feat_channels
-        self.coarsest_stride = coarsest_stride
         self.anchor_scales = anchor_scales
         self.anchor_ratios = anchor_ratios
         self.anchor_strides = anchor_strides
@@ -66,63 +81,63 @@ class RPNHead(nn.Module):
     def forward(self, feats):
         return multi_apply(self.forward_single, feats)
 
-    def get_anchors(self, featmap_sizes, img_shapes):
-        """Get anchors given a list of feature map sizes, and get valid flags
-        at the same time. (Extra padding regions should be marked as invalid)
+    def get_anchors(self, featmap_sizes, img_metas):
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+
+        Returns:
+            tuple: anchors of each image, valid flags of each image
         """
-        # calculate actual image shapes
-        padded_img_shapes = []
-        for img_shape in img_shapes:
-            h, w = img_shape[:2]
-            padded_h = int(
-                np.ceil(h / self.coarsest_stride) * self.coarsest_stride)
-            padded_w = int(
-                np.ceil(w / self.coarsest_stride) * self.coarsest_stride)
-            padded_img_shapes.append((padded_h, padded_w))
-        # generate anchors for different feature levels
-        # len = feature levels
-        anchor_list = []
-        # len = imgs per gpu
-        valid_flag_list = [[] for _ in range(len(img_shapes))]
-        for i in range(len(featmap_sizes)):
-            anchor_stride = self.anchor_strides[i]
+        num_imgs = len(img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = []
+        for i in range(num_levels):
             anchors = self.anchor_generators[i].grid_anchors(
-                featmap_sizes[i], anchor_stride)
-            anchor_list.append(anchors)
-            # for each image in this feature level, get valid flags
-            featmap_size = featmap_sizes[i]
-            for img_id, (h, w) in enumerate(padded_img_shapes):
-                valid_feat_h = min(
-                    int(np.ceil(h / anchor_stride)), featmap_size[0])
-                valid_feat_w = min(
-                    int(np.ceil(w / anchor_stride)), featmap_size[1])
+                featmap_sizes[i], self.anchor_strides[i])
+            multi_level_anchors.append(anchors)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = []
+            for i in range(num_levels):
+                anchor_stride = self.anchor_strides[i]
+                feat_h, feat_w = featmap_sizes[i]
+                h, w, _ = img_meta['pad_shape']
+                valid_feat_h = min(int(np.ceil(h / anchor_stride)), feat_h)
+                valid_feat_w = min(int(np.ceil(w / anchor_stride)), feat_w)
                 flags = self.anchor_generators[i].valid_flags(
-                    featmap_size, (valid_feat_h, valid_feat_w))
-                valid_flag_list[img_id].append(flags)
+                    (feat_h, feat_w), (valid_feat_h, valid_feat_w))
+                multi_level_flags.append(flags)
+            valid_flag_list.append(multi_level_flags)
+
         return anchor_list, valid_flag_list
 
     def loss_single(self, rpn_cls_score, rpn_bbox_pred, labels, label_weights,
                     bbox_targets, bbox_weights, num_total_samples, cfg):
+        # classification loss
         labels = labels.contiguous().view(-1)
         label_weights = label_weights.contiguous().view(-1)
-        bbox_targets = bbox_targets.contiguous().view(-1, 4)
-        bbox_weights = bbox_weights.contiguous().view(-1, 4)
         if self.use_sigmoid_cls:
             rpn_cls_score = rpn_cls_score.permute(0, 2, 3,
                                                   1).contiguous().view(-1)
-            loss_cls = weighted_binary_cross_entropy(
-                rpn_cls_score,
-                labels,
-                label_weights,
-                ave_factor=num_total_samples)
+            criterion = weighted_binary_cross_entropy
         else:
             rpn_cls_score = rpn_cls_score.permute(0, 2, 3,
                                                   1).contiguous().view(-1, 2)
-            loss_cls = weighted_cross_entropy(
-                rpn_cls_score,
-                labels,
-                label_weights,
-                ave_factor=num_total_samples)
+            criterion = weighted_cross_entropy
+        loss_cls = criterion(
+            rpn_cls_score, labels, label_weights, avg_factor=num_total_samples)
+        # regression loss
+        bbox_targets = bbox_targets.contiguous().view(-1, 4)
+        bbox_weights = bbox_weights.contiguous().view(-1, 4)
         rpn_bbox_pred = rpn_bbox_pred.permute(0, 2, 3, 1).contiguous().view(
             -1, 4)
         loss_reg = weighted_smoothl1(
@@ -130,7 +145,7 @@ class RPNHead(nn.Module):
             bbox_targets,
             bbox_weights,
             beta=cfg.smoothl1_beta,
-            ave_factor=num_total_samples)
+            avg_factor=num_total_samples)
         return loss_cls, loss_reg
 
     def loss(self, rpn_cls_scores, rpn_bbox_preds, gt_bboxes, img_shapes, cfg):
@@ -140,7 +155,7 @@ class RPNHead(nn.Module):
         anchor_list, valid_flag_list = self.get_anchors(
             featmap_sizes, img_shapes)
         cls_reg_targets = anchor_target(
-            anchor_list, valid_flag_list, featmap_sizes, gt_bboxes, img_shapes,
+            anchor_list, valid_flag_list, gt_bboxes, img_shapes,
             self.target_means, self.target_stds, cfg)
         if cls_reg_targets is None:
             return None
@@ -158,8 +173,8 @@ class RPNHead(nn.Module):
             cfg=cfg)
         return dict(loss_rpn_cls=losses_cls, loss_rpn_reg=losses_reg)
 
-    def get_proposals(self, rpn_cls_scores, rpn_bbox_preds, img_shapes, cfg):
-        img_per_gpu = len(img_shapes)
+    def get_proposals(self, rpn_cls_scores, rpn_bbox_preds, img_meta, cfg):
+        num_imgs = len(img_meta)
         featmap_sizes = [featmap.size()[-2:] for featmap in rpn_cls_scores]
         mlvl_anchors = [
             self.anchor_generators[idx].grid_anchors(featmap_sizes[idx],
@@ -167,7 +182,7 @@ class RPNHead(nn.Module):
             for idx in range(len(featmap_sizes))
         ]
         proposal_list = []
-        for img_id in range(img_per_gpu):
+        for img_id in range(num_imgs):
             rpn_cls_score_list = [
                 rpn_cls_scores[idx][img_id].detach()
                 for idx in range(len(rpn_cls_scores))
@@ -177,10 +192,9 @@ class RPNHead(nn.Module):
                 for idx in range(len(rpn_bbox_preds))
             ]
             assert len(rpn_cls_score_list) == len(rpn_bbox_pred_list)
-            img_shape = img_shapes[img_id]
             proposals = self._get_proposals_single(
                 rpn_cls_score_list, rpn_bbox_pred_list, mlvl_anchors,
-                img_shape, cfg)
+                img_meta[img_id]['img_shape'], cfg)
             proposal_list.append(proposals)
         return proposal_list
 
@@ -195,7 +209,7 @@ class RPNHead(nn.Module):
             if self.use_sigmoid_cls:
                 rpn_cls_score = rpn_cls_score.permute(1, 2,
                                                       0).contiguous().view(-1)
-                rpn_cls_prob = F.sigmoid(rpn_cls_score)
+                rpn_cls_prob = rpn_cls_score.sigmoid()
                 scores = rpn_cls_prob
             else:
                 rpn_cls_score = rpn_cls_score.permute(1, 2,
@@ -211,9 +225,8 @@ class RPNHead(nn.Module):
                 rpn_bbox_pred = rpn_bbox_pred[order, :]
                 anchors = anchors[order, :]
                 scores = scores[order]
-            proposals = bbox_transform_inv(anchors, rpn_bbox_pred,
-                                           self.target_means, self.target_stds,
-                                           img_shape)
+            proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means,
+                                   self.target_stds, img_shape)
             w = proposals[:, 2] - proposals[:, 0] + 1
             h = proposals[:, 3] - proposals[:, 1] + 1
             valid_inds = torch.nonzero((w >= cfg.min_bbox_size) &
diff --git a/mmdet/models/utils/__init__.py b/mmdet/models/utils/__init__.py
index f11af964480456cce144172591bd0b94f3ed7ad7..c759ca9aba1a07d983ae3a0d0305faab910b17a5 100644
--- a/mmdet/models/utils/__init__.py
+++ b/mmdet/models/utils/__init__.py
@@ -1,6 +1,8 @@
 from .conv_module import ConvModule
 from .norm import build_norm_layer
-from .misc import *
-from .weight_init import *
+from .weight_init import xavier_init, normal_init, uniform_init, kaiming_init
 
-__all__ = ['ConvModule', 'build_norm_layer']
+__all__ = [
+    'ConvModule', 'build_norm_layer', 'xavier_init', 'normal_init',
+    'uniform_init', 'kaiming_init'
+]
diff --git a/mmdet/models/utils/misc.py b/mmdet/models/utils/misc.py
deleted file mode 100644
index ad52b587ac126ed2cfbf5e2ed5c98356e1499c5f..0000000000000000000000000000000000000000
--- a/mmdet/models/utils/misc.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from functools import partial
-
-from six.moves import map, zip
-
-
-def multi_apply(func, *args, **kwargs):
-    pfunc = partial(func, **kwargs) if kwargs else func
-    map_results = map(pfunc, *args)
-    return tuple(map(list, zip(*map_results)))
diff --git a/mmdet/nn/__init__.py b/mmdet/nn/__init__.py
deleted file mode 100644
index 1b627f5e7b807b1c6ae321c775c8fc8d03266238..0000000000000000000000000000000000000000
--- a/mmdet/nn/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .parallel import MMDataParallel, MMDistributedDataParallel
diff --git a/mmdet/nn/parallel/__init__.py b/mmdet/nn/parallel/__init__.py
deleted file mode 100644
index 0ea0a58e4a53737372b7995f3f9d570cba50dddb..0000000000000000000000000000000000000000
--- a/mmdet/nn/parallel/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .data_parallel import MMDataParallel
-from .distributed import MMDistributedDataParallel
-from .scatter_gather import scatter, scatter_kwargs
-
-__all__ = [
-    'MMDataParallel', 'MMDistributedDataParallel', 'scatter', 'scatter_kwargs'
-]
diff --git a/mmdet/nn/parallel/_functions.py b/mmdet/nn/parallel/_functions.py
deleted file mode 100644
index 75bb954dce440f7634c47d4a021360df53f3509e..0000000000000000000000000000000000000000
--- a/mmdet/nn/parallel/_functions.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import torch
-from torch.nn.parallel._functions import _get_stream
-
-
-def scatter(input, devices, streams=None):
-    """Scatters tensor across multiple GPUs.
-    """
-    if streams is None:
-        streams = [None] * len(devices)
-
-    if isinstance(input, list):
-        chunk_size = (len(input) - 1) // len(devices) + 1
-        outputs = [
-            scatter(input[i], [devices[i // chunk_size]],
-                    [streams[i // chunk_size]]) for i in range(len(input))
-        ]
-        return outputs
-    elif isinstance(input, torch.Tensor):
-        output = input.contiguous()
-        # TODO: copy to a pinned buffer first (if copying from CPU)
-        stream = streams[0] if output.numel() > 0 else None
-        with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
-            output = output.cuda(devices[0], non_blocking=True)
-        return output
-    else:
-        raise Exception('Unknown type {}.'.format(type(input)))
-
-
-def synchronize_stream(output, devices, streams):
-    if isinstance(output, list):
-        chunk_size = len(output) // len(devices)
-        for i in range(len(devices)):
-            for j in range(chunk_size):
-                synchronize_stream(output[i * chunk_size + j], [devices[i]],
-                                   [streams[i]])
-    elif isinstance(output, torch.Tensor):
-        if output.numel() != 0:
-            with torch.cuda.device(devices[0]):
-                main_stream = torch.cuda.current_stream()
-                main_stream.wait_stream(streams[0])
-                output.record_stream(main_stream)
-    else:
-        raise Exception('Unknown type {}.'.format(type(output)))
-
-
-def get_input_device(input):
-    if isinstance(input, list):
-        for item in input:
-            input_device = get_input_device(item)
-            if input_device != -1:
-                return input_device
-        return -1
-    elif isinstance(input, torch.Tensor):
-        return input.get_device() if input.is_cuda else -1
-    else:
-        raise Exception('Unknown type {}.'.format(type(input)))
-
-
-class Scatter(object):
-
-    @staticmethod
-    def forward(target_gpus, input):
-        input_device = get_input_device(input)
-        streams = None
-        if input_device == -1:
-            # Perform CPU to GPU copies in a background stream
-            streams = [_get_stream(device) for device in target_gpus]
-
-        outputs = scatter(input, target_gpus, streams)
-        # Synchronize with the copy stream
-        if streams is not None:
-            synchronize_stream(outputs, target_gpus, streams)
-
-        return tuple(outputs)
diff --git a/mmdet/nn/parallel/data_parallel.py b/mmdet/nn/parallel/data_parallel.py
deleted file mode 100644
index 6735cb4afb7b512c5e9f757e962612ad1073ae12..0000000000000000000000000000000000000000
--- a/mmdet/nn/parallel/data_parallel.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from torch.nn.parallel import DataParallel
-
-from .scatter_gather import scatter_kwargs
-
-
-class MMDataParallel(DataParallel):
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmdet/nn/parallel/distributed.py b/mmdet/nn/parallel/distributed.py
deleted file mode 100644
index 2809778ad93951650677a546b57190cb7659302d..0000000000000000000000000000000000000000
--- a/mmdet/nn/parallel/distributed.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from torch.nn.parallel import DistributedDataParallel
-
-from .scatter_gather import scatter_kwargs
-
-
-class MMDistributedDataParallel(DistributedDataParallel):
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmdet/nn/parallel/scatter_gather.py b/mmdet/nn/parallel/scatter_gather.py
deleted file mode 100644
index 47f794e8916956f9e8c494e50aff7e5b870889e7..0000000000000000000000000000000000000000
--- a/mmdet/nn/parallel/scatter_gather.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch
-from ._functions import Scatter
-from torch.nn.parallel._functions import Scatter as OrigScatter
-from mmdet.datasets.utils import DataContainer
-
-
-def scatter(inputs, target_gpus, dim=0):
-    """Scatter inputs to target gpus.
-
-    The only difference from original :func:`scatter` is to add support for
-    :type:`~mmdet.DataContainer`.
-    """
-
-    def scatter_map(obj):
-        if isinstance(obj, torch.Tensor):
-            return OrigScatter.apply(target_gpus, None, dim, obj)
-        if isinstance(obj, DataContainer) and isinstance(obj.data, list):
-            return Scatter.forward(target_gpus, obj.data)
-        if isinstance(obj, tuple) and len(obj) > 0:
-            return list(zip(*map(scatter_map, obj)))
-        if isinstance(obj, list) and len(obj) > 0:
-            return list(map(list, zip(*map(scatter_map, obj))))
-        if isinstance(obj, dict) and len(obj) > 0:
-            return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
-        return [obj for targets in target_gpus]
-
-    # After scatter_map is called, a scatter_map cell will exist. This cell
-    # has a reference to the actual function scatter_map, which has references
-    # to a closure that has a reference to the scatter_map cell (because the
-    # fn is recursive). To avoid this reference cycle, we set the function to
-    # None, clearing the cell
-    try:
-        return scatter_map(inputs)
-    finally:
-        scatter_map = None
-
-
-def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
-    """Scatter with support for kwargs dictionary"""
-    inputs = scatter(inputs, target_gpus, dim) if inputs else []
-    kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
-    if len(inputs) < len(kwargs):
-        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
-    elif len(kwargs) < len(inputs):
-        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
-    inputs = tuple(inputs)
-    kwargs = tuple(kwargs)
-    return inputs, kwargs
diff --git a/mmdet/ops/__init__.py b/mmdet/ops/__init__.py
index 52e5808016cb94e63a7501cef7b1292805eb3491..5b63224c3476ad189445fe2f6ee2b7182aee661a 100644
--- a/mmdet/ops/__init__.py
+++ b/mmdet/ops/__init__.py
@@ -1,3 +1,5 @@
 from .nms import nms, soft_nms
 from .roi_align import RoIAlign, roi_align
 from .roi_pool import RoIPool, roi_pool
+
+__all__ = ['nms', 'soft_nms', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool']
diff --git a/mmdet/ops/nms/__init__.py b/mmdet/ops/nms/__init__.py
index 1cf8569b97b3a568458428776b1dbd6737882389..c4407041ad733d51eca3006b8aefa82e02bbfcde 100644
--- a/mmdet/ops/nms/__init__.py
+++ b/mmdet/ops/nms/__init__.py
@@ -1 +1,3 @@
 from .nms_wrapper import nms, soft_nms
+
+__all__ = ['nms', 'soft_nms']
diff --git a/mmdet/ops/roi_align/__init__.py b/mmdet/ops/roi_align/__init__.py
index ae27e21d6c78e9ffd8d13e8c71017ef6f365fb5e..4cb037904a24e613c4b15305cdf8ded6c0072a1b 100644
--- a/mmdet/ops/roi_align/__init__.py
+++ b/mmdet/ops/roi_align/__init__.py
@@ -1,2 +1,4 @@
 from .functions.roi_align import roi_align
 from .modules.roi_align import RoIAlign
+
+__all__ = ['roi_align', 'RoIAlign']
diff --git a/mmdet/ops/roi_align/gradcheck.py b/mmdet/ops/roi_align/gradcheck.py
index e2c51e64bb7b5eba9da3087d83cfa1083f965bbc..394cd69c5064e097becf12752755ee510045193b 100644
--- a/mmdet/ops/roi_align/gradcheck.py
+++ b/mmdet/ops/roi_align/gradcheck.py
@@ -5,7 +5,7 @@ from torch.autograd import gradcheck
 import os.path as osp
 import sys
 sys.path.append(osp.abspath(osp.join(__file__, '../../')))
-from roi_align import RoIAlign
+from roi_align import RoIAlign  # noqa: E402
 
 feat_size = 15
 spatial_scale = 1.0 / 8
diff --git a/mmdet/ops/roi_align/src/roi_align_cuda.cpp b/mmdet/ops/roi_align/src/roi_align_cuda.cpp
index e4c28c142268d4caf3ff2800dcfe9b24e8e99c66..8551bc5188800e46baf4cf64c6076520fed38581 100644
--- a/mmdet/ops/roi_align/src/roi_align_cuda.cpp
+++ b/mmdet/ops/roi_align/src/roi_align_cuda.cpp
@@ -17,9 +17,9 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
                             const int pooled_height, const int pooled_width,
                             at::Tensor bottom_grad);
 
-#define CHECK_CUDA(x) AT_ASSERT(x.type().is_cuda(), #x " must be a CUDAtensor ")
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
 #define CHECK_CONTIGUOUS(x) \
-  AT_ASSERT(x.is_contiguous(), #x " must be contiguous ")
+  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 #define CHECK_INPUT(x) \
   CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
diff --git a/mmdet/ops/roi_align/src/roi_align_kernel.cu b/mmdet/ops/roi_align/src/roi_align_kernel.cu
index 31be093c038872ff0b48c79157e5048d25a416cf..341d858de52a0999f7d9598ddb3c2f52d529bf17 100644
--- a/mmdet/ops/roi_align/src/roi_align_kernel.cu
+++ b/mmdet/ops/roi_align/src/roi_align_kernel.cu
@@ -1,14 +1,10 @@
 #include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
 
-#include <cuda.h>
-#include <cuda_runtime.h>
+using namespace at;  // temporal fix for pytorch<=0.4.1 (see #9848)
 
-#include <math.h>
-#include <stdio.h>
-#include <vector>
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;                   \
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
        i += blockDim.x * gridDim.x)
 
 #define THREADS_PER_BLOCK 1024
@@ -28,10 +24,8 @@ __device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
     return 0;
   }
 
-  if (y <= 0)
-    y = 0;
-  if (x <= 0)
-    x = 0;
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
 
   int y_low = (int)y;
   int x_low = (int)x;
@@ -69,12 +63,13 @@ __device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
 }
 
 template <typename scalar_t>
-__global__ void
-ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
-                const scalar_t *bottom_rois, const scalar_t spatial_scale,
-                const int sample_num, const int channels, const int height,
-                const int width, const int pooled_height,
-                const int pooled_width, scalar_t *top_data) {
+__global__ void ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
+                                const scalar_t *bottom_rois,
+                                const scalar_t spatial_scale,
+                                const int sample_num, const int channels,
+                                const int height, const int width,
+                                const int pooled_height, const int pooled_width,
+                                scalar_t *top_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the aligned output
     int pw = index % pooled_width;
@@ -101,7 +96,7 @@ ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
 
     int sample_num_h = (sample_num > 0)
                            ? sample_num
-                           : ceil(roi_height / pooled_height); // e.g., = 2
+                           : ceil(roi_height / pooled_height);  // e.g., = 2
     int sample_num_w =
         (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
 
@@ -137,17 +132,17 @@ int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
                            const int pooled_height, const int pooled_width,
                            at::Tensor output) {
   const int output_size = num_rois * pooled_height * pooled_width * channels;
-  AT_DISPATCH_FLOATING_TYPES(
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       features.type(), "ROIAlignLaucherForward", ([&] {
         const scalar_t *bottom_data = features.data<scalar_t>();
         const scalar_t *rois_data = rois.data<scalar_t>();
         scalar_t *top_data = output.data<scalar_t>();
 
-        ROIAlignForward<
-            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
-            output_size, bottom_data, rois_data, scalar_t(spatial_scale),
-            sample_num, channels, height, width, pooled_height, pooled_width,
-            top_data);
+        ROIAlignForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                sample_num, channels, height, width, pooled_height,
+                pooled_width, top_data);
       }));
   cudaError_t err = cudaGetLastError();
   if (cudaSuccess != err) {
@@ -159,11 +154,12 @@ int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
 }
 
 template <typename scalar_t>
-__device__ void
-bilinear_interpolate_gradient(const int height, const int width, scalar_t y,
-                              scalar_t x, scalar_t &w1, scalar_t &w2,
-                              scalar_t &w3, scalar_t &w4, int &x_low,
-                              int &x_high, int &y_low, int &y_high) {
+__device__ void bilinear_interpolate_gradient(const int height, const int width,
+                                              scalar_t y, scalar_t x,
+                                              scalar_t &w1, scalar_t &w2,
+                                              scalar_t &w3, scalar_t &w4,
+                                              int &x_low, int &x_high,
+                                              int &y_low, int &y_high) {
   // deal with cases that inverse elements are out of feature map boundary
   if (y < -1.0 || y > height || x < -1.0 || x > width) {
     w1 = w2 = w3 = w4 = 0.;
@@ -171,10 +167,8 @@ bilinear_interpolate_gradient(const int height, const int width, scalar_t y,
     return;
   }
 
-  if (y <= 0)
-    y = 0;
-  if (x <= 0)
-    x = 0;
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
 
   y_low = (int)y;
   x_low = (int)x;
@@ -204,12 +198,11 @@ bilinear_interpolate_gradient(const int height, const int width, scalar_t y,
 }
 
 template <typename scalar_t>
-__global__ void
-ROIAlignBackward(const int nthreads, const scalar_t *top_diff,
-                 const scalar_t *bottom_rois, const scalar_t spatial_scale,
-                 const int sample_num, const int channels, const int height,
-                 const int width, const int pooled_height,
-                 const int pooled_width, scalar_t *bottom_diff) {
+__global__ void ROIAlignBackward(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int sample_num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, scalar_t *bottom_diff) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the aligned output
     int pw = index % pooled_width;
@@ -239,7 +232,7 @@ ROIAlignBackward(const int nthreads, const scalar_t *top_diff,
 
     int sample_num_h = (sample_num > 0)
                            ? sample_num
-                           : ceil(roi_height / pooled_height); // e.g., = 2
+                           : ceil(roi_height / pooled_height);  // e.g., = 2
     int sample_num_w =
         (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
 
@@ -279,13 +272,6 @@ ROIAlignBackward(const int nthreads, const scalar_t *top_diff,
   }
 }
 
-template <>
-__global__ void ROIAlignBackward<double>(
-    const int nthreads, const double *top_diff, const double *bottom_rois,
-    const double spatial_scale, const int sample_num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, double *bottom_diff) {}
-
 int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
                             const float spatial_scale, const int sample_num,
                             const int channels, const int height,
@@ -294,6 +280,7 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
                             at::Tensor bottom_grad) {
   const int output_size = num_rois * pooled_height * pooled_width * channels;
 
+  // TODO: use AT_DISPATCH_FLOATING_TYPES_AND_HALF when atomicAdd is resolved
   AT_DISPATCH_FLOATING_TYPES(
       top_grad.type(), "ROIAlignLaucherBackward", ([&] {
         const scalar_t *top_diff = top_grad.data<scalar_t>();
@@ -304,10 +291,11 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
           exit(-1);
         }
 
-        ROIAlignBackward<
-            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
-            output_size, top_diff, rois_data, spatial_scale, sample_num,
-            channels, height, width, pooled_height, pooled_width, bottom_diff);
+        ROIAlignBackward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, spatial_scale, sample_num,
+                channels, height, width, pooled_height, pooled_width,
+                bottom_diff);
       }));
   cudaError_t err = cudaGetLastError();
   if (cudaSuccess != err) {
diff --git a/mmdet/ops/roi_pool/__init__.py b/mmdet/ops/roi_pool/__init__.py
index 9c8506d319d3c9c2300860a6c0d64259e43e7916..eb2c57eabd6fa002c970c1f8d199d80d0a9b689c 100644
--- a/mmdet/ops/roi_pool/__init__.py
+++ b/mmdet/ops/roi_pool/__init__.py
@@ -1,2 +1,4 @@
 from .functions.roi_pool import roi_pool
 from .modules.roi_pool import RoIPool
+
+__all__ = ['roi_pool', 'RoIPool']
diff --git a/mmdet/ops/roi_pool/gradcheck.py b/mmdet/ops/roi_pool/gradcheck.py
index dfc08b2e138855e913a2ac1f3c365a570aba661d..c39616086a240cf57cf115d4264eb32b9cc9f7c7 100644
--- a/mmdet/ops/roi_pool/gradcheck.py
+++ b/mmdet/ops/roi_pool/gradcheck.py
@@ -4,7 +4,7 @@ from torch.autograd import gradcheck
 import os.path as osp
 import sys
 sys.path.append(osp.abspath(osp.join(__file__, '../../')))
-from roi_pooling import RoIPool
+from roi_pool import RoIPool  # noqa: E402
 
 feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda()
 rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55],
diff --git a/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp b/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
index 799c151d192911f03e446ea9c1ad7bb18fa3b1d1..b05e870600fa80ea4b236bd85c03122ed1f49aba 100644
--- a/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
+++ b/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
@@ -16,9 +16,9 @@ int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
                            const int num_rois, const int pooled_h,
                            const int pooled_w, at::Tensor bottom_grad);
 
-#define CHECK_CUDA(x) AT_ASSERT(x.type().is_cuda(), #x " must be a CUDAtensor ")
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
 #define CHECK_CONTIGUOUS(x) \
-  AT_ASSERT(x.is_contiguous(), #x " must be contiguous ")
+  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 #define CHECK_INPUT(x) \
   CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
diff --git a/mmdet/ops/roi_pool/src/roi_pool_kernel.cu b/mmdet/ops/roi_pool/src/roi_pool_kernel.cu
index c94a9cd78503c19995db88dd71f2b1ce5a36d629..d2cefa662f9ff9c961a261cef621f7f1d0e561fc 100644
--- a/mmdet/ops/roi_pool/src/roi_pool_kernel.cu
+++ b/mmdet/ops/roi_pool/src/roi_pool_kernel.cu
@@ -1,14 +1,10 @@
 #include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
 
-#include <cuda.h>
-#include <cuda_runtime.h>
+using namespace at;  // temporal fix for pytorch<=0.4.1 (see #9848)
 
-#include <math.h>
-#include <stdio.h>
-#include <vector>
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;                   \
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
        i += blockDim.x * gridDim.x)
 
 #define THREADS_PER_BLOCK 1024
@@ -44,8 +40,7 @@ __global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
     // force malformed rois to be 1x1
     scalar_t roi_w = roi_x2 - roi_x1;
     scalar_t roi_h = roi_y2 - roi_y1;
-    if (roi_w <= 0 || roi_h <= 0)
-      continue;
+    if (roi_w <= 0 || roi_h <= 0) continue;
 
     scalar_t bin_size_w = roi_w / static_cast<scalar_t>(pooled_w);
     scalar_t bin_size_h = roi_h / static_cast<scalar_t>(pooled_h);
@@ -68,7 +63,8 @@ __global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
     bottom_data += (roi_batch_ind * channels + c) * height * width;
 
     // Define an empty pooling region to be zero
-    scalar_t max_val = is_empty ? 0 : bottom_data[bin_y1 * width + bin_x1] - 1;
+    scalar_t max_val = is_empty ? static_cast<scalar_t>(0)
+                                : bottom_data[bin_y1 * width + bin_x1] - 1;
 
     for (int h = bin_y1; h < bin_y2; ++h) {
       for (int w = bin_x1; w < bin_x2; ++w) {
@@ -80,8 +76,7 @@ __global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
       }
     }
     top_data[index] = max_val;
-    if (argmax_data != NULL)
-      argmax_data[index] = max_idx;
+    if (argmax_data != NULL) argmax_data[index] = max_idx;
   }
 }
 
@@ -92,17 +87,18 @@ int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
                           at::Tensor output, at::Tensor argmax) {
   const int output_size = num_rois * channels * pooled_h * pooled_w;
 
-  AT_DISPATCH_FLOATING_TYPES(
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       features.type(), "ROIPoolLaucherForward", ([&] {
         const scalar_t *bottom_data = features.data<scalar_t>();
         const scalar_t *rois_data = rois.data<scalar_t>();
         scalar_t *top_data = output.data<scalar_t>();
         int *argmax_data = argmax.data<int>();
 
-        ROIPoolForward<
-            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
-            output_size, bottom_data, rois_data, scalar_t(spatial_scale),
-            channels, height, width, pooled_h, pooled_w, top_data, argmax_data);
+        ROIPoolForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                channels, height, width, pooled_h, pooled_w, top_data,
+                argmax_data);
       }));
   cudaError_t err = cudaGetLastError();
   if (cudaSuccess != err) {
@@ -135,28 +131,6 @@ __global__ void ROIPoolBackward(const int nthreads, const scalar_t *top_diff,
   }
 }
 
-template <>
-__global__ void
-ROIPoolBackward<double>(const int nthreads, const double *top_diff,
-                        const double *rois, const int *argmax_data,
-                        const double spatial_scale, const int channels,
-                        const int height, const int width, const int pooled_h,
-                        const int pooled_w, double *bottom_diff) {
-  // CUDA_1D_KERNEL_LOOP(index, nthreads) {
-  //   int pw = index % pooled_w;
-  //   int ph = (index / pooled_w) % pooled_h;
-  //   int c = (index / pooled_w / pooled_h) % channels;
-  //   int n = index / pooled_w / pooled_h / channels;
-
-  //   int roi_batch_ind = rois[n * 5];
-  //   int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w +
-  //                                  ph * pooled_w + pw];
-
-  //   *(bottom_diff + (roi_batch_ind * channels + c) * height * width +
-  //                 bottom_index) +=top_diff[index];
-  // }
-}
-
 int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
                            const at::Tensor argmax, const float spatial_scale,
                            const int batch_size, const int channels,
@@ -165,6 +139,7 @@ int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
                            const int pooled_w, at::Tensor bottom_grad) {
   const int output_size = num_rois * pooled_h * pooled_w * channels;
 
+  // TODO: use AT_DISPATCH_FLOATING_TYPES_AND_HALF when atomicAdd is resolved
   AT_DISPATCH_FLOATING_TYPES(
       top_grad.type(), "ROIPoolLaucherBackward", ([&] {
         const scalar_t *top_diff = top_grad.data<scalar_t>();
@@ -177,11 +152,11 @@ int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
           exit(-1);
         }
 
-        ROIPoolBackward<
-            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
-            output_size, top_diff, rois_data, argmax_data,
-            scalar_t(spatial_scale), channels, height, width, pooled_h,
-            pooled_w, bottom_diff);
+        ROIPoolBackward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, argmax_data,
+                scalar_t(spatial_scale), channels, height, width, pooled_h,
+                pooled_w, bottom_diff);
       }));
   cudaError_t err = cudaGetLastError();
   if (cudaSuccess != err) {
diff --git a/mmdet/version.py b/mmdet/version.py
deleted file mode 100644
index 2b8877c505752cd3aaa805b09b88791d3ca0c9bb..0000000000000000000000000000000000000000
--- a/mmdet/version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = '0.5.0'
diff --git a/setup.py b/setup.py
index 8ed19bd5a810692f308f99617f20fe2e07e86f5a..7cb44e538e3ce611a00135a588ebe37a486e3388 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,6 @@
+import os
+import subprocess
+import time
 from setuptools import find_packages, setup
 
 
@@ -7,34 +10,102 @@ def readme():
     return content
 
 
+MAJOR = 0
+MINOR = 5
+PATCH = 0
+SUFFIX = ''
+SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX)
+
+version_file = 'mmdet/version.py'
+
+
+def get_git_hash():
+
+    def _minimal_ext_cmd(cmd):
+        # construct minimal environment
+        env = {}
+        for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+            v = os.environ.get(k)
+            if v is not None:
+                env[k] = v
+        # LANGUAGE is used on win32
+        env['LANGUAGE'] = 'C'
+        env['LANG'] = 'C'
+        env['LC_ALL'] = 'C'
+        out = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+        return out
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+    except OSError:
+        sha = 'unknown'
+
+    return sha
+
+
+def get_hash():
+    if os.path.exists('.git'):
+        sha = get_git_hash()[:7]
+    elif os.path.exists(version_file):
+        try:
+            from mmdet.version import __version__
+            sha = __version__.split('+')[-1]
+        except ImportError:
+            raise ImportError('Unable to get git version')
+    else:
+        sha = 'unknown'
+
+    return sha
+
+
+def write_version_py():
+    content = """# GENERATED VERSION FILE
+# TIME: {}
+
+__version__ = '{}'
+short_version = '{}'
+"""
+    sha = get_hash()
+    VERSION = SHORT_VERSION + '+' + sha
+
+    with open(version_file, 'w') as f:
+        f.write(content.format(time.asctime(), VERSION, SHORT_VERSION))
+
+
 def get_version():
-    version_file = 'mmcv/version.py'
     with open(version_file, 'r') as f:
         exec(compile(f.read(), version_file, 'exec'))
     return locals()['__version__']
 
 
-setup(
-    name='mmdet',
-    version=get_version(),
-    description='Open MMLab Detection Toolbox',
-    long_description=readme(),
-    keywords='computer vision, object detection',
-    packages=find_packages(),
-    classifiers=[
-        'Development Status :: 4 - Beta',
-        'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
-        'Operating System :: OS Independent',
-        'Programming Language :: Python :: 2',
-        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.4',
-        'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6',
-        'Topic :: Utilities',
-    ],
-    license='GPLv3',
-    setup_requires=['pytest-runner'],
-    tests_require=['pytest'],
-    install_requires=['numpy', 'matplotlib', 'six', 'terminaltables'],
-    zip_safe=False)
+if __name__ == '__main__':
+    write_version_py()
+    setup(
+        name='mmdet',
+        version=get_version(),
+        description='Open MMLab Detection Toolbox',
+        long_description=readme(),
+        keywords='computer vision, object detection',
+        url='https://github.com/open-mmlab/mmdetection',
+        packages=find_packages(),
+        package_data={'mmdet.ops': ['*/*.so']},
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 2',
+            'Programming Language :: Python :: 2.7',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.4',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+        ],
+        license='GPLv3',
+        setup_requires=['pytest-runner'],
+        tests_require=['pytest'],
+        install_requires=[
+            'numpy', 'matplotlib', 'six', 'terminaltables', 'pycocotools'
+        ],
+        zip_safe=False)
diff --git a/tools/coco_eval.py b/tools/coco_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..65e114ca280578cd41848a631e419d70819a662f
--- /dev/null
+++ b/tools/coco_eval.py
@@ -0,0 +1,28 @@
+from argparse import ArgumentParser
+
+from mmdet.core import coco_eval
+
+
+def main():
+    parser = ArgumentParser(description='COCO Evaluation')
+    parser.add_argument('result', help='result file path')
+    parser.add_argument('--ann', help='annotation file path')
+    parser.add_argument(
+        '--types',
+        type=str,
+        nargs='+',
+        choices=['proposal_fast', 'proposal', 'bbox', 'segm', 'keypoint'],
+        default=['bbox'],
+        help='result types')
+    parser.add_argument(
+        '--max-dets',
+        type=int,
+        nargs='+',
+        default=[100, 300, 1000],
+        help='proposal numbers, only used for recall evaluation')
+    args = parser.parse_args()
+    coco_eval(args.result, args.types, args.ann, args.max_dets)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dist_train.sh b/tools/dist_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fa68297226b874596a54b9c819f03584008093e6
--- /dev/null
+++ b/tools/dist_train.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+PYTHON=${PYTHON:-"python"}
+
+$PYTHON -m torch.distributed.launch --nproc_per_node=$2 $(dirname "$0")/train.py $1 --launcher pytorch ${@:3}
diff --git a/tools/eval.py b/tools/eval.py
deleted file mode 100644
index 20cc571e94b2fcf228f2d0782cf8a8b16dd3688b..0000000000000000000000000000000000000000
--- a/tools/eval.py
+++ /dev/null
@@ -1,265 +0,0 @@
-from argparse import ArgumentParser
-from multiprocessing import Pool
-import matplotlib.pyplot as plt
-import numpy as np
-import copy
-import os
-
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-
-
-def generate_area_range(splitRng=32, stop_size=128):
-    areaRng = [[0**2, 1e5**2], [0**2, 32**2], [32**2, 96**2], [96**2, 1e5**2]]
-    start = 0
-    while start < stop_size:
-        end = start + splitRng
-        areaRng.append([start * start, end * end])
-        start = end
-    areaRng.append([start * start, 1e5**2])
-    return areaRng
-
-
-def print_summarize(iouThr=None,
-                    iouThrs=None,
-                    precision=None,
-                    recall=None,
-                    areaRng_id=4,
-                    areaRngs=None,
-                    maxDets_id=2,
-                    maxDets=None):
-    assert (precision is not None) or (recall is not None)
-    iStr = ' {:<18} {} @[ IoU={:<9} | size={:>5}-{:>5} | maxDets={:>3d} ] = {:0.3f}'
-    titleStr = 'Average Precision' if precision is not None else 'Average Recall'
-    typeStr = '(AP)' if precision is not None else '(AR)'
-    iouStr = '{:0.2f}:{:0.2f}'.format(iouThrs[0], iouThrs[-1]) \
-        if iouThr is None else '{:0.2f}'.format(iouThr)
-
-    aind = [areaRng_id]
-    mind = [maxDets_id]
-    if precision is not None:
-        # dimension of precision: [TxRxKxAxM]
-        s = precision
-        # IoU
-        if iouThr is not None:
-            t = np.where(iouThr == iouThrs)[0]
-            s = s[t]
-        s = s[:, :, :, aind, mind]
-    else:
-        # dimension of recall: [TxKxAxM]
-        s = recall
-        if iouThr is not None:
-            t = np.where(iouThr == iouThrs)[0]
-            s = s[t]
-        s = s[:, :, aind, mind]
-    if len(s[s > -1]) == 0:
-        mean_s = -1
-    else:
-        mean_s = np.mean(s[s > -1])
-    print(
-        iStr.format(
-            titleStr, typeStr, iouStr, np.sqrt(areaRngs[areaRng_id][0]),
-            np.sqrt(areaRngs[areaRng_id][1])
-            if np.sqrt(areaRngs[areaRng_id][1]) < 999 else 'max',
-            maxDets[maxDets_id], mean_s))
-
-
-def eval_results(res_file, ann_file, res_types, splitRng):
-    for res_type in res_types:
-        assert res_type in ['proposal', 'bbox', 'segm', 'keypoints']
-
-    areaRng = generate_area_range(splitRng)
-    cocoGt = COCO(ann_file)
-    cocoDt = cocoGt.loadRes(res_file)
-    imgIds = cocoGt.getImgIds()
-    for res_type in res_types:
-        iou_type = 'bbox' if res_type == 'proposal' else res_type
-        cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
-        cocoEval.params.imgIds = imgIds
-        if res_type == 'proposal':
-            cocoEval.params.useCats = 0
-            cocoEval.params.maxDets = [100, 300, 1000]
-        cocoEval.params.areaRng = areaRng
-        cocoEval.evaluate()
-        cocoEval.accumulate()
-        cocoEval.summarize()
-        ps = cocoEval.eval['precision']
-        rc = cocoEval.eval['recall']
-        for i in range(len(areaRng)):
-            print_summarize(None, cocoEval.params.iouThrs, ps, None, i,
-                            areaRng, 2, cocoEval.params.maxDets)
-
-
-def makeplot(rs, ps, outDir, class_name):
-    cs = np.vstack([
-        np.ones((2, 3)),
-        np.array([.31, .51, .74]),
-        np.array([.75, .31, .30]),
-        np.array([.36, .90, .38]),
-        np.array([.50, .39, .64]),
-        np.array([1, .6, 0])
-    ])
-    areaNames = ['all', 'small', 'medium', 'large']
-    types = ['C75', 'C50', 'Loc', 'Sim', 'Oth', 'BG', 'FN']
-    for i in range(len(areaNames)):
-        area_ps = ps[..., i, 0]
-        figure_tile = class_name + '-' + areaNames[i]
-        aps = [ps_.mean() for ps_ in area_ps]
-        ps_curve = [
-            ps_.mean(axis=1) if ps_.ndim > 1 else ps_ for ps_ in area_ps
-        ]
-        ps_curve.insert(0, np.zeros(ps_curve[0].shape))
-        fig = plt.figure()
-        ax = plt.subplot(111)
-        for k in range(len(types)):
-            ax.plot(rs, ps_curve[k + 1], color=[0, 0, 0], linewidth=0.5)
-            ax.fill_between(
-                rs,
-                ps_curve[k],
-                ps_curve[k + 1],
-                color=cs[k],
-                label=str('[{:.3f}'.format(aps[k]) + ']' + types[k]))
-        plt.xlabel('recall')
-        plt.ylabel('precision')
-        plt.xlim(0, 1.)
-        plt.ylim(0, 1.)
-        plt.title(figure_tile)
-        plt.legend()
-        # plt.show()
-        fig.savefig(outDir + '/{}.png'.format(figure_tile))
-        plt.close(fig)
-
-
-def analyze_individual_category(k, cocoDt, cocoGt, catId, iou_type):
-    nm = cocoGt.loadCats(catId)[0]
-    print('--------------analyzing {}-{}---------------'.format(
-        k + 1, nm['name']))
-    ps_ = {}
-    dt = copy.deepcopy(cocoDt)
-    nm = cocoGt.loadCats(catId)[0]
-    imgIds = cocoGt.getImgIds()
-    dt_anns = dt.dataset['annotations']
-    select_dt_anns = []
-    for ann in dt_anns:
-        if ann['category_id'] == catId:
-            select_dt_anns.append(ann)
-    dt.dataset['annotations'] = select_dt_anns
-    dt.createIndex()
-    # compute precision but ignore superclass confusion
-    gt = copy.deepcopy(cocoGt)
-    child_catIds = gt.getCatIds(supNms=[nm['supercategory']])
-    for idx, ann in enumerate(gt.dataset['annotations']):
-        if (ann['category_id'] in child_catIds
-                and ann['category_id'] != catId):
-            gt.dataset['annotations'][idx]['ignore'] = 1
-            gt.dataset['annotations'][idx]['iscrowd'] = 1
-            gt.dataset['annotations'][idx]['category_id'] = catId
-    cocoEval = COCOeval(gt, copy.deepcopy(dt), iou_type)
-    cocoEval.params.imgIds = imgIds
-    cocoEval.params.maxDets = [100]
-    cocoEval.params.iouThrs = [.1]
-    cocoEval.params.useCats = 1
-    cocoEval.evaluate()
-    cocoEval.accumulate()
-    ps_supercategory = cocoEval.eval['precision'][0, :, k, :, :]
-    ps_['ps_supercategory'] = ps_supercategory
-    # compute precision but ignore any class confusion
-    gt = copy.deepcopy(cocoGt)
-    for idx, ann in enumerate(gt.dataset['annotations']):
-        if ann['category_id'] != catId:
-            gt.dataset['annotations'][idx]['ignore'] = 1
-            gt.dataset['annotations'][idx]['iscrowd'] = 1
-            gt.dataset['annotations'][idx]['category_id'] = catId
-    cocoEval = COCOeval(gt, copy.deepcopy(dt), iou_type)
-    cocoEval.params.imgIds = imgIds
-    cocoEval.params.maxDets = [100]
-    cocoEval.params.iouThrs = [.1]
-    cocoEval.params.useCats = 1
-    cocoEval.evaluate()
-    cocoEval.accumulate()
-    ps_allcategory = cocoEval.eval['precision'][0, :, k, :, :]
-    ps_['ps_allcategory'] = ps_allcategory
-    return k, ps_
-
-
-def analyze_results(res_file, ann_file, res_types, out_dir):
-    for res_type in res_types:
-        assert res_type in ['bbox', 'segm']
-
-    directory = os.path.dirname(out_dir + '/')
-    if not os.path.exists(directory):
-        print('-------------create {}-----------------'.format(out_dir))
-        os.makedirs(directory)
-
-    cocoGt = COCO(ann_file)
-    cocoDt = cocoGt.loadRes(res_file)
-    imgIds = cocoGt.getImgIds()
-    for res_type in res_types:
-        iou_type = res_type
-        cocoEval = COCOeval(
-            copy.deepcopy(cocoGt), copy.deepcopy(cocoDt), iou_type)
-        cocoEval.params.imgIds = imgIds
-        cocoEval.params.iouThrs = [.75, .5, .1]
-        cocoEval.params.maxDets = [100]
-        cocoEval.evaluate()
-        cocoEval.accumulate()
-        ps = cocoEval.eval['precision']
-        ps = np.vstack([ps, np.zeros((4, *ps.shape[1:]))])
-        catIds = cocoGt.getCatIds()
-        recThrs = cocoEval.params.recThrs
-        with Pool(processes=48) as pool:
-            args = [(k, cocoDt, cocoGt, catId, iou_type)
-                    for k, catId in enumerate(catIds)]
-            analyze_results = pool.starmap(analyze_individual_category, args)
-        for k, catId in enumerate(catIds):
-            nm = cocoGt.loadCats(catId)[0]
-            print('--------------saving {}-{}---------------'.format(
-                k + 1, nm['name']))
-            analyze_result = analyze_results[k]
-            assert k == analyze_result[0]
-            ps_supercategory = analyze_result[1]['ps_supercategory']
-            ps_allcategory = analyze_result[1]['ps_allcategory']
-            # compute precision but ignore superclass confusion
-            ps[3, :, k, :, :] = ps_supercategory
-            # compute precision but ignore any class confusion
-            ps[4, :, k, :, :] = ps_allcategory
-            # fill in background and false negative errors and plot
-            ps[ps == -1] = 0
-            ps[5, :, k, :, :] = (ps[4, :, k, :, :] > 0)
-            ps[6, :, k, :, :] = 1.0
-            makeplot(recThrs, ps[:, :, k], out_dir, nm['name'])
-        makeplot(recThrs, ps, out_dir, 'all')
-
-
-def main():
-    parser = ArgumentParser(description='COCO Evaluation')
-    parser.add_argument('result', help='result file path')
-    parser.add_argument(
-        '--ann',
-        default='/mnt/SSD/dataset/coco/annotations/instances_minival2017.json',
-        help='annotation file path')
-    parser.add_argument(
-        '--types', type=str, nargs='+', default=['bbox'], help='result types')
-    parser.add_argument(
-        '--analyze', action='store_true', help='whether to analyze results')
-    parser.add_argument(
-        '--out_dir',
-        type=str,
-        default=None,
-        help='dir to save analyze result images')
-    parser.add_argument(
-        '--splitRng',
-        type=int,
-        default=32,
-        help='range to split area in evaluation')
-    args = parser.parse_args()
-    if not args.analyze:
-        eval_results(args.result, args.ann, args.types, splitRng=args.splitRng)
-    else:
-        assert args.out_dir is not None
-        analyze_results(
-            args.result, args.ann, args.types, out_dir=args.out_dir)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/examples/r50_fpn_rpn_1x.py b/tools/examples/r50_fpn_rpn_1x.py
deleted file mode 100644
index 45c0a1a6c4649a18346251c8e81f5480f29da30f..0000000000000000000000000000000000000000
--- a/tools/examples/r50_fpn_rpn_1x.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# model settings
-model = dict(
-    pretrained=
-    '/mnt/lustre/pangjiangmiao/initmodel/pytorch/resnet50-19c8e357.pth',
-    backbone=dict(
-        type='resnet',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        style='fb'),
-    neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        num_outs=5),
-    rpn_head=dict(
-        type='RPNHead',
-        in_channels=256,
-        feat_channels=256,
-        coarsest_stride=32,
-        anchor_scales=[8],
-        anchor_ratios=[0.5, 1.0, 2.0],
-        anchor_strides=[4, 8, 16, 32, 64],
-        target_means=[.0, .0, .0, .0],
-        target_stds=[1.0, 1.0, 1.0, 1.0],
-        use_sigmoid_cls=True))
-meta_params = dict(
-    rpn_train_cfg=dict(
-        pos_fraction=0.5,
-        pos_balance_sampling=False,
-        neg_pos_ub=256,
-        allowed_border=0,
-        anchor_batch_size=256,
-        pos_iou_thr=0.7,
-        neg_iou_thr=0.3,
-        neg_balance_thr=0,
-        min_pos_iou=1e-3,
-        pos_weight=-1,
-        smoothl1_beta=1 / 9.0,
-        debug=False),
-    rpn_test_cfg=dict(
-        nms_across_levels=False,
-        nms_pre=2000,
-        nms_post=2000,
-        max_num=2000,
-        nms_thr=0.7,
-        min_bbox_size=0))
-# dataset settings
-data_root = '/mnt/lustre/pangjiangmiao/dataset/coco/'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-img_per_gpu = 1
-data_workers = 2
-train_dataset = dict(
-    ann_file=data_root + 'annotations/instances_train2017.json',
-    img_prefix=data_root + 'train2017/',
-    img_scale=(1333, 800),
-    img_norm_cfg=img_norm_cfg,
-    size_divisor=32,
-    flip_ratio=0.5)
-test_dataset = dict(
-    ann_file=data_root + 'annotations/instances_val2017.json',
-    img_prefix=data_root + 'val2017/',
-    img_scale=(1333, 800),
-    img_norm_cfg=img_norm_cfg,
-    size_divisor=32,
-    test_mode=True)
-# optimizer
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-grad_clip_config = dict(grad_clip=True, max_norm=35, norm_type=2)
-# learning policy
-lr_policy = dict(
-    policy='step',
-    warmup='linear',
-    warmup_iters=500,
-    warmup_ratio=0.333,
-    step=[8, 11])
-max_epoch = 12
-checkpoint_config = dict(interval=1)
-dist_params = dict(backend='nccl', port='29500', master_ip='127.0.0.1')
-# logging settings
-log_level = 'INFO'
-# yapf:disable
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # ('TensorboardLoggerHook', dict(log_dir=work_dir + '/log')),
-    ])
-# yapf:enable
-work_dir = './model/r50_fpn_1x'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
diff --git a/tools/test.py b/tools/test.py
index 2d062489100f3fc6a579ec811ff0391573f48454..3b1ce2d2e04859fdcce4c977556be89298d1953d 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -1,64 +1,92 @@
-import os.path as osp
-import sys
-sys.path.append(osp.abspath(osp.join(__file__, '../../')))
-sys.path.append('/mnt/lustre/pangjiangmiao/sensenet_folder/mmcv')
 import argparse
 
-import numpy as np
 import torch
-
 import mmcv
-from mmcv import Config
-from mmcv.torchpack import load_checkpoint, parallel_test
-from mmdet.core import _data_func, results2json
-from mmdet.datasets import CocoDataset
-from mmdet.datasets.data_engine import build_data
-from mmdet.models import Detector
+from mmcv.runner import load_checkpoint, parallel_test, obj_from_dict
+from mmcv.parallel import scatter, MMDataParallel
+
+from mmdet import datasets
+from mmdet.core import results2json, coco_eval
+from mmdet.datasets import collate, build_dataloader
+from mmdet.models import build_detector, detectors
+
+
+def single_test(model, data_loader, show=False):
+    model.eval()
+    results = []
+    prog_bar = mmcv.ProgressBar(len(data_loader.dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(**data, return_loss=False, rescale=not show)
+        results.append(result)
+
+        if show:
+            model.module.show_result(data, result,
+                                     data_loader.dataset.img_norm_cfg)
+
+        batch_size = data['img'][0].size(0)
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def _data_func(data, device_id):
+    data = scatter(collate([data], samples_per_gpu=1), [device_id])[0]
+    return dict(**data, return_loss=False, rescale=True)
 
 
 def parse_args():
     parser = argparse.ArgumentParser(description='MMDet test detector')
     parser.add_argument('config', help='test config file path')
     parser.add_argument('checkpoint', help='checkpoint file')
-    parser.add_argument('--world_size', default=1, type=int)
+    parser.add_argument('--gpus', default=1, type=int)
     parser.add_argument('--out', help='output result file')
     parser.add_argument(
-        '--out_json', action='store_true', help='get json output file')
+        '--eval',
+        type=str,
+        nargs='+',
+        choices=['proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'],
+        help='eval types')
+    parser.add_argument('--show', action='store_true', help='show results')
     args = parser.parse_args()
     return args
 
 
-args = parse_args()
+def main():
+    args = parse_args()
 
+    cfg = mmcv.Config.fromfile(args.config)
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
 
-def main():
-    cfg = Config.fromfile(args.config)
-    cfg.model['pretrained'] = None
-    # TODO this img_per_gpu
-    cfg.img_per_gpu == 1
-
-    if args.world_size == 1:
-        # TODO verify this part
-        args.dist = False
-        args.img_per_gpu = cfg.img_per_gpu
-        args.data_workers = cfg.data_workers
-        model = Detector(**cfg.model, **meta_params)
+    dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True))
+    if args.gpus == 1:
+        model = build_detector(
+            cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
         load_checkpoint(model, args.checkpoint)
-        test_loader = build_data(cfg.test_dataset, args)
-        model = torch.nn.DataParallel(model, device_ids=0)
-        # TODO write single_test
-        outputs = single_test(test_loader, model)
+        model = MMDataParallel(model, device_ids=[0])
+
+        data_loader = build_dataloader(
+            dataset,
+            imgs_per_gpu=1,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            num_gpus=1,
+            dist=False,
+            shuffle=False)
+        outputs = single_test(model, data_loader, args.show)
     else:
-        test_dataset = CocoDataset(**cfg.test_dataset)
-        model = dict(cfg.model, **cfg.meta_params)
-        outputs = parallel_test(Detector, model,
-                                args.checkpoint, test_dataset, _data_func,
-                                range(args.world_size))
+        model_args = cfg.model.copy()
+        model_args.update(train_cfg=None, test_cfg=cfg.test_cfg)
+        model_type = getattr(detectors, model_args.pop('type'))
+        outputs = parallel_test(model_type, model_args, args.checkpoint,
+                                dataset, _data_func, range(args.gpus))
 
     if args.out:
-        mmcv.dump(outputs, args.out, protocol=4)
-        if args.out_json:
-            results2json(test_dataset, outputs, args.out + '.json')
+        mmcv.dump(outputs, args.out)
+        if args.eval:
+            json_file = args.out + '.json'
+            results2json(dataset, outputs, json_file)
+            coco_eval(json_file, args.eval, dataset.coco)
 
 
 if __name__ == '__main__':
diff --git a/tools/train.py b/tools/train.py
index 0cb2450acf511715c716594e37b0968876aad683..237ec2b21f58bdbda27339844bfdf0501700b8ca 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -1,85 +1,157 @@
 from __future__ import division
+
 import argparse
-import sys
-import os.path as osp
-sys.path.append(osp.abspath(osp.join(__file__, '../../')))
-sys.path.append('/mnt/lustre/pangjiangmiao/sensenet_folder/mmcv')
+import logging
+import random
+from collections import OrderedDict
 
+import numpy as np
 import torch
-import torch.multiprocessing as mp
 from mmcv import Config
-from mmcv.torchpack import Runner
-from mmdet.core import (batch_processor, init_dist, broadcast_params,
-                        DistOptimizerStepperHook, DistSamplerSeedHook)
-from mmdet.datasets.data_engine import build_data
-from mmdet.models import Detector
-from mmdet.nn.parallel import MMDataParallel
+from mmcv.runner import Runner, obj_from_dict, DistSamplerSeedHook
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+
+from mmdet import datasets, __version__
+from mmdet.core import (init_dist, DistOptimizerHook, CocoDistEvalRecallHook,
+                        CocoDistEvalmAPHook)
+from mmdet.datasets import build_dataloader
+from mmdet.models import build_detector, RPN
+
+
+def parse_losses(losses):
+    log_vars = OrderedDict()
+    for loss_name, loss_value in losses.items():
+        if isinstance(loss_value, torch.Tensor):
+            log_vars[loss_name] = loss_value.mean()
+        elif isinstance(loss_value, list):
+            log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+        else:
+            raise TypeError(
+                '{} is not a tensor or list of tensors'.format(loss_name))
+
+    loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
+
+    log_vars['loss'] = loss
+    for name in log_vars:
+        log_vars[name] = log_vars[name].item()
+
+    return loss, log_vars
+
+
+def batch_processor(model, data, train_mode):
+    losses = model(**data)
+    loss, log_vars = parse_losses(losses)
+
+    outputs = dict(
+        loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+
+    return outputs
+
+
+def get_logger(log_level):
+    logging.basicConfig(
+        format='%(asctime)s - %(levelname)s - %(message)s', level=log_level)
+    logger = logging.getLogger()
+    return logger
+
+
+def set_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description='MMDet train val detector')
+    parser = argparse.ArgumentParser(description='Train a detector')
     parser.add_argument('config', help='train config file path')
-    parser.add_argument('--validate', action='store_true', help='validate')
+    parser.add_argument('--work_dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--validate',
+        action='store_true',
+        help='whether to add a validate phase')
     parser.add_argument(
-        '--dist', action='store_true', help='distributed training or not')
-    parser.add_argument('--world_size', default=1, type=int)
-    parser.add_argument('--rank', default=0, type=int)
+        '--gpus', type=int, default=1, help='number of gpus to use')
+    parser.add_argument('--seed', type=int, help='random seed')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
     args = parser.parse_args()
 
     return args
 
 
-args = parse_args()
-
-
 def main():
-    # Enable distributed training or not
-    if args.dist:
-        print('Enable distributed training.')
-        mp.set_start_method("spawn", force=True)
-        init_dist(
-            args.world_size,
-            args.rank,
-            **cfg.dist_params)
-    else:
-        print('Disabled distributed training.')
+    args = parse_args()
 
-    # Fetch config information
     cfg = Config.fromfile(args.config)
-    # TODO more flexible
-    args.img_per_gpu = cfg.img_per_gpu
-    args.data_workers = cfg.data_workers
+    if args.work_dir is not None:
+        cfg.work_dir = args.work_dir
+    cfg.gpus = args.gpus
+    # save mmdet version in checkpoint as meta data
+    cfg.checkpoint_config.meta = dict(
+        mmdet_version=__version__, config=cfg.text)
+
+    logger = get_logger(cfg.log_level)
 
-    # prepare training loader
-    train_loader = [build_data(cfg.train_dataset, args)]
-    if args.validate:
-        val_loader = build_data(cfg.val_dataset, args)
-        train_loader.append(val_loader)
+    # set random seed if specified
+    if args.seed is not None:
+        logger.info('Set random seed to {}'.format(args.seed))
+        set_random_seed(args.seed)
+
+    # init distributed environment if necessary
+    if args.launcher == 'none':
+        dist = False
+        logger.info('Non-distributed training.')
+    else:
+        dist = True
+        init_dist(args.launcher, **cfg.dist_params)
+        if torch.distributed.get_rank() != 0:
+            logger.setLevel('ERROR')
+        logger.info('Distributed training.')
+
+    # prepare data loaders
+    train_dataset = obj_from_dict(cfg.data.train, datasets)
+    data_loaders = [
+        build_dataloader(train_dataset, cfg.data.imgs_per_gpu,
+                         cfg.data.workers_per_gpu, cfg.gpus, dist)
+    ]
 
     # build model
-    model = Detector(**cfg.model, **cfg.meta_params)
-    if args.dist:
-        model = model.cuda()
-        broadcast_params(model)
+    model = build_detector(
+        cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
+    if dist:
+        model = MMDistributedDataParallel(model.cuda())
     else:
-        device_ids = args.rank % torch.cuda.device_count()
-        model = MMDataParallel(model, device_ids=device_ids).cuda()
+        model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
 
-    # register hooks
+    # build runner
     runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
                     cfg.log_level)
-    optimizer_stepper = DistOptimizerStepperHook(
-        **cfg.grad_clip_config) if args.dist else cfg.grad_clip_config
-    runner.register_training_hooks(cfg.lr_policy, optimizer_stepper,
+
+    # register hooks
+    optimizer_config = DistOptimizerHook(
+        **cfg.optimizer_config) if dist else cfg.optimizer_config
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
                                    cfg.checkpoint_config, cfg.log_config)
-    if args.dist:
+    if dist:
         runner.register_hook(DistSamplerSeedHook())
+        # register eval hooks
+        if args.validate:
+            if isinstance(model.module, RPN):
+                runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
+            elif cfg.data.val.type == 'CocoDataset':
+                runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
+
     if cfg.resume_from:
         runner.resume(cfg.resume_from)
     elif cfg.load_from:
         runner.load_checkpoint(cfg.load_from)
-    runner.run(train_loader, cfg.workflow, cfg.max_epoch, args=args)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()