diff --git a/.gitignore b/.gitignore
index 894a44cc066a027465cd26d634948d56d13af9af..01c47d6e277dba0d7b880dff88f9695f9a8eec50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,8 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# cython generated cpp
+mmdet/ops/nms/*.cpp
+mmdet/version.py
+data
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000000000000000000000000000000000000..dd08915113371f5fabc99c73964f862625ef363f
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,13 @@
+dist: trusty
+language: python
+
+install:
+  - pip install flake8
+
+python:
+  - "2.7"
+  - "3.5"
+  - "3.6"
+
+script:
+  - flake8
\ No newline at end of file
diff --git a/compile.sh b/compile.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8bf418054a26fc2ab5741298f3f3863273cd1c0a
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+PYTHON=${PYTHON:-"python"}
+
+echo "Building roi align op..."
+cd mmdet/ops/roi_align
+if [ -d "build" ]; then
+    rm -r build
+fi
+$PYTHON setup.py build_ext --inplace
+
+echo "Building roi pool op..."
+cd ../roi_pool
+if [ -d "build" ]; then
+    rm -r build
+fi
+$PYTHON setup.py build_ext --inplace
+
+echo "Building nms op..."
+cd ../nms
+make clean
+make PYTHON=${PYTHON}
diff --git a/configs/fast_mask_rcnn_r50_fpn_1x.py b/configs/fast_mask_rcnn_r50_fpn_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..af2070f5d9006a38432f5b4a15100bb5d1c3f9e9
--- /dev/null
+++ b/configs/fast_mask_rcnn_r50_fpn_1x.py
@@ -0,0 +1,132 @@
+# model settings
+model = dict(
+    type='FastRCNN',
+    pretrained='modelzoo://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    bbox_roi_extractor=dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32]),
+    bbox_head=dict(
+        type='SharedFCRoIHead',
+        num_fcs=2,
+        in_channels=256,
+        fc_out_channels=1024,
+        roi_feat_size=7,
+        num_classes=81,
+        target_means=[0., 0., 0., 0.],
+        target_stds=[0.1, 0.1, 0.2, 0.2],
+        reg_class_agnostic=False),
+    mask_roi_extractor=dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32]),
+    mask_head=dict(
+        type='FCNMaskHead',
+        num_convs=4,
+        in_channels=256,
+        conv_out_channels=256,
+        num_classes=81))
+# model training and testing settings
+train_cfg = dict(
+    rcnn=dict(
+        mask_size=28,
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        crowd_thr=1.1,
+        roi_batch_size=512,
+        add_gt_as_proposals=True,
+        pos_fraction=0.25,
+        pos_balance_sampling=False,
+        neg_pos_ub=512,
+        neg_balance_thr=0,
+        min_pos_iou=0.5,
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rcnn=dict(
+        score_thr=0.05, max_per_img=100, nms_thr=0.5, mask_thr_binary=0.5))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl',
+        flip_ratio=0.5,
+        with_mask=True,
+        with_crowd=True,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=True,
+        with_crowd=True,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/fast_mask_rcnn_r50_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/fast_rcnn_r50_fpn_1x.py b/configs/fast_rcnn_r50_fpn_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..397ab431e616ee7c42cb3a8d3c9fd92333d869e9
--- /dev/null
+++ b/configs/fast_rcnn_r50_fpn_1x.py
@@ -0,0 +1,118 @@
+# model settings
+model = dict(
+    type='FastRCNN',
+    pretrained='modelzoo://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    bbox_roi_extractor=dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32]),
+    bbox_head=dict(
+        type='SharedFCRoIHead',
+        num_fcs=2,
+        in_channels=256,
+        fc_out_channels=1024,
+        roi_feat_size=7,
+        num_classes=81,
+        target_means=[0., 0., 0., 0.],
+        target_stds=[0.1, 0.1, 0.2, 0.2],
+        reg_class_agnostic=False))
+# model training and testing settings
+train_cfg = dict(
+    rcnn=dict(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        crowd_thr=1.1,
+        roi_batch_size=512,
+        add_gt_as_proposals=True,
+        pos_fraction=0.25,
+        pos_balance_sampling=False,
+        neg_pos_ub=512,
+        neg_balance_thr=0,
+        min_pos_iou=0.5,
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(rcnn=dict(score_thr=0.05, max_per_img=100, nms_thr=0.5))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_train2017.pkl',
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        proposal_file=data_root + 'proposals/rpn_r50_fpn_1x_val2017.pkl',
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/fast_rcnn_r50_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/faster_rcnn_r50_fpn_1x.py b/configs/faster_rcnn_r50_fpn_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c06c4cca7e0989e774e1a23ae92929b18da9d7d
--- /dev/null
+++ b/configs/faster_rcnn_r50_fpn_1x.py
@@ -0,0 +1,147 @@
+# model settings
+model = dict(
+    type='FasterRCNN',
+    pretrained='modelzoo://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_scales=[8],
+        anchor_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        target_means=[.0, .0, .0, .0],
+        target_stds=[1.0, 1.0, 1.0, 1.0],
+        use_sigmoid_cls=True),
+    bbox_roi_extractor=dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32]),
+    bbox_head=dict(
+        type='SharedFCRoIHead',
+        num_fcs=2,
+        in_channels=256,
+        fc_out_channels=1024,
+        roi_feat_size=7,
+        num_classes=81,
+        target_means=[0., 0., 0., 0.],
+        target_stds=[0.1, 0.1, 0.2, 0.2],
+        reg_class_agnostic=False))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        pos_fraction=0.5,
+        pos_balance_sampling=False,
+        neg_pos_ub=256,
+        allowed_border=0,
+        crowd_thr=1.1,
+        anchor_batch_size=256,
+        pos_iou_thr=0.7,
+        neg_iou_thr=0.3,
+        neg_balance_thr=0,
+        min_pos_iou=0.3,
+        pos_weight=-1,
+        smoothl1_beta=1 / 9.0,
+        debug=False),
+    rcnn=dict(
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        crowd_thr=1.1,
+        roi_batch_size=512,
+        add_gt_as_proposals=True,
+        pos_fraction=0.25,
+        pos_balance_sampling=False,
+        neg_pos_ub=512,
+        neg_balance_thr=0,
+        min_pos_iou=0.5,
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(score_thr=0.05, max_per_img=100, nms_thr=0.5))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/faster_rcnn_r50_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/mask_rcnn_r50_fpn_1x.py b/configs/mask_rcnn_r50_fpn_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..8868cf6ebd99914ca8f12d3227fbbd9887e9c933
--- /dev/null
+++ b/configs/mask_rcnn_r50_fpn_1x.py
@@ -0,0 +1,160 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    pretrained='modelzoo://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_scales=[8],
+        anchor_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        target_means=[.0, .0, .0, .0],
+        target_stds=[1.0, 1.0, 1.0, 1.0],
+        use_sigmoid_cls=True),
+    bbox_roi_extractor=dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32]),
+    bbox_head=dict(
+        type='SharedFCRoIHead',
+        num_fcs=2,
+        in_channels=256,
+        fc_out_channels=1024,
+        roi_feat_size=7,
+        num_classes=81,
+        target_means=[0., 0., 0., 0.],
+        target_stds=[0.1, 0.1, 0.2, 0.2],
+        reg_class_agnostic=False),
+    mask_roi_extractor=dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32]),
+    mask_head=dict(
+        type='FCNMaskHead',
+        num_convs=4,
+        in_channels=256,
+        conv_out_channels=256,
+        num_classes=81))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        pos_fraction=0.5,
+        pos_balance_sampling=False,
+        neg_pos_ub=256,
+        allowed_border=0,
+        crowd_thr=1.1,
+        anchor_batch_size=256,
+        pos_iou_thr=0.7,
+        neg_iou_thr=0.3,
+        neg_balance_thr=0,
+        min_pos_iou=0.3,
+        pos_weight=-1,
+        smoothl1_beta=1 / 9.0,
+        debug=False),
+    rcnn=dict(
+        mask_size=28,
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        crowd_thr=1.1,
+        roi_batch_size=512,
+        add_gt_as_proposals=True,
+        pos_fraction=0.25,
+        pos_balance_sampling=False,
+        neg_pos_ub=512,
+        neg_balance_thr=0,
+        min_pos_iou=0.5,
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=0.05, max_per_img=100, nms_thr=0.5, mask_thr_binary=0.5))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=True,
+        with_crowd=True,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=True,
+        with_crowd=True,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/mask_rcnn_r50_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/rpn_r50_fpn_1x.py b/configs/rpn_r50_fpn_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1b6d0ca39558694292610ba93979099eb0ada8
--- /dev/null
+++ b/configs/rpn_r50_fpn_1x.py
@@ -0,0 +1,118 @@
+# model settings
+model = dict(
+    type='RPN',
+    pretrained='modelzoo://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_scales=[8],
+        anchor_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        target_means=[.0, .0, .0, .0],
+        target_stds=[1.0, 1.0, 1.0, 1.0],
+        use_sigmoid_cls=True))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        pos_fraction=0.5,
+        pos_balance_sampling=False,
+        neg_pos_ub=256,
+        allowed_border=0,
+        crowd_thr=1.1,
+        anchor_batch_size=256,
+        pos_iou_thr=0.7,
+        neg_iou_thr=0.3,
+        neg_balance_thr=0,
+        min_pos_iou=0.3,
+        pos_weight=-1,
+        smoothl1_beta=1 / 9.0,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+# runner configs
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/rpn_r50_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/mmdet/__init__.py b/mmdet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4f7e8fcc54041e383b72d48860ccbdc3afc41c
--- /dev/null
+++ b/mmdet/__init__.py
@@ -0,0 +1,3 @@
+from .version import __version__, short_version
+
+__all__ = ['__version__', 'short_version']
diff --git a/mmdet/apis/__init__.py b/mmdet/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..030b7de41026755359535cc309e39c7b4e0efb66
--- /dev/null
+++ b/mmdet/apis/__init__.py
@@ -0,0 +1,8 @@
+from .env import init_dist, get_root_logger, set_random_seed
+from .train import train_detector
+from .inference import inference_detector, show_result
+
+__all__ = [
+    'init_dist', 'get_root_logger', 'set_random_seed', 'train_detector',
+    'inference_detector', 'show_result'
+]
diff --git a/mmdet/apis/env.py b/mmdet/apis/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..20cd26dee8fbc258ffd4c50fef6e8468bf4ba094
--- /dev/null
+++ b/mmdet/apis/env.py
@@ -0,0 +1,57 @@
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from mmcv.runner import get_dist_info
+
+
+def init_dist(launcher, backend='nccl', **kwargs):
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError('Invalid launcher type: {}'.format(launcher))
+
+
+def _init_dist_pytorch(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_mpi(backend, **kwargs):
+    raise NotImplementedError
+
+
+def _init_dist_slurm(backend, **kwargs):
+    raise NotImplementedError
+
+
+def set_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def get_root_logger(log_level=logging.INFO):
+    logger = logging.getLogger()
+    if not logger.hasHandlers():
+        logging.basicConfig(
+            format='%(asctime)s - %(levelname)s - %(message)s',
+            level=log_level)
+    rank, _ = get_dist_info()
+    if rank != 0:
+        logger.setLevel('ERROR')
+    return logger
diff --git a/mmdet/apis/inference.py b/mmdet/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a87323cee1aba6e97c75f2a563c5337bc5fe32ff
--- /dev/null
+++ b/mmdet/apis/inference.py
@@ -0,0 +1,65 @@
+import mmcv
+import numpy as np
+import torch
+
+from mmdet.datasets import to_tensor
+from mmdet.datasets.transforms import ImageTransform
+from mmdet.core import get_classes
+
+
+def _prepare_data(img, img_transform, cfg, device):
+    ori_shape = img.shape
+    img, img_shape, pad_shape, scale_factor = img_transform(
+        img, scale=cfg.data.test.img_scale)
+    img = to_tensor(img).to(device).unsqueeze(0)
+    img_meta = [
+        dict(
+            ori_shape=ori_shape,
+            img_shape=img_shape,
+            pad_shape=pad_shape,
+            scale_factor=scale_factor,
+            flip=False)
+    ]
+    return dict(img=[img], img_meta=[img_meta])
+
+
+def _inference_single(model, img, img_transform, cfg, device):
+    img = mmcv.imread(img)
+    data = _prepare_data(img, img_transform, cfg, device)
+    with torch.no_grad():
+        result = model(return_loss=False, rescale=True, **data)
+    return result
+
+
+def _inference_generator(model, imgs, img_transform, cfg, device):
+    for img in imgs:
+        yield _inference_single(model, img, img_transform, cfg, device)
+
+
+def inference_detector(model, imgs, cfg, device='cuda:0'):
+    img_transform = ImageTransform(
+        size_divisor=cfg.data.test.size_divisor, **cfg.img_norm_cfg)
+    model = model.to(device)
+    model.eval()
+
+    if not isinstance(imgs, list):
+        return _inference_single(model, imgs, img_transform, cfg, device)
+    else:
+        return _inference_generator(model, imgs, img_transform, cfg, device)
+
+
+def show_result(img, result, dataset='coco', score_thr=0.3):
+    class_names = get_classes(dataset)
+    labels = [
+        np.full(bbox.shape[0], i, dtype=np.int32)
+        for i, bbox in enumerate(result)
+    ]
+    labels = np.concatenate(labels)
+    bboxes = np.vstack(result)
+    img = mmcv.imread(img)
+    mmcv.imshow_det_bboxes(
+        img.copy(),
+        bboxes,
+        labels,
+        class_names=class_names,
+        score_thr=score_thr)
diff --git a/mmdet/apis/train.py b/mmdet/apis/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a589722a70fcb450ed33f3577ef37c625762682
--- /dev/null
+++ b/mmdet/apis/train.py
@@ -0,0 +1,117 @@
+from __future__ import division
+
+from collections import OrderedDict
+
+import torch
+from mmcv.runner import Runner, DistSamplerSeedHook
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+
+from mmdet.core import (DistOptimizerHook, CocoDistEvalRecallHook,
+                        CocoDistEvalmAPHook)
+from mmdet.datasets import build_dataloader
+from mmdet.models import RPN
+from .env import get_root_logger
+
+
+def parse_losses(losses):
+    log_vars = OrderedDict()
+    for loss_name, loss_value in losses.items():
+        if isinstance(loss_value, torch.Tensor):
+            log_vars[loss_name] = loss_value.mean()
+        elif isinstance(loss_value, list):
+            log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+        else:
+            raise TypeError(
+                '{} is not a tensor or list of tensors'.format(loss_name))
+
+    loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key)
+
+    log_vars['loss'] = loss
+    for name in log_vars:
+        log_vars[name] = log_vars[name].item()
+
+    return loss, log_vars
+
+
+def batch_processor(model, data, train_mode):
+    losses = model(**data)
+    loss, log_vars = parse_losses(losses)
+
+    outputs = dict(
+        loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+
+    return outputs
+
+
+def train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   logger=None):
+    if logger is None:
+        logger = get_root_logger(cfg.log_level)
+
+    # start training
+    if distributed:
+        _dist_train(model, dataset, cfg, validate=validate)
+    else:
+        _non_dist_train(model, dataset, cfg, validate=validate)
+
+
+def _dist_train(model, dataset, cfg, validate=False):
+    # prepare data loaders
+    data_loaders = [
+        build_dataloader(
+            dataset,
+            cfg.data.imgs_per_gpu,
+            cfg.data.workers_per_gpu,
+            dist=True)
+    ]
+    # put model on gpus
+    model = MMDistributedDataParallel(model.cuda())
+    # build runner
+    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
+                    cfg.log_level)
+    # register hooks
+    optimizer_config = DistOptimizerHook(**cfg.optimizer_config)
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config)
+    runner.register_hook(DistSamplerSeedHook())
+    # register eval hooks
+    if validate:
+        if isinstance(model.module, RPN):
+            runner.register_hook(CocoDistEvalRecallHook(cfg.data.val))
+        elif cfg.data.val.type == 'CocoDataset':
+            runner.register_hook(CocoDistEvalmAPHook(cfg.data.val))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
+
+
+def _non_dist_train(model, dataset, cfg, validate=False):
+    # prepare data loaders
+    data_loaders = [
+        build_dataloader(
+            dataset,
+            cfg.data.imgs_per_gpu,
+            cfg.data.workers_per_gpu,
+            cfg.gpus,
+            dist=False)
+    ]
+    # put model on gpus
+    model = MMDataParallel(model, device_ids=range(cfg.gpus)).cuda()
+    # build runner
+    runner = Runner(model, batch_processor, cfg.optimizer, cfg.work_dir,
+                    cfg.log_level)
+    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config)
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
diff --git a/mmdet/core/__init__.py b/mmdet/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..645d5be29c039aeb2173525163b681675741d7ea
--- /dev/null
+++ b/mmdet/core/__init__.py
@@ -0,0 +1,7 @@
+from .anchor import *  # noqa: F401, F403
+from .bbox import *  # noqa: F401, F403
+from .mask import *  # noqa: F401, F403
+from .loss import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
diff --git a/mmdet/core/anchor/__init__.py b/mmdet/core/anchor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ff430a4be1825fbbaa3cb31d54de8790aa2fb90
--- /dev/null
+++ b/mmdet/core/anchor/__init__.py
@@ -0,0 +1,4 @@
+from .anchor_generator import AnchorGenerator
+from .anchor_target import anchor_target
+
+__all__ = ['AnchorGenerator', 'anchor_target']
diff --git a/mmdet/core/anchor/anchor_generator.py b/mmdet/core/anchor/anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..84600be331e52d9a64f70e2cb43696b82801bf0e
--- /dev/null
+++ b/mmdet/core/anchor/anchor_generator.py
@@ -0,0 +1,83 @@
+import torch
+
+
+class AnchorGenerator(object):
+
+    def __init__(self, base_size, scales, ratios, scale_major=True):
+        self.base_size = base_size
+        self.scales = torch.Tensor(scales)
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_base_anchors(self):
+        return self.base_anchors.size(0)
+
+    def gen_base_anchors(self):
+        base_anchor = torch.Tensor(
+            [0, 0, self.base_size - 1, self.base_size - 1])
+
+        w = base_anchor[2] - base_anchor[0] + 1
+        h = base_anchor[3] - base_anchor[1] + 1
+        x_ctr = base_anchor[0] + 0.5 * (w - 1)
+        y_ctr = base_anchor[1] + 0.5 * (h - 1)
+
+        h_ratios = torch.sqrt(self.ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * self.scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * self.scales[None, :]).view(-1)
+        else:
+            ws = (w * self.scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * self.scales[:, None] * h_ratios[None, :]).view(-1)
+
+        base_anchors = torch.stack(
+            [
+                x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),
+                x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)
+            ],
+            dim=-1).round()
+
+        return base_anchors
+
+    def _meshgrid(self, x, y, row_major=True):
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_anchors(self, featmap_size, stride=16, device='cuda'):
+        base_anchors = self.base_anchors.to(device)
+
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0, feat_w, device=device) * stride
+        shift_y = torch.arange(0, feat_h, device=device) * stride
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self, featmap_size, valid_size, device='cuda'):
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.uint8, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.uint8, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(
+            valid.size(0), self.num_base_anchors).contiguous().view(-1)
+        return valid
diff --git a/mmdet/core/anchor/anchor_target.py b/mmdet/core/anchor/anchor_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad81e390e6dcb2a064862818a34ea99adbe462e0
--- /dev/null
+++ b/mmdet/core/anchor/anchor_target.py
@@ -0,0 +1,149 @@
+import torch
+
+from ..bbox import bbox_assign, bbox2delta, bbox_sampling
+from ..utils import multi_apply
+
+
+def anchor_target(anchor_list, valid_flag_list, gt_bboxes_list, img_metas,
+                  target_means, target_stds, cfg):
+    """Compute regression and classification targets for anchors.
+
+    Args:
+        anchor_list (list[list]): Multi level anchors of each image.
+        valid_flag_list (list[list]): Multi level valid flags of each image.
+        gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+        img_metas (list[dict]): Meta info of each image.
+        target_means (Iterable): Mean value of regression targets.
+        target_stds (Iterable): Std value of regression targets.
+        cfg (dict): RPN train configs.
+
+    Returns:
+        tuple
+    """
+    num_imgs = len(img_metas)
+    assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+    # anchor number of multi levels
+    num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+    # concat all level anchors and flags to a single tensor
+    for i in range(num_imgs):
+        assert len(anchor_list[i]) == len(valid_flag_list[i])
+        anchor_list[i] = torch.cat(anchor_list[i])
+        valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+    # compute targets for each image
+    means_replicas = [target_means for _ in range(num_imgs)]
+    stds_replicas = [target_stds for _ in range(num_imgs)]
+    cfg_replicas = [cfg for _ in range(num_imgs)]
+    (all_labels, all_label_weights, all_bbox_targets,
+     all_bbox_weights, pos_inds_list, neg_inds_list) = multi_apply(
+         anchor_target_single, anchor_list, valid_flag_list, gt_bboxes_list,
+         img_metas, means_replicas, stds_replicas, cfg_replicas)
+    # no valid anchors
+    if any([labels is None for labels in all_labels]):
+        return None
+    # sampled anchors of all images
+    num_total_samples = sum([
+        max(pos_inds.numel() + neg_inds.numel(), 1)
+        for pos_inds, neg_inds in zip(pos_inds_list, neg_inds_list)
+    ])
+    # split targets to a list w.r.t. multiple levels
+    labels_list = images_to_levels(all_labels, num_level_anchors)
+    label_weights_list = images_to_levels(all_label_weights, num_level_anchors)
+    bbox_targets_list = images_to_levels(all_bbox_targets, num_level_anchors)
+    bbox_weights_list = images_to_levels(all_bbox_weights, num_level_anchors)
+    return (labels_list, label_weights_list, bbox_targets_list,
+            bbox_weights_list, num_total_samples)
+
+
+def images_to_levels(target, num_level_anchors):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = torch.stack(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_level_anchors:
+        end = start + n
+        level_targets.append(target[:, start:end].squeeze(0))
+        start = end
+    return level_targets
+
+
+def anchor_target_single(flat_anchors, valid_flags, gt_bboxes, img_meta,
+                         target_means, target_stds, cfg):
+    inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                       img_meta['img_shape'][:2],
+                                       cfg.allowed_border)
+    if not inside_flags.any():
+        return (None, ) * 6
+    # assign gt and sample anchors
+    anchors = flat_anchors[inside_flags, :]
+    assigned_gt_inds, argmax_overlaps, max_overlaps = bbox_assign(
+        anchors,
+        gt_bboxes,
+        pos_iou_thr=cfg.pos_iou_thr,
+        neg_iou_thr=cfg.neg_iou_thr,
+        min_pos_iou=cfg.min_pos_iou)
+    pos_inds, neg_inds = bbox_sampling(assigned_gt_inds, cfg.anchor_batch_size,
+                                       cfg.pos_fraction, cfg.neg_pos_ub,
+                                       cfg.pos_balance_sampling, max_overlaps,
+                                       cfg.neg_balance_thr)
+
+    bbox_targets = torch.zeros_like(anchors)
+    bbox_weights = torch.zeros_like(anchors)
+    labels = torch.zeros_like(assigned_gt_inds)
+    label_weights = torch.zeros_like(assigned_gt_inds, dtype=anchors.dtype)
+
+    if len(pos_inds) > 0:
+        pos_anchors = anchors[pos_inds, :]
+        pos_gt_bbox = gt_bboxes[assigned_gt_inds[pos_inds] - 1, :]
+        pos_bbox_targets = bbox2delta(pos_anchors, pos_gt_bbox, target_means,
+                                      target_stds)
+        bbox_targets[pos_inds, :] = pos_bbox_targets
+        bbox_weights[pos_inds, :] = 1.0
+        labels[pos_inds] = 1
+        if cfg.pos_weight <= 0:
+            label_weights[pos_inds] = 1.0
+        else:
+            label_weights[pos_inds] = cfg.pos_weight
+    if len(neg_inds) > 0:
+        label_weights[neg_inds] = 1.0
+
+    # map up to original set of anchors
+    num_total_anchors = flat_anchors.size(0)
+    labels = unmap(labels, num_total_anchors, inside_flags)
+    label_weights = unmap(label_weights, num_total_anchors, inside_flags)
+    bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+    bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+    return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+            neg_inds)
+
+
+def anchor_inside_flags(flat_anchors, valid_flags, img_shape,
+                        allowed_border=0):
+    img_h, img_w = img_shape[:2]
+    if allowed_border >= 0:
+        inside_flags = valid_flags & \
+            (flat_anchors[:, 0] >= -allowed_border) & \
+            (flat_anchors[:, 1] >= -allowed_border) & \
+            (flat_anchors[:, 2] < img_w + allowed_border) & \
+            (flat_anchors[:, 3] < img_h + allowed_border)
+    else:
+        inside_flags = valid_flags
+    return inside_flags
+
+
+def unmap(data, count, inds, fill=0):
+    """ Unmap a subset of item (data) back to the original set of items (of
+    size count) """
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds, :] = data
+    return ret
diff --git a/mmdet/core/bbox/__init__.py b/mmdet/core/bbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5c21dce52f25781e2e4e3e760a837d4d36eec5c
--- /dev/null
+++ b/mmdet/core/bbox/__init__.py
@@ -0,0 +1,15 @@
+from .geometry import bbox_overlaps
+from .sampling import (random_choice, bbox_assign, bbox_assign_wrt_overlaps,
+                       bbox_sampling, bbox_sampling_pos, bbox_sampling_neg,
+                       sample_bboxes)
+from .transforms import (bbox2delta, delta2bbox, bbox_flip, bbox_mapping,
+                         bbox_mapping_back, bbox2roi, roi2bbox, bbox2result)
+from .bbox_target import bbox_target
+
+__all__ = [
+    'bbox_overlaps', 'random_choice', 'bbox_assign',
+    'bbox_assign_wrt_overlaps', 'bbox_sampling', 'bbox_sampling_pos',
+    'bbox_sampling_neg', 'sample_bboxes', 'bbox2delta', 'delta2bbox',
+    'bbox_flip', 'bbox_mapping', 'bbox_mapping_back', 'bbox2roi', 'roi2bbox',
+    'bbox2result', 'bbox_target'
+]
diff --git a/mmdet/core/bbox/bbox_target.py b/mmdet/core/bbox/bbox_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e205c3850c9bc232b99826a23e79f416a3dbcfb
--- /dev/null
+++ b/mmdet/core/bbox/bbox_target.py
@@ -0,0 +1,76 @@
+import torch
+
+from .transforms import bbox2delta
+from ..utils import multi_apply
+
+
+def bbox_target(pos_proposals_list,
+                neg_proposals_list,
+                pos_gt_bboxes_list,
+                pos_gt_labels_list,
+                cfg,
+                reg_num_classes=1,
+                target_means=[.0, .0, .0, .0],
+                target_stds=[1.0, 1.0, 1.0, 1.0],
+                concat=True):
+    labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+        proposal_target_single,
+        pos_proposals_list,
+        neg_proposals_list,
+        pos_gt_bboxes_list,
+        pos_gt_labels_list,
+        cfg=cfg,
+        reg_num_classes=reg_num_classes,
+        target_means=target_means,
+        target_stds=target_stds)
+
+    if concat:
+        labels = torch.cat(labels, 0)
+        label_weights = torch.cat(label_weights, 0)
+        bbox_targets = torch.cat(bbox_targets, 0)
+        bbox_weights = torch.cat(bbox_weights, 0)
+    return labels, label_weights, bbox_targets, bbox_weights
+
+
+def proposal_target_single(pos_proposals,
+                           neg_proposals,
+                           pos_gt_bboxes,
+                           pos_gt_labels,
+                           cfg,
+                           reg_num_classes=1,
+                           target_means=[.0, .0, .0, .0],
+                           target_stds=[1.0, 1.0, 1.0, 1.0]):
+    num_pos = pos_proposals.size(0)
+    num_neg = neg_proposals.size(0)
+    num_samples = num_pos + num_neg
+    labels = pos_proposals.new_zeros(num_samples, dtype=torch.long)
+    label_weights = pos_proposals.new_zeros(num_samples)
+    bbox_targets = pos_proposals.new_zeros(num_samples, 4)
+    bbox_weights = pos_proposals.new_zeros(num_samples, 4)
+    if num_pos > 0:
+        labels[:num_pos] = pos_gt_labels
+        pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+        label_weights[:num_pos] = pos_weight
+        pos_bbox_targets = bbox2delta(pos_proposals, pos_gt_bboxes,
+                                      target_means, target_stds)
+        bbox_targets[:num_pos, :] = pos_bbox_targets
+        bbox_weights[:num_pos, :] = 1
+    if num_neg > 0:
+        label_weights[-num_neg:] = 1.0
+    if reg_num_classes > 1:
+        bbox_targets, bbox_weights = expand_target(bbox_targets, bbox_weights,
+                                                   labels, reg_num_classes)
+
+    return labels, label_weights, bbox_targets, bbox_weights
+
+
+def expand_target(bbox_targets, bbox_weights, labels, num_classes):
+    bbox_targets_expand = bbox_targets.new_zeros((bbox_targets.size(0),
+                                                  4 * num_classes))
+    bbox_weights_expand = bbox_weights.new_zeros((bbox_weights.size(0),
+                                                  4 * num_classes))
+    for i in torch.nonzero(labels > 0).squeeze(-1):
+        start, end = labels[i] * 4, (labels[i] + 1) * 4
+        bbox_targets_expand[i, start:end] = bbox_targets[i, :]
+        bbox_weights_expand[i, start:end] = bbox_weights[i, :]
+    return bbox_targets_expand, bbox_weights_expand
diff --git a/mmdet/core/bbox/geometry.py b/mmdet/core/bbox/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..a852a06fb0c216569cf5f32385c356114c534904
--- /dev/null
+++ b/mmdet/core/bbox/geometry.py
@@ -0,0 +1,63 @@
+import torch
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False):
+    """Calculate overlap between two set of bboxes.
+
+    If ``is_aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (m, 4)
+        bboxes2 (Tensor): shape (n, 4), if is_aligned is ``True``, then m and n
+            must be equal.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    Returns:
+        ious(Tensor): shape (n, k) if is_aligned == False else shape (n, 1)
+    """
+
+    assert mode in ['iou', 'iof']
+
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        return bboxes1.new(rows, 1) if is_aligned else bboxes1.new(rows, cols)
+
+    if is_aligned:
+        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
+        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
+
+        wh = (rb - lt + 1).clamp(min=0)  # [rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + 1)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + 1)
+            ious = overlap / (area1 + area2 - overlap)
+        else:
+            ious = overlap / area1
+    else:
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
+
+        wh = (rb - lt + 1).clamp(min=0)  # [rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + 1)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + 1)
+            ious = overlap / (area1[:, None] + area2 - overlap)
+        else:
+            ious = overlap / (area1[:, None])
+
+    return ious
diff --git a/mmdet/core/bbox/sampling.py b/mmdet/core/bbox/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..976cd9507f2279b663d3f5e09ed1180da5b457c1
--- /dev/null
+++ b/mmdet/core/bbox/sampling.py
@@ -0,0 +1,343 @@
+import numpy as np
+import torch
+
+from .geometry import bbox_overlaps
+
+
+def random_choice(gallery, num):
+    """Random select some elements from the gallery.
+
+    It seems that Pytorch's implementation is slower than numpy so we use numpy
+    to randperm the indices.
+    """
+    assert len(gallery) >= num
+    if isinstance(gallery, list):
+        gallery = np.array(gallery)
+    cands = np.arange(len(gallery))
+    np.random.shuffle(cands)
+    rand_inds = cands[:num]
+    if not isinstance(gallery, np.ndarray):
+        rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
+    return gallery[rand_inds]
+
+
+def bbox_assign(proposals,
+                gt_bboxes,
+                gt_bboxes_ignore=None,
+                gt_labels=None,
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=.0,
+                crowd_thr=-1):
+    """Assign a corresponding gt bbox or background to each proposal/anchor.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    If `gt_bboxes_ignore` is specified, bboxes which have iof (intersection
+    over foreground) with `gt_bboxes_ignore` above `crowd_thr` will be ignored.
+
+    Args:
+        proposals (Tensor): Proposals or RPN anchors, shape (n, 4).
+        gt_bboxes (Tensor): Ground truth bboxes, shape (k, 4).
+        gt_bboxes_ignore (Tensor, optional): shape(m, 4).
+        gt_labels (Tensor, optional): shape (k, ).
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. For RPN, it is usually set as 0.3, for Fast R-CNN,
+            it is usually set as pos_iou_thr
+        crowd_thr (float): IoF threshold for ignoring bboxes. Negative value
+            for not ignoring any bboxes.
+
+    Returns:
+        tuple: (assigned_gt_inds, argmax_overlaps, max_overlaps), shape (n, )
+    """
+
+    # calculate overlaps between the proposals and the gt boxes
+    overlaps = bbox_overlaps(proposals, gt_bboxes)
+    if overlaps.numel() == 0:
+        raise ValueError('No gt bbox or proposals')
+
+    # ignore proposals according to crowd bboxes
+    if (crowd_thr > 0) and (gt_bboxes_ignore is
+                            not None) and (gt_bboxes_ignore.numel() > 0):
+        crowd_overlaps = bbox_overlaps(proposals, gt_bboxes_ignore, mode='iof')
+        crowd_max_overlaps, _ = crowd_overlaps.max(dim=1)
+        crowd_bboxes_inds = torch.nonzero(
+            crowd_max_overlaps > crowd_thr).long()
+        if crowd_bboxes_inds.numel() > 0:
+            overlaps[crowd_bboxes_inds, :] = -1
+
+    return bbox_assign_wrt_overlaps(overlaps, gt_labels, pos_iou_thr,
+                                    neg_iou_thr, min_pos_iou)
+
+
+def bbox_assign_wrt_overlaps(overlaps,
+                             gt_labels=None,
+                             pos_iou_thr=0.5,
+                             neg_iou_thr=0.5,
+                             min_pos_iou=.0):
+    """Assign a corresponding gt bbox or background to each proposal/anchor.
+
+    This method assign a gt bbox to every proposal, each proposals will be
+    assigned with -1, 0, or a positive number. -1 means don't care, 0 means
+    negative sample, positive number is the index (1-based) of assigned gt.
+    The assignment is done in following steps, the order matters:
+
+    1. assign every anchor to -1
+    2. assign proposals whose iou with all gts < neg_iou_thr to 0
+    3. for each anchor, if the iou with its nearest gt >= pos_iou_thr,
+    assign it to that bbox
+    4. for each gt bbox, assign its nearest proposals(may be more than one)
+    to itself
+
+    Args:
+        overlaps (Tensor): Overlaps between n proposals and k gt_bboxes,
+            shape(n, k).
+        gt_labels (Tensor, optional): Labels of k gt_bboxes, shape (k, ).
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum IoU for a bbox to be considered as a
+            positive bbox. This argument only affects the 4th step.
+
+    Returns:
+        tuple: (assigned_gt_inds, [assigned_labels], argmax_overlaps,
+            max_overlaps), shape (n, )
+    """
+    num_bboxes, num_gts = overlaps.size(0), overlaps.size(1)
+    # 1. assign -1 by default
+    assigned_gt_inds = overlaps.new(num_bboxes).long().fill_(-1)
+
+    if overlaps.numel() == 0:
+        raise ValueError('No gt bbox or proposals')
+
+    assert overlaps.size() == (num_bboxes, num_gts)
+    # for each anchor, which gt best overlaps with it
+    # for each anchor, the max iou of all gts
+    max_overlaps, argmax_overlaps = overlaps.max(dim=1)
+    # for each gt, which anchor best overlaps with it
+    # for each gt, the max iou of all proposals
+    gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=0)
+
+    # 2. assign negative: below
+    if isinstance(neg_iou_thr, float):
+        assigned_gt_inds[(max_overlaps >= 0)
+                         & (max_overlaps < neg_iou_thr)] = 0
+    elif isinstance(neg_iou_thr, tuple):
+        assert len(neg_iou_thr) == 2
+        assigned_gt_inds[(max_overlaps >= neg_iou_thr[0])
+                         & (max_overlaps < neg_iou_thr[1])] = 0
+
+    # 3. assign positive: above positive IoU threshold
+    pos_inds = max_overlaps >= pos_iou_thr
+    assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+    # 4. assign fg: for each gt, proposals with highest IoU
+    for i in range(num_gts):
+        if gt_max_overlaps[i] >= min_pos_iou:
+            assigned_gt_inds[overlaps[:, i] == gt_max_overlaps[i]] = i + 1
+
+    if gt_labels is None:
+        return assigned_gt_inds, argmax_overlaps, max_overlaps
+    else:
+        assigned_labels = assigned_gt_inds.new(num_bboxes).fill_(0)
+        pos_inds = torch.nonzero(assigned_gt_inds > 0).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+        return assigned_gt_inds, assigned_labels, argmax_overlaps, max_overlaps
+
+
+def bbox_sampling_pos(assigned_gt_inds, num_expected, balance_sampling=True):
+    """Balance sampling for positive bboxes/anchors.
+
+    1. calculate average positive num for each gt: num_per_gt
+    2. sample at most num_per_gt positives for each gt
+    3. random sampling from rest anchors if not enough fg
+    """
+    pos_inds = torch.nonzero(assigned_gt_inds > 0)
+    if pos_inds.numel() != 0:
+        pos_inds = pos_inds.squeeze(1)
+    if pos_inds.numel() <= num_expected:
+        return pos_inds
+    elif not balance_sampling:
+        return random_choice(pos_inds, num_expected)
+    else:
+        unique_gt_inds = torch.unique(assigned_gt_inds[pos_inds].cpu())
+        num_gts = len(unique_gt_inds)
+        num_per_gt = int(round(num_expected / float(num_gts)) + 1)
+        sampled_inds = []
+        for i in unique_gt_inds:
+            inds = torch.nonzero(assigned_gt_inds == i.item())
+            if inds.numel() != 0:
+                inds = inds.squeeze(1)
+            else:
+                continue
+            if len(inds) > num_per_gt:
+                inds = random_choice(inds, num_per_gt)
+            sampled_inds.append(inds)
+        sampled_inds = torch.cat(sampled_inds)
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(
+                list(set(pos_inds.cpu()) - set(sampled_inds.cpu())))
+            if len(extra_inds) > num_extra:
+                extra_inds = random_choice(extra_inds, num_extra)
+            extra_inds = torch.from_numpy(extra_inds).to(
+                assigned_gt_inds.device).long()
+            sampled_inds = torch.cat([sampled_inds, extra_inds])
+        elif len(sampled_inds) > num_expected:
+            sampled_inds = random_choice(sampled_inds, num_expected)
+        return sampled_inds
+
+
+def bbox_sampling_neg(assigned_gt_inds,
+                      num_expected,
+                      max_overlaps=None,
+                      balance_thr=0,
+                      hard_fraction=0.5):
+    """Balance sampling for negative bboxes/anchors.
+
+    Negative samples are split into 2 set: hard (balance_thr <= iou <
+    neg_iou_thr) and easy(iou < balance_thr). The sampling ratio is controlled
+    by `hard_fraction`.
+    """
+    neg_inds = torch.nonzero(assigned_gt_inds == 0)
+    if neg_inds.numel() != 0:
+        neg_inds = neg_inds.squeeze(1)
+    if len(neg_inds) <= num_expected:
+        return neg_inds
+    elif balance_thr <= 0:
+        # uniform sampling among all negative samples
+        return random_choice(neg_inds, num_expected)
+    else:
+        assert max_overlaps is not None
+        max_overlaps = max_overlaps.cpu().numpy()
+        # balance sampling for negative samples
+        neg_set = set(neg_inds.cpu().numpy())
+        easy_set = set(
+            np.where(
+                np.logical_and(max_overlaps >= 0,
+                               max_overlaps < balance_thr))[0])
+        hard_set = set(np.where(max_overlaps >= balance_thr)[0])
+        easy_neg_inds = list(easy_set & neg_set)
+        hard_neg_inds = list(hard_set & neg_set)
+
+        num_expected_hard = int(num_expected * hard_fraction)
+        if len(hard_neg_inds) > num_expected_hard:
+            sampled_hard_inds = random_choice(hard_neg_inds, num_expected_hard)
+        else:
+            sampled_hard_inds = np.array(hard_neg_inds, dtype=np.int)
+        num_expected_easy = num_expected - len(sampled_hard_inds)
+        if len(easy_neg_inds) > num_expected_easy:
+            sampled_easy_inds = random_choice(easy_neg_inds, num_expected_easy)
+        else:
+            sampled_easy_inds = np.array(easy_neg_inds, dtype=np.int)
+        sampled_inds = np.concatenate((sampled_easy_inds, sampled_hard_inds))
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(list(neg_set - set(sampled_inds)))
+            if len(extra_inds) > num_extra:
+                extra_inds = random_choice(extra_inds, num_extra)
+            sampled_inds = np.concatenate((sampled_inds, extra_inds))
+        sampled_inds = torch.from_numpy(sampled_inds).long().to(
+            assigned_gt_inds.device)
+        return sampled_inds
+
+
+def bbox_sampling(assigned_gt_inds,
+                  num_expected,
+                  pos_fraction,
+                  neg_pos_ub,
+                  pos_balance_sampling=True,
+                  max_overlaps=None,
+                  neg_balance_thr=0,
+                  neg_hard_fraction=0.5):
+    """Sample positive and negative bboxes given assigned results.
+
+    Args:
+        assigned_gt_inds (Tensor): Assigned gt indices for each bbox.
+        num_expected (int): Expected total samples (pos and neg).
+        pos_fraction (float): Positive sample fraction.
+        neg_pos_ub (float): Negative/Positive upper bound.
+        pos_balance_sampling(bool): Whether to sample positive samples around
+            each gt bbox evenly.
+        max_overlaps (Tensor, optional): For each bbox, the max IoU of all gts.
+            Used for negative balance sampling only.
+        neg_balance_thr (float, optional): IoU threshold for simple/hard
+            negative balance sampling.
+        neg_hard_fraction (float, optional): Fraction of hard negative samples
+            for negative balance sampling.
+
+    Returns:
+        tuple[Tensor]: positive bbox indices, negative bbox indices.
+    """
+    num_expected_pos = int(num_expected * pos_fraction)
+    pos_inds = bbox_sampling_pos(assigned_gt_inds, num_expected_pos,
+                                 pos_balance_sampling)
+    # We found that sampled indices have duplicated items occasionally.
+    # (mab be a bug of PyTorch)
+    pos_inds = pos_inds.unique()
+    num_sampled_pos = pos_inds.numel()
+    num_neg_max = int(
+        neg_pos_ub *
+        num_sampled_pos) if num_sampled_pos > 0 else int(neg_pos_ub)
+    num_expected_neg = min(num_neg_max, num_expected - num_sampled_pos)
+    neg_inds = bbox_sampling_neg(assigned_gt_inds, num_expected_neg,
+                                 max_overlaps, neg_balance_thr,
+                                 neg_hard_fraction)
+    neg_inds = neg_inds.unique()
+    return pos_inds, neg_inds
+
+
+def sample_bboxes(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels, cfg):
+    """Sample positive and negative bboxes.
+
+    This is a simple implementation of bbox sampling given candidates and
+    ground truth bboxes, which includes 3 steps.
+
+    1. Assign gt to each bbox.
+    2. Add gt bboxes to the sampling pool (optional).
+    3. Perform positive and negative sampling.
+
+    Args:
+        bboxes (Tensor): Boxes to be sampled from.
+        gt_bboxes (Tensor): Ground truth bboxes.
+        gt_bboxes_ignore (Tensor): Ignored ground truth bboxes. In MS COCO,
+            `crowd` bboxes are considered as ignored.
+        gt_labels (Tensor): Class labels of ground truth bboxes.
+        cfg (dict): Sampling configs.
+
+    Returns:
+        tuple[Tensor]: pos_bboxes, neg_bboxes, pos_assigned_gt_inds,
+            pos_gt_bboxes, pos_gt_labels
+    """
+    bboxes = bboxes[:, :4]
+    assigned_gt_inds, assigned_labels, argmax_overlaps, max_overlaps = \
+        bbox_assign(bboxes, gt_bboxes, gt_bboxes_ignore, gt_labels,
+                    cfg.pos_iou_thr, cfg.neg_iou_thr, cfg.min_pos_iou,
+                    cfg.crowd_thr)
+
+    if cfg.add_gt_as_proposals:
+        bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+        gt_assign_self = torch.arange(
+            1, len(gt_labels) + 1, dtype=torch.long, device=bboxes.device)
+        assigned_gt_inds = torch.cat([gt_assign_self, assigned_gt_inds])
+        assigned_labels = torch.cat([gt_labels, assigned_labels])
+
+    pos_inds, neg_inds = bbox_sampling(
+        assigned_gt_inds, cfg.roi_batch_size, cfg.pos_fraction, cfg.neg_pos_ub,
+        cfg.pos_balance_sampling, max_overlaps, cfg.neg_balance_thr)
+
+    pos_bboxes = bboxes[pos_inds]
+    neg_bboxes = bboxes[neg_inds]
+    pos_assigned_gt_inds = assigned_gt_inds[pos_inds] - 1
+    pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
+    pos_gt_labels = assigned_labels[pos_inds]
+
+    return (pos_bboxes, neg_bboxes, pos_assigned_gt_inds, pos_gt_bboxes,
+            pos_gt_labels)
diff --git a/mmdet/core/bbox/transforms.py b/mmdet/core/bbox/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d8f6f44f20df5c019dc8ed9ea46c2eb6c411c66
--- /dev/null
+++ b/mmdet/core/bbox/transforms.py
@@ -0,0 +1,156 @@
+import mmcv
+import numpy as np
+import torch
+
+
+def bbox2delta(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]):
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gh = gt[..., 3] - gt[..., 1] + 1.0
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def delta2bbox(rois,
+               deltas,
+               means=[0, 0, 0, 0],
+               stds=[1, 1, 1, 1],
+               max_shape=None,
+               wh_ratio_clip=16 / 1000):
+    means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
+    stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[:, 0::4]
+    dy = denorm_deltas[:, 1::4]
+    dw = denorm_deltas[:, 2::4]
+    dh = denorm_deltas[:, 3::4]
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = dw.clamp(min=-max_ratio, max=max_ratio)
+    dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+    pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
+    ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    gx = torch.addcmul(px, 1, pw, dx)  # gx = px + pw * dx
+    gy = torch.addcmul(py, 1, ph, dy)  # gy = py + ph * dy
+    x1 = gx - gw * 0.5 + 0.5
+    y1 = gy - gh * 0.5 + 0.5
+    x2 = gx + gw * 0.5 - 0.5
+    y2 = gy + gh * 0.5 - 0.5
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
+    return bboxes
+
+
+def bbox_flip(bboxes, img_shape):
+    """Flip bboxes horizontally.
+
+    Args:
+        bboxes(Tensor or ndarray): Shape (..., 4*k)
+        img_shape(tuple): Image shape.
+
+    Returns:
+        Same type as `bboxes`: Flipped bboxes.
+    """
+    if isinstance(bboxes, torch.Tensor):
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.clone()
+        flipped[:, 0::4] = img_shape[1] - bboxes[:, 2::4] - 1
+        flipped[:, 2::4] = img_shape[1] - bboxes[:, 0::4] - 1
+        return flipped
+    elif isinstance(bboxes, np.ndarray):
+        return mmcv.bbox_flip(bboxes, img_shape)
+
+
+def bbox_mapping(bboxes, img_shape, scale_factor, flip):
+    """Map bboxes from the original image scale to testing scale"""
+    new_bboxes = bboxes * scale_factor
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes, img_shape, scale_factor, flip):
+    """Map bboxes from testing scale to original image scale"""
+    new_bboxes = bbox_flip(bboxes, img_shape) if flip else bboxes
+    new_bboxes = new_bboxes / scale_factor
+    return new_bboxes
+
+
+def bbox2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
+    Returns:
+        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+        else:
+            rois = bboxes.new_zeros((0, 5))
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois):
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
+
+
+def bbox2result(bboxes, labels, num_classes):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (Tensor): shape (n, 5)
+        labels (Tensor): shape (n, )
+        num_classes (int): class number, including background class
+
+    Returns:
+        list(ndarray): bbox results of each class
+    """
+    if bboxes.shape[0] == 0:
+        return [
+            np.zeros((0, 5), dtype=np.float32) for i in range(num_classes - 1)
+        ]
+    else:
+        bboxes = bboxes.cpu().numpy()
+        labels = labels.cpu().numpy()
+        return [bboxes[labels == i, :] for i in range(num_classes - 1)]
diff --git a/mmdet/core/evaluation/__init__.py b/mmdet/core/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..026234fce3198fe410143d9e1578cc384005c0d4
--- /dev/null
+++ b/mmdet/core/evaluation/__init__.py
@@ -0,0 +1,18 @@
+from .class_names import (voc_classes, imagenet_det_classes,
+                          imagenet_vid_classes, coco_classes, dataset_aliases,
+                          get_classes)
+from .coco_utils import coco_eval, fast_eval_recall, results2json
+from .eval_hooks import (DistEvalHook, CocoDistEvalRecallHook,
+                         CocoDistEvalmAPHook)
+from .mean_ap import average_precision, eval_map, print_map_summary
+from .recall import (eval_recalls, print_recall_summary, plot_num_recall,
+                     plot_iou_recall)
+
+__all__ = [
+    'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
+    'coco_classes', 'dataset_aliases', 'get_classes', 'coco_eval',
+    'fast_eval_recall', 'results2json', 'DistEvalHook',
+    'CocoDistEvalRecallHook', 'CocoDistEvalmAPHook', 'average_precision',
+    'eval_map', 'print_map_summary', 'eval_recalls', 'print_recall_summary',
+    'plot_num_recall', 'plot_iou_recall'
+]
diff --git a/mmdet/core/evaluation/bbox_overlaps.py b/mmdet/core/evaluation/bbox_overlaps.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad4c70523fdaa5d89a2b80ada559e1822d0ecd22
--- /dev/null
+++ b/mmdet/core/evaluation/bbox_overlaps.py
@@ -0,0 +1,49 @@
+import numpy as np
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou'):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1(ndarray): shape (n, 4)
+        bboxes2(ndarray): shape (k, 4)
+        mode(str): iou (intersection over union) or iof (intersection
+            over foreground)
+
+    Returns:
+        ious(ndarray): shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
+        bboxes1[:, 3] - bboxes1[:, 1] + 1)
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
+        bboxes2[:, 3] - bboxes2[:, 1] + 1)
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start + 1, 0) * np.maximum(
+            y_end - y_start + 1, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
diff --git a/mmdet/core/evaluation/class_names.py b/mmdet/core/evaluation/class_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f806315b7c6ef47419efa61e38d2f7ec3ebd2a
--- /dev/null
+++ b/mmdet/core/evaluation/class_names.py
@@ -0,0 +1,103 @@
+import mmcv
+
+
+def voc_classes():
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+
+
+def imagenet_det_classes():
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes():
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes():
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+    ]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco']
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if mmcv.is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError('Unrecognized dataset: {}'.format(dataset))
+    else:
+        raise TypeError('dataset must a str, but got {}'.format(type(dataset)))
+    return labels
diff --git a/mmdet/core/evaluation/coco_utils.py b/mmdet/core/evaluation/coco_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed056b2e6dfae4c379c3ac817f89a91607aacf3
--- /dev/null
+++ b/mmdet/core/evaluation/coco_utils.py
@@ -0,0 +1,149 @@
+import mmcv
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+from .recall import eval_recalls
+
+
+def coco_eval(result_file, result_types, coco, max_dets=(100, 300, 1000)):
+    for res_type in result_types:
+        assert res_type in [
+            'proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'
+        ]
+
+    if mmcv.is_str(coco):
+        coco = COCO(coco)
+    assert isinstance(coco, COCO)
+
+    if result_types == ['proposal_fast']:
+        ar = fast_eval_recall(result_file, coco, np.array(max_dets))
+        for i, num in enumerate(max_dets):
+            print('AR@{}\t= {:.4f}'.format(num, ar[i]))
+        return
+
+    assert result_file.endswith('.json')
+    coco_dets = coco.loadRes(result_file)
+
+    img_ids = coco.getImgIds()
+    for res_type in result_types:
+        iou_type = 'bbox' if res_type == 'proposal' else res_type
+        cocoEval = COCOeval(coco, coco_dets, iou_type)
+        cocoEval.params.imgIds = img_ids
+        if res_type == 'proposal':
+            cocoEval.params.useCats = 0
+            cocoEval.params.maxDets = list(max_dets)
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+
+
+def fast_eval_recall(results,
+                     coco,
+                     max_dets,
+                     iou_thrs=np.arange(0.5, 0.96, 0.05)):
+    if mmcv.is_str(results):
+        assert results.endswith('.pkl')
+        results = mmcv.load(results)
+    elif not isinstance(results, list):
+        raise TypeError(
+            'results must be a list of numpy arrays or a filename, not {}'.
+            format(type(results)))
+
+    gt_bboxes = []
+    img_ids = coco.getImgIds()
+    for i in range(len(img_ids)):
+        ann_ids = coco.getAnnIds(imgIds=img_ids[i])
+        ann_info = coco.loadAnns(ann_ids)
+        if len(ann_info) == 0:
+            gt_bboxes.append(np.zeros((0, 4)))
+            continue
+        bboxes = []
+        for ann in ann_info:
+            if ann.get('ignore', False) or ann['iscrowd']:
+                continue
+            x1, y1, w, h = ann['bbox']
+            bboxes.append([x1, y1, x1 + w - 1, y1 + h - 1])
+        bboxes = np.array(bboxes, dtype=np.float32)
+        if bboxes.shape[0] == 0:
+            bboxes = np.zeros((0, 4))
+        gt_bboxes.append(bboxes)
+
+    recalls = eval_recalls(
+        gt_bboxes, results, max_dets, iou_thrs, print_summary=False)
+    ar = recalls.mean(axis=1)
+    return ar
+
+
+def xyxy2xywh(bbox):
+    _bbox = bbox.tolist()
+    return [
+        _bbox[0],
+        _bbox[1],
+        _bbox[2] - _bbox[0] + 1,
+        _bbox[3] - _bbox[1] + 1,
+    ]
+
+
+def proposal2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        bboxes = results[idx]
+        for i in range(bboxes.shape[0]):
+            data = dict()
+            data['image_id'] = img_id
+            data['bbox'] = xyxy2xywh(bboxes[i])
+            data['score'] = float(bboxes[i][4])
+            data['category_id'] = 1
+            json_results.append(data)
+    return json_results
+
+
+def det2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        result = results[idx]
+        for label in range(len(result)):
+            bboxes = result[label]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = dataset.cat_ids[label]
+                json_results.append(data)
+    return json_results
+
+
+def segm2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        det, seg = results[idx]
+        for label in range(len(det)):
+            bboxes = det[label]
+            segms = seg[label]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = dataset.cat_ids[label]
+                segms[i]['counts'] = segms[i]['counts'].decode()
+                data['segmentation'] = segms[i]
+                json_results.append(data)
+    return json_results
+
+
+def results2json(dataset, results, out_file):
+    if isinstance(results[0], list):
+        json_results = det2json(dataset, results)
+    elif isinstance(results[0], tuple):
+        json_results = segm2json(dataset, results)
+    elif isinstance(results[0], np.ndarray):
+        json_results = proposal2json(dataset, results)
+    else:
+        raise TypeError('invalid type of results')
+    mmcv.dump(json_results, out_file)
diff --git a/mmdet/core/evaluation/eval_hooks.py b/mmdet/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..1402f7f3c73152eae7b5c1129d2c298d36b2ad45
--- /dev/null
+++ b/mmdet/core/evaluation/eval_hooks.py
@@ -0,0 +1,146 @@
+import os
+import os.path as osp
+import shutil
+import time
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.runner import Hook, obj_from_dict
+from mmcv.parallel import scatter, collate
+from pycocotools.cocoeval import COCOeval
+from torch.utils.data import Dataset
+
+from .coco_utils import results2json, fast_eval_recall
+from mmdet import datasets
+
+
+class DistEvalHook(Hook):
+
+    def __init__(self, dataset, interval=1):
+        if isinstance(dataset, Dataset):
+            self.dataset = dataset
+        elif isinstance(dataset, dict):
+            self.dataset = obj_from_dict(dataset, datasets,
+                                         {'test_mode': True})
+        else:
+            raise TypeError(
+                'dataset must be a Dataset object or a dict, not {}'.format(
+                    type(dataset)))
+        self.interval = interval
+        self.lock_dir = None
+
+    def _barrier(self, rank, world_size):
+        """Due to some issues with `torch.distributed.barrier()`, we have to
+        implement this ugly barrier function.
+        """
+        if rank == 0:
+            for i in range(1, world_size):
+                tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
+                while not (osp.exists(tmp)):
+                    time.sleep(1)
+            for i in range(1, world_size):
+                tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
+                os.remove(tmp)
+        else:
+            tmp = osp.join(self.lock_dir, '{}.pkl'.format(rank))
+            mmcv.dump([], tmp)
+            while osp.exists(tmp):
+                time.sleep(1)
+
+    def before_run(self, runner):
+        self.lock_dir = osp.join(runner.work_dir, '.lock_map_hook')
+        if runner.rank == 0:
+            if osp.exists(self.lock_dir):
+                shutil.rmtree(self.lock_dir)
+            mmcv.mkdir_or_exist(self.lock_dir)
+
+    def after_run(self, runner):
+        if runner.rank == 0:
+            shutil.rmtree(self.lock_dir)
+
+    def after_train_epoch(self, runner):
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        runner.model.eval()
+        results = [None for _ in range(len(self.dataset))]
+        prog_bar = mmcv.ProgressBar(len(self.dataset))
+        for idx in range(runner.rank, len(self.dataset), runner.world_size):
+            data = self.dataset[idx]
+            data_gpu = scatter(
+                collate([data], samples_per_gpu=1),
+                [torch.cuda.current_device()])[0]
+
+            # compute output
+            with torch.no_grad():
+                result = runner.model(
+                    return_loss=False, rescale=True, **data_gpu)
+            results[idx] = result
+
+            batch_size = runner.world_size
+            for _ in range(batch_size):
+                prog_bar.update()
+
+        if runner.rank == 0:
+            print('\n')
+            self._barrier(runner.rank, runner.world_size)
+            for i in range(1, runner.world_size):
+                tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i))
+                tmp_results = mmcv.load(tmp_file)
+                for idx in range(i, len(results), runner.world_size):
+                    results[idx] = tmp_results[idx]
+                os.remove(tmp_file)
+            self.evaluate(runner, results)
+        else:
+            tmp_file = osp.join(runner.work_dir,
+                                'temp_{}.pkl'.format(runner.rank))
+            mmcv.dump(results, tmp_file)
+            self._barrier(runner.rank, runner.world_size)
+        self._barrier(runner.rank, runner.world_size)
+
+    def evaluate(self):
+        raise NotImplementedError
+
+
+class CocoDistEvalRecallHook(DistEvalHook):
+
+    def __init__(self,
+                 dataset,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
+        super(CocoDistEvalRecallHook, self).__init__(dataset)
+        self.proposal_nums = np.array(proposal_nums, dtype=np.int32)
+        self.iou_thrs = np.array(iou_thrs, dtype=np.float32)
+
+    def evaluate(self, runner, results):
+        # the official coco evaluation is too slow, here we use our own
+        # implementation instead, which may get slightly different results
+        ar = fast_eval_recall(results, self.dataset.coco, self.proposal_nums,
+                              self.iou_thrs)
+        for i, num in enumerate(self.proposal_nums):
+            runner.log_buffer.output['AR@{}'.format(num)] = ar[i]
+        runner.log_buffer.ready = True
+
+
+class CocoDistEvalmAPHook(DistEvalHook):
+
+    def evaluate(self, runner, results):
+        tmp_file = osp.join(runner.work_dir, 'temp_0.json')
+        results2json(self.dataset, results, tmp_file)
+
+        res_types = ['bbox',
+                     'segm'] if runner.model.module.with_mask else ['bbox']
+        cocoGt = self.dataset.coco
+        cocoDt = cocoGt.loadRes(tmp_file)
+        imgIds = cocoGt.getImgIds()
+        for res_type in res_types:
+            iou_type = res_type
+            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
+            cocoEval.params.imgIds = imgIds
+            cocoEval.evaluate()
+            cocoEval.accumulate()
+            cocoEval.summarize()
+            field = '{}_mAP'.format(res_type)
+            runner.log_buffer.output[field] = cocoEval.stats[0]
+        runner.log_buffer.ready = True
+        os.remove(tmp_file)
diff --git a/mmdet/core/evaluation/mean_ap.py b/mmdet/core/evaluation/mean_ap.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f47c1368af0e3385bc8e49cc5d35b99726ce722
--- /dev/null
+++ b/mmdet/core/evaluation/mean_ap.py
@@ -0,0 +1,374 @@
+import numpy as np
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+            ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_ignore,
+                  default_iou_thr,
+                  area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): the detected bbox
+        gt_bboxes (ndarray): ground truth bboxes of this image
+        gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
+        default_iou_thr (float): the iou thresholds for medium and large bboxes
+        area_ranges (list or None): gt bbox area ranges
+
+    Returns:
+        tuple: two arrays (tp, fp) whose elements are 0 and 1
+    """
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1] + 1)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes - 1)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlaped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes, gt_bboxes, gt_ignore, iou_thr, area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): the detected bbox
+        gt_bboxes (ndarray): ground truth bboxes of this image
+        gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
+        iou_thr (float): the iou thresholds
+
+    Returns:
+        tuple: (tp, fp), two arrays whose elements are 0 and 1
+    """
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1] + 1)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes)
+    ious_max = ious.max(axis=1)
+    ious_argmax = ious.argmax(axis=1)
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1)
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def get_cls_results(det_results, gt_bboxes, gt_labels, gt_ignore, class_id):
+    """Get det results and gt information of a certain class."""
+    cls_dets = [det[class_id]
+                for det in det_results]  # det bboxes of this class
+    cls_gts = []  # gt bboxes of this class
+    cls_gt_ignore = []
+    for j in range(len(gt_bboxes)):
+        gt_bbox = gt_bboxes[j]
+        cls_inds = (gt_labels[j] == class_id + 1)
+        cls_gt = gt_bbox[cls_inds, :] if gt_bbox.shape[0] > 0 else gt_bbox
+        cls_gts.append(cls_gt)
+        if gt_ignore is None:
+            cls_gt_ignore.append(np.zeros(cls_gt.shape[0], dtype=np.int32))
+        else:
+            cls_gt_ignore.append(gt_ignore[j][cls_inds])
+    return cls_dets, cls_gts, cls_gt_ignore
+
+
+def eval_map(det_results,
+             gt_bboxes,
+             gt_labels,
+             gt_ignore=None,
+             scale_ranges=None,
+             iou_thr=0.5,
+             dataset=None,
+             print_summary=True):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results (list): a list of list, [[cls1_det, cls2_det, ...], ...]
+        gt_bboxes (list): ground truth bboxes of each image, a list of K*4
+            array.
+        gt_labels (list): ground truth labels of each image, a list of K array
+        gt_ignore (list): gt ignore indicators of each image, a list of K array
+        scale_ranges (list, optional): [(min1, max1), (min2, max2), ...]
+        iou_thr (float): IoU threshold
+        dataset (None or str): dataset name, there are minor differences in
+            metrics for different datsets, e.g. "voc07", "imagenet_det", etc.
+        print_summary (bool): whether to print the mAP summary
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(gt_bboxes) == len(gt_labels)
+    if gt_ignore is not None:
+        assert len(gt_ignore) == len(gt_labels)
+        for i in range(len(gt_ignore)):
+            assert len(gt_labels[i]) == len(gt_ignore[i])
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    eval_results = []
+    num_classes = len(det_results[0])  # positive class num
+    gt_labels = [
+        label if label.ndim == 1 else label[:, 0] for label in gt_labels
+    ]
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gt_ignore = get_cls_results(
+            det_results, gt_bboxes, gt_labels, gt_ignore, i)
+        # calculate tp and fp for each image
+        tpfp_func = (tpfp_imagenet
+                     if dataset in ['det', 'vid'] else tpfp_default)
+        tpfp = [
+            tpfp_func(cls_dets[j], cls_gts[j], cls_gt_ignore[j], iou_thr,
+                      area_ranges) for j in range(len(cls_dets))
+        ]
+        tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale, gts ignored or beyond scale
+        # are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += np.sum(np.logical_not(cls_gt_ignore[j]))
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0] + 1) * (
+                    bbox[:, 3] - bbox[:, 1] + 1)
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum(
+                        np.logical_not(cls_gt_ignore[j]) &
+                        (gt_areas >= min_area) & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        mode = 'area' if dataset != 'voc07' else '11points'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = [
+            all_ap[all_num_gts[:, i] > 0, i].mean()
+            if np.any(all_num_gts[:, i] > 0) else 0.0
+            for i in range(num_scales)
+        ]
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+    if print_summary:
+        print_map_summary(mean_ap, eval_results, dataset)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap, results, dataset=None):
+    """Print mAP and results of each class.
+
+    Args:
+        mean_ap(float): calculated from `eval_map`
+        results(list): calculated from `eval_map`
+        dataset(None or str or list): dataset name.
+    """
+    num_scales = len(results[0]['ap']) if isinstance(results[0]['ap'],
+                                                     np.ndarray) else 1
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    precisions = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+            precisions[:, i] = np.array(
+                cls_result['precision'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(1, num_classes + 1)]
+    else:
+        label_names = get_classes(dataset)
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+    header = ['class', 'gts', 'dets', 'recall', 'precision', 'ap']
+    for i in range(num_scales):
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                '{:.3f}'.format(recalls[i, j]), '{:.3f}'.format(
+                    precisions[i, j]), '{:.3f}'.format(aps[i, j])
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', '', '{:.3f}'.format(mean_ap[i])])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print(table.table)
diff --git a/mmdet/core/evaluation/recall.py b/mmdet/core/evaluation/recall.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a56f42fdef33341d4b9ec7a654832282b44a7c2
--- /dev/null
+++ b/mmdet/core/evaluation/recall.py
@@ -0,0 +1,185 @@
+import numpy as np
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format.
+    """
+    if isinstance(proposal_nums, list):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, list):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=None,
+                 print_summary=True):
+    """Calculate recalls.
+
+    Args:
+        gts(list or ndarray): a list of arrays of shape (n, 4)
+        proposals(list or ndarray): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums(int or list of int or ndarray): top N proposals
+        thrs(float or list or ndarray): iou thresholds
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(gts[i], img_proposal[:prop_num, :4])
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+    if print_summary:
+        print_recall_summary(recalls, proposal_nums, iou_thrs)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls(ndarray): calculated from `bbox_recalls`
+        proposal_nums(ndarray or list): top N proposals
+        iou_thrs(ndarray or list): iou thresholds
+        row_idxs(ndarray): which rows(proposal nums) to print
+        col_idxs(ndarray): which cols(iou thresholds) to print
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [
+            '{:.3f}'.format(val)
+            for val in recalls[row_idxs[i], col_idxs].tolist()
+        ]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print(table.table)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
diff --git a/mmdet/core/loss/__init__.py b/mmdet/core/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..661f0d6426602b5bed7dc3367e1322374922ae1c
--- /dev/null
+++ b/mmdet/core/loss/__init__.py
@@ -0,0 +1,11 @@
+from .losses import (weighted_nll_loss, weighted_cross_entropy,
+                     weighted_binary_cross_entropy, sigmoid_focal_loss,
+                     weighted_sigmoid_focal_loss, mask_cross_entropy,
+                     smooth_l1_loss, weighted_smoothl1, accuracy)
+
+__all__ = [
+    'weighted_nll_loss', 'weighted_cross_entropy',
+    'weighted_binary_cross_entropy', 'sigmoid_focal_loss',
+    'weighted_sigmoid_focal_loss', 'mask_cross_entropy', 'smooth_l1_loss',
+    'weighted_smoothl1', 'accuracy'
+]
diff --git a/mmdet/core/loss/losses.py b/mmdet/core/loss/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..14b49f5cb90ccc29240622a0c2a6764ae4c68520
--- /dev/null
+++ b/mmdet/core/loss/losses.py
@@ -0,0 +1,101 @@
+# TODO merge naive and weighted loss.
+import torch
+import torch.nn.functional as F
+
+
+def weighted_nll_loss(pred, label, weight, avg_factor=None):
+    if avg_factor is None:
+        avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
+    raw = F.nll_loss(pred, label, reduction='none')
+    return torch.sum(raw * weight)[None] / avg_factor
+
+
+def weighted_cross_entropy(pred, label, weight, avg_factor=None):
+    if avg_factor is None:
+        avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
+    raw = F.cross_entropy(pred, label, reduction='none')
+    return torch.sum(raw * weight)[None] / avg_factor
+
+
+def weighted_binary_cross_entropy(pred, label, weight, avg_factor=None):
+    if avg_factor is None:
+        avg_factor = max(torch.sum(weight > 0).float().item(), 1.)
+    return F.binary_cross_entropy_with_logits(
+        pred, label.float(), weight.float(),
+        reduction='sum')[None] / avg_factor
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='elementwise_mean'):
+    pred_sigmoid = pred.sigmoid()
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    weight = (alpha * target + (1 - alpha) * (1 - target)) * weight
+    weight = weight * pt.pow(gamma)
+    return F.binary_cross_entropy_with_logits(
+        pred, target, weight, reduction=reduction)
+
+
+def weighted_sigmoid_focal_loss(pred,
+                                target,
+                                weight,
+                                gamma=2.0,
+                                alpha=0.25,
+                                avg_factor=None,
+                                num_classes=80):
+    if avg_factor is None:
+        avg_factor = torch.sum(weight > 0).float().item() / num_classes + 1e-6
+    return sigmoid_focal_loss(
+        pred, target, weight, gamma=gamma, alpha=alpha,
+        reduction='sum')[None] / avg_factor
+
+
+def mask_cross_entropy(pred, target, label):
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, reduction='elementwise_mean')[None]
+
+
+def smooth_l1_loss(pred, target, beta=1.0, reduction='elementwise_mean'):
+    assert beta > 0
+    assert pred.size() == target.size() and target.numel() > 0
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    reduction = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction == 0:
+        return loss
+    elif reduction == 1:
+        return loss.sum() / pred.numel()
+    elif reduction == 2:
+        return loss.sum()
+
+
+def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None):
+    if avg_factor is None:
+        avg_factor = torch.sum(weight > 0).float().item() / 4 + 1e-6
+    loss = smooth_l1_loss(pred, target, beta, reduction='none')
+    return torch.sum(loss * weight)[None] / avg_factor
+
+
+def accuracy(pred, target, topk=1):
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+
+    maxk = max(topk)
+    _, pred_label = pred.topk(maxk, 1, True, True)
+    pred_label = pred_label.t()
+    correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / pred.size(0)))
+    return res[0] if return_single else res
diff --git a/mmdet/core/mask/__init__.py b/mmdet/core/mask/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b703b55d3eb92821c28ef38579fcbebeb1fa12cf
--- /dev/null
+++ b/mmdet/core/mask/__init__.py
@@ -0,0 +1,4 @@
+from .utils import split_combined_polys
+from .mask_target import mask_target
+
+__all__ = ['split_combined_polys', 'mask_target']
diff --git a/mmdet/core/mask/mask_target.py b/mmdet/core/mask/mask_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..be93dfc28934052a7497b3c42aa3e9dd1b3b3fe6
--- /dev/null
+++ b/mmdet/core/mask/mask_target.py
@@ -0,0 +1,36 @@
+import torch
+import numpy as np
+import mmcv
+
+
+def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
+                cfg):
+    cfg_list = [cfg for _ in range(len(pos_proposals_list))]
+    mask_targets = map(mask_target_single, pos_proposals_list,
+                       pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
+    mask_targets = torch.cat(list(mask_targets))
+    return mask_targets
+
+
+def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+    mask_size = cfg.mask_size
+    num_pos = pos_proposals.size(0)
+    mask_targets = []
+    if num_pos > 0:
+        proposals_np = pos_proposals.cpu().numpy()
+        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+        for i in range(num_pos):
+            gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+            bbox = proposals_np[i, :].astype(np.int32)
+            x1, y1, x2, y2 = bbox
+            w = np.maximum(x2 - x1 + 1, 1)
+            h = np.maximum(y2 - y1 + 1, 1)
+            # mask is uint8 both before and after resizing
+            target = mmcv.imresize(gt_mask[y1:y1 + h, x1:x1 + w],
+                                   (mask_size, mask_size))
+            mask_targets.append(target)
+        mask_targets = torch.from_numpy(np.stack(mask_targets)).float().to(
+            pos_proposals.device)
+    else:
+        mask_targets = pos_proposals.new_zeros((0, mask_size, mask_size))
+    return mask_targets
diff --git a/mmdet/core/mask/utils.py b/mmdet/core/mask/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a68312b179e56cb0e93e967ecfeeb602d48ca866
--- /dev/null
+++ b/mmdet/core/mask/utils.py
@@ -0,0 +1,30 @@
+import mmcv
+
+
+def split_combined_polys(polys, poly_lens, polys_per_mask):
+    """Split the combined 1-D polys into masks.
+
+    A mask is represented as a list of polys, and a poly is represented as
+    a 1-D array. In dataset, all masks are concatenated into a single 1-D
+    tensor. Here we need to split the tensor into original representations.
+
+    Args:
+        polys (list): a list (length = image num) of 1-D tensors
+        poly_lens (list): a list (length = image num) of poly length
+        polys_per_mask (list): a list (length = image num) of poly number
+            of each mask
+
+    Returns:
+        list: a list (length = image num) of list (length = mask num) of
+            list (length = poly num) of numpy array
+    """
+    mask_polys_list = []
+    for img_id in range(len(polys)):
+        polys_single = polys[img_id]
+        polys_lens_single = poly_lens[img_id].tolist()
+        polys_per_mask_single = polys_per_mask[img_id].tolist()
+
+        split_polys = mmcv.slice_list(polys_single, polys_lens_single)
+        mask_polys = mmcv.slice_list(split_polys, polys_per_mask_single)
+        mask_polys_list.append(mask_polys)
+    return mask_polys_list
diff --git a/mmdet/core/post_processing/__init__.py b/mmdet/core/post_processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b24a3fc68525de1c73d687404990bd521bdf5b0
--- /dev/null
+++ b/mmdet/core/post_processing/__init__.py
@@ -0,0 +1,8 @@
+from .bbox_nms import multiclass_nms
+from .merge_augs import (merge_aug_proposals, merge_aug_bboxes,
+                         merge_aug_scores, merge_aug_masks)
+
+__all__ = [
+    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
+    'merge_aug_scores', 'merge_aug_masks'
+]
diff --git a/mmdet/core/post_processing/bbox_nms.py b/mmdet/core/post_processing/bbox_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..f619d2682a035344c6fda6974cd03c5cbfeb0f26
--- /dev/null
+++ b/mmdet/core/post_processing/bbox_nms.py
@@ -0,0 +1,54 @@
+import torch
+
+from mmdet.ops import nms
+
+
+def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_thr, max_num=-1):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class)
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels
+            are 0-based.
+    """
+    num_classes = multi_scores.shape[1]
+    bboxes, labels = [], []
+    for i in range(1, num_classes):
+        cls_inds = multi_scores[:, i] > score_thr
+        if not cls_inds.any():
+            continue
+        # get bboxes and scores of this class
+        if multi_bboxes.shape[1] == 4:
+            _bboxes = multi_bboxes[cls_inds, :]
+        else:
+            _bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4]
+        _scores = multi_scores[cls_inds, i]
+        cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1)
+        # perform nms
+        nms_keep = nms(cls_dets, nms_thr)
+        cls_dets = cls_dets[nms_keep, :]
+        cls_labels = multi_bboxes.new_full(
+            (len(nms_keep), ), i - 1, dtype=torch.long)
+        bboxes.append(cls_dets)
+        labels.append(cls_labels)
+    if bboxes:
+        bboxes = torch.cat(bboxes)
+        labels = torch.cat(labels)
+        if bboxes.shape[0] > max_num:
+            _, inds = bboxes[:, -1].sort(descending=True)
+            inds = inds[:max_num]
+            bboxes = bboxes[inds]
+            labels = labels[inds]
+    else:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+
+    return bboxes, labels
diff --git a/mmdet/core/post_processing/merge_augs.py b/mmdet/core/post_processing/merge_augs.py
new file mode 100644
index 0000000000000000000000000000000000000000..00f65b049ccf2b00a0fee73cc64ac257415425ea
--- /dev/null
+++ b/mmdet/core/post_processing/merge_augs.py
@@ -0,0 +1,98 @@
+import torch
+
+import numpy as np
+
+from mmdet.ops import nms
+from ..bbox import bbox_mapping_back
+
+
+def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+        img_metas (list[dict]): image info including "shape_scale" and "flip".
+        rpn_test_cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    nms_keep = nms(aug_proposals, rpn_test_cfg.nms_thr,
+                   aug_proposals.get_device())
+    merged_proposals = aug_proposals[nms_keep, :]
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(rpn_test_cfg.max_num, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None):
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[ndarray]): shape (n, #class, h, w)
+        img_shapes (list[ndarray]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_masks = [
+        mask if not img_info[0]['flip'] else mask[..., ::-1]
+        for mask, img_info in zip(aug_masks, img_metas)
+    ]
+    if weights is None:
+        merged_masks = np.mean(recovered_masks, axis=0)
+    else:
+        merged_masks = np.average(
+            np.array(recovered_masks), axis=0, weights=np.array(weights))
+    return merged_masks
diff --git a/mmdet/core/utils/__init__.py b/mmdet/core/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e952ee5f9c2a546485c270834b36df1412fffa
--- /dev/null
+++ b/mmdet/core/utils/__init__.py
@@ -0,0 +1,7 @@
+from .dist_utils import allreduce_grads, DistOptimizerHook
+from .misc import tensor2imgs, unmap, multi_apply
+
+__all__ = [
+    'allreduce_grads', 'DistOptimizerHook', 'tensor2imgs', 'unmap',
+    'multi_apply'
+]
diff --git a/mmdet/core/utils/dist_utils.py b/mmdet/core/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec84bb48693a99fc480bd0f251ce6bb2a2389eaf
--- /dev/null
+++ b/mmdet/core/utils/dist_utils.py
@@ -0,0 +1,57 @@
+from collections import OrderedDict
+
+import torch.distributed as dist
+from torch._utils import (_flatten_dense_tensors, _unflatten_dense_tensors,
+                          _take_tensors)
+from mmcv.runner import OptimizerHook
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(model, coalesce=True, bucket_size_mb=-1):
+    grads = [
+        param.grad.data for param in model.parameters()
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+class DistOptimizerHook(OptimizerHook):
+
+    def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        runner.outputs['loss'].backward()
+        allreduce_grads(runner.model, self.coalesce, self.bucket_size_mb)
+        if self.grad_clip is not None:
+            self.clip_grads(runner.model.parameters())
+        runner.optimizer.step()
diff --git a/mmdet/core/utils/misc.py b/mmdet/core/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..262f168e646089a535a9ad393947d57198873d93
--- /dev/null
+++ b/mmdet/core/utils/misc.py
@@ -0,0 +1,37 @@
+from functools import partial
+
+import mmcv
+import numpy as np
+from six.moves import map, zip
+
+
+def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
+    num_imgs = tensor.size(0)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+    imgs = []
+    for img_id in range(num_imgs):
+        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+        img = mmcv.imdenormalize(
+            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+        imgs.append(np.ascontiguousarray(img))
+    return imgs
+
+
+def multi_apply(func, *args, **kwargs):
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """ Unmap a subset of item (data) back to the original set of items (of
+    size count) """
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds, :] = data
+    return ret
diff --git a/mmdet/datasets/__init__.py b/mmdet/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..425ea72535a144544f44ebe8b5d63dd31336a54c
--- /dev/null
+++ b/mmdet/datasets/__init__.py
@@ -0,0 +1,8 @@
+from .coco import CocoDataset
+from .loader import GroupSampler, DistributedGroupSampler, build_dataloader
+from .utils import to_tensor, random_scale, show_ann
+
+__all__ = [
+    'CocoDataset', 'GroupSampler', 'DistributedGroupSampler',
+    'build_dataloader', 'to_tensor', 'random_scale', 'show_ann'
+]
diff --git a/mmdet/datasets/coco.py b/mmdet/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..9049f1af9703a5d97a6a6f53c33eac3190468c97
--- /dev/null
+++ b/mmdet/datasets/coco.py
@@ -0,0 +1,305 @@
+import os.path as osp
+
+import mmcv
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+from pycocotools.coco import COCO
+from torch.utils.data import Dataset
+
+from .transforms import (ImageTransform, BboxTransform, MaskTransform,
+                         Numpy2Tensor)
+from .utils import to_tensor, show_ann, random_scale
+
+
+class CocoDataset(Dataset):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 img_scale,
+                 img_norm_cfg,
+                 size_divisor=None,
+                 proposal_file=None,
+                 num_max_proposals=1000,
+                 flip_ratio=0,
+                 with_mask=True,
+                 with_crowd=True,
+                 with_label=True,
+                 test_mode=False,
+                 debug=False):
+        # path of the data file
+        self.coco = COCO(ann_file)
+        # filter images with no annotation during training
+        if not test_mode:
+            self.img_ids, self.img_infos = self._filter_imgs()
+        else:
+            self.img_ids = self.coco.getImgIds()
+            self.img_infos = [
+                self.coco.loadImgs(idx)[0] for idx in self.img_ids
+            ]
+        assert len(self.img_ids) == len(self.img_infos)
+        # get the mapping from original category ids to labels
+        self.cat_ids = self.coco.getCatIds()
+        self.cat2label = {
+            cat_id: i + 1
+            for i, cat_id in enumerate(self.cat_ids)
+        }
+        # prefix of images path
+        self.img_prefix = img_prefix
+        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
+        self.img_scales = img_scale if isinstance(img_scale,
+                                                  list) else [img_scale]
+        assert mmcv.is_list_of(self.img_scales, tuple)
+        # color channel order and normalize configs
+        self.img_norm_cfg = img_norm_cfg
+        # proposals
+        # TODO: revise _filter_imgs to be more flexible
+        if proposal_file is not None:
+            self.proposals = mmcv.load(proposal_file)
+            ori_ids = self.coco.getImgIds()
+            sorted_idx = [ori_ids.index(id) for id in self.img_ids]
+            self.proposals = [self.proposals[idx] for idx in sorted_idx]
+        else:
+            self.proposals = None
+        self.num_max_proposals = num_max_proposals
+        # flip ratio
+        self.flip_ratio = flip_ratio
+        assert flip_ratio >= 0 and flip_ratio <= 1
+        # padding border to ensure the image size can be divided by
+        # size_divisor (used for FPN)
+        self.size_divisor = size_divisor
+        # with crowd or not, False when using RetinaNet
+        self.with_crowd = with_crowd
+        # with mask or not
+        self.with_mask = with_mask
+        # with label is False for RPN
+        self.with_label = with_label
+        # in test mode or not
+        self.test_mode = test_mode
+        # debug mode or not
+        self.debug = debug
+
+        # set group flag for the sampler
+        self._set_group_flag()
+        # transforms
+        self.img_transform = ImageTransform(
+            size_divisor=self.size_divisor, **self.img_norm_cfg)
+        self.bbox_transform = BboxTransform()
+        self.mask_transform = MaskTransform()
+        self.numpy2tensor = Numpy2Tensor()
+
+    def __len__(self):
+        return len(self.img_ids)
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without ground truths."""
+        img_ids = list(set([_['image_id'] for _ in self.coco.anns.values()]))
+        valid_ids = []
+        img_infos = []
+        for i in img_ids:
+            info = self.coco.loadImgs(i)[0]
+            if min(info['width'], info['height']) >= min_size:
+                valid_ids.append(i)
+                img_infos.append(info)
+        return valid_ids, img_infos
+
+    def _load_ann_info(self, idx):
+        img_id = self.img_ids[idx]
+        ann_ids = self.coco.getAnnIds(imgIds=img_id)
+        ann_info = self.coco.loadAnns(ann_ids)
+        return ann_info
+
+    def _parse_ann_info(self, ann_info, with_mask=True):
+        """Parse bbox and mask annotation.
+
+        Args:
+            ann_info (list[dict]): Annotation info of an image.
+            with_mask (bool): Whether to parse mask annotations.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,
+                labels, masks, mask_polys, poly_lens.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        # Two formats are provided.
+        # 1. mask: a binary map of the same size of the image.
+        # 2. polys: each mask consists of one or several polys, each poly is a
+        # list of float.
+        if with_mask:
+            gt_masks = []
+            gt_mask_polys = []
+            gt_poly_lens = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
+            if ann['iscrowd']:
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+            if with_mask:
+                gt_masks.append(self.coco.annToMask(ann))
+                mask_polys = [
+                    p for p in ann['segmentation'] if len(p) >= 6
+                ]  # valid polygons have >= 3 points (6 coordinates)
+                poly_lens = [len(p) for p in mask_polys]
+                gt_mask_polys.append(mask_polys)
+                gt_poly_lens.extend(poly_lens)
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        ann = dict(
+            bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)
+
+        if with_mask:
+            ann['masks'] = gt_masks
+            # poly format is not used in the current implementation
+            ann['mask_polys'] = gt_mask_polys
+            ann['poly_lens'] = gt_poly_lens
+        return ann
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0.
+        """
+        self.flag = np.zeros(len(self.img_ids), dtype=np.uint8)
+        for i in range(len(self.img_ids)):
+            img_info = self.img_infos[i]
+            if img_info['width'] / img_info['height'] > 1:
+                self.flag[i] = 1
+
+    def _rand_another(self, idx):
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __getitem__(self, idx):
+        if self.test_mode:
+            return self.prepare_test_img(idx)
+        while True:
+            img_info = self.img_infos[idx]
+            ann_info = self._load_ann_info(idx)
+
+            # load image
+            img = mmcv.imread(osp.join(self.img_prefix, img_info['file_name']))
+            if self.debug:
+                show_ann(self.coco, img, ann_info)
+
+            # load proposals if necessary
+            if self.proposals is not None:
+                proposals = self.proposals[idx][:self.num_max_proposals, :4]
+                # TODO: Handle empty proposals properly. Currently images with
+                # no proposals are just ignored, but they can be used for
+                # training in concept.
+                if len(proposals) == 0:
+                    idx = self._rand_another(idx)
+                    continue
+
+            ann = self._parse_ann_info(ann_info, self.with_mask)
+            gt_bboxes = ann['bboxes']
+            gt_labels = ann['labels']
+            gt_bboxes_ignore = ann['bboxes_ignore']
+            # skip the image if there is no valid gt bbox
+            if len(gt_bboxes) == 0:
+                idx = self._rand_another(idx)
+                continue
+
+            # apply transforms
+            flip = True if np.random.rand() < self.flip_ratio else False
+            img_scale = random_scale(self.img_scales)  # sample a scale
+            img, img_shape, pad_shape, scale_factor = self.img_transform(
+                img, img_scale, flip)
+            if self.proposals is not None:
+                proposals = self.bbox_transform(proposals, img_shape,
+                                                scale_factor, flip)
+            gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
+                                            flip)
+            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
+                                                   scale_factor, flip)
+
+            if self.with_mask:
+                gt_masks = self.mask_transform(ann['masks'], pad_shape,
+                                               scale_factor, flip)
+
+            ori_shape = (img_info['height'], img_info['width'], 3)
+            img_meta = dict(
+                ori_shape=ori_shape,
+                img_shape=img_shape,
+                pad_shape=pad_shape,
+                scale_factor=scale_factor,
+                flip=flip)
+
+            data = dict(
+                img=DC(to_tensor(img), stack=True),
+                img_meta=DC(img_meta, cpu_only=True),
+                gt_bboxes=DC(to_tensor(gt_bboxes)))
+            if self.proposals is not None:
+                data['proposals'] = DC(to_tensor(proposals))
+            if self.with_label:
+                data['gt_labels'] = DC(to_tensor(gt_labels))
+            if self.with_crowd:
+                data['gt_bboxes_ignore'] = DC(to_tensor(gt_bboxes_ignore))
+            if self.with_mask:
+                data['gt_masks'] = DC(gt_masks, cpu_only=True)
+            return data
+
+    def prepare_test_img(self, idx):
+        """Prepare an image for testing (multi-scale and flipping)"""
+        img_info = self.img_infos[idx]
+        img = mmcv.imread(osp.join(self.img_prefix, img_info['file_name']))
+        proposal = (self.proposals[idx][:, :4]
+                    if self.proposals is not None else None)
+
+        def prepare_single(img, scale, flip, proposal=None):
+            _img, img_shape, pad_shape, scale_factor = self.img_transform(
+                img, scale, flip)
+            _img = to_tensor(_img)
+            _img_meta = dict(
+                ori_shape=(img_info['height'], img_info['width'], 3),
+                img_shape=img_shape,
+                pad_shape=pad_shape,
+                scale_factor=scale_factor,
+                flip=flip)
+            if proposal is not None:
+                _proposal = self.bbox_transform(proposal, img_shape,
+                                                scale_factor, flip)
+                _proposal = to_tensor(_proposal)
+            else:
+                _proposal = None
+            return _img, _img_meta, _proposal
+
+        imgs = []
+        img_metas = []
+        proposals = []
+        for scale in self.img_scales:
+            _img, _img_meta, _proposal = prepare_single(
+                img, scale, False, proposal)
+            imgs.append(_img)
+            img_metas.append(DC(_img_meta, cpu_only=True))
+            proposals.append(_proposal)
+            if self.flip_ratio > 0:
+                _img, _img_meta, _proposal = prepare_single(
+                    img, scale, True, proposal)
+                imgs.append(_img)
+                img_metas.append(DC(_img_meta, cpu_only=True))
+                proposals.append(_proposal)
+        data = dict(img=imgs, img_meta=img_metas)
+        if self.proposals is not None:
+            data['proposals'] = proposals
+        return data
diff --git a/mmdet/datasets/loader/__init__.py b/mmdet/datasets/loader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3d4fdd2cbbe85b26b4c5fa4898315accbe94c0a
--- /dev/null
+++ b/mmdet/datasets/loader/__init__.py
@@ -0,0 +1,6 @@
+from .build_loader import build_dataloader
+from .sampler import GroupSampler, DistributedGroupSampler
+
+__all__ = [
+    'GroupSampler', 'DistributedGroupSampler', 'build_dataloader'
+]
diff --git a/mmdet/datasets/loader/build_loader.py b/mmdet/datasets/loader/build_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..761d9aea1884c8741a6653d5e9405ff5acc530a9
--- /dev/null
+++ b/mmdet/datasets/loader/build_loader.py
@@ -0,0 +1,44 @@
+from functools import partial
+
+from mmcv.runner import get_dist_info
+from mmcv.parallel import collate
+from torch.utils.data import DataLoader
+
+from .sampler import GroupSampler, DistributedGroupSampler
+
+# https://github.com/pytorch/pytorch/issues/973
+import resource
+rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
+
+
+def build_dataloader(dataset,
+                     imgs_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     **kwargs):
+    if dist:
+        rank, world_size = get_dist_info()
+        sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size,
+                                          rank)
+        batch_size = imgs_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = GroupSampler(dataset, imgs_per_gpu)
+        batch_size = num_gpus * imgs_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    if not kwargs.get('shuffle', True):
+        sampler = None
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
+        pin_memory=False,
+        **kwargs)
+
+    return data_loader
diff --git a/mmdet/datasets/loader/sampler.py b/mmdet/datasets/loader/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c060cd926ea50d232d0f765b86933ca8fad0969
--- /dev/null
+++ b/mmdet/datasets/loader/sampler.py
@@ -0,0 +1,132 @@
+from __future__ import division
+
+import math
+import torch
+import numpy as np
+
+from torch.distributed import get_world_size, get_rank
+from torch.utils.data.sampler import Sampler
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, 'flag')
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(
+                size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)
+                            ) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate([indice, indice[:num_extra]])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(
+                range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = torch.from_numpy(indices).long()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None):
+        if num_replicas is None:
+            num_replicas = get_world_size()
+        if rank is None:
+            rank = get_rank()
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                indice = indice[list(torch.randperm(int(size),
+                                                    generator=g))].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                indice += indice[:extra]
+                indices += indice
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/mmdet/datasets/transforms.py b/mmdet/datasets/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddb2fb2c2f483326e8703a108d086a919542b212
--- /dev/null
+++ b/mmdet/datasets/transforms.py
@@ -0,0 +1,115 @@
+import mmcv
+import numpy as np
+import torch
+
+__all__ = ['ImageTransform', 'BboxTransform', 'MaskTransform', 'Numpy2Tensor']
+
+
+class ImageTransform(object):
+    """Preprocess an image.
+
+    1. rescale the image to expected size
+    2. normalize the image
+    3. flip the image (if needed)
+    4. pad the image (if needed)
+    5. transpose to (c, h, w)
+    """
+
+    def __init__(self,
+                 mean=(0, 0, 0),
+                 std=(1, 1, 1),
+                 to_rgb=True,
+                 size_divisor=None):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+        self.size_divisor = size_divisor
+
+    def __call__(self, img, scale, flip=False):
+        img, scale_factor = mmcv.imrescale(img, scale, return_scale=True)
+        img_shape = img.shape
+        img = mmcv.imnormalize(img, self.mean, self.std, self.to_rgb)
+        if flip:
+            img = mmcv.imflip(img)
+        if self.size_divisor is not None:
+            img = mmcv.impad_to_multiple(img, self.size_divisor)
+            pad_shape = img.shape
+        else:
+            pad_shape = img_shape
+        img = img.transpose(2, 0, 1)
+        return img, img_shape, pad_shape, scale_factor
+
+
+def bbox_flip(bboxes, img_shape):
+    """Flip bboxes horizontally.
+
+    Args:
+        bboxes(ndarray): shape (..., 4*k)
+        img_shape(tuple): (height, width)
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    w = img_shape[1]
+    flipped = bboxes.copy()
+    flipped[..., 0::4] = w - bboxes[..., 2::4] - 1
+    flipped[..., 2::4] = w - bboxes[..., 0::4] - 1
+    return flipped
+
+
+class BboxTransform(object):
+    """Preprocess gt bboxes.
+
+    1. rescale bboxes according to image size
+    2. flip bboxes (if needed)
+    3. pad the first dimension to `max_num_gts`
+    """
+
+    def __init__(self, max_num_gts=None):
+        self.max_num_gts = max_num_gts
+
+    def __call__(self, bboxes, img_shape, scale_factor, flip=False):
+        gt_bboxes = bboxes * scale_factor
+        if flip:
+            gt_bboxes = bbox_flip(gt_bboxes, img_shape)
+        gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0, img_shape[1])
+        gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0, img_shape[0])
+        if self.max_num_gts is None:
+            return gt_bboxes
+        else:
+            num_gts = gt_bboxes.shape[0]
+            padded_bboxes = np.zeros((self.max_num_gts, 4), dtype=np.float32)
+            padded_bboxes[:num_gts, :] = gt_bboxes
+            return padded_bboxes
+
+
+class MaskTransform(object):
+    """Preprocess masks.
+
+    1. resize masks to expected size and stack to a single array
+    2. flip the masks (if needed)
+    3. pad the masks (if needed)
+    """
+
+    def __call__(self, masks, pad_shape, scale_factor, flip=False):
+        masks = [
+            mmcv.imrescale(mask, scale_factor, interpolation='nearest')
+            for mask in masks
+        ]
+        if flip:
+            masks = [mask[:, ::-1] for mask in masks]
+        padded_masks = [
+            mmcv.impad(mask, pad_shape[:2], pad_val=0) for mask in masks
+        ]
+        padded_masks = np.stack(padded_masks, axis=0)
+        return padded_masks
+
+
+class Numpy2Tensor(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, *args):
+        if len(args) == 1:
+            return torch.from_numpy(args[0])
+        else:
+            return tuple([torch.from_numpy(np.array(array)) for array in args])
diff --git a/mmdet/datasets/utils.py b/mmdet/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a248ef6890ea348ea7ad98154cc163ae1e035c5
--- /dev/null
+++ b/mmdet/datasets/utils.py
@@ -0,0 +1,69 @@
+from collections import Sequence
+
+import mmcv
+import torch
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError('type {} cannot be converted to tensor.'.format(
+            type(data)))
+
+
+def random_scale(img_scales, mode='range'):
+    """Randomly select a scale from a list of scales or scale ranges.
+
+    Args:
+        img_scales (list[tuple]): Image scale or scale range.
+        mode (str): "range" or "value".
+
+    Returns:
+        tuple: Sampled image scale.
+    """
+    num_scales = len(img_scales)
+    if num_scales == 1:  # fixed scale is specified
+        img_scale = img_scales[0]
+    elif num_scales == 2:  # randomly sample a scale
+        if mode == 'range':
+            img_scale_long = [max(s) for s in img_scales]
+            img_scale_short = [min(s) for s in img_scales]
+            long_edge = np.random.randint(
+                min(img_scale_long),
+                max(img_scale_long) + 1)
+            short_edge = np.random.randint(
+                min(img_scale_short),
+                max(img_scale_short) + 1)
+            img_scale = (long_edge, short_edge)
+        elif mode == 'value':
+            img_scale = img_scales[np.random.randint(num_scales)]
+    else:
+        if mode != 'value':
+            raise ValueError(
+                'Only "value" mode supports more than 2 image scales')
+        img_scale = img_scales[np.random.randint(num_scales)]
+    return img_scale
+
+
+def show_ann(coco, img, ann_info):
+    plt.imshow(mmcv.bgr2rgb(img))
+    plt.axis('off')
+    coco.showAnns(ann_info)
+    plt.show()
diff --git a/mmdet/models/__init__.py b/mmdet/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8232fda616c11deefd20ba34a12a0679db7b3e4d
--- /dev/null
+++ b/mmdet/models/__init__.py
@@ -0,0 +1,11 @@
+from .detectors import (BaseDetector, TwoStageDetector, RPN, FastRCNN,
+                        FasterRCNN, MaskRCNN)
+from .builder import (build_neck, build_rpn_head, build_roi_extractor,
+                      build_bbox_head, build_mask_head, build_detector)
+
+__all__ = [
+    'BaseDetector', 'TwoStageDetector', 'RPN', 'FastRCNN', 'FasterRCNN',
+    'MaskRCNN', 'build_backbone', 'build_neck', 'build_rpn_head',
+    'build_roi_extractor', 'build_bbox_head', 'build_mask_head',
+    'build_detector'
+]
diff --git a/mmdet/models/backbones/__init__.py b/mmdet/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f82f92aad10ed86b6528f0554615d7e9589ce1c
--- /dev/null
+++ b/mmdet/models/backbones/__init__.py
@@ -0,0 +1,3 @@
+from .resnet import ResNet
+
+__all__ = ['ResNet']
diff --git a/mmdet/models/backbones/resnet.py b/mmdet/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..66684b154b5aea3364789495b43c8b31ab97745b
--- /dev/null
+++ b/mmdet/models/backbones/resnet.py
@@ -0,0 +1,313 @@
+import logging
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+
+from mmcv.cnn import constant_init, kaiming_init
+from mmcv.runner import load_checkpoint
+
+
+def conv3x3(in_planes, out_planes, stride=1, dilation=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        dilation=dilation,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        assert not with_cp
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False):
+        """Bottleneck block.
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer,
+        if it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__()
+        assert style in ['pytorch', 'caffe']
+        if style == 'pytorch':
+            conv1_stride = 1
+            conv2_stride = stride
+        else:
+            conv1_stride = stride
+            conv2_stride = 1
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.conv1(x)
+            out = self.bn1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.bn2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.bn3(out)
+
+            if self.downsample is not None:
+                residual = self.downsample(x)
+
+            out += residual
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block,
+                   inplanes,
+                   planes,
+                   blocks,
+                   stride=1,
+                   dilation=1,
+                   style='pytorch',
+                   with_cp=False):
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            nn.Conv2d(
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            nn.BatchNorm2d(planes * block.expansion),
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes,
+            planes,
+            stride,
+            dilation,
+            downsample,
+            style=style,
+            with_cp=with_cp))
+    inplanes = planes * block.expansion
+    for i in range(1, blocks):
+        layers.append(
+            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
+
+    return nn.Sequential(*layers)
+
+
+class ResNet(nn.Module):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        num_stages (int): Resnet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers to eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 frozen_stages=-1,
+                 bn_eval=True,
+                 bn_frozen=False,
+                 with_cp=False):
+        super(ResNet, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError('invalid depth {} for resnet'.format(depth))
+        assert num_stages >= 1 and num_stages <= 4
+        block, stage_blocks = self.arch_settings[depth]
+        stage_blocks = stage_blocks[:num_stages]
+        assert len(strides) == len(dilations) == num_stages
+        assert max(out_indices) < num_stages
+
+        self.out_indices = out_indices
+        self.style = style
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+        self.with_cp = with_cp
+
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                with_cp=with_cp)
+            self.inplanes = planes * block.expansion
+            layer_name = 'layer{}'.format(i + 1)
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode=True):
+        super(ResNet, self).train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        if mode and self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+            for param in self.bn1.parameters():
+                param.requires_grad = False
+            self.bn1.eval()
+            self.bn1.weight.requires_grad = False
+            self.bn1.bias.requires_grad = False
+            for i in range(1, self.frozen_stages + 1):
+                mod = getattr(self, 'layer{}'.format(i))
+                mod.eval()
+                for param in mod.parameters():
+                    param.requires_grad = False
diff --git a/mmdet/models/bbox_heads/__init__.py b/mmdet/models/bbox_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49a863594290ce0b0e748ffc45c6d4a4381e2140
--- /dev/null
+++ b/mmdet/models/bbox_heads/__init__.py
@@ -0,0 +1,4 @@
+from .bbox_head import BBoxHead
+from .convfc_bbox_head import ConvFCRoIHead, SharedFCRoIHead
+
+__all__ = ['BBoxHead', 'ConvFCRoIHead', 'SharedFCRoIHead']
diff --git a/mmdet/models/bbox_heads/bbox_head.py b/mmdet/models/bbox_heads/bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..67dba03959231b5ed0f784ac97542911b56cc785
--- /dev/null
+++ b/mmdet/models/bbox_heads/bbox_head.py
@@ -0,0 +1,120 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.core import (delta2bbox, multiclass_nms, bbox_target,
+                        weighted_cross_entropy, weighted_smoothl1, accuracy)
+
+
+class BBoxHead(nn.Module):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively"""
+
+    def __init__(self,
+                 with_avg_pool=False,
+                 with_cls=True,
+                 with_reg=True,
+                 roi_feat_size=7,
+                 in_channels=256,
+                 num_classes=81,
+                 target_means=[0., 0., 0., 0.],
+                 target_stds=[0.1, 0.1, 0.2, 0.2],
+                 reg_class_agnostic=False):
+        super(BBoxHead, self).__init__()
+        assert with_cls or with_reg
+        self.with_avg_pool = with_avg_pool
+        self.with_cls = with_cls
+        self.with_reg = with_reg
+        self.roi_feat_size = roi_feat_size
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.target_means = target_means
+        self.target_stds = target_stds
+        self.reg_class_agnostic = reg_class_agnostic
+
+        in_channels = self.in_channels
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(roi_feat_size)
+        else:
+            in_channels *= (self.roi_feat_size * self.roi_feat_size)
+        if self.with_cls:
+            self.fc_cls = nn.Linear(in_channels, num_classes)
+        if self.with_reg:
+            out_dim_reg = 4 if reg_class_agnostic else 4 * num_classes
+            self.fc_reg = nn.Linear(in_channels, out_dim_reg)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        if self.with_cls:
+            nn.init.normal_(self.fc_cls.weight, 0, 0.01)
+            nn.init.constant_(self.fc_cls.bias, 0)
+        if self.with_reg:
+            nn.init.normal_(self.fc_reg.weight, 0, 0.001)
+            nn.init.constant_(self.fc_reg.bias, 0)
+
+    def forward(self, x):
+        if self.with_avg_pool:
+            x = self.avg_pool(x)
+        x = x.view(x.size(0), -1)
+        cls_score = self.fc_cls(x) if self.with_cls else None
+        bbox_pred = self.fc_reg(x) if self.with_reg else None
+        return cls_score, bbox_pred
+
+    def get_bbox_target(self, pos_proposals, neg_proposals, pos_gt_bboxes,
+                        pos_gt_labels, rcnn_train_cfg):
+        reg_num_classes = 1 if self.reg_class_agnostic else self.num_classes
+        cls_reg_targets = bbox_target(
+            pos_proposals,
+            neg_proposals,
+            pos_gt_bboxes,
+            pos_gt_labels,
+            rcnn_train_cfg,
+            reg_num_classes,
+            target_means=self.target_means,
+            target_stds=self.target_stds)
+        return cls_reg_targets
+
+    def loss(self, cls_score, bbox_pred, labels, label_weights, bbox_targets,
+             bbox_weights):
+        losses = dict()
+        if cls_score is not None:
+            losses['loss_cls'] = weighted_cross_entropy(
+                cls_score, labels, label_weights)
+            losses['acc'] = accuracy(cls_score, labels)
+        if bbox_pred is not None:
+            losses['loss_reg'] = weighted_smoothl1(
+                bbox_pred,
+                bbox_targets,
+                bbox_weights,
+                avg_factor=bbox_targets.size(0))
+        return losses
+
+    def get_det_bboxes(self,
+                       rois,
+                       cls_score,
+                       bbox_pred,
+                       img_shape,
+                       scale_factor,
+                       rescale=False,
+                       nms_cfg=None):
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
+
+        if bbox_pred is not None:
+            bboxes = delta2bbox(rois[:, 1:], bbox_pred, self.target_means,
+                                self.target_stds, img_shape)
+        else:
+            bboxes = rois[:, 1:]
+            # TODO: add clip here
+
+        if rescale:
+            bboxes /= scale_factor
+
+        if nms_cfg is None:
+            return bboxes, scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes, scores, nms_cfg.score_thr, nms_cfg.nms_thr,
+                nms_cfg.max_per_img)
+
+            return det_bboxes, det_labels
diff --git a/mmdet/models/bbox_heads/convfc_bbox_head.py b/mmdet/models/bbox_heads/convfc_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7bd7f80a9fc00bd3fc020ccd7d834eb45905067
--- /dev/null
+++ b/mmdet/models/bbox_heads/convfc_bbox_head.py
@@ -0,0 +1,178 @@
+import torch.nn as nn
+
+from .bbox_head import BBoxHead
+from ..utils import ConvModule
+
+
+class ConvFCRoIHead(BBoxHead):
+    """More general bbox head, with shared conv and fc layers and two optional
+    separated branches.
+
+                                /-> cls convs -> cls fcs -> cls
+    shared convs -> shared fcs
+                                \-> reg convs -> reg fcs -> reg
+    """
+
+    def __init__(self,
+                 num_shared_convs=0,
+                 num_shared_fcs=0,
+                 num_cls_convs=0,
+                 num_cls_fcs=0,
+                 num_reg_convs=0,
+                 num_reg_fcs=0,
+                 conv_out_channels=256,
+                 fc_out_channels=1024,
+                 *args,
+                 **kwargs):
+        super(ConvFCRoIHead, self).__init__(*args, **kwargs)
+        assert (num_shared_convs + num_shared_fcs + num_cls_convs + num_cls_fcs
+                + num_reg_convs + num_reg_fcs > 0)
+        if num_cls_convs > 0 or num_reg_convs > 0:
+            assert num_shared_fcs == 0
+        if not self.with_cls:
+            assert num_cls_convs == 0 and num_cls_fcs == 0
+        if not self.with_reg:
+            assert num_reg_convs == 0 and num_reg_fcs == 0
+        self.num_shared_convs = num_shared_convs
+        self.num_shared_fcs = num_shared_fcs
+        self.num_cls_convs = num_cls_convs
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_convs = num_reg_convs
+        self.num_reg_fcs = num_reg_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+
+        # add shared convs and fcs
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
+        self.shared_out_channels = last_layer_dim
+
+        # add cls specific branch
+        self.cls_convs, self.cls_fcs, self.cls_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+
+        # add reg specific branch
+        self.reg_convs, self.reg_fcs, self.reg_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+
+        if self.num_shared_fcs == 0 and not self.with_avg_pool:
+            if self.num_cls_fcs == 0:
+                self.cls_last_dim *= (self.roi_feat_size * self.roi_feat_size)
+            if self.num_reg_fcs == 0:
+                self.reg_last_dim *= (self.roi_feat_size * self.roi_feat_size)
+
+        self.relu = nn.ReLU(inplace=True)
+        # reconstruct fc_cls and fc_reg since input channels are changed
+        if self.with_cls:
+            self.fc_cls = nn.Linear(self.cls_last_dim, self.num_classes)
+        if self.with_reg:
+            out_dim_reg = (4 if self.reg_class_agnostic else
+                           4 * self.num_classes)
+            self.fc_reg = nn.Linear(self.reg_last_dim, out_dim_reg)
+
+    def _add_conv_fc_branch(self,
+                            num_branch_convs,
+                            num_branch_fcs,
+                            in_channels,
+                            is_shared=False):
+        """Add shared or separable branch
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (last_layer_dim
+                                    if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        normalize=self.normalize,
+                        bias=self.with_bias))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            # for shared branch, only consider self.with_avg_pool
+            # for separated branches, also consider self.num_shared_fcs
+            if (is_shared
+                    or self.num_shared_fcs == 0) and not self.with_avg_pool:
+                last_layer_dim *= (self.roi_feat_size * self.roi_feat_size)
+            for i in range(num_branch_fcs):
+                fc_in_channels = (last_layer_dim
+                                  if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def init_weights(self):
+        super(ConvFCRoIHead, self).init_weights()
+        for module_list in [self.shared_fcs, self.cls_fcs, self.reg_fcs]:
+            for m in module_list.modules():
+                if isinstance(m, nn.Linear):
+                    nn.init.xavier_uniform_(m.weight)
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+            x = x.view(x.size(0), -1)
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+        # separate branches
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.view(x_cls.size(0), -1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.view(x_reg.size(0), -1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+        return cls_score, bbox_pred
+
+
+class SharedFCRoIHead(ConvFCRoIHead):
+
+    def __init__(self, num_fcs=2, fc_out_channels=1024, *args, **kwargs):
+        assert num_fcs >= 1
+        super(SharedFCRoIHead, self).__init__(
+            num_shared_convs=0,
+            num_shared_fcs=num_fcs,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
diff --git a/mmdet/models/builder.py b/mmdet/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee5ae0b14b01e147f5f9199141709bdac4dbe0af
--- /dev/null
+++ b/mmdet/models/builder.py
@@ -0,0 +1,52 @@
+from mmcv.runner import obj_from_dict
+from torch import nn
+
+from . import (backbones, necks, roi_extractors, rpn_heads, bbox_heads,
+               mask_heads)
+
+__all__ = [
+    'build_backbone', 'build_neck', 'build_rpn_head', 'build_roi_extractor',
+    'build_bbox_head', 'build_mask_head', 'build_detector'
+]
+
+
+def _build_module(cfg, parrent=None, default_args=None):
+    return cfg if isinstance(cfg, nn.Module) else obj_from_dict(
+        cfg, parrent, default_args)
+
+
+def build(cfg, parrent=None, default_args=None):
+    if isinstance(cfg, list):
+        modules = [_build_module(cfg_, parrent, default_args) for cfg_ in cfg]
+        return nn.Sequential(*modules)
+    else:
+        return _build_module(cfg, parrent, default_args)
+
+
+def build_backbone(cfg):
+    return build(cfg, backbones)
+
+
+def build_neck(cfg):
+    return build(cfg, necks)
+
+
+def build_rpn_head(cfg):
+    return build(cfg, rpn_heads)
+
+
+def build_roi_extractor(cfg):
+    return build(cfg, roi_extractors)
+
+
+def build_bbox_head(cfg):
+    return build(cfg, bbox_heads)
+
+
+def build_mask_head(cfg):
+    return build(cfg, mask_heads)
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    from . import detectors
+    return build(cfg, detectors, dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/mmdet/models/detectors/__init__.py b/mmdet/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a784d5f345605d08df19f258e541f99dc1a794e4
--- /dev/null
+++ b/mmdet/models/detectors/__init__.py
@@ -0,0 +1,11 @@
+from .base import BaseDetector
+from .two_stage import TwoStageDetector
+from .rpn import RPN
+from .fast_rcnn import FastRCNN
+from .faster_rcnn import FasterRCNN
+from .mask_rcnn import MaskRCNN
+
+__all__ = [
+    'BaseDetector', 'TwoStageDetector', 'RPN', 'FastRCNN', 'FasterRCNN',
+    'MaskRCNN'
+]
diff --git a/mmdet/models/detectors/base.py b/mmdet/models/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d26dc3a5aba773e521f3ffdcaf9ee7958b88843
--- /dev/null
+++ b/mmdet/models/detectors/base.py
@@ -0,0 +1,116 @@
+import logging
+from abc import ABCMeta, abstractmethod
+
+import mmcv
+import numpy as np
+import torch.nn as nn
+
+from mmdet.core import tensor2imgs, get_classes
+
+
+class BaseDetector(nn.Module):
+    """Base class for detectors"""
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self):
+        super(BaseDetector, self).__init__()
+
+    @property
+    def with_neck(self):
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_bbox(self):
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self):
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        pass
+
+    def extract_feats(self, imgs):
+        assert isinstance(imgs, list)
+        for img in imgs:
+            yield self.extract_feat(img)
+
+    @abstractmethod
+    def forward_train(self, imgs, img_metas, **kwargs):
+        pass
+
+    @abstractmethod
+    def simple_test(self, img, img_meta, **kwargs):
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        pass
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            logger = logging.getLogger()
+            logger.info('load model from: {}'.format(pretrained))
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(
+                'num of augmentations ({}) != num of image meta ({})'.format(
+                    len(imgs), len(img_metas)))
+        # TODO: remove the restriction of imgs_per_gpu == 1 when prepared
+        imgs_per_gpu = imgs[0].size(0)
+        assert imgs_per_gpu == 1
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    def forward(self, img, img_meta, return_loss=True, **kwargs):
+        if return_loss:
+            return self.forward_train(img, img_meta, **kwargs)
+        else:
+            return self.forward_test(img, img_meta, **kwargs)
+
+    def show_result(self,
+                    data,
+                    result,
+                    img_norm_cfg,
+                    dataset='coco',
+                    score_thr=0.3):
+        img_tensor = data['img'][0]
+        img_metas = data['img_meta'][0].data[0]
+        imgs = tensor2imgs(img_tensor, **img_norm_cfg)
+        assert len(imgs) == len(img_metas)
+
+        if isinstance(dataset, str):
+            class_names = get_classes(dataset)
+        elif isinstance(dataset, list):
+            class_names = dataset
+        else:
+            raise TypeError('dataset must be a valid dataset name or a list'
+                            ' of class names, not {}'.format(type(dataset)))
+
+        for img, img_meta in zip(imgs, img_metas):
+            h, w, _ = img_meta['img_shape']
+            img_show = img[:h, :w, :]
+            labels = [
+                np.full(bbox.shape[0], i, dtype=np.int32)
+                for i, bbox in enumerate(result)
+            ]
+            labels = np.concatenate(labels)
+            bboxes = np.vstack(result)
+            mmcv.imshow_det_bboxes(
+                img_show,
+                bboxes,
+                labels,
+                class_names=class_names,
+                score_thr=score_thr)
diff --git a/mmdet/models/detectors/fast_rcnn.py b/mmdet/models/detectors/fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd80a87f69d67a2c77378c926a39c2ddb3208ac0
--- /dev/null
+++ b/mmdet/models/detectors/fast_rcnn.py
@@ -0,0 +1,46 @@
+from .two_stage import TwoStageDetector
+
+
+class FastRCNN(TwoStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_roi_extractor,
+                 bbox_head,
+                 train_cfg,
+                 test_cfg,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 pretrained=None):
+        super(FastRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_roi_extractor=bbox_roi_extractor,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            mask_roi_extractor=mask_roi_extractor,
+            mask_head=mask_head,
+            pretrained=pretrained)
+
+    def forward_test(self, imgs, img_metas, proposals, **kwargs):
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(
+                'num of augmentations ({}) != num of image meta ({})'.format(
+                    len(imgs), len(img_metas)))
+        # TODO: remove the restriction of imgs_per_gpu == 1 when prepared
+        imgs_per_gpu = imgs[0].size(0)
+        assert imgs_per_gpu == 1
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], proposals[0],
+                                    **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, proposals, **kwargs)
diff --git a/mmdet/models/detectors/faster_rcnn.py b/mmdet/models/detectors/faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd31f60c1d819b6c7ba47a67ecb3285a46e09636
--- /dev/null
+++ b/mmdet/models/detectors/faster_rcnn.py
@@ -0,0 +1,23 @@
+from .two_stage import TwoStageDetector
+
+
+class FasterRCNN(TwoStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 bbox_roi_extractor,
+                 bbox_head,
+                 train_cfg,
+                 test_cfg,
+                 pretrained=None):
+        super(FasterRCNN, self).__init__(
+                    backbone=backbone,
+                    neck=neck,
+                    rpn_head=rpn_head,
+                    bbox_roi_extractor=bbox_roi_extractor,
+                    bbox_head=bbox_head,
+                    train_cfg=train_cfg,
+                    test_cfg=test_cfg,
+                    pretrained=pretrained)
diff --git a/mmdet/models/detectors/mask_rcnn.py b/mmdet/models/detectors/mask_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..25a363e398f6c0d01e2f8bd53e05c9046a5275ac
--- /dev/null
+++ b/mmdet/models/detectors/mask_rcnn.py
@@ -0,0 +1,34 @@
+from .two_stage import TwoStageDetector
+
+
+class MaskRCNN(TwoStageDetector):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 bbox_roi_extractor,
+                 bbox_head,
+                 mask_roi_extractor,
+                 mask_head,
+                 train_cfg,
+                 test_cfg,
+                 pretrained=None):
+        super(MaskRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            bbox_roi_extractor=bbox_roi_extractor,
+            bbox_head=bbox_head,
+            mask_roi_extractor=mask_roi_extractor,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
+
+    def show_result(self, data, result, img_norm_cfg, **kwargs):
+        # TODO: show segmentation masks
+        assert isinstance(result, tuple)
+        assert len(result) == 2  # (bbox_results, segm_results)
+        super(MaskRCNN, self).show_result(data, result[0], img_norm_cfg,
+                                          **kwargs)
diff --git a/mmdet/models/detectors/rpn.py b/mmdet/models/detectors/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d700fe3e3c3af357256b36f1582c6a8c7249580
--- /dev/null
+++ b/mmdet/models/detectors/rpn.py
@@ -0,0 +1,85 @@
+import mmcv
+
+from mmdet.core import tensor2imgs, bbox_mapping
+from .base import BaseDetector
+from .test_mixins import RPNTestMixin
+from .. import builder
+
+
+class RPN(BaseDetector, RPNTestMixin):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 train_cfg,
+                 test_cfg,
+                 pretrained=None):
+        super(RPN, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+        self.neck = builder.build_neck(neck) if neck is not None else None
+        self.rpn_head = builder.build_rpn_head(rpn_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        super(RPN, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        self.rpn_head.init_weights()
+
+    def extract_feat(self, img):
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_train(self, img, img_meta, gt_bboxes=None):
+        if self.train_cfg.rpn.get('debug', False):
+            self.rpn_head.debug_imgs = tensor2imgs(img)
+
+        x = self.extract_feat(img)
+        rpn_outs = self.rpn_head(x)
+
+        rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta, self.train_cfg.rpn)
+        losses = self.rpn_head.loss(*rpn_loss_inputs)
+        return losses
+
+    def simple_test(self, img, img_meta, rescale=False):
+        x = self.extract_feat(img)
+        proposal_list = self.simple_test_rpn(x, img_meta, self.test_cfg.rpn)
+        if rescale:
+            for proposals, meta in zip(proposal_list, img_meta):
+                proposals[:, :4] /= meta['scale_factor']
+        # TODO: remove this restriction
+        return proposal_list[0].cpu().numpy()
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        proposal_list = self.aug_test_rpn(
+            self.extract_feats(imgs), img_metas, self.test_cfg.rpn)
+        if not rescale:
+            for proposals, img_meta in zip(proposal_list, img_metas[0]):
+                img_shape = img_meta['img_shape']
+                scale_factor = img_meta['scale_factor']
+                flip = img_meta['flip']
+                proposals[:, :4] = bbox_mapping(proposals[:, :4], img_shape,
+                                                scale_factor, flip)
+        # TODO: remove this restriction
+        return proposal_list[0].cpu().numpy()
+
+    def show_result(self, data, result, img_norm_cfg):
+        """Show RPN proposals on the image.
+
+        Although we assume batch size is 1, this method supports arbitrary
+        batch size.
+        """
+        img_tensor = data['img'][0]
+        img_metas = data['img_meta'][0].data[0]
+        imgs = tensor2imgs(img_tensor, **img_norm_cfg)
+        assert len(imgs) == len(img_metas)
+        for img, img_meta in zip(imgs, img_metas):
+            h, w, _ = img_meta['img_shape']
+            img_show = img[:h, :w, :]
+            mmcv.imshow_bboxes(img_show, result, top_k=20)
diff --git a/mmdet/models/detectors/test_mixins.py b/mmdet/models/detectors/test_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..38136f47545c49d88253fee321c91f9408058ca9
--- /dev/null
+++ b/mmdet/models/detectors/test_mixins.py
@@ -0,0 +1,145 @@
+from mmdet.core import (bbox2roi, bbox_mapping, merge_aug_proposals,
+                        merge_aug_bboxes, merge_aug_masks, multiclass_nms)
+
+
+class RPNTestMixin(object):
+
+    def simple_test_rpn(self, x, img_meta, rpn_test_cfg):
+        rpn_outs = self.rpn_head(x)
+        proposal_inputs = rpn_outs + (img_meta, rpn_test_cfg)
+        proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
+        return proposal_list
+
+    def aug_test_rpn(self, feats, img_metas, rpn_test_cfg):
+        imgs_per_gpu = len(img_metas[0])
+        aug_proposals = [[] for _ in range(imgs_per_gpu)]
+        for x, img_meta in zip(feats, img_metas):
+            proposal_list = self.simple_test_rpn(x, img_meta, rpn_test_cfg)
+            for i, proposals in enumerate(proposal_list):
+                aug_proposals[i].append(proposals)
+        # after merging, proposals will be rescaled to the original image size
+        merged_proposals = [
+            merge_aug_proposals(proposals, img_meta, rpn_test_cfg)
+            for proposals, img_meta in zip(aug_proposals, img_metas)
+        ]
+        return merged_proposals
+
+
+class BBoxTestMixin(object):
+
+    def simple_test_bboxes(self,
+                           x,
+                           img_meta,
+                           proposals,
+                           rcnn_test_cfg,
+                           rescale=False):
+        """Test only det bboxes without augmentation."""
+        rois = bbox2roi(proposals)
+        roi_feats = self.bbox_roi_extractor(
+            x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+        cls_score, bbox_pred = self.bbox_head(roi_feats)
+        img_shape = img_meta[0]['img_shape']
+        scale_factor = img_meta[0]['scale_factor']
+        det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
+            rois,
+            cls_score,
+            bbox_pred,
+            img_shape,
+            scale_factor,
+            rescale=rescale,
+            nms_cfg=rcnn_test_cfg)
+        return det_bboxes, det_labels
+
+    def aug_test_bboxes(self, feats, img_metas, proposal_list, rcnn_test_cfg):
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            # TODO more flexible
+            proposals = bbox_mapping(proposal_list[0][:, :4], img_shape,
+                                     scale_factor, flip)
+            rois = bbox2roi([proposals])
+            # recompute feature maps to save GPU memory
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            cls_score, bbox_pred = self.bbox_head(roi_feats)
+            bboxes, scores = self.bbox_head.get_det_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                scale_factor,
+                rescale=False,
+                nms_cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, self.test_cfg.rcnn)
+        det_bboxes, det_labels = multiclass_nms(
+            merged_bboxes, merged_scores, self.test_cfg.rcnn.score_thr,
+            self.test_cfg.rcnn.nms_thr, self.test_cfg.rcnn.max_per_img)
+        return det_bboxes, det_labels
+
+
+class MaskTestMixin(object):
+
+    def simple_test_mask(self,
+                         x,
+                         img_meta,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        # image shape of the first image in the batch (only one)
+        ori_shape = img_meta[0]['ori_shape']
+        scale_factor = img_meta[0]['scale_factor']
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            _bboxes = (det_bboxes[:, :4] * scale_factor
+                       if rescale else det_bboxes)
+            mask_rois = bbox2roi([_bboxes])
+            mask_feats = self.mask_roi_extractor(
+                x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois)
+            mask_pred = self.mask_head(mask_feats)
+            segm_result = self.mask_head.get_seg_masks(
+                mask_pred, _bboxes, det_labels, self.test_cfg.rcnn, ori_shape,
+                scale_factor, rescale)
+        return segm_result
+
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip)
+                mask_rois = bbox2roi([_bboxes])
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+                mask_pred = self.mask_head(mask_feats)
+                # convert to numpy array to save memory
+                aug_masks.append(mask_pred.sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas,
+                                           self.test_cfg.rcnn)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            segm_result = self.mask_head.get_seg_masks(
+                merged_masks,
+                det_bboxes,
+                det_labels,
+                self.test_cfg.rcnn,
+                ori_shape,
+                scale_factor=1.0,
+                rescale=False)
+        return segm_result
diff --git a/mmdet/models/detectors/two_stage.py b/mmdet/models/detectors/two_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a818d164c4946399a89676dc1a84eeda9e0ff6
--- /dev/null
+++ b/mmdet/models/detectors/two_stage.py
@@ -0,0 +1,190 @@
+import torch
+import torch.nn as nn
+
+from .base import BaseDetector
+from .test_mixins import RPNTestMixin, BBoxTestMixin, MaskTestMixin
+from .. import builder
+from mmdet.core import sample_bboxes, bbox2roi, bbox2result, multi_apply
+
+
+class TwoStageDetector(BaseDetector, RPNTestMixin, BBoxTestMixin,
+                       MaskTestMixin):
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 bbox_roi_extractor=None,
+                 bbox_head=None,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(TwoStageDetector, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+        else:
+            raise NotImplementedError
+
+        if rpn_head is not None:
+            self.rpn_head = builder.build_rpn_head(rpn_head)
+
+        if bbox_head is not None:
+            self.bbox_roi_extractor = builder.build_roi_extractor(
+                bbox_roi_extractor)
+            self.bbox_head = builder.build_bbox_head(bbox_head)
+
+        if mask_head is not None:
+            self.mask_roi_extractor = builder.build_roi_extractor(
+                mask_roi_extractor)
+            self.mask_head = builder.build_mask_head(mask_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.init_weights(pretrained=pretrained)
+
+    @property
+    def with_rpn(self):
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    def init_weights(self, pretrained=None):
+        super(TwoStageDetector, self).init_weights(pretrained)
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.with_neck:
+            if isinstance(self.neck, nn.Sequential):
+                for m in self.neck:
+                    m.init_weights()
+            else:
+                self.neck.init_weights()
+        if self.with_rpn:
+            self.rpn_head.init_weights()
+        if self.with_bbox:
+            self.bbox_roi_extractor.init_weights()
+            self.bbox_head.init_weights()
+
+    def extract_feat(self, img):
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_train(self,
+                      img,
+                      img_meta,
+                      gt_bboxes,
+                      gt_bboxes_ignore,
+                      gt_labels,
+                      gt_masks=None,
+                      proposals=None):
+        losses = dict()
+
+        x = self.extract_feat(img)
+
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(x)
+            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_meta,
+                                          self.train_cfg.rpn)
+            rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
+            losses.update(rpn_losses)
+
+            proposal_inputs = rpn_outs + (img_meta, self.test_cfg.rpn)
+            proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
+        else:
+            proposal_list = proposals
+
+        if self.with_bbox:
+            (pos_proposals, neg_proposals, pos_assigned_gt_inds, pos_gt_bboxes,
+             pos_gt_labels) = multi_apply(
+                 sample_bboxes,
+                 proposal_list,
+                 gt_bboxes,
+                 gt_bboxes_ignore,
+                 gt_labels,
+                 cfg=self.train_cfg.rcnn)
+            (labels, label_weights, bbox_targets,
+             bbox_weights) = self.bbox_head.get_bbox_target(
+                 pos_proposals, neg_proposals, pos_gt_bboxes, pos_gt_labels,
+                 self.train_cfg.rcnn)
+
+            rois = bbox2roi([
+                torch.cat([pos, neg], dim=0)
+                for pos, neg in zip(pos_proposals, neg_proposals)
+            ])
+            # TODO: a more flexible way to configurate feat maps
+            roi_feats = self.bbox_roi_extractor(
+                x[:self.bbox_roi_extractor.num_inputs], rois)
+            cls_score, bbox_pred = self.bbox_head(roi_feats)
+
+            loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, labels,
+                                            label_weights, bbox_targets,
+                                            bbox_weights)
+            losses.update(loss_bbox)
+
+        if self.with_mask:
+            mask_targets = self.mask_head.get_mask_target(
+                pos_proposals, pos_assigned_gt_inds, gt_masks,
+                self.train_cfg.rcnn)
+            pos_rois = bbox2roi(pos_proposals)
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], pos_rois)
+            mask_pred = self.mask_head(mask_feats)
+            loss_mask = self.mask_head.loss(mask_pred, mask_targets,
+                                            torch.cat(pos_gt_labels))
+            losses.update(loss_mask)
+
+        return losses
+
+    def simple_test(self, img, img_meta, proposals=None, rescale=False):
+        """Test without augmentation."""
+        assert self.with_bbox, "Bbox head must be implemented."
+
+        x = self.extract_feat(img)
+
+        proposal_list = self.simple_test_rpn(
+            x, img_meta,
+            self.test_cfg.rpn) if proposals is None else proposals
+
+        det_bboxes, det_labels = self.simple_test_bboxes(
+            x, img_meta, proposal_list, self.test_cfg.rcnn, rescale=rescale)
+        bbox_results = bbox2result(det_bboxes, det_labels,
+                                   self.bbox_head.num_classes)
+
+        if not self.with_mask:
+            return bbox_results
+        else:
+            segm_results = self.simple_test_mask(
+                x, img_meta, det_bboxes, det_labels, rescale=rescale)
+            return bbox_results, segm_results
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        # recompute feats to save memory
+        proposal_list = self.aug_test_rpn(
+            self.extract_feats(imgs), img_metas, self.test_cfg.rpn)
+        det_bboxes, det_labels = self.aug_test_bboxes(
+            self.extract_feats(imgs), img_metas, proposal_list,
+            self.test_cfg.rcnn)
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= img_metas[0][0]['scale_factor']
+        bbox_results = bbox2result(_det_bboxes, det_labels,
+                                   self.bbox_head.num_classes)
+
+        # det_bboxes always keep the original scale
+        if self.with_mask:
+            segm_results = self.aug_test_mask(
+                self.extract_feats(imgs), img_metas, det_bboxes, det_labels)
+            return bbox_results, segm_results
+        else:
+            return bbox_results
diff --git a/mmdet/models/mask_heads/__init__.py b/mmdet/models/mask_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a21ae9add5a78d23781bf36a696b28606e19b0ce
--- /dev/null
+++ b/mmdet/models/mask_heads/__init__.py
@@ -0,0 +1,3 @@
+from .fcn_mask_head import FCNMaskHead
+
+__all__ = ['FCNMaskHead']
diff --git a/mmdet/models/mask_heads/fcn_mask_head.py b/mmdet/models/mask_heads/fcn_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba46bea77e16115378f5b8d36626e3097943bd75
--- /dev/null
+++ b/mmdet/models/mask_heads/fcn_mask_head.py
@@ -0,0 +1,154 @@
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+import torch.nn as nn
+
+from ..utils import ConvModule
+from mmdet.core import mask_cross_entropy, mask_target
+
+
+class FCNMaskHead(nn.Module):
+
+    def __init__(self,
+                 num_convs=4,
+                 roi_feat_size=14,
+                 in_channels=256,
+                 conv_kernel_size=3,
+                 conv_out_channels=256,
+                 upsample_method='deconv',
+                 upsample_ratio=2,
+                 num_classes=81,
+                 class_agnostic=False,
+                 normalize=None):
+        super(FCNMaskHead, self).__init__()
+        if upsample_method not in [None, 'deconv', 'nearest', 'bilinear']:
+            raise ValueError(
+                'Invalid upsample method {}, accepted methods '
+                'are "deconv", "nearest", "bilinear"'.format(upsample_method))
+        self.num_convs = num_convs
+        self.roi_feat_size = roi_feat_size  # WARN: not used and reserved
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_out_channels = conv_out_channels
+        self.upsample_method = upsample_method
+        self.upsample_ratio = upsample_ratio
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.normalize = normalize
+        self.with_bias = normalize is None
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = (self.in_channels
+                           if i == 0 else self.conv_out_channels)
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    3,
+                    padding=padding,
+                    normalize=normalize,
+                    bias=self.with_bias))
+        if self.upsample_method is None:
+            self.upsample = None
+        elif self.upsample_method == 'deconv':
+            self.upsample = nn.ConvTranspose2d(
+                self.conv_out_channels,
+                self.conv_out_channels,
+                self.upsample_ratio,
+                stride=self.upsample_ratio)
+        else:
+            self.upsample = nn.Upsample(
+                scale_factor=self.upsample_ratio, mode=self.upsample_method)
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        self.conv_logits = nn.Conv2d(self.conv_out_channels, out_channels, 1)
+        self.relu = nn.ReLU(inplace=True)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        for m in [self.upsample, self.conv_logits]:
+            if m is None:
+                continue
+            nn.init.kaiming_normal_(
+                m.weight, mode='fan_out', nonlinearity='relu')
+            nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_pred = self.conv_logits(x)
+        return mask_pred
+
+    def get_mask_target(self, pos_proposals, pos_assigned_gt_inds, gt_masks,
+                        rcnn_train_cfg):
+        mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
+                                   gt_masks, rcnn_train_cfg)
+        return mask_targets
+
+    def loss(self, mask_pred, mask_targets, labels):
+        loss = dict()
+        loss_mask = mask_cross_entropy(mask_pred, mask_targets, labels)
+        loss['loss_mask'] = loss_mask
+        return loss
+
+    def get_seg_masks(self, mask_pred, det_bboxes, det_labels, rcnn_test_cfg,
+                      ori_shape, scale_factor, rescale):
+        """Get segmentation masks from mask_pred and bboxes.
+
+        Args:
+            mask_pred (Tensor or ndarray): shape (n, #class+1, h, w).
+                For single-scale testing, mask_pred is the direct output of
+                model, whose type is Tensor, while for multi-scale testing,
+                it will be converted to numpy array outside of this method.
+            det_bboxes (Tensor): shape (n, 4/5)
+            det_labels (Tensor): shape (n, )
+            img_shape (Tensor): shape (3, )
+            rcnn_test_cfg (dict): rcnn testing config
+            ori_shape: original image size
+
+        Returns:
+            list[list]: encoded masks
+        """
+        if isinstance(mask_pred, torch.Tensor):
+            mask_pred = mask_pred.sigmoid().cpu().numpy()
+        assert isinstance(mask_pred, np.ndarray)
+
+        cls_segms = [[] for _ in range(self.num_classes - 1)]
+        bboxes = det_bboxes.cpu().numpy()[:, :4]
+        labels = det_labels.cpu().numpy() + 1
+
+        if rescale:
+            img_h, img_w = ori_shape[:2]
+        else:
+            img_h = np.round(ori_shape[0] * scale_factor).astype(np.int32)
+            img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32)
+            scale_factor = 1.0
+
+        for i in range(bboxes.shape[0]):
+            bbox = (bboxes[i, :] / scale_factor).astype(np.int32)
+            label = labels[i]
+            w = max(bbox[2] - bbox[0] + 1, 1)
+            h = max(bbox[3] - bbox[1] + 1, 1)
+
+            if not self.class_agnostic:
+                mask_pred_ = mask_pred[i, label, :, :]
+            else:
+                mask_pred_ = mask_pred[i, 0, :, :]
+            im_mask = np.zeros((img_h, img_w), dtype=np.uint8)
+
+            bbox_mask = mmcv.imresize(mask_pred_, (w, h))
+            bbox_mask = (bbox_mask > rcnn_test_cfg.mask_thr_binary).astype(
+                np.uint8)
+            im_mask[bbox[1]:bbox[1] + h, bbox[0]:bbox[0] + w] = bbox_mask
+            rle = mask_util.encode(
+                np.array(im_mask[:, :, np.newaxis], order='F'))[0]
+            cls_segms[label - 1].append(rle)
+
+        return cls_segms
diff --git a/mmdet/models/necks/__init__.py b/mmdet/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0093021ebac1e46fbb798ed6ee96a192dbd8604c
--- /dev/null
+++ b/mmdet/models/necks/__init__.py
@@ -0,0 +1,3 @@
+from .fpn import FPN
+
+__all__ = ['FPN']
diff --git a/mmdet/models/necks/fpn.py b/mmdet/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a256cae3647bcafa54ee2671cb7167f75fc9f95
--- /dev/null
+++ b/mmdet/models/necks/fpn.py
@@ -0,0 +1,126 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from ..utils import ConvModule
+from ..utils import xavier_init
+
+
+class FPN(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 normalize=None,
+                 activation=None):
+        super(FPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.activation = activation
+        self.with_bias = normalize is None
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                normalize=normalize,
+                bias=self.with_bias,
+                activation=self.activation,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                normalize=normalize,
+                bias=self.with_bias,
+                activation=self.activation,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+            # lvl_id = i - self.start_level
+            # setattr(self, 'lateral_conv{}'.format(lvl_id), l_conv)
+            # setattr(self, 'fpn_conv{}'.format(lvl_id), fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                in_channels = (self.in_channels[self.backbone_end_level - 1]
+                               if i == 0 else out_channels)
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    normalize=normalize,
+                    bias=self.with_bias,
+                    activation=self.activation,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            laterals[i - 1] += F.interpolate(
+                laterals[i], scale_factor=2, mode='nearest')
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                orig = inputs[self.backbone_end_level - 1]
+                outs.append(self.fpn_convs[used_backbone_levels](orig))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    # BUG: we should add relu before each extra conv
+                    outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmdet/models/roi_extractors/__init__.py b/mmdet/models/roi_extractors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9161708ce13fa4f0a6bb188e82a19a163b9b7e4f
--- /dev/null
+++ b/mmdet/models/roi_extractors/__init__.py
@@ -0,0 +1,3 @@
+from .single_level import SingleRoIExtractor
+
+__all__ = ['SingleRoIExtractor']
diff --git a/mmdet/models/roi_extractors/single_level.py b/mmdet/models/roi_extractors/single_level.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f97a631f987104422f65110a2cb6b49e080de0e
--- /dev/null
+++ b/mmdet/models/roi_extractors/single_level.py
@@ -0,0 +1,86 @@
+from __future__ import division
+
+import torch
+import torch.nn as nn
+
+from mmdet import ops
+
+
+class SingleRoIExtractor(nn.Module):
+    """Extract RoI features from a single level feature map.
+
+    If there are mulitple input feature levels, each RoI is mapped to a level
+    according to its scale.
+
+    Args:
+        roi_layer (dict): Specify RoI layer type and arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (int): Strides of input feature maps.
+        finest_scale (int): Scale threshold of mapping to level 0.
+    """
+
+    def __init__(self,
+                 roi_layer,
+                 out_channels,
+                 featmap_strides,
+                 finest_scale=56):
+        super(SingleRoIExtractor, self).__init__()
+        self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+        self.finest_scale = finest_scale
+
+    @property
+    def num_inputs(self):
+        """int: Input feature map levels."""
+        return len(self.featmap_strides)
+
+    def init_weights(self):
+        pass
+
+    def build_roi_layers(self, layer_cfg, featmap_strides):
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        assert hasattr(ops, layer_type)
+        layer_cls = getattr(ops, layer_type)
+        roi_layers = nn.ModuleList(
+            [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides])
+        return roi_layers
+
+    def map_roi_levels(self, rois, num_levels):
+        """Map rois to corresponding feature levels by scales.
+
+        - scale < finest_scale: level 0
+        - finest_scale <= scale < finest_scale * 2: level 1
+        - finest_scale * 2 <= scale < finest_scale * 4: level 2
+        - scale >= finest_scale * 4: level 3
+
+        Args:
+            rois (Tensor): Input RoIs, shape (k, 5).
+            num_levels (int): Total level number.
+
+        Returns:
+            Tensor: Level index (0-based) of each RoI, shape (k, )
+        """
+        scale = torch.sqrt(
+            (rois[:, 3] - rois[:, 1] + 1) * (rois[:, 4] - rois[:, 2] + 1))
+        target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6))
+        target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long()
+        return target_lvls
+
+    def forward(self, feats, rois):
+        if len(feats) == 1:
+            return self.roi_layers[0](feats[0], rois)
+
+        out_size = self.roi_layers[0].out_size
+        num_levels = len(feats)
+        target_lvls = self.map_roi_levels(rois, num_levels)
+        roi_feats = torch.cuda.FloatTensor(rois.size()[0], self.out_channels,
+                                           out_size, out_size).fill_(0)
+        for i in range(num_levels):
+            inds = target_lvls == i
+            if inds.any():
+                rois_ = rois[inds, :]
+                roi_feats_t = self.roi_layers[i](feats[i], rois_)
+                roi_feats[inds] += roi_feats_t
+        return roi_feats
diff --git a/mmdet/models/rpn_heads/__init__.py b/mmdet/models/rpn_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbc4b3affbf31059fdcbb1b4b43eeb1544c631f0
--- /dev/null
+++ b/mmdet/models/rpn_heads/__init__.py
@@ -0,0 +1,3 @@
+from .rpn_head import RPNHead
+
+__all__ = ['RPNHead']
diff --git a/mmdet/models/rpn_heads/rpn_head.py b/mmdet/models/rpn_heads/rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e6e199ac0407bd23226701e3117c02ec16171d
--- /dev/null
+++ b/mmdet/models/rpn_heads/rpn_head.py
@@ -0,0 +1,250 @@
+from __future__ import division
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.core import (AnchorGenerator, anchor_target, delta2bbox,
+                        multi_apply, weighted_cross_entropy, weighted_smoothl1,
+                        weighted_binary_cross_entropy)
+from mmdet.ops import nms
+from ..utils import normal_init
+
+
+class RPNHead(nn.Module):
+    """Network head of RPN.
+
+                                  / - rpn_cls (1x1 conv)
+    input - rpn_conv (3x3 conv) -
+                                  \ - rpn_reg (1x1 conv)
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for the RPN feature map.
+        anchor_scales (Iterable): Anchor scales.
+        anchor_ratios (Iterable): Anchor aspect ratios.
+        anchor_strides (Iterable): Anchor strides.
+        anchor_base_sizes (Iterable): Anchor base sizes.
+        target_means (Iterable): Mean values of regression targets.
+        target_stds (Iterable): Std values of regression targets.
+        use_sigmoid_cls (bool): Whether to use sigmoid loss for classification.
+            (softmax by default)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels=256,
+                 anchor_scales=[8, 16, 32],
+                 anchor_ratios=[0.5, 1.0, 2.0],
+                 anchor_strides=[4, 8, 16, 32, 64],
+                 anchor_base_sizes=None,
+                 target_means=(.0, .0, .0, .0),
+                 target_stds=(1.0, 1.0, 1.0, 1.0),
+                 use_sigmoid_cls=False):
+        super(RPNHead, self).__init__()
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.anchor_scales = anchor_scales
+        self.anchor_ratios = anchor_ratios
+        self.anchor_strides = anchor_strides
+        self.anchor_base_sizes = list(
+            anchor_strides) if anchor_base_sizes is None else anchor_base_sizes
+        self.target_means = target_means
+        self.target_stds = target_stds
+        self.use_sigmoid_cls = use_sigmoid_cls
+
+        self.anchor_generators = []
+        for anchor_base in self.anchor_base_sizes:
+            self.anchor_generators.append(
+                AnchorGenerator(anchor_base, anchor_scales, anchor_ratios))
+        self.rpn_conv = nn.Conv2d(in_channels, feat_channels, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales)
+        out_channels = (self.num_anchors
+                        if self.use_sigmoid_cls else self.num_anchors * 2)
+        self.rpn_cls = nn.Conv2d(feat_channels, out_channels, 1)
+        self.rpn_reg = nn.Conv2d(feat_channels, self.num_anchors * 4, 1)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        normal_init(self.rpn_conv, std=0.01)
+        normal_init(self.rpn_cls, std=0.01)
+        normal_init(self.rpn_reg, std=0.01)
+
+    def forward_single(self, x):
+        rpn_feat = self.relu(self.rpn_conv(x))
+        rpn_cls_score = self.rpn_cls(rpn_feat)
+        rpn_bbox_pred = self.rpn_reg(rpn_feat)
+        return rpn_cls_score, rpn_bbox_pred
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, img_metas):
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+
+        Returns:
+            tuple: anchors of each image, valid flags of each image
+        """
+        num_imgs = len(img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = []
+        for i in range(num_levels):
+            anchors = self.anchor_generators[i].grid_anchors(
+                featmap_sizes[i], self.anchor_strides[i])
+            multi_level_anchors.append(anchors)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = []
+            for i in range(num_levels):
+                anchor_stride = self.anchor_strides[i]
+                feat_h, feat_w = featmap_sizes[i]
+                h, w, _ = img_meta['pad_shape']
+                valid_feat_h = min(int(np.ceil(h / anchor_stride)), feat_h)
+                valid_feat_w = min(int(np.ceil(w / anchor_stride)), feat_w)
+                flags = self.anchor_generators[i].valid_flags(
+                    (feat_h, feat_w), (valid_feat_h, valid_feat_w))
+                multi_level_flags.append(flags)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def loss_single(self, rpn_cls_score, rpn_bbox_pred, labels, label_weights,
+                    bbox_targets, bbox_weights, num_total_samples, cfg):
+        # classification loss
+        labels = labels.contiguous().view(-1)
+        label_weights = label_weights.contiguous().view(-1)
+        if self.use_sigmoid_cls:
+            rpn_cls_score = rpn_cls_score.permute(0, 2, 3,
+                                                  1).contiguous().view(-1)
+            criterion = weighted_binary_cross_entropy
+        else:
+            rpn_cls_score = rpn_cls_score.permute(0, 2, 3,
+                                                  1).contiguous().view(-1, 2)
+            criterion = weighted_cross_entropy
+        loss_cls = criterion(
+            rpn_cls_score, labels, label_weights, avg_factor=num_total_samples)
+        # regression loss
+        bbox_targets = bbox_targets.contiguous().view(-1, 4)
+        bbox_weights = bbox_weights.contiguous().view(-1, 4)
+        rpn_bbox_pred = rpn_bbox_pred.permute(0, 2, 3, 1).contiguous().view(
+            -1, 4)
+        loss_reg = weighted_smoothl1(
+            rpn_bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            beta=cfg.smoothl1_beta,
+            avg_factor=num_total_samples)
+        return loss_cls, loss_reg
+
+    def loss(self, rpn_cls_scores, rpn_bbox_preds, gt_bboxes, img_shapes, cfg):
+        featmap_sizes = [featmap.size()[-2:] for featmap in rpn_cls_scores]
+        assert len(featmap_sizes) == len(self.anchor_generators)
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_shapes)
+        cls_reg_targets = anchor_target(
+            anchor_list, valid_flag_list, gt_bboxes, img_shapes,
+            self.target_means, self.target_stds, cfg)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_samples) = cls_reg_targets
+        losses_cls, losses_reg = multi_apply(
+            self.loss_single,
+            rpn_cls_scores,
+            rpn_bbox_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples,
+            cfg=cfg)
+        return dict(loss_rpn_cls=losses_cls, loss_rpn_reg=losses_reg)
+
+    def get_proposals(self, rpn_cls_scores, rpn_bbox_preds, img_meta, cfg):
+        num_imgs = len(img_meta)
+        featmap_sizes = [featmap.size()[-2:] for featmap in rpn_cls_scores]
+        mlvl_anchors = [
+            self.anchor_generators[idx].grid_anchors(featmap_sizes[idx],
+                                                     self.anchor_strides[idx])
+            for idx in range(len(featmap_sizes))
+        ]
+        proposal_list = []
+        for img_id in range(num_imgs):
+            rpn_cls_score_list = [
+                rpn_cls_scores[idx][img_id].detach()
+                for idx in range(len(rpn_cls_scores))
+            ]
+            rpn_bbox_pred_list = [
+                rpn_bbox_preds[idx][img_id].detach()
+                for idx in range(len(rpn_bbox_preds))
+            ]
+            assert len(rpn_cls_score_list) == len(rpn_bbox_pred_list)
+            proposals = self._get_proposals_single(
+                rpn_cls_score_list, rpn_bbox_pred_list, mlvl_anchors,
+                img_meta[img_id]['img_shape'], cfg)
+            proposal_list.append(proposals)
+        return proposal_list
+
+    def _get_proposals_single(self, rpn_cls_scores, rpn_bbox_preds,
+                              mlvl_anchors, img_shape, cfg):
+        mlvl_proposals = []
+        for idx in range(len(rpn_cls_scores)):
+            rpn_cls_score = rpn_cls_scores[idx]
+            rpn_bbox_pred = rpn_bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            anchors = mlvl_anchors[idx]
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.permute(1, 2,
+                                                      0).contiguous().view(-1)
+                rpn_cls_prob = rpn_cls_score.sigmoid()
+                scores = rpn_cls_prob
+            else:
+                rpn_cls_score = rpn_cls_score.permute(1, 2,
+                                                      0).contiguous().view(
+                                                          -1, 2)
+                rpn_cls_prob = F.softmax(rpn_cls_score, dim=1)
+                scores = rpn_cls_prob[:, 1]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).contiguous().view(
+                -1, 4)
+            _, order = scores.sort(0, descending=True)
+            if cfg.nms_pre > 0:
+                order = order[:cfg.nms_pre]
+                rpn_bbox_pred = rpn_bbox_pred[order, :]
+                anchors = anchors[order, :]
+                scores = scores[order]
+            proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means,
+                                   self.target_stds, img_shape)
+            w = proposals[:, 2] - proposals[:, 0] + 1
+            h = proposals[:, 3] - proposals[:, 1] + 1
+            valid_inds = torch.nonzero((w >= cfg.min_bbox_size) &
+                                       (h >= cfg.min_bbox_size)).squeeze()
+            proposals = proposals[valid_inds, :]
+            scores = scores[valid_inds]
+            proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1)
+            nms_keep = nms(proposals, cfg.nms_thr)[:cfg.nms_post]
+            proposals = proposals[nms_keep, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.nms_across_levels:
+            nms_keep = nms(proposals, cfg.nms_thr)[:cfg.max_num]
+            proposals = proposals[nms_keep, :]
+        else:
+            scores = proposals[:, 4]
+            _, order = scores.sort(0, descending=True)
+            num = min(cfg.max_num, proposals.shape[0])
+            order = order[:num]
+            proposals = proposals[order, :]
+        return proposals
diff --git a/mmdet/models/utils/__init__.py b/mmdet/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c759ca9aba1a07d983ae3a0d0305faab910b17a5
--- /dev/null
+++ b/mmdet/models/utils/__init__.py
@@ -0,0 +1,8 @@
+from .conv_module import ConvModule
+from .norm import build_norm_layer
+from .weight_init import xavier_init, normal_init, uniform_init, kaiming_init
+
+__all__ = [
+    'ConvModule', 'build_norm_layer', 'xavier_init', 'normal_init',
+    'uniform_init', 'kaiming_init'
+]
diff --git a/mmdet/models/utils/conv_module.py b/mmdet/models/utils/conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..25121972da29d8e4e83fb2301b8f8d25a1727f7e
--- /dev/null
+++ b/mmdet/models/utils/conv_module.py
@@ -0,0 +1,95 @@
+import warnings
+
+import torch.nn as nn
+
+from .norm import build_norm_layer
+
+
+class ConvModule(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 normalize=None,
+                 activation='relu',
+                 inplace=True,
+                 activate_last=True):
+        super(ConvModule, self).__init__()
+        self.with_norm = normalize is not None
+        self.with_activatation = activation is not None
+        self.with_bias = bias
+        self.activation = activation
+        self.activate_last = activate_last
+
+        if self.with_norm and self.with_bias:
+            warnings.warn('ConvModule has norm and bias at the same time')
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = self.conv.padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_norm:
+            # self.norm_type, self.norm_params = parse_norm(normalize)
+            # assert self.norm_type in [None, 'BN', 'SyncBN', 'GN', 'SN']
+            # self.Norm2d = norm_cfg[self.norm_type]
+            if self.activate_last:
+                self.norm = build_norm_layer(normalize, out_channels)
+                # self.norm = self.Norm2d(out_channels, **self.norm_params)
+            else:
+                self.norm = build_norm_layer(normalize, in_channels)
+                # self.norm = self.Norm2d(in_channels, **self.norm_params)
+
+        if self.with_activatation:
+            assert activation in ['relu'], 'Only ReLU supported.'
+            if self.activation == 'relu':
+                self.activate = nn.ReLU(inplace=inplace)
+
+        # Default using msra init
+        self.init_weights()
+
+    def init_weights(self):
+        nonlinearity = 'relu' if self.activation is None else self.activation
+        nn.init.kaiming_normal_(
+            self.conv.weight, mode='fan_out', nonlinearity=nonlinearity)
+        if self.with_bias:
+            nn.init.constant_(self.conv.bias, 0)
+        if self.with_norm:
+            nn.init.constant_(self.norm.weight, 1)
+            nn.init.constant_(self.norm.bias, 0)
+
+    def forward(self, x, activate=True, norm=True):
+        if self.activate_last:
+            x = self.conv(x)
+            if norm and self.with_norm:
+                x = self.norm(x)
+            if activate and self.with_activatation:
+                x = self.activate(x)
+        else:
+            if norm and self.with_norm:
+                x = self.norm(x)
+            if activate and self.with_activatation:
+                x = self.activate(x)
+            x = self.conv(x)
+        return x
diff --git a/mmdet/models/utils/norm.py b/mmdet/models/utils/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b82cd046e82e8ece24c5552687ae2952cfd9932
--- /dev/null
+++ b/mmdet/models/utils/norm.py
@@ -0,0 +1,17 @@
+import torch.nn as nn
+
+norm_cfg = {'BN': nn.BatchNorm2d, 'SyncBN': None, 'GN': None}
+
+
+def build_norm_layer(cfg, num_features):
+    assert isinstance(cfg, dict) and 'type' in cfg
+    cfg_ = cfg.copy()
+    cfg_.setdefault('eps', 1e-5)
+    layer_type = cfg_.pop('type')
+
+    if layer_type not in norm_cfg:
+        raise KeyError('Unrecognized norm type {}'.format(layer_type))
+    elif norm_cfg[layer_type] is None:
+        raise NotImplementedError
+
+    return norm_cfg[layer_type](num_features, **cfg_)
diff --git a/mmdet/models/utils/weight_init.py b/mmdet/models/utils/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e9b13b4fbc17d6d1986da876108c1a813190c2d
--- /dev/null
+++ b/mmdet/models/utils/weight_init.py
@@ -0,0 +1,39 @@
+import torch.nn as nn
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.xavier_uniform_(module.weight, gain=gain)
+    else:
+        nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.kaiming_uniform_(
+            module.weight, mode=mode, nonlinearity=nonlinearity)
+    else:
+        nn.init.kaiming_normal_(
+            module.weight, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
diff --git a/mmdet/ops/__init__.py b/mmdet/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b63224c3476ad189445fe2f6ee2b7182aee661a
--- /dev/null
+++ b/mmdet/ops/__init__.py
@@ -0,0 +1,5 @@
+from .nms import nms, soft_nms
+from .roi_align import RoIAlign, roi_align
+from .roi_pool import RoIPool, roi_pool
+
+__all__ = ['nms', 'soft_nms', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool']
diff --git a/mmdet/ops/nms/.gitignore b/mmdet/ops/nms/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..ce1da4c53c0301615c1f0ba3b01a859ad68259cb
--- /dev/null
+++ b/mmdet/ops/nms/.gitignore
@@ -0,0 +1 @@
+*.cpp
diff --git a/mmdet/ops/nms/Makefile b/mmdet/ops/nms/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..39556dd28ba76300d0f491cd5e66d4a4d19fc8ee
--- /dev/null
+++ b/mmdet/ops/nms/Makefile
@@ -0,0 +1,8 @@
+PYTHON=${PYTHON:-python}
+
+all:
+	echo "Compiling nms kernels..."
+	$(PYTHON) setup.py build_ext --inplace
+
+clean:
+	rm *.so
diff --git a/mmdet/ops/nms/__init__.py b/mmdet/ops/nms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4407041ad733d51eca3006b8aefa82e02bbfcde
--- /dev/null
+++ b/mmdet/ops/nms/__init__.py
@@ -0,0 +1,3 @@
+from .nms_wrapper import nms, soft_nms
+
+__all__ = ['nms', 'soft_nms']
diff --git a/mmdet/ops/nms/cpu_nms.pyx b/mmdet/ops/nms/cpu_nms.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..1d0bef3321d78fc73556906649ab61eaaea60d86
--- /dev/null
+++ b/mmdet/ops/nms/cpu_nms.pyx
@@ -0,0 +1,68 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+    return a if a >= b else b
+
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+    return a if a <= b else b
+
+def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+    cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+    cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+    cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+    cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+    cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+
+    cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+
+    cdef int ndets = dets.shape[0]
+    cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+            np.zeros((ndets), dtype=np.int)
+
+    # nominal indices
+    cdef int _i, _j
+    # sorted indices
+    cdef int i, j
+    # temp variables for box i's (the box currently under consideration)
+    cdef np.float32_t ix1, iy1, ix2, iy2, iarea
+    # variables for computing overlap with box j (lower scoring box)
+    cdef np.float32_t xx1, yy1, xx2, yy2
+    cdef np.float32_t w, h
+    cdef np.float32_t inter, ovr
+
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        keep.append(i)
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+
+    return keep
diff --git a/mmdet/ops/nms/cpu_soft_nms.pyx b/mmdet/ops/nms/cpu_soft_nms.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..05ec5a5446221d3593a10edfd4d714bfa6192309
--- /dev/null
+++ b/mmdet/ops/nms/cpu_soft_nms.pyx
@@ -0,0 +1,123 @@
+# ----------------------------------------------------------
+# Soft-NMS: Improving Object Detection With One Line of Code
+# Copyright (c) University of Maryland, College Park
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Navaneeth Bodla and Bharat Singh
+# ----------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+    return a if a >= b else b
+
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+    return a if a <= b else b
+
+def cpu_soft_nms(
+    np.ndarray[float, ndim=2] boxes_in,
+    float sigma=0.5,
+    float Nt=0.3,
+    float threshold=0.001,
+    unsigned int method=0
+):
+    boxes = boxes_in.copy()
+    cdef unsigned int N = boxes.shape[0]
+    cdef float iw, ih, box_area
+    cdef float ua
+    cdef int pos = 0
+    cdef float maxscore = 0
+    cdef int maxpos = 0
+    cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov
+    inds = np.arange(N)
+
+    for i in range(N):
+        maxscore = boxes[i, 4]
+        maxpos = i
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+        ti = inds[i]
+
+        pos = i + 1
+        # get max box
+        while pos < N:
+            if maxscore < boxes[pos, 4]:
+                maxscore = boxes[pos, 4]
+                maxpos = pos
+            pos = pos + 1
+
+        # add max box as a detection
+        boxes[i,0] = boxes[maxpos,0]
+        boxes[i,1] = boxes[maxpos,1]
+        boxes[i,2] = boxes[maxpos,2]
+        boxes[i,3] = boxes[maxpos,3]
+        boxes[i,4] = boxes[maxpos,4]
+        inds[i] = inds[maxpos]
+
+        # swap ith box with position of max box
+        boxes[maxpos,0] = tx1
+        boxes[maxpos,1] = ty1
+        boxes[maxpos,2] = tx2
+        boxes[maxpos,3] = ty2
+        boxes[maxpos,4] = ts
+        inds[maxpos] = ti
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+
+        pos = i + 1
+        # NMS iterations, note that N changes if detection boxes fall below
+        # threshold
+        while pos < N:
+            x1 = boxes[pos, 0]
+            y1 = boxes[pos, 1]
+            x2 = boxes[pos, 2]
+            y2 = boxes[pos, 3]
+            s = boxes[pos, 4]
+
+            area = (x2 - x1 + 1) * (y2 - y1 + 1)
+            iw = (min(tx2, x2) - max(tx1, x1) + 1)
+            if iw > 0:
+                ih = (min(ty2, y2) - max(ty1, y1) + 1)
+                if ih > 0:
+                    ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
+                    ov = iw * ih / ua #iou between max box and detection box
+
+                    if method == 1: # linear
+                        if ov > Nt:
+                            weight = 1 - ov
+                        else:
+                            weight = 1
+                    elif method == 2: # gaussian
+                        weight = np.exp(-(ov * ov)/sigma)
+                    else: # original NMS
+                        if ov > Nt:
+                            weight = 0
+                        else:
+                            weight = 1
+
+                    boxes[pos, 4] = weight*boxes[pos, 4]
+
+                    # if box score falls below threshold, discard the box by
+                    # swapping with last box update N
+                    if boxes[pos, 4] < threshold:
+                        boxes[pos,0] = boxes[N-1, 0]
+                        boxes[pos,1] = boxes[N-1, 1]
+                        boxes[pos,2] = boxes[N-1, 2]
+                        boxes[pos,3] = boxes[N-1, 3]
+                        boxes[pos,4] = boxes[N-1, 4]
+                        inds[pos] = inds[N-1]
+                        N = N - 1
+                        pos = pos - 1
+
+            pos = pos + 1
+
+    return boxes[:N], inds[:N]
\ No newline at end of file
diff --git a/mmdet/ops/nms/gpu_nms.hpp b/mmdet/ops/nms/gpu_nms.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d45e344aeb93c00262f98153dd3e1300a9adcce
--- /dev/null
+++ b/mmdet/ops/nms/gpu_nms.hpp
@@ -0,0 +1,3 @@
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id, size_t base);
+size_t nms_Malloc();
diff --git a/mmdet/ops/nms/gpu_nms.pyx b/mmdet/ops/nms/gpu_nms.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..e5ae72578731c38150bf0c79866fcabfcb936ceb
--- /dev/null
+++ b/mmdet/ops/nms/gpu_nms.pyx
@@ -0,0 +1,43 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+assert sizeof(int) == sizeof(np.int32_t)
+
+cdef extern from "gpu_nms.hpp":
+    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil
+    size_t nms_Malloc() nogil
+
+memory_pool = {}
+
+def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
+            np.int32_t device_id=0):
+    cdef int boxes_num = dets.shape[0]
+    cdef int boxes_dim = dets.shape[1]
+    cdef int num_out
+    cdef size_t base
+    cdef np.ndarray[np.int32_t, ndim=1] \
+        keep = np.zeros(boxes_num, dtype=np.int32)
+    cdef np.ndarray[np.float32_t, ndim=1] \
+        scores = dets[:, 4]
+    cdef np.ndarray[np.int_t, ndim=1] \
+        order = scores.argsort()[::-1]
+    cdef np.ndarray[np.float32_t, ndim=2] \
+        sorted_dets = dets[order, :]
+    cdef float cthresh = thresh
+    if device_id not in memory_pool:
+        with nogil:
+            base = nms_Malloc()
+        memory_pool[device_id] = base
+        # print "malloc", base
+    base = memory_pool[device_id]
+    with nogil:
+        _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base)
+    keep = keep[:num_out]
+    return list(order[keep])
diff --git a/mmdet/ops/nms/nms_kernel.cu b/mmdet/ops/nms/nms_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4c5f0ec5e1096260e57ff314074f9c36da0a4e72
--- /dev/null
+++ b/mmdet/ops/nms/nms_kernel.cu
@@ -0,0 +1,188 @@
+// ------------------------------------------------------------------
+// Faster R-CNN
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Shaoqing Ren
+// ------------------------------------------------------------------
+
+#include <stdio.h>
+#include <iostream>
+#include <vector>
+#include "gpu_nms.hpp"
+
+#define CUDA_CHECK(condition)                                    \
+    /* Code block avoids redefinition of cudaError_t error */    \
+    do {                                                         \
+        cudaError_t error = condition;                           \
+        if (error != cudaSuccess) {                              \
+            std::cout << cudaGetErrorString(error) << std::endl; \
+        }                                                        \
+    } while (0)
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define MULTIPLIER 16
+#define LONGLONG_SIZE 64
+
+int const threadsPerBlock =
+    sizeof(unsigned long long) * 8 *
+    MULTIPLIER;  // number of bits for a long long variable
+
+__device__ inline float devIoU(float const* const a, float const* const b) {
+    float left = max(a[0], b[0]), right = min(a[2], b[2]);
+    float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+    float width = max(right - left + 1, 0.f),
+          height = max(bottom - top + 1, 0.f);
+    float interS = width * height;
+    float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+    float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+    return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+                           const float* dev_boxes,
+                           unsigned long long* dev_mask) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    __shared__ float block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 5 + 0] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+        block_boxes[threadIdx.x * 5 + 1] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+        block_boxes[threadIdx.x * 5 + 2] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+        block_boxes[threadIdx.x * 5 + 3] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+        block_boxes[threadIdx.x * 5 + 4] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    unsigned long long ts[MULTIPLIER];
+
+    if (threadIdx.x < row_size) {
+#pragma unroll
+        for (int i = 0; i < MULTIPLIER; ++i) {
+            ts[i] = 0;
+        }
+        const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+        const float* cur_box = dev_boxes + cur_box_idx * 5;
+        int i = 0;
+        int start = 0;
+        if (row_start == col_start) {
+            start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+                ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE);
+            }
+        }
+        const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
+
+#pragma unroll
+        for (int i = 0; i < MULTIPLIER; ++i) {
+            dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] =
+                ts[i];
+        }
+    }
+}
+
+void _set_device(int device_id) {
+    int current_device;
+    CUDA_CHECK(cudaGetDevice(&current_device));
+    if (current_device == device_id) {
+        return;
+    }
+    // The call to cudaSetDevice must come before any calls to Get, which
+    // may perform initialization using the GPU.
+    CUDA_CHECK(cudaSetDevice(device_id));
+}
+
+const size_t MEMORY_SIZE = 500000000;
+size_t nms_Malloc() {
+    float* boxes_dev = NULL;
+    CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE));
+    return size_t(boxes_dev);
+}
+
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) {
+    _set_device(device_id);
+
+    float* boxes_dev = NULL;
+    unsigned long long* mask_dev = NULL;
+
+    const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+
+    if (base > 0) {
+        size_t require_mem =
+            boxes_num * boxes_dim * sizeof(float) +
+            boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER;
+        if (require_mem >= MEMORY_SIZE) {
+            std::cout << "require_mem: " << require_mem << std::endl;
+        }
+        boxes_dev = (float*)(base);
+        mask_dev =
+            (unsigned long long*)(base +
+                                  512 * ((unsigned long long)(boxes_num *
+                                                              boxes_dim *
+                                                              sizeof(float) /
+                                                              512) +
+                                         1));
+    } else {
+        CUDA_CHECK(
+            cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float)));
+        CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks *
+                                             sizeof(unsigned long long)));
+    }
+    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
+                          boxes_num * boxes_dim * sizeof(float),
+                          cudaMemcpyHostToDevice));
+
+    dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
+                DIVUP(boxes_num, threadsPerBlock));
+    dim3 threads(threadsPerBlock);
+    nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev,
+                                    mask_dev);
+
+    std::vector<unsigned long long> mask_host(boxes_num * col_blocks *
+                                              MULTIPLIER);
+    CUDA_CHECK(cudaMemcpy(
+        &mask_host[0], mask_dev,
+        sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER,
+        cudaMemcpyDeviceToHost));
+
+    std::vector<unsigned long long> remv(col_blocks * MULTIPLIER);
+    memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER);
+
+    int num_to_keep = 0;
+    for (int i = 0; i < boxes_num; i++) {
+        int nblock = i / threadsPerBlock;
+        int inblock = i % threadsPerBlock;
+        int offset = inblock / LONGLONG_SIZE;
+        int bit_pos = inblock % LONGLONG_SIZE;
+
+        if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) {
+            keep_out[num_to_keep++] = i;
+            unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER;
+            for (int j = nblock * MULTIPLIER + offset;
+                 j < col_blocks * MULTIPLIER; j++) {
+                remv[j] |= p[j];
+            }
+        }
+    }
+    *num_out = num_to_keep;
+
+    if (!base) {
+        CUDA_CHECK(cudaFree(boxes_dev));
+        CUDA_CHECK(cudaFree(mask_dev));
+    }
+}
diff --git a/mmdet/ops/nms/nms_wrapper.py b/mmdet/ops/nms/nms_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d5e5c6e5c038467f2084d46d85b97bb2a943f1
--- /dev/null
+++ b/mmdet/ops/nms/nms_wrapper.py
@@ -0,0 +1,46 @@
+import numpy as np
+import torch
+
+from .gpu_nms import gpu_nms
+from .cpu_nms import cpu_nms
+from .cpu_soft_nms import cpu_soft_nms
+
+
+def nms(dets, thresh, device_id=None):
+    """Dispatch to either CPU or GPU NMS implementations."""
+
+    if isinstance(dets, torch.Tensor):
+        if dets.is_cuda:
+            device_id = dets.get_device()
+        dets = dets.detach().cpu().numpy()
+    assert isinstance(dets, np.ndarray)
+
+    if dets.shape[0] == 0:
+        inds = []
+    else:
+        inds = (gpu_nms(dets, thresh, device_id=device_id)
+                if device_id is not None else cpu_nms(dets, thresh))
+
+    if isinstance(dets, torch.Tensor):
+        return dets.new_tensor(inds, dtype=torch.long)
+    else:
+        return np.array(inds, dtype=np.int)
+
+
+def soft_nms(dets, Nt=0.3, method=1, sigma=0.5, min_score=0):
+    if isinstance(dets, torch.Tensor):
+        _dets = dets.detach().cpu().numpy()
+    else:
+        _dets = dets.copy()
+    assert isinstance(_dets, np.ndarray)
+
+    new_dets, inds = cpu_soft_nms(
+        _dets, Nt=Nt, method=method, sigma=sigma, threshold=min_score)
+
+    if isinstance(dets, torch.Tensor):
+        return dets.new_tensor(
+            inds, dtype=torch.long), dets.new_tensor(new_dets)
+    else:
+        return np.array(
+            inds, dtype=np.int), np.array(
+                new_dets, dtype=np.float32)
diff --git a/mmdet/ops/nms/setup.py b/mmdet/ops/nms/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..98bf57c8f135805927205ec638d865177b070d8c
--- /dev/null
+++ b/mmdet/ops/nms/setup.py
@@ -0,0 +1,91 @@
+import os
+from distutils.core import setup
+from distutils.extension import Extension
+
+import numpy as np
+from Cython.Build import cythonize
+from Cython.Distutils import build_ext
+
+CUDA_ROOT = '/usr/local/cuda'
+CUDA = {
+    "include": os.path.join(CUDA_ROOT, 'include'),
+    "lib": os.path.join(CUDA_ROOT, 'lib64'),
+    "nvcc": os.path.join(CUDA_ROOT, 'bin', "nvcc")
+}
+
+inc_dirs = [CUDA['include'], np.get_include()]
+
+lib_dirs = [CUDA['lib']]
+
+# extensions
+ext_args = dict(
+    include_dirs=inc_dirs,
+    library_dirs=lib_dirs,
+    language='c++',
+    libraries=['cudart'],
+    extra_compile_args={
+        "cc": ['-Wno-unused-function', '-Wno-write-strings'],
+        "nvcc": [
+            '-arch=sm_52', '--ptxas-options=-v', '-c', '--compiler-options',
+            '-fPIC'
+        ],
+    },
+)
+
+extensions = [
+    Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args),
+    Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args),
+    Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args),
+]
+
+
+def customize_compiler_for_nvcc(self):
+    """inject deep into distutils to customize how the dispatch
+    to cc/nvcc works.
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on."""
+
+    # tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1 translated
+            # from the extra_compile_args in the Extension class
+            postargs = extra_postargs['nvcc']
+        else:
+            postargs = extra_postargs['cc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # inject our redefined _compile method into the class
+    self._compile = _compile
+
+
+# run the customize_compiler
+class custom_build_ext(build_ext):
+
+    def build_extensions(self):
+        customize_compiler_for_nvcc(self.compiler)
+        build_ext.build_extensions(self)
+
+
+setup(
+    name='nms',
+    cmdclass={'build_ext': custom_build_ext},
+    ext_modules=cythonize(extensions),
+)
diff --git a/mmdet/ops/roi_align/__init__.py b/mmdet/ops/roi_align/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cb037904a24e613c4b15305cdf8ded6c0072a1b
--- /dev/null
+++ b/mmdet/ops/roi_align/__init__.py
@@ -0,0 +1,4 @@
+from .functions.roi_align import roi_align
+from .modules.roi_align import RoIAlign
+
+__all__ = ['roi_align', 'RoIAlign']
diff --git a/mmdet/ops/roi_align/functions/__init__.py b/mmdet/ops/roi_align/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/ops/roi_align/functions/roi_align.py b/mmdet/ops/roi_align/functions/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e546fe59527570a2331f6f79bb6113f1cc1abb9
--- /dev/null
+++ b/mmdet/ops/roi_align/functions/roi_align.py
@@ -0,0 +1,61 @@
+from torch.autograd import Function, Variable
+
+from .. import roi_align_cuda
+
+
+class RoIAlignFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
+        ctx.spatial_scale = spatial_scale
+        ctx.sample_num = sample_num
+        ctx.save_for_backward(rois)
+        ctx.feature_size = features.size()
+
+        batch_size, num_channels, data_height, data_width = features.size()
+        num_rois = rois.size(0)
+
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+        if features.is_cuda:
+            roi_align_cuda.forward(features, rois, out_h, out_w, spatial_scale,
+                                   sample_num, output)
+        else:
+            raise NotImplementedError
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        sample_num = ctx.sample_num
+        rois = ctx.saved_tensors[0]
+        assert (feature_size is not None and grad_output.is_cuda)
+
+        batch_size, num_channels, data_height, data_width = feature_size
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = grad_rois = None
+        if ctx.needs_input_grad[0]:
+            grad_input = Variable(
+                rois.new(batch_size, num_channels, data_height, data_width)
+                .zero_())
+            roi_align_cuda.backward(grad_output, rois, out_h, out_w,
+                                    spatial_scale, sample_num, grad_input)
+
+        return grad_input, grad_rois, None, None, None
+
+
+roi_align = RoIAlignFunction.apply
diff --git a/mmdet/ops/roi_align/gradcheck.py b/mmdet/ops/roi_align/gradcheck.py
new file mode 100644
index 0000000000000000000000000000000000000000..394cd69c5064e097becf12752755ee510045193b
--- /dev/null
+++ b/mmdet/ops/roi_align/gradcheck.py
@@ -0,0 +1,29 @@
+import numpy as np
+import torch
+from torch.autograd import gradcheck
+
+import os.path as osp
+import sys
+sys.path.append(osp.abspath(osp.join(__file__, '../../')))
+from roi_align import RoIAlign  # noqa: E402
+
+feat_size = 15
+spatial_scale = 1.0 / 8
+img_size = feat_size / spatial_scale
+num_imgs = 2
+num_rois = 20
+
+batch_ind = np.random.randint(num_imgs, size=(num_rois, 1))
+rois = np.random.rand(num_rois, 4) * img_size * 0.5
+rois[:, 2:] += img_size * 0.5
+rois = np.hstack((batch_ind, rois))
+
+feat = torch.randn(
+    num_imgs, 16, feat_size, feat_size, requires_grad=True, device='cuda:0')
+rois = torch.from_numpy(rois).float().cuda()
+inputs = (feat, rois)
+print('Gradcheck for roi align...')
+test = gradcheck(RoIAlign(3, spatial_scale), inputs, atol=1e-3, eps=1e-3)
+print(test)
+test = gradcheck(RoIAlign(3, spatial_scale, 2), inputs, atol=1e-3, eps=1e-3)
+print(test)
diff --git a/mmdet/ops/roi_align/modules/__init__.py b/mmdet/ops/roi_align/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/ops/roi_align/modules/roi_align.py b/mmdet/ops/roi_align/modules/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83b74e6b7c151eaf627c2b6d3530823ce8cda05
--- /dev/null
+++ b/mmdet/ops/roi_align/modules/roi_align.py
@@ -0,0 +1,16 @@
+from torch.nn.modules.module import Module
+from ..functions.roi_align import RoIAlignFunction
+
+
+class RoIAlign(Module):
+
+    def __init__(self, out_size, spatial_scale, sample_num=0):
+        super(RoIAlign, self).__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+        self.sample_num = int(sample_num)
+
+    def forward(self, features, rois):
+        return RoIAlignFunction.apply(features, rois, self.out_size,
+                                      self.spatial_scale, self.sample_num)
diff --git a/mmdet/ops/roi_align/setup.py b/mmdet/ops/roi_align/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..f02a5ea30d66f51761038c7802d948f039871c8c
--- /dev/null
+++ b/mmdet/ops/roi_align/setup.py
@@ -0,0 +1,12 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='roi_align_cuda',
+    ext_modules=[
+        CUDAExtension('roi_align_cuda', [
+            'src/roi_align_cuda.cpp',
+            'src/roi_align_kernel.cu',
+        ]),
+    ],
+    cmdclass={'build_ext': BuildExtension})
diff --git a/mmdet/ops/roi_align/src/roi_align_cuda.cpp b/mmdet/ops/roi_align/src/roi_align_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8551bc5188800e46baf4cf64c6076520fed38581
--- /dev/null
+++ b/mmdet/ops/roi_align/src/roi_align_cuda.cpp
@@ -0,0 +1,85 @@
+#include <torch/torch.h>
+
+#include <cmath>
+#include <vector>
+
+int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                           const float spatial_scale, const int sample_num,
+                           const int channels, const int height,
+                           const int width, const int num_rois,
+                           const int pooled_height, const int pooled_width,
+                           at::Tensor output);
+
+int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                            const float spatial_scale, const int sample_num,
+                            const int channels, const int height,
+                            const int width, const int num_rois,
+                            const int pooled_height, const int pooled_width,
+                            at::Tensor bottom_grad);
+
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+int roi_align_forward_cuda(at::Tensor features, at::Tensor rois,
+                           int pooled_height, int pooled_width,
+                           float spatial_scale, int sample_num,
+                           at::Tensor output) {
+  CHECK_INPUT(features);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(output);
+
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+
+  int num_channels = features.size(1);
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+
+  ROIAlignForwardLaucher(features, rois, spatial_scale, sample_num,
+                         num_channels, data_height, data_width, num_rois,
+                         pooled_height, pooled_width, output);
+
+  return 1;
+}
+
+int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois,
+                            int pooled_height, int pooled_width,
+                            float spatial_scale, int sample_num,
+                            at::Tensor bottom_grad) {
+  CHECK_INPUT(top_grad);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(bottom_grad);
+
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+
+  ROIAlignBackwardLaucher(top_grad, rois, spatial_scale, sample_num,
+                          num_channels, data_height, data_width, num_rois,
+                          pooled_height, pooled_width, bottom_grad);
+
+  return 1;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &roi_align_forward_cuda, "Roi_Align forward (CUDA)");
+  m.def("backward", &roi_align_backward_cuda, "Roi_Align backward (CUDA)");
+}
diff --git a/mmdet/ops/roi_align/src/roi_align_kernel.cu b/mmdet/ops/roi_align/src/roi_align_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..341d858de52a0999f7d9598ddb3c2f52d529bf17
--- /dev/null
+++ b/mmdet/ops/roi_align/src/roi_align_kernel.cu
@@ -0,0 +1,307 @@
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+
+using namespace at;  // temporal fix for pytorch<=0.4.1 (see #9848)
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 1024
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 65000;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
+                                         const int height, const int width,
+                                         scalar_t y, scalar_t x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return 0;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly;
+  scalar_t hx = 1. - lx;
+  // do bilinear interpolation
+  scalar_t lt = bottom_data[y_low * width + x_low];
+  scalar_t rt = bottom_data[y_low * width + x_high];
+  scalar_t lb = bottom_data[y_high * width + x_low];
+  scalar_t rb = bottom_data[y_high * width + x_high];
+  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
+
+  return val;
+}
+
+template <typename scalar_t>
+__global__ void ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
+                                const scalar_t *bottom_rois,
+                                const scalar_t spatial_scale,
+                                const int sample_num, const int channels,
+                                const int height, const int width,
+                                const int pooled_height, const int pooled_width,
+                                scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the aligned output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
+    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
+    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
+
+    scalar_t bin_size_h = roi_height / pooled_height;
+    scalar_t bin_size_w = roi_width / pooled_width;
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    int sample_num_h = (sample_num > 0)
+                           ? sample_num
+                           : ceil(roi_height / pooled_height);  // e.g., = 2
+    int sample_num_w =
+        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
+    scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
+
+    int hstart = fminf(floor(h), height - 2);
+    int wstart = fminf(floor(w), width - 2);
+
+    scalar_t output_val = 0;
+    for (int iy = 0; iy < sample_num_h; iy++) {
+      const scalar_t y = roi_start_h + ph * bin_size_h +
+                         (scalar_t)(iy + scalar_t(.5f)) * bin_size_h /
+                             (scalar_t)(sample_num_h);
+      for (int ix = 0; ix < sample_num_w; ix++) {
+        const scalar_t x = roi_start_w + pw * bin_size_w +
+                           (scalar_t)(ix + scalar_t(.5f)) * bin_size_w /
+                               (scalar_t)(sample_num_w);
+        scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data,
+                                                      height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= (sample_num_h * sample_num_w);
+    top_data[index] = output_val;
+  }
+}
+
+int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                           const float spatial_scale, const int sample_num,
+                           const int channels, const int height,
+                           const int width, const int num_rois,
+                           const int pooled_height, const int pooled_width,
+                           at::Tensor output) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.type(), "ROIAlignLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *top_data = output.data<scalar_t>();
+
+        ROIAlignForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                sample_num, channels, height, width, pooled_height,
+                pooled_width, top_data);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  return 1;
+}
+
+template <typename scalar_t>
+__device__ void bilinear_interpolate_gradient(const int height, const int width,
+                                              scalar_t y, scalar_t x,
+                                              scalar_t &w1, scalar_t &w2,
+                                              scalar_t &w3, scalar_t &w4,
+                                              int &x_low, int &x_high,
+                                              int &y_low, int &y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly;
+  scalar_t hx = 1. - lx;
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename scalar_t>
+__global__ void ROIAlignBackward(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int sample_num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the aligned output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
+    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
+    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
+
+    scalar_t bin_size_h = roi_height / pooled_height;
+    scalar_t bin_size_w = roi_width / pooled_width;
+
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+    int offset_top = (n * channels + c) * pooled_height * pooled_width +
+                     ph * pooled_width + pw;
+    scalar_t offset_top_diff = top_diff[offset_top];
+
+    int sample_num_h = (sample_num > 0)
+                           ? sample_num
+                           : ceil(roi_height / pooled_height);  // e.g., = 2
+    int sample_num_w =
+        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    const scalar_t count = (scalar_t)(sample_num_h * sample_num_w);
+
+    scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
+    scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
+
+    int hstart = fminf(floor(h), height - 2);
+    int wstart = fminf(floor(w), width - 2);
+
+    for (int iy = 0; iy < sample_num_h; iy++) {
+      const scalar_t y =
+          roi_start_h + ph * bin_size_h +
+          (scalar_t)(iy + .5f) * bin_size_h / (scalar_t)(sample_num_h);
+      for (int ix = 0; ix < sample_num_w; ix++) {
+        const scalar_t x =
+            roi_start_w + pw * bin_size_w +
+            (scalar_t)(ix + .5f) * bin_size_w / (scalar_t)(sample_num_w);
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+        scalar_t g1 = offset_top_diff * w1 / count;
+        scalar_t g2 = offset_top_diff * w2 / count;
+        scalar_t g3 = offset_top_diff * w3 / count;
+        scalar_t g4 = offset_top_diff * w4 / count;
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+        }
+      }
+    }
+  }
+}
+
+int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                            const float spatial_scale, const int sample_num,
+                            const int channels, const int height,
+                            const int width, const int num_rois,
+                            const int pooled_height, const int pooled_width,
+                            at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+
+  // TODO: use AT_DISPATCH_FLOATING_TYPES_AND_HALF when atomicAdd is resolved
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.type(), "ROIAlignLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+        if (sizeof(scalar_t) == sizeof(double)) {
+          fprintf(stderr, "double is not supported\n");
+          exit(-1);
+        }
+
+        ROIAlignBackward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, spatial_scale, sample_num,
+                channels, height, width, pooled_height, pooled_width,
+                bottom_diff);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  return 1;
+}
diff --git a/mmdet/ops/roi_pool/__init__.py b/mmdet/ops/roi_pool/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb2c57eabd6fa002c970c1f8d199d80d0a9b689c
--- /dev/null
+++ b/mmdet/ops/roi_pool/__init__.py
@@ -0,0 +1,4 @@
+from .functions.roi_pool import roi_pool
+from .modules.roi_pool import RoIPool
+
+__all__ = ['roi_pool', 'RoIPool']
diff --git a/mmdet/ops/roi_pool/functions/__init__.py b/mmdet/ops/roi_pool/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/ops/roi_pool/functions/roi_pool.py b/mmdet/ops/roi_pool/functions/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..78ba1395fb9b653673c3ad57d076def78887b5ff
--- /dev/null
+++ b/mmdet/ops/roi_pool/functions/roi_pool.py
@@ -0,0 +1,56 @@
+import torch
+from torch.autograd import Function
+
+from .. import roi_pool_cuda
+
+
+class RoIPoolFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, rois, out_size, spatial_scale):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
+        assert features.is_cuda
+        ctx.save_for_backward(rois)
+        num_channels = features.size(1)
+        num_rois = rois.size(0)
+        out_size = (num_rois, num_channels, out_h, out_w)
+        output = features.new_zeros(*out_size)
+
+        argmax = features.new_zeros(*out_size, dtype=torch.int)
+        roi_pool_cuda.forward(features, rois, out_h, out_w, spatial_scale,
+                              output, argmax)
+        ctx.spatial_scale = spatial_scale
+        ctx.feature_size = features.size()
+        ctx.argmax = argmax
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+        spatial_scale = ctx.spatial_scale
+        feature_size = ctx.feature_size
+        argmax = ctx.argmax
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+
+        grad_input = grad_rois = None
+        if ctx.needs_input_grad[0]:
+            grad_input = grad_output.new(feature_size).zero_()
+            roi_pool_cuda.backward(grad_output, rois, argmax, spatial_scale,
+                                   grad_input)
+
+        return grad_input, grad_rois, None, None
+
+
+roi_pool = RoIPoolFunction.apply
diff --git a/mmdet/ops/roi_pool/gradcheck.py b/mmdet/ops/roi_pool/gradcheck.py
new file mode 100644
index 0000000000000000000000000000000000000000..c39616086a240cf57cf115d4264eb32b9cc9f7c7
--- /dev/null
+++ b/mmdet/ops/roi_pool/gradcheck.py
@@ -0,0 +1,15 @@
+import torch
+from torch.autograd import gradcheck
+
+import os.path as osp
+import sys
+sys.path.append(osp.abspath(osp.join(__file__, '../../')))
+from roi_pool import RoIPool  # noqa: E402
+
+feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda()
+rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55],
+                     [1, 67, 40, 110, 120]]).cuda()
+inputs = (feat, rois)
+print('Gradcheck for roi pooling...')
+test = gradcheck(RoIPool(4, 1.0 / 8), inputs, eps=1e-5, atol=1e-3)
+print(test)
diff --git a/mmdet/ops/roi_pool/modules/__init__.py b/mmdet/ops/roi_pool/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/ops/roi_pool/modules/roi_pool.py b/mmdet/ops/roi_pool/modules/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7fffd08c656ee7301aeed5a8262714f4be4157d
--- /dev/null
+++ b/mmdet/ops/roi_pool/modules/roi_pool.py
@@ -0,0 +1,14 @@
+from torch.nn.modules.module import Module
+from ..functions.roi_pool import roi_pool
+
+
+class RoIPool(Module):
+
+    def __init__(self, out_size, spatial_scale):
+        super(RoIPool, self).__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, features, rois):
+        return roi_pool(features, rois, self.out_size, self.spatial_scale)
diff --git a/mmdet/ops/roi_pool/setup.py b/mmdet/ops/roi_pool/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..16991b889220f9ae4c7763460033754c6ff38f77
--- /dev/null
+++ b/mmdet/ops/roi_pool/setup.py
@@ -0,0 +1,12 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='roi_pool',
+    ext_modules=[
+        CUDAExtension('roi_pool_cuda', [
+            'src/roi_pool_cuda.cpp',
+            'src/roi_pool_kernel.cu',
+        ])
+    ],
+    cmdclass={'build_ext': BuildExtension})
diff --git a/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp b/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b05e870600fa80ea4b236bd85c03122ed1f49aba
--- /dev/null
+++ b/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
@@ -0,0 +1,86 @@
+#include <torch/torch.h>
+
+#include <cmath>
+#include <vector>
+
+int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                          const float spatial_scale, const int channels,
+                          const int height, const int width, const int num_rois,
+                          const int pooled_h, const int pooled_w,
+                          at::Tensor output, at::Tensor argmax);
+
+int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                           const at::Tensor argmax, const float spatial_scale,
+                           const int batch_size, const int channels,
+                           const int height, const int width,
+                           const int num_rois, const int pooled_h,
+                           const int pooled_w, at::Tensor bottom_grad);
+
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+int roi_pooling_forward_cuda(at::Tensor features, at::Tensor rois,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale, at::Tensor output,
+                             at::Tensor argmax) {
+  CHECK_INPUT(features);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(output);
+  CHECK_INPUT(argmax);
+
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+
+  int channels = features.size(1);
+  int height = features.size(2);
+  int width = features.size(3);
+
+  ROIPoolForwardLaucher(features, rois, spatial_scale, channels, height, width,
+                        num_rois, pooled_height, pooled_width, output, argmax);
+
+  return 1;
+}
+
+int roi_pooling_backward_cuda(at::Tensor top_grad, at::Tensor rois,
+                              at::Tensor argmax, float spatial_scale,
+                              at::Tensor bottom_grad) {
+  CHECK_INPUT(top_grad);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(argmax);
+  CHECK_INPUT(bottom_grad);
+
+  int pooled_height = top_grad.size(2);
+  int pooled_width = top_grad.size(3);
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+  int batch_size = bottom_grad.size(0);
+  int channels = bottom_grad.size(1);
+  int height = bottom_grad.size(2);
+  int width = bottom_grad.size(3);
+
+  ROIPoolBackwardLaucher(top_grad, rois, argmax, spatial_scale, batch_size,
+                         channels, height, width, num_rois, pooled_height,
+                         pooled_width, bottom_grad);
+
+  return 1;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &roi_pooling_forward_cuda, "Roi_Pooling forward (CUDA)");
+  m.def("backward", &roi_pooling_backward_cuda, "Roi_Pooling backward (CUDA)");
+}
diff --git a/mmdet/ops/roi_pool/src/roi_pool_kernel.cu b/mmdet/ops/roi_pool/src/roi_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d2cefa662f9ff9c961a261cef621f7f1d0e561fc
--- /dev/null
+++ b/mmdet/ops/roi_pool/src/roi_pool_kernel.cu
@@ -0,0 +1,168 @@
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+
+using namespace at;  // temporal fix for pytorch<=0.4.1 (see #9848)
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 1024
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 65000;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename scalar_t>
+__global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
+                               const scalar_t *rois,
+                               const scalar_t spatial_scale, const int channels,
+                               const int height, const int width,
+                               const int pooled_h, const int pooled_w,
+                               scalar_t *top_data, int *argmax_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_w;
+    int ph = (index / pooled_w) % pooled_h;
+    int c = (index / pooled_w / pooled_h) % channels;
+    int n = index / pooled_w / pooled_h / channels;
+
+    const scalar_t *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    // calculate the roi region on feature maps
+    scalar_t roi_x1 = offset_rois[1] * spatial_scale;
+    scalar_t roi_y1 = offset_rois[2] * spatial_scale;
+    scalar_t roi_x2 = (offset_rois[3] + 1) * spatial_scale;
+    scalar_t roi_y2 = (offset_rois[4] + 1) * spatial_scale;
+
+    // force malformed rois to be 1x1
+    scalar_t roi_w = roi_x2 - roi_x1;
+    scalar_t roi_h = roi_y2 - roi_y1;
+    if (roi_w <= 0 || roi_h <= 0) continue;
+
+    scalar_t bin_size_w = roi_w / static_cast<scalar_t>(pooled_w);
+    scalar_t bin_size_h = roi_h / static_cast<scalar_t>(pooled_h);
+
+    // the corresponding bin region
+    int bin_x1 = floor(static_cast<scalar_t>(pw) * bin_size_w + roi_x1);
+    int bin_y1 = floor(static_cast<scalar_t>(ph) * bin_size_h + roi_y1);
+    int bin_x2 = ceil(static_cast<scalar_t>(pw + 1) * bin_size_w + roi_x1);
+    int bin_y2 = ceil(static_cast<scalar_t>(ph + 1) * bin_size_h + roi_y1);
+
+    // add roi offsets and clip to input boundaries
+    bin_x1 = min(max(bin_x1, 0), width);
+    bin_y1 = min(max(bin_y1, 0), height);
+    bin_x2 = min(max(bin_x2, 0), width);
+    bin_y2 = min(max(bin_y2, 0), height);
+    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
+
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int max_idx = -1;
+    bottom_data += (roi_batch_ind * channels + c) * height * width;
+
+    // Define an empty pooling region to be zero
+    scalar_t max_val = is_empty ? static_cast<scalar_t>(0)
+                                : bottom_data[bin_y1 * width + bin_x1] - 1;
+
+    for (int h = bin_y1; h < bin_y2; ++h) {
+      for (int w = bin_x1; w < bin_x2; ++w) {
+        int offset = h * width + w;
+        if (bottom_data[offset] > max_val) {
+          max_val = bottom_data[offset];
+          max_idx = offset;
+        }
+      }
+    }
+    top_data[index] = max_val;
+    if (argmax_data != NULL) argmax_data[index] = max_idx;
+  }
+}
+
+int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                          const float spatial_scale, const int channels,
+                          const int height, const int width, const int num_rois,
+                          const int pooled_h, const int pooled_w,
+                          at::Tensor output, at::Tensor argmax) {
+  const int output_size = num_rois * channels * pooled_h * pooled_w;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.type(), "ROIPoolLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *top_data = output.data<scalar_t>();
+        int *argmax_data = argmax.data<int>();
+
+        ROIPoolForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                channels, height, width, pooled_h, pooled_w, top_data,
+                argmax_data);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+  return 1;
+}
+
+template <typename scalar_t>
+__global__ void ROIPoolBackward(const int nthreads, const scalar_t *top_diff,
+                                const scalar_t *rois, const int *argmax_data,
+                                const scalar_t spatial_scale,
+                                const int channels, const int height,
+                                const int width, const int pooled_h,
+                                const int pooled_w, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_w;
+    int ph = (index / pooled_w) % pooled_h;
+    int c = (index / pooled_w / pooled_h) % channels;
+    int n = index / pooled_w / pooled_h / channels;
+
+    int roi_batch_ind = rois[n * 5];
+    int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w +
+                                   ph * pooled_w + pw];
+
+    atomicAdd(bottom_diff + (roi_batch_ind * channels + c) * height * width +
+                  bottom_index,
+              top_diff[index]);
+  }
+}
+
+int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                           const at::Tensor argmax, const float spatial_scale,
+                           const int batch_size, const int channels,
+                           const int height, const int width,
+                           const int num_rois, const int pooled_h,
+                           const int pooled_w, at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_h * pooled_w * channels;
+
+  // TODO: use AT_DISPATCH_FLOATING_TYPES_AND_HALF when atomicAdd is resolved
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.type(), "ROIPoolLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        const int *argmax_data = argmax.data<int>();
+        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+
+        if (sizeof(scalar_t) == sizeof(double)) {
+          fprintf(stderr, "double is not supported\n");
+          exit(-1);
+        }
+
+        ROIPoolBackward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, argmax_data,
+                scalar_t(spatial_scale), channels, height, width, pooled_h,
+                pooled_w, bottom_diff);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  return 1;
+}
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ebe317956ffb781d7d264393dca7bcc7af67c10
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,112 @@
+import os
+import subprocess
+import time
+from setuptools import find_packages, setup
+
+
+def readme():
+    with open('README.md') as f:
+        content = f.read()
+    return content
+
+
+MAJOR = 0
+MINOR = 5
+PATCH = 0
+SUFFIX = ''
+SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX)
+
+version_file = 'mmdet/version.py'
+
+
+def get_git_hash():
+
+    def _minimal_ext_cmd(cmd):
+        # construct minimal environment
+        env = {}
+        for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+            v = os.environ.get(k)
+            if v is not None:
+                env[k] = v
+        # LANGUAGE is used on win32
+        env['LANGUAGE'] = 'C'
+        env['LANG'] = 'C'
+        env['LC_ALL'] = 'C'
+        out = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+        return out
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+    except OSError:
+        sha = 'unknown'
+
+    return sha
+
+
+def get_hash():
+    if os.path.exists('.git'):
+        sha = get_git_hash()[:7]
+    elif os.path.exists(version_file):
+        try:
+            from mmdet.version import __version__
+            sha = __version__.split('+')[-1]
+        except ImportError:
+            raise ImportError('Unable to get git version')
+    else:
+        sha = 'unknown'
+
+    return sha
+
+
+def write_version_py():
+    content = """# GENERATED VERSION FILE
+# TIME: {}
+
+__version__ = '{}'
+short_version = '{}'
+"""
+    sha = get_hash()
+    VERSION = SHORT_VERSION + '+' + sha
+
+    with open(version_file, 'w') as f:
+        f.write(content.format(time.asctime(), VERSION, SHORT_VERSION))
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+if __name__ == '__main__':
+    write_version_py()
+    setup(
+        name='mmdet',
+        version=get_version(),
+        description='Open MMLab Detection Toolbox',
+        long_description=readme(),
+        keywords='computer vision, object detection',
+        url='https://github.com/open-mmlab/mmdetection',
+        packages=find_packages(),
+        package_data={'mmdet.ops': ['*/*.so']},
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 2',
+            'Programming Language :: Python :: 2.7',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.4',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+        ],
+        license='GPLv3',
+        setup_requires=['pytest-runner'],
+        tests_require=['pytest'],
+        install_requires=[
+            'mmcv', 'numpy', 'matplotlib', 'six', 'terminaltables',
+            'pycocotools'
+        ],
+        zip_safe=False)
diff --git a/tools/coco_eval.py b/tools/coco_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..65e114ca280578cd41848a631e419d70819a662f
--- /dev/null
+++ b/tools/coco_eval.py
@@ -0,0 +1,28 @@
+from argparse import ArgumentParser
+
+from mmdet.core import coco_eval
+
+
+def main():
+    parser = ArgumentParser(description='COCO Evaluation')
+    parser.add_argument('result', help='result file path')
+    parser.add_argument('--ann', help='annotation file path')
+    parser.add_argument(
+        '--types',
+        type=str,
+        nargs='+',
+        choices=['proposal_fast', 'proposal', 'bbox', 'segm', 'keypoint'],
+        default=['bbox'],
+        help='result types')
+    parser.add_argument(
+        '--max-dets',
+        type=int,
+        nargs='+',
+        default=[100, 300, 1000],
+        help='proposal numbers, only used for recall evaluation')
+    args = parser.parse_args()
+    coco_eval(args.result, args.types, args.ann, args.max_dets)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dist_train.sh b/tools/dist_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fa68297226b874596a54b9c819f03584008093e6
--- /dev/null
+++ b/tools/dist_train.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+PYTHON=${PYTHON:-"python"}
+
+$PYTHON -m torch.distributed.launch --nproc_per_node=$2 $(dirname "$0")/train.py $1 --launcher pytorch ${@:3}
diff --git a/tools/test.py b/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc8dc5e85ce415b5149227b0035cf1d88d70c677
--- /dev/null
+++ b/tools/test.py
@@ -0,0 +1,114 @@
+import argparse
+
+import torch
+import mmcv
+from mmcv.runner import load_checkpoint, parallel_test, obj_from_dict
+from mmcv.parallel import scatter, collate, MMDataParallel
+
+from mmdet import datasets
+from mmdet.core import results2json, coco_eval
+from mmdet.datasets import build_dataloader
+from mmdet.models import build_detector, detectors
+
+
+def single_test(model, data_loader, show=False):
+    model.eval()
+    results = []
+    prog_bar = mmcv.ProgressBar(len(data_loader.dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=not show, **data)
+        results.append(result)
+
+        if show:
+            model.module.show_result(data, result,
+                                     data_loader.dataset.img_norm_cfg)
+
+        batch_size = data['img'][0].size(0)
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def _data_func(data, device_id):
+    data = scatter(collate([data], samples_per_gpu=1), [device_id])[0]
+    return dict(return_loss=False, rescale=True, **data)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet test detector')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--gpus', default=1, type=int, help='GPU number used for testing')
+    parser.add_argument(
+        '--proc_per_gpu',
+        default=1,
+        type=int,
+        help='Number of processes per GPU')
+    parser.add_argument('--out', help='output result file')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        choices=['proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'],
+        help='eval types')
+    parser.add_argument('--show', action='store_true', help='show results')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = mmcv.Config.fromfile(args.config)
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True))
+    if args.gpus == 1:
+        model = build_detector(
+            cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
+        load_checkpoint(model, args.checkpoint)
+        model = MMDataParallel(model, device_ids=[0])
+
+        data_loader = build_dataloader(
+            dataset,
+            imgs_per_gpu=1,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            num_gpus=1,
+            dist=False,
+            shuffle=False)
+        outputs = single_test(model, data_loader, args.show)
+    else:
+        model_args = cfg.model.copy()
+        model_args.update(train_cfg=None, test_cfg=cfg.test_cfg)
+        model_type = getattr(detectors, model_args.pop('type'))
+        outputs = parallel_test(
+            model_type,
+            model_args,
+            args.checkpoint,
+            dataset,
+            _data_func,
+            range(args.gpus),
+            workers_per_gpu=args.proc_per_gpu)
+
+    if args.out:
+        print('writing results to {}'.format(args.out))
+        mmcv.dump(outputs, args.out)
+        eval_types = args.eval
+        if eval_types:
+            print('Starting evaluate {}'.format(' and '.join(eval_types)))
+            if eval_types == ['proposal_fast']:
+                result_file = args.out
+            else:
+                result_file = args.out + '.json'
+                results2json(dataset, outputs, result_file)
+            coco_eval(result_file, eval_types, dataset.coco)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/train.py b/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e03628db5ea28d027ccdc3939c72bace482be93
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,81 @@
+from __future__ import division
+
+import argparse
+from mmcv import Config
+from mmcv.runner import obj_from_dict
+
+from mmdet import datasets, __version__
+from mmdet.apis import (train_detector, init_dist, get_root_logger,
+                        set_random_seed)
+from mmdet.models import build_detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work_dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--validate',
+        action='store_true',
+        help='whether to evaluate the checkpoint during training')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # update configs according to CLI args
+    if args.work_dir is not None:
+        cfg.work_dir = args.work_dir
+    cfg.gpus = args.gpus
+    if cfg.checkpoint_config is not None:
+        # save mmdet version in checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=__version__, config=cfg.text)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # init logger before other steps
+    logger = get_root_logger(cfg.log_level)
+    logger.info('Distributed training: {}'.format(distributed))
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info('Set random seed to {}'.format(args.seed))
+        set_random_seed(args.seed)
+
+    model = build_detector(
+        cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
+    train_dataset = obj_from_dict(cfg.data.train, datasets)
+    train_detector(
+        model,
+        train_dataset,
+        cfg,
+        distributed=distributed,
+        validate=args.validate,
+        logger=logger)
+
+
+if __name__ == '__main__':
+    main()