Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • Archana_Badagi/food-round2
  • eric_a_scuccimarra/food-round2
  • joel_joseph/food-round2
  • darthgera123/food-round2
  • reshmarameshbabu/food-round2
  • nikhil_rayaprolu/food-round2
6 results
Show changes
Showing
with 113 additions and 217 deletions
import torch.nn as nn
from ..functions.masked_conv import masked_conv2d
class MaskedConv2d(nn.Conv2d):
"""A MaskedConv2d which inherits the official Conv2d.
The masked forward doesn't implement the backward function and only
supports the stride parameter to be 1 currently.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True):
super(MaskedConv2d,
self).__init__(in_channels, out_channels, kernel_size, stride,
padding, dilation, groups, bias)
def forward(self, input, mask=None):
if mask is None: # fallback to the normal Conv2d
return super(MaskedConv2d, self).forward(input)
else:
return masked_conv2d(input, mask, self.weight, self.bias,
self.padding)
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='masked_conv2d_cuda',
ext_modules=[
CUDAExtension('masked_conv2d_cuda', [
'src/masked_conv2d_cuda.cpp',
'src/masked_conv2d_kernel.cu',
]),
],
cmdclass={'build_ext': BuildExtension})
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCAtomics.cuh>
#define CUDA_1D_KERNEL_LOOP(i, n) \
......@@ -18,9 +19,9 @@ __global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const long *mask_h_idx,
const long *mask_w_idx, const int mask_cnt,
scalar_t *data_col) {
const int64_t *mask_h_idx,
const int64_t *mask_w_idx,
const int mask_cnt, scalar_t *data_col) {
// mask_cnt * channels
CUDA_1D_KERNEL_LOOP(index, n) {
const int m_index = index % mask_cnt;
......@@ -57,13 +58,14 @@ int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height,
const int output_size = mask_cnt * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.type(), "MaskedIm2colLaucherForward", ([&] {
bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data<scalar_t>();
const long *mask_h_idx_ = mask_h_idx.data<long>();
const long *mask_w_idx_ = mask_w_idx.data<long>();
const int64_t *mask_h_idx_ = mask_h_idx.data<int64_t>();
const int64_t *mask_w_idx_ = mask_w_idx.data<int64_t>();
scalar_t *top_data_ = top_data.data<scalar_t>();
MaskedIm2colForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()
>>>(
output_size, bottom_data_, height, width, kernel_h, kernel_w,
pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
}));
......@@ -74,16 +76,15 @@ int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height,
template <typename scalar_t>
__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
const int height, const int width,
const int channels, const long *mask_h_idx,
const long *mask_w_idx, const int mask_cnt,
scalar_t *data_im) {
const int channels,
const int64_t *mask_h_idx,
const int64_t *mask_w_idx,
const int mask_cnt, scalar_t *data_im) {
CUDA_1D_KERNEL_LOOP(index, n) {
const int m_index = index % mask_cnt;
const int h_im = mask_h_idx[m_index];
const int w_im = mask_w_idx[m_index];
const int c_im = index / mask_cnt;
// int kernel_extent_w = (kernel_w - 1) + 1;
// int kernel_extent_h = (kernel_h - 1) + 1;
// compute the start and end of the output
data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
}
......@@ -97,14 +98,14 @@ int MaskedCol2imForwardLaucher(const at::Tensor bottom_data, const int height,
const int output_size = mask_cnt * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.type(), "MaskedCol2imLaucherForward", ([&] {
bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data<scalar_t>();
const long *mask_h_idx_ = mask_h_idx.data<long>();
const long *mask_w_idx_ = mask_w_idx.data<long>();
const int64_t *mask_h_idx_ = mask_h_idx.data<int64_t>();
const int64_t *mask_w_idx_ = mask_w_idx.data<int64_t>();
scalar_t *top_data_ = top_data.data<scalar_t>();
MaskedCol2imForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(
output_size, bottom_data_, height, width, channels, mask_h_idx_,
mask_w_idx_, mask_cnt, top_data_);
}));
......
import numpy as np
import torch
from . import nms_cuda, nms_cpu
from . import nms_cpu, nms_cuda
from .soft_nms_cpu import soft_nms_cpu
......@@ -21,6 +21,18 @@ def nms(dets, iou_thr, device_id=None):
Returns:
tuple: kept bboxes and indice, which is always the same data type as
the input.
Example:
>>> dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.9],
>>> [49.3, 32.9, 51.0, 35.3, 0.9],
>>> [49.2, 31.8, 51.0, 35.4, 0.5],
>>> [35.1, 11.5, 39.1, 15.7, 0.5],
>>> [35.6, 11.8, 39.3, 14.2, 0.5],
>>> [35.3, 11.5, 39.9, 14.5, 0.4],
>>> [35.2, 11.7, 39.7, 15.7, 0.3]], dtype=np.float32)
>>> iou_thr = 0.7
>>> supressed, inds = nms(dets, iou_thr)
>>> assert len(inds) == len(supressed) == 3
"""
# convert dets (tensor or numpy array) to tensor
if isinstance(dets, torch.Tensor):
......@@ -50,6 +62,18 @@ def nms(dets, iou_thr, device_id=None):
def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3):
"""
Example:
>>> dets = np.array([[4., 3., 5., 3., 0.9],
>>> [4., 3., 5., 4., 0.9],
>>> [3., 1., 3., 1., 0.5],
>>> [3., 1., 3., 1., 0.5],
>>> [3., 1., 3., 1., 0.4],
>>> [3., 1., 3., 1., 0.0]], dtype=np.float32)
>>> iou_thr = 0.7
>>> supressed, inds = soft_nms(dets, iou_thr, sigma=0.5)
>>> assert len(inds) == len(supressed) == 3
"""
if isinstance(dets, torch.Tensor):
is_tensor = True
dets_np = dets.detach().cpu().numpy()
......
import os.path as osp
from setuptools import setup, Extension
import numpy as np
from Cython.Build import cythonize
from Cython.Distutils import build_ext
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
ext_args = dict(
include_dirs=[np.get_include()],
language='c++',
extra_compile_args={
'cc': ['-Wno-unused-function', '-Wno-write-strings'],
'nvcc': ['-c', '--compiler-options', '-fPIC'],
},
)
extensions = [
Extension('soft_nms_cpu', ['src/soft_nms_cpu.pyx'], **ext_args),
]
def customize_compiler_for_nvcc(self):
"""inject deep into distutils to customize how the dispatch
to cc/nvcc works.
If you subclass UnixCCompiler, it's not trivial to get your subclass
injected in, and still have the right customizations (i.e.
distutils.sysconfig.customize_compiler) run on it. So instead of going
the OO route, I have this. Note, it's kindof like a wierd functional
subclassing going on."""
# tell the compiler it can processes .cu
self.src_extensions.append('.cu')
# save references to the default compiler_so and _comple methods
default_compiler_so = self.compiler_so
super = self._compile
# now redefine the _compile method. This gets executed for each
# object but distutils doesn't have the ability to change compilers
# based on source extension: we add it.
def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
if osp.splitext(src)[1] == '.cu':
# use the cuda for .cu files
self.set_executable('compiler_so', 'nvcc')
# use only a subset of the extra_postargs, which are 1-1 translated
# from the extra_compile_args in the Extension class
postargs = extra_postargs['nvcc']
else:
postargs = extra_postargs['cc']
super(obj, src, ext, cc_args, postargs, pp_opts)
# reset the default compiler_so, which we might have changed for cuda
self.compiler_so = default_compiler_so
# inject our redefined _compile method into the class
self._compile = _compile
class custom_build_ext(build_ext):
def build_extensions(self):
customize_compiler_for_nvcc(self.compiler)
build_ext.build_extensions(self)
setup(
name='soft_nms',
cmdclass={'build_ext': custom_build_ext},
ext_modules=cythonize(extensions),
)
setup(
name='nms_cuda',
ext_modules=[
CUDAExtension('nms_cuda', [
'src/nms_cuda.cpp',
'src/nms_kernel.cu',
]),
CUDAExtension('nms_cpu', [
'src/nms_cpu.cpp',
]),
],
cmdclass={'build_ext': BuildExtension})
......@@ -60,7 +60,7 @@ at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) {
at::Tensor nms(const at::Tensor& dets, const float threshold) {
at::Tensor result;
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
result = nms_cpu_kernel<scalar_t>(dets, threshold);
});
return result;
......
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/DeviceGuard.h>
#include <THC/THC.h>
#include <THC/THCDeviceUtils.cuh>
......@@ -68,6 +69,10 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
// boxes is a N x 5 tensor
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
// Ensure CUDA uses the input tensor device.
at::DeviceGuard guard(boxes.device());
using scalar_t = float;
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
auto scores = boxes.select(1, 4);
......@@ -91,16 +96,19 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
THCCeilDiv(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<<blocks, threads>>>(boxes_num,
nms_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
THCudaCheck(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
THCudaCheck(cudaMemcpyAsync(
&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost,
at::cuda::getCurrentCUDAStream()
));
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
......@@ -128,4 +136,4 @@ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
order_t.device(), keep.scalar_type())
}).sort(0, false));
}
\ No newline at end of file
}
......@@ -27,7 +27,7 @@ def soft_nms_cpu(
float min_score=0.001,
):
boxes = boxes_in.copy()
cdef unsigned int N = boxes.shape[0]
cdef int N = boxes.shape[0]
cdef float iw, ih, box_area
cdef float ua
cdef int pos = 0
......
from .functions.roi_align import roi_align
from .modules.roi_align import RoIAlign
from .roi_align import RoIAlign, roi_align
__all__ = ['roi_align', 'RoIAlign']
import os.path as osp
import sys
import numpy as np
import torch
from torch.autograd import gradcheck
import os.path as osp
import sys
sys.path.append(osp.abspath(osp.join(__file__, '../../')))
from roi_align import RoIAlign # noqa: E402
from roi_align import RoIAlign # noqa: E402, isort:skip
feat_size = 15
spatial_scale = 1.0 / 8
......
from torch.nn.modules.module import Module
from ..functions.roi_align import RoIAlignFunction
class RoIAlign(Module):
def __init__(self, out_size, spatial_scale, sample_num=0):
super(RoIAlign, self).__init__()
self.out_size = out_size
self.spatial_scale = float(spatial_scale)
self.sample_num = int(sample_num)
def forward(self, features, rois):
return RoIAlignFunction.apply(features, rois, self.out_size,
self.spatial_scale, self.sample_num)
import torch.nn as nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair
from .. import roi_align_cuda
from . import roi_align_cuda
class RoIAlignFunction(Function):
@staticmethod
def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0):
if isinstance(out_size, int):
out_h = out_size
out_w = out_size
elif isinstance(out_size, tuple):
assert len(out_size) == 2
assert isinstance(out_size[0], int)
assert isinstance(out_size[1], int)
out_h, out_w = out_size
else:
raise TypeError(
'"out_size" must be an integer or tuple of integers')
out_h, out_w = _pair(out_size)
assert isinstance(out_h, int) and isinstance(out_w, int)
ctx.spatial_scale = spatial_scale
ctx.sample_num = sample_num
ctx.save_for_backward(rois)
......@@ -36,6 +30,7 @@ class RoIAlignFunction(Function):
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
feature_size = ctx.feature_size
spatial_scale = ctx.spatial_scale
......@@ -59,3 +54,34 @@ class RoIAlignFunction(Function):
roi_align = RoIAlignFunction.apply
class RoIAlign(nn.Module):
def __init__(self,
out_size,
spatial_scale,
sample_num=0,
use_torchvision=False):
super(RoIAlign, self).__init__()
self.out_size = _pair(out_size)
self.spatial_scale = float(spatial_scale)
self.sample_num = int(sample_num)
self.use_torchvision = use_torchvision
def forward(self, features, rois):
if self.use_torchvision:
from torchvision.ops import roi_align as tv_roi_align
return tv_roi_align(features, rois, self.out_size,
self.spatial_scale, self.sample_num)
else:
return roi_align(features, rois, self.out_size, self.spatial_scale,
self.sample_num)
def __repr__(self):
format_str = self.__class__.__name__
format_str += '(out_size={}, spatial_scale={}, sample_num={}'.format(
self.out_size, self.spatial_scale, self.sample_num)
format_str += ', use_torchvision={})'.format(self.use_torchvision)
return format_str
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='roi_align_cuda',
ext_modules=[
CUDAExtension('roi_align_cuda', [
'src/roi_align_cuda.cpp',
'src/roi_align_kernel.cu',
]),
],
cmdclass={'build_ext': BuildExtension})
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <cmath>
#include <vector>
......
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCAtomics.cuh>
#define CUDA_1D_KERNEL_LOOP(i, n) \
......@@ -98,12 +99,6 @@ __global__ void ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
int sample_num_w =
(sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
int hstart = fminf(floor(h), height - 2);
int wstart = fminf(floor(w), width - 2);
scalar_t output_val = 0;
for (int iy = 0; iy < sample_num_h; iy++) {
const scalar_t y = roi_start_h + ph * bin_size_h +
......@@ -131,13 +126,13 @@ int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
at::Tensor output) {
const int output_size = num_rois * pooled_height * pooled_width * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.type(), "ROIAlignLaucherForward", ([&] {
features.scalar_type(), "ROIAlignLaucherForward", ([&] {
const scalar_t *bottom_data = features.data<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>();
scalar_t *top_data = output.data<scalar_t>();
ROIAlignForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(
output_size, bottom_data, rois_data, scalar_t(spatial_scale),
sample_num, channels, height, width, pooled_height,
pooled_width, top_data);
......@@ -231,12 +226,6 @@ __global__ void ROIAlignBackward(
const scalar_t count = (scalar_t)(sample_num_h * sample_num_w);
scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
int hstart = fminf(floor(h), height - 2);
int wstart = fminf(floor(w), width - 2);
for (int iy = 0; iy < sample_num_h; iy++) {
const scalar_t y =
roi_start_h + ph * bin_size_h +
......@@ -274,7 +263,7 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
const int output_size = num_rois * pooled_height * pooled_width * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.type(), "ROIAlignLaucherBackward", ([&] {
top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
const scalar_t *top_diff = top_grad.data<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>();
scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
......@@ -284,7 +273,7 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
}
ROIAlignBackward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(
output_size, top_diff, rois_data, spatial_scale, sample_num,
channels, height, width, pooled_height, pooled_width,
bottom_diff);
......
from .functions.roi_pool import roi_pool
from .modules.roi_pool import RoIPool
from .roi_pool import RoIPool, roi_pool
__all__ = ['roi_pool', 'RoIPool']
import os.path as osp
import sys
import torch
from torch.autograd import gradcheck
import os.path as osp
import sys
sys.path.append(osp.abspath(osp.join(__file__, '../../')))
from roi_pool import RoIPool # noqa: E402
from roi_pool import RoIPool # noqa: E402, isort:skip
feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda()
rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55],
......