Skip to content
Snippets Groups Projects
Unverified Commit f92f64c2 authored by Kai Chen's avatar Kai Chen Committed by GitHub
Browse files

use int64_t instead of long in cuda kernels (#1131)

parent bc94b212
No related branches found
No related tags found
No related merge requests found
...@@ -18,9 +18,9 @@ __global__ void MaskedIm2colForward(const int n, const scalar_t *data_im, ...@@ -18,9 +18,9 @@ __global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
const int height, const int width, const int height, const int width,
const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int pad_h, const int pad_w,
const long *mask_h_idx, const int64_t *mask_h_idx,
const long *mask_w_idx, const int mask_cnt, const int64_t *mask_w_idx,
scalar_t *data_col) { const int mask_cnt, scalar_t *data_col) {
// mask_cnt * channels // mask_cnt * channels
CUDA_1D_KERNEL_LOOP(index, n) { CUDA_1D_KERNEL_LOOP(index, n) {
const int m_index = index % mask_cnt; const int m_index = index % mask_cnt;
...@@ -59,8 +59,8 @@ int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height, ...@@ -59,8 +59,8 @@ int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height,
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] { bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data<scalar_t>(); const scalar_t *bottom_data_ = bottom_data.data<scalar_t>();
const long *mask_h_idx_ = mask_h_idx.data<long>(); const int64_t *mask_h_idx_ = mask_h_idx.data<int64_t>();
const long *mask_w_idx_ = mask_w_idx.data<long>(); const int64_t *mask_w_idx_ = mask_w_idx.data<int64_t>();
scalar_t *top_data_ = top_data.data<scalar_t>(); scalar_t *top_data_ = top_data.data<scalar_t>();
MaskedIm2colForward<scalar_t> MaskedIm2colForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>( <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
...@@ -74,16 +74,15 @@ int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height, ...@@ -74,16 +74,15 @@ int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height,
template <typename scalar_t> template <typename scalar_t>
__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col, __global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
const int height, const int width, const int height, const int width,
const int channels, const long *mask_h_idx, const int channels,
const long *mask_w_idx, const int mask_cnt, const int64_t *mask_h_idx,
scalar_t *data_im) { const int64_t *mask_w_idx,
const int mask_cnt, scalar_t *data_im) {
CUDA_1D_KERNEL_LOOP(index, n) { CUDA_1D_KERNEL_LOOP(index, n) {
const int m_index = index % mask_cnt; const int m_index = index % mask_cnt;
const int h_im = mask_h_idx[m_index]; const int h_im = mask_h_idx[m_index];
const int w_im = mask_w_idx[m_index]; const int w_im = mask_w_idx[m_index];
const int c_im = index / mask_cnt; const int c_im = index / mask_cnt;
// int kernel_extent_w = (kernel_w - 1) + 1;
// int kernel_extent_h = (kernel_h - 1) + 1;
// compute the start and end of the output // compute the start and end of the output
data_im[(c_im * height + h_im) * width + w_im] = data_col[index]; data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
} }
...@@ -99,8 +98,8 @@ int MaskedCol2imForwardLaucher(const at::Tensor bottom_data, const int height, ...@@ -99,8 +98,8 @@ int MaskedCol2imForwardLaucher(const at::Tensor bottom_data, const int height,
AT_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] { bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
const scalar_t *bottom_data_ = bottom_data.data<scalar_t>(); const scalar_t *bottom_data_ = bottom_data.data<scalar_t>();
const long *mask_h_idx_ = mask_h_idx.data<long>(); const int64_t *mask_h_idx_ = mask_h_idx.data<int64_t>();
const long *mask_w_idx_ = mask_w_idx.data<long>(); const int64_t *mask_w_idx_ = mask_w_idx.data<int64_t>();
scalar_t *top_data_ = top_data.data<scalar_t>(); scalar_t *top_data_ = top_data.data<scalar_t>();
MaskedCol2imForward<scalar_t> MaskedCol2imForward<scalar_t>
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
template <typename scalar_t> template <typename scalar_t>
__global__ void SigmoidFocalLossForward(const int nthreads, __global__ void SigmoidFocalLossForward(const int nthreads,
const scalar_t *logits, const scalar_t *logits,
const long *targets, const int64_t *targets,
const int num_classes, const int num_classes,
const float gamma, const float alpha, const float gamma, const float alpha,
const int num, scalar_t *losses) { const int num, scalar_t *losses) {
...@@ -60,7 +60,7 @@ __global__ void SigmoidFocalLossForward(const int nthreads, ...@@ -60,7 +60,7 @@ __global__ void SigmoidFocalLossForward(const int nthreads,
template <typename scalar_t> template <typename scalar_t>
__global__ void SigmoidFocalLossBackward( __global__ void SigmoidFocalLossBackward(
const int nthreads, const scalar_t *logits, const long *targets, const int nthreads, const scalar_t *logits, const int64_t *targets,
const scalar_t *d_losses, const int num_classes, const float gamma, const scalar_t *d_losses, const int num_classes, const float gamma,
const float alpha, const int num, scalar_t *d_logits) { const float alpha, const int num, scalar_t *d_logits) {
CUDA_1D_KERNEL_LOOP(i, nthreads) { CUDA_1D_KERNEL_LOOP(i, nthreads) {
...@@ -109,7 +109,8 @@ at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, ...@@ -109,7 +109,8 @@ at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits,
auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
auto losses_size = num_samples * logits.size(1); auto losses_size = num_samples * logits.size(1);
dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L)); dim3 grid(
std::min(THCCeilDiv((int64_t)losses_size, (int64_t)512), (int64_t)4096));
dim3 block(512); dim3 block(512);
if (losses.numel() == 0) { if (losses.numel() == 0) {
...@@ -121,7 +122,7 @@ at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, ...@@ -121,7 +122,7 @@ at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits,
logits.scalar_type(), "SigmoidFocalLoss_forward", [&] { logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
SigmoidFocalLossForward<scalar_t><<<grid, block>>>( SigmoidFocalLossForward<scalar_t><<<grid, block>>>(
losses_size, logits.contiguous().data<scalar_t>(), losses_size, logits.contiguous().data<scalar_t>(),
targets.contiguous().data<long>(), num_classes, gamma, alpha, targets.contiguous().data<int64_t>(), num_classes, gamma, alpha,
num_samples, losses.data<scalar_t>()); num_samples, losses.data<scalar_t>());
}); });
THCudaCheck(cudaGetLastError()); THCudaCheck(cudaGetLastError());
...@@ -147,7 +148,8 @@ at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, ...@@ -147,7 +148,8 @@ at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits,
auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
auto d_logits_size = num_samples * logits.size(1); auto d_logits_size = num_samples * logits.size(1);
dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L)); dim3 grid(std::min(THCCeilDiv((int64_t)d_logits_size, (int64_t)512),
(int64_t)4096));
dim3 block(512); dim3 block(512);
if (d_logits.numel() == 0) { if (d_logits.numel() == 0) {
...@@ -159,7 +161,7 @@ at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, ...@@ -159,7 +161,7 @@ at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits,
logits.scalar_type(), "SigmoidFocalLoss_backward", [&] { logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
SigmoidFocalLossBackward<scalar_t><<<grid, block>>>( SigmoidFocalLossBackward<scalar_t><<<grid, block>>>(
d_logits_size, logits.contiguous().data<scalar_t>(), d_logits_size, logits.contiguous().data<scalar_t>(),
targets.contiguous().data<long>(), targets.contiguous().data<int64_t>(),
d_losses.contiguous().data<scalar_t>(), num_classes, gamma, alpha, d_losses.contiguous().data<scalar_t>(), num_classes, gamma, alpha,
num_samples, d_logits.data<scalar_t>()); num_samples, d_logits.data<scalar_t>());
}); });
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment