[GitHub] piiswrong closed pull request #9560: Expand gpu-kernel-launch synchronous error checking.

GitBox Tue, 30 Jan 2018 10:45:42 -0800

piiswrong closed pull request #9560: Expand gpu-kernel-launch synchronous error 
checking.
URL: https://github.com/apache/incubator-mxnet/pull/9560


This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/common/random_generator.cu b/src/common/random_generator.cu
index f6f31cf5af..930e5e07b8 100644
--- a/src/common/random_generator.cu
+++ b/src/common/random_generator.cu
@@ -55,6 +55,7 @@ void RandGenerator<gpu, float>::Seed(mshadow::Stream<gpu> *s, 
uint32_t seed) {
           states_,
           RandGenerator<gpu, float>::kNumRandomStates,
           seed);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(rand_generator_seed_kernel);
   s->Wait();
 }
 
diff --git a/src/operator/contrib/count_sketch.cu 
b/src/operator/contrib/count_sketch.cu
index b849f4cf3e..373ff3eab6 100644
--- a/src/operator/contrib/count_sketch.cu
+++ b/src/operator/contrib/count_sketch.cu
@@ -129,6 +129,7 @@ inline void CountSketchForward(const Tensor<gpu, 2, DType> 
&out,
                                     nthreads, out_ptr+bstart*out_dim, h_ptr,
                                     s_ptr, in_ptr+bstart*in_dim, batchlen,
                                     in_dim, out_dim);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(sketch_forward_kernel);
     // cudaThreadSynchronize();
     bstart = (i+1)*batchlen;
   }
@@ -164,6 +165,7 @@ inline void CountSketchBackward(const Tensor<gpu, 2, DType> 
&in_grad,
                             nthreads, in_grad_ptr+bstart*in_dim, h_ptr,
                             s_ptr, out_grad_ptr+bstart*out_dim, batchlen,
                             in_dim, out_dim);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(sketch_backward_kernel);
     bstart = (i+1)*batchlen;
   }
 }
diff --git a/src/operator/contrib/ctc_include/detail/gpu_ctc.h 
b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
index c9cab80f34..c249046424 100644
--- a/src/operator/contrib/ctc_include/detail/gpu_ctc.h
+++ b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
@@ -404,6 +404,10 @@ GpuCTC<ProbT>::compute_log_probs(const ProbT* const 
activations) {
         (ctc_helper::identity<ProbT>(), log_probs_,
          denoms_, out_dim_, num_elements);
 
+    cuda_status = cudaGetLastError();
+    if (cuda_status != cudaSuccess)
+        return CTC_STATUS_EXECUTION_FAILED;
+
     return CTC_STATUS_SUCCESS;
 }
 
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index b2a672ffd9..d1286170c2 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -641,6 +641,7 @@ void linalg_potri<gpu, DType>(const Tensor<gpu, 2, DType>& 
A, bool lower, Stream
                        static_cast<int>((A.MSize() + kBaseThreadNum - 1) / 
kBaseThreadNum)); \
   linalgInitIdentityGPU<<<ngrid, kBaseThreadNum, 0, 
mshadow::Stream<gpu>::GetStream(s)>>> \
     (static_cast<DType *>(buffer.dptr), A.MSize(), A.stride_, A.MSize());  \
+  MSHADOW_CUDA_POST_KERNEL_CHECK(linalgInitIdentityGPU); \
   Tensor<gpu, 2, DType> B((DType *)buffer.dptr, A.shape_, A.stride_, s); \
   linalg_trsm(A, B, DType(1.0), false, lower, !lower, s); \
   linalg_trsm(A, B, DType(1.0), false, lower, lower, s); \
@@ -664,6 +665,7 @@ void linalg_batch_potri<gpu, DType>(const Tensor<gpu, 3, 
DType>& A, bool lower,
                        static_cast<int>((A.MSize() + kBaseThreadNum - 1) / 
kBaseThreadNum)); \
   linalgInitIdentityGPU<<<ngrid, kBaseThreadNum, 0, 
mshadow::Stream<gpu>::GetStream(s)>>> \
     (static_cast<DType *>(buffer.dptr), A.size(1)*A.stride_, A.stride_, 
A.MSize()); \
+  MSHADOW_CUDA_POST_KERNEL_CHECK(linalgInitIdentityGPU); \
   Tensor<gpu, 3, DType> B((DType *)buffer.dptr, A.shape_, A.stride_, s); \
   linalg_batch_trsm(A, B, DType(1.0), false, lower, !lower, s); \
   linalg_batch_trsm(A, B, DType(1.0), false, lower, lower, s); \
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 5a36954510..cd52524612 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -563,6 +563,7 @@ struct Kernel<OP, gpu> {
     mxnet_generic_kernel<OP, Args...>
       <<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
         N, args...);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(mxnet_generic_kernel);
   }
 
   template<typename ...Args>
@@ -572,6 +573,7 @@ struct Kernel<OP, gpu> {
     mxnet_generic_kernel_ex<OP, Args...>
       <<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
         N, args...);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(mxnet_generic_kernel_ex);
   }
 };
 #endif  // __CUDACC__
diff --git a/src/operator/nn/sequence_mask-inl.h 
b/src/operator/nn/sequence_mask-inl.h
index a3b41f6ecf..df98116372 100644
--- a/src/operator/nn/sequence_mask-inl.h
+++ b/src/operator/nn/sequence_mask-inl.h
@@ -66,6 +66,7 @@ inline void SequenceMask(const mshadow::Tensor<gpu, 3, DType> 
&dst,
   CheckLaunchParam(dimGrid, dimBlock, "SequenceMask");
   cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
   SequenceMaskKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, 
stream>>>(dst, lengths, value);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(SequenceMaskKernel);
 }
 #endif
 
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index 2badecf3d0..080bc08852 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -173,6 +173,7 @@ inline void Softmax(Stream<gpu> *s, DType *in, DType *out,
   softmax_compute_kernel<x_bits, OP, DType, ndim>
     <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
       in, out, M, axis, sshape, stride);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_compute_kernel);
 }
 
 
@@ -216,6 +217,7 @@ inline void SoftmaxGrad(Stream<gpu> *s, DType *out, DType 
*ograd,
   softmax_gradient_kernel<x_bits, OP1, OP2, DType, ndim>
     <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
       out, ograd, igrad, M, axis, sshape, stride);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_gradient_kernel);
 }
 #endif
 
diff --git a/src/operator/pad.cu b/src/operator/pad.cu
index 54242a485e..372683a2be 100644
--- a/src/operator/pad.cu
+++ b/src/operator/pad.cu
@@ -78,6 +78,7 @@ inline void image_pad_edge(Tensor<gpu, 4, DType> dst,
   image_2d_pad_edge_kernel<kBaseThreadBits,
                            DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src,
                                                                     padT, 
padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_edge_kernel);
 }
 
 template <int n_bits, typename DType>
@@ -119,6 +120,7 @@ inline void image_pad_edge_grad(Tensor<gpu, 4, DType> 
grad_in,
   image_2d_pad_edge_grad_kernel<kBaseThreadBits,
                                 DType><<<dimGrid, dimBlock, 0, stream>>>(
       grad_in, grad_out, padT, padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_edge_grad_kernel);
 }
 
 // Case 2: Constant Padding
@@ -166,6 +168,7 @@ inline void image_pad_constant(Tensor<gpu, 4, DType> dst,
   image_2d_pad_constant_kernel<kBaseThreadBits,
                                DType><<<dimGrid, dimBlock, 0, stream>>>(
       dst, src, padT, padL, constant);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_constant_kernel);
 }
 
 template <int n_bits, typename DType>
@@ -202,6 +205,7 @@ inline void image_pad_constant_grad(Tensor<gpu, 4, DType> 
grad_in,
   image_2d_pad_constant_grad_kernel<kBaseThreadBits,
                                     DType><<<dimGrid, dimBlock, 0, stream>>>(
       grad_in, grad_out, padT, padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_constant_grad_kernel);
 }
 
 
@@ -257,6 +261,7 @@ inline void image_pad_reflect(Tensor<gpu, 4, DType> dst,
   image_2d_pad_reflect_kernel<kBaseThreadBits,
                            DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src,
                                                                     padT, 
padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_reflect_kernel);
 }
 
 template <int n_bits, typename DType>
@@ -307,6 +312,7 @@ inline void image_pad_reflect_grad(Tensor<gpu, 4, DType> 
grad_in,
   image_2d_pad_reflect_grad_kernel<kBaseThreadBits,
                                 DType><<<dimGrid, dimBlock, 0, stream>>>(
       grad_in, grad_out, padT, padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_2d_pad_reflect_grad_kernel);
 }
 
 
@@ -365,6 +371,7 @@ inline void image_pad_edge(Tensor<gpu, 5, DType> dst,
   image_3d_pad_edge_kernel<kBaseThreadBits,
                            DType><<<dimGrid, dimBlock, 0, stream>>>(
       dst, src, padF, padT, padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_edge_kernel);
 }
 
 template <int n_bits, typename DType>
@@ -416,6 +423,7 @@ inline void image_pad_edge_grad(Tensor<gpu, 5, DType> 
grad_in,
   image_3d_pad_edge_grad_kernel<kBaseThreadBits,
                                 DType><<<dimGrid, dimBlock, 0, stream>>>(
       grad_in, grad_out, padF, padT, padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_edge_grad_kernel);
 }
 
 // Case 2: Constant Padding
@@ -473,6 +481,7 @@ inline void image_pad_constant(Tensor<gpu, 5, DType> dst,
   image_3d_pad_constant_kernel<kBaseThreadBits,
                                DType><<<dimGrid, dimBlock, 0, stream>>>(
       dst, src, padF, padT, padL, constant);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_constant_kernel);
 }
 
 template <int n_bits, typename DType>
@@ -515,6 +524,7 @@ inline void image_pad_constant_grad(Tensor<gpu, 5, DType> 
grad_in,
   image_3d_pad_constant_grad_kernel<kBaseThreadBits,
                                     DType><<<dimGrid, dimBlock, 0, stream>>>(
       grad_in, grad_out, padF, padT, padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_constant_grad_kernel);
 }
 
 // Case 3: Reflection Padding
@@ -578,6 +588,7 @@ inline void image_pad_reflect(Tensor<gpu, 5, DType> dst,
   image_3d_pad_reflect_kernel<kBaseThreadBits,
                            DType><<<dimGrid, dimBlock, 0, stream>>>(
       dst, src, padF, padT, padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_reflect_kernel);
 }
 
 template <int n_bits, typename DType>
@@ -670,6 +681,7 @@ inline void image_pad_reflect_grad(Tensor<gpu, 5, DType> 
grad_in,
   image_3d_pad_reflect_grad_kernel<kBaseThreadBits,
                                 DType><<<dimGrid, dimBlock, 0, stream>>>(
       grad_in, grad_out, padF, padT, padL);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(image_3d_pad_reflect_grad_kernel);
 }
 
 
////////////////////////////////////////////////////////////////////////////////
diff --git a/src/operator/roi_pooling.cu b/src/operator/roi_pooling.cu
index 0f637b0e1b..066c2ffd06 100644
--- a/src/operator/roi_pooling.cu
+++ b/src/operator/roi_pooling.cu
@@ -129,6 +129,7 @@ inline void ROIPoolForward(const Tensor<gpu, 4, Dtype> &out,
   ROIPoolForwardKernel<Dtype><<<dimGrid, dimBlock, 0, stream>>>(
       count, bottom_data, spatial_scale, channels, height, width,
       pooled_height, pooled_width, bottom_rois, top_data, argmax_data);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(ROIPoolForwardKernel);
 }
 
 template<typename Dtype>
@@ -232,6 +233,7 @@ inline void ROIPoolBackwardAcc(const Tensor<gpu, 4, Dtype> 
&in_grad,
   ROIPoolBackwardAccKernel<Dtype><<<dimGrid, dimBlock, 0, stream>>>(
       count, top_diff, argmax_data, num_rois, spatial_scale, channels, height, 
width,
       pooled_height, pooled_width, bottom_diff, bottom_rois);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(ROIPoolBackwardAccKernel);
 }
 
 }  // namespace cuda
diff --git a/src/operator/svm_output.cu b/src/operator/svm_output.cu
index d9501071fd..fa11a6cfb4 100644
--- a/src/operator/svm_output.cu
+++ b/src/operator/svm_output.cu
@@ -62,6 +62,7 @@ inline void L1_SVM(const DType & margin,
   cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
   L1_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream 
>>>
     (margin, reg_coef, dst, label, src);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(L1_SVMKernel);
 }
 
 
@@ -98,6 +99,7 @@ inline void L2_SVM(const DType & margin,
   cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
   L2_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream 
>>>
     (margin, reg_coef, dst, label, src);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(L2_SVMKernel);
 }
 }  // namespace mshadow
 
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh 
b/src/operator/tensor/broadcast_reduce-inl.cuh
index f5d0152e08..630fef65a5 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -507,6 +507,7 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, 
const OpReqType req,
     <<< config.kernel_1.gridDim, config.kernel_1.blockDim, 0, stream >>>(
       config.N, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(), 
big.shape_.get<ndim>(),
       small.shape_.get<ndim>());
+    MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel_M1);
   } else {
 
     DType* small_dptr = small.dptr<DType>();
@@ -531,11 +532,13 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, 
const OpReqType req,
         small.shape_.get<ndim>(), config.rshape, config.rstride, config.Mnext,
         config.kernel_1.do_transpose);
     });
+    MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel);
 
     if (config.Mnext > 1) {
       reduce_lines_kernel<Reducer, DType>
       <<< config.kernel_2.gridSize, config.kernel_2.blockSize, 0, stream >>>
         (config.N, config.Mnext, req == kAddTo, config.N, small_dptr, 
small.dptr<DType>());
+      MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_lines_kernel);
     }
   }
 }
@@ -550,6 +553,7 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, 
const TBlob& lhs, const
       config.N, req == kAddTo, big.dptr<DType>(), lhs.dptr<DType>(), 
rhs.dptr<DType>(),
       small.dptr<DType>(), big.shape_.get<ndim>(), lhs.shape_.get<ndim>(),
       rhs.shape_.get<ndim>(), small.shape_.get<ndim>());
+    MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel_M1);
   } else {
     DType* small_dptr = small.dptr<DType>();
     bool addto = (req == kAddTo);
@@ -574,12 +578,14 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, 
const TBlob& lhs, const
         rhs.shape_.get<ndim>(), small.shape_.get<ndim>(), config.rshape, 
config.lhs_shape,
         config.rhs_shape, config.rstride, config.lhs_stride, 
config.rhs_stride, config.Mnext,
         config.kernel_1.do_transpose);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel);
     });
 
     if (config.Mnext > 1) {
       reduce_lines_kernel<Reducer, DType>
       <<< config.kernel_2.gridSize, config.kernel_2.blockSize, 0, stream >>>
         (config.N, config.Mnext, req == kAddTo, config.N, small_dptr, 
small.dptr<DType>());
+      MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_lines_kernel);
     }
   }
 }
diff --git a/tests/python/gpu/test_operator_gpu.py 
b/tests/python/gpu/test_operator_gpu.py
index 52aca091d8..55bb30cc7d 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -15,9 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from __future__ import print_function
 import sys
 import os
 import time
+import multiprocessing as mp
 import unittest
 import mxnet as mx
 import numpy as np
@@ -1478,6 +1480,42 @@ def test_cross_device_autograd():
 
     assert_almost_equal(dx, x.grad.asnumpy())
 
+
+# The following 2 functions launch 0-thread kernels, an error that should be 
caught and signaled.
+def kernel_error_check_imperative():
+    os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
+    a = mx.nd.array([1,2,3],ctx=mx.gpu(0))
+    b = mx.nd.array([],ctx=mx.gpu(0))
+    c = (a / b).asnumpy()
+
+def kernel_error_check_symbolic():
+    os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
+    a = mx.sym.Variable('a')
+    b = mx.sym.Variable('b')
+    c = a / b
+    f = c.bind(mx.gpu(0), { 'a':mx.nd.array([1,2,3],ctx=mx.gpu(0)),
+                            'b':mx.nd.array([],ctx=mx.gpu(0))})
+    f.forward()
+    g = f.outputs[0].asnumpy()
+
+def test_kernel_error_checking():
+    # Running tests that may throw exceptions out of worker threads will stop 
CI testing
+    # if not run in a separate process (with its own address space for CUDA 
compatibility).
+    try:
+        mpctx = mp.get_context('spawn')
+    except:
+        print('SKIP: python%s.%s lacks the required process fork-exec support 
... ' %
+              sys.version_info[0:2], file=sys.stderr, end='')
+    else:
+        with discard_stderr():
+            for f in [kernel_error_check_imperative, 
kernel_error_check_symbolic]:
+                p = mpctx.Process(target=f)
+                p.start()
+                p.join()
+                assert p.exitcode != 0,\
+                    "Expected a synchronous kernel error from %s(), none 
seen." % f.__name__
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] piiswrong closed pull request #9560: Expand gpu-kernel-launch synchronous error checking.

Reply via email to