reminisce opened a new issue #16338: Reduce op throws "too many resources requested for launch" URL: https://github.com/apache/incubator-mxnet/issues/16338 When MXNet is compiled in `debug` mode, reduce ops on GPUs may throw the error "too many resources requested for launch" at kernel launch. See https://github.com/apache/incubator-mxnet/pull/16294#issuecomment-536727538 for root cause. This issue is filed to track the progress of fixing the problem. Error message [example](https://github.com/apache/incubator-mxnet/pull/16294#issuecomment-536266618): ``` (base) ubuntu@ip-172-31-16-49:~/incubator-mxnet$ nosetests -s --verbose tests/python/gpu/test_operator_gpu.py:test_np_sum [INFO] Setting module np/mx/python random seeds, use MXNET_MODULE_SEED=342263604 to reproduce. test_operator_gpu.test_np_sum ... [08:06:44] ../src/base.cc:84: Upgrade advisory: this mxnet has been built against cuDNN lib version 7401, which is older than the oldest version tested by CI (7600). Set MXNET_CUDNN_LIB_CHECKING=0 to quiet this warning. [INFO] Setting test np/mx/python random seeds, use MXNET_TEST_SEED=1216105730 to reproduce. ERROR ====================================================================== ERROR: test_operator_gpu.test_np_sum ---------------------------------------------------------------------- Traceback (most recent call last): File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/nose/case.py", line 197, in runTest self.test(*self.arg) File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/nose/util.py", line 620, in newfunc return func(*arg, **kw) File "/home/ubuntu/incubator-mxnet/tests/python/gpu/../unittest/common.py", line 177, in test_new orig_test(*args, **kwargs) File "/home/ubuntu/incubator-mxnet/python/mxnet/util.py", line 307, in _with_np_shape return func(*args, **kwargs) File "/home/ubuntu/incubator-mxnet/python/mxnet/util.py", line 491, in _with_np_array return func(*args, **kwargs) File "/home/ubuntu/incubator-mxnet/tests/python/gpu/../unittest/test_numpy_op.py", line 264, in test_np_sum assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3 if dtype == 'float16' else 1e-3, File "/home/ubuntu/incubator-mxnet/python/mxnet/ndarray/ndarray.py", line 2504, in asnumpy ctypes.c_size_t(data.size))) File "/home/ubuntu/incubator-mxnet/python/mxnet/base.py", line 254, in check_call raise MXNetError(py_str(_LIB.MXGetLastError())) mxnet.base.MXNetError: [08:06:54] /home/ubuntu/incubator-mxnet/src/operator/nn/././../tensor/./broadcast_reduce-inl.cuh:528: Check failed: err == cudaSuccess (7 vs. 0) : Name: reduce_kernel_M1 ErrStr:too many resources requested for launch Stack trace: [bt] (0) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x7f81a9b7fb82] [bt] (1) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(void mxnet::op::broadcast::ReduceImpl<mxnet::op::mshadow_op::sum, 2, float, mshadow::half::half_t, mshadow::half::half_t, mxnet::op::mshadow_op::identity>(CUstream_st*, mxnet::TBlob const&, mxnet::OpReqType, mxnet::TBlob const&, mshadow::Tensor<mshadow::gpu, 1, char> const&, mxnet::op::broadcast::ReduceImplConfig<2> const&)+0x820) [0x7f81aa184e10] [bt] (2) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(void mxnet::op::broadcast::Reduce<mxnet::op::mshadow_op::sum, 2, mshadow::half::half_t, mxnet::op::mshadow_op::identity, true>(mshadow::Stream<mshadow::gpu>*, mxnet::TBlob const&, mxnet::OpReqType, mshadow::Tensor<mshadow::gpu, 1, char> const&, mxnet::TBlob const&)+0x539) [0x7f81aa187eb9] [bt] (3) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(void mxnet::op::ReduceAxesComputeImpl<mshadow::gpu, mxnet::op::mshadow_op::sum, true, false, mxnet::op::mshadow_op::identity>(mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, mxnet::TShape const&)+0x13e9) [0x7f81aa868649] [bt] (4) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(void mxnet::op::NumpyReduceAxesCompute<mshadow::gpu, mxnet::op::mshadow_op::sum, true, false, mxnet::op::mshadow_op::identity>(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x4ac) [0x7f81aa97a26c] [bt] (5) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x2a6) [0x7f81ac1cdc16] [bt] (6) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x17) [0x7f81ac1cde67] [bt] (7) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(+0x39d3f4e) [0x7f81ac127f4e] [bt] (8) /home/ubuntu/incubator-mxnet/python/mxnet/../../build/libmxnet.so(mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext, mxnet::engine::OprBlock*)+0x5cf) [0x7f81ac13418f] -------------------- >> begin captured logging << -------------------- root: INFO: NumPy-shape semantics has been activated in your code. This is required for creating and manipulating scalar and zero-size tensors, which were not supported in MXNet before, as in the official NumPy library. Please DO NOT manually deactivate this semantics while using `mxnet.numpy` and `mxnet.numpy_extension` modules. common: INFO: Setting module np/mx/python random seeds, use MXNET_MODULE_SEED=342263604 to reproduce. root: INFO: NumPy-shape semantics has been activated in your code. This is required for creating and manipulating scalar and zero-size tensors, which were not supported in MXNet before, as in the official NumPy library. Please DO NOT manually deactivate this semantics while using `mxnet.numpy` and `mxnet.numpy_extension` modules. common: INFO: Setting test np/mx/python random seeds, use MXNET_TEST_SEED=1216105730 to reproduce. --------------------- >> end captured logging << --------------------- ---------------------------------------------------------------------- Ran 1 test in 9.612s FAILED (errors=1) ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services