reminisce opened a new issue #15732: MXNET_GPU_TEMP_SPACE=4 cannot pass CI URL: https://github.com/apache/incubator-mxnet/issues/15732 After changing the default value of `MXNET_GPU_TEMP_SPACE` from 1 to 4, the CI cannot pass due to some memory corruption on GPUs. Test PR: https://github.com/apache/incubator-mxnet/pull/15704 Error message as below. ``` test_operator_gpu.test_concat_with_zero_size_tensor ... ok (0.0011s) test_operator_gpu.test_np_shape_decorator ... ok (0.0067s) test_operator_gpu.test_add_n ... ok (0.0019s) test_operator_gpu.test_get_all_registered_operators ... ok (0.0013s) test_operator_gpu.test_get_operator_arguments ... ok (0.0002s) [INFO] Setting module np/mx/python random seeds, use MXNET_MODULE_SEED=1165643488 to reproduce. test_predictor.test_predictor_with_dtype ... ok (0.0118s) test_predictor.test_predictor_amp ... [06:50:18] src/nnvm/legacy_json_util.cc:209: Loading symbol saved by previous version v0.8.0. Attempting to upgrade... [06:50:18] src/nnvm/legacy_json_util.cc:217: Symbol successfully upgraded! ok (0.3672s) test_tvm_bridge.test_tvm_bridge ... SKIP: test skip test_tvm_bridge ====================================================================== ERROR: test_operator_gpu.test_sparse_nd_elemwise_add ---------------------------------------------------------------------- Traceback (most recent call last): File "/usr/local/lib/python2.7/dist-packages/nose/case.py", line 197, in runTest self.test(*self.arg) File "/usr/local/lib/python2.7/dist-packages/nose/util.py", line 620, in newfunc return func(*arg, **kw) File "/work/mxnet/tests/python/gpu/../unittest/common.py", line 177, in test_new orig_test(*args, **kwargs) File "/work/mxnet/tests/python/gpu/../unittest/test_sparse_ndarray.py", line 58, in test_sparse_nd_elemwise_add check_sparse_nd_elemwise_binary(shape, ['default'] * 2, op, g) File "/work/mxnet/tests/python/gpu/../unittest/test_sparse_ndarray.py", line 51, in check_sparse_nd_elemwise_binary assert_almost_equal(test.asnumpy(), g(nds[0].asnumpy(), nds[1].asnumpy())) File "/work/mxnet/python/mxnet/ndarray/ndarray.py", line 2092, in asnumpy ctypes.c_size_t(data.size))) File "/work/mxnet/python/mxnet/base.py", line 253, in check_call raise MXNetError(py_str(_LIB.MXGetLastError())) MXNetError: [06:11:14] src/operator/random/./sample_op.h:400: Check failed: param.scale > 0 (-1 vs. 0) : scale parameter in gaussian has to be positive Stack trace: [bt] (0) /work/mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x3c) [0x7fd3c5df01bc] [bt] (1) /work/mxnet/python/mxnet/../../lib/libmxnet.so(+0x8a59fff) [0x7fd3cc2b3fff] [bt] (2) /work/mxnet/python/mxnet/../../lib/libmxnet.so(void mxnet::op::Sample_<mshadow::gpu, mxnet::op::SampleNormalParam>(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x296) [0x7fd3cc2c38f6] [bt] (3) /work/mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&), void (*)(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)>::_M_invoke(std::_Any_data const&, nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x20) [0x7fd3c5dea780] [bt] (4) /work/mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x2e2) [0x7fd3c91cd4e2] [bt] (5) /work/mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x1f) [0x7fd3c91cd83f] [bt] (6) /work/mxnet/python/mxnet/../../lib/libmxnet.so(+0x63e37ee) [0x7fd3c9c3d7ee] [bt] (7) /work/mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext, mxnet::engine::OprBlock*)+0x679) [0x7fd3c9c37809] [bt] (8) /work/mxnet/python/mxnet/../../lib/libmxnet.so(void mxnet::engine::ThreadedEnginePerDevice::GPUWorker<(dmlc::ConcurrentQueueType)0>(mxnet::Context, bool, mxnet::engine::ThreadedEnginePerDevice::ThreadWorkerBlock<(dmlc::ConcurrentQueueType)0>*, std::shared_ptr<dmlc::ManualEvent> const&)+0x18d) [0x7fd3c9c4f54d] ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
