reminisce opened a new issue #15732: MXNET_GPU_TEMP_SPACE=4 cannot pass CI
URL: https://github.com/apache/incubator-mxnet/issues/15732
 
 
   After changing the default value of `MXNET_GPU_TEMP_SPACE` from 1 to 4, the 
CI cannot pass due to some memory corruption on GPUs.
   
   Test PR: https://github.com/apache/incubator-mxnet/pull/15704
   
   Error message as below.
   
   ```
   test_operator_gpu.test_concat_with_zero_size_tensor ... ok (0.0011s)
   
   test_operator_gpu.test_np_shape_decorator ... ok (0.0067s)
   
   test_operator_gpu.test_add_n ... ok (0.0019s)
   
   test_operator_gpu.test_get_all_registered_operators ... ok (0.0013s)
   
   test_operator_gpu.test_get_operator_arguments ... ok (0.0002s)
   
   [INFO] Setting module np/mx/python random seeds, use 
MXNET_MODULE_SEED=1165643488 to reproduce.
   
   test_predictor.test_predictor_with_dtype ... ok (0.0118s)
   
   test_predictor.test_predictor_amp ... [06:50:18] 
src/nnvm/legacy_json_util.cc:209: Loading symbol saved by previous version 
v0.8.0. Attempting to upgrade...
   
   [06:50:18] src/nnvm/legacy_json_util.cc:217: Symbol successfully upgraded!
   
   ok (0.3672s)
   
   test_tvm_bridge.test_tvm_bridge ... SKIP: test skip test_tvm_bridge
   
   
   
   ======================================================================
   
   ERROR: test_operator_gpu.test_sparse_nd_elemwise_add
   
   ----------------------------------------------------------------------
   
   Traceback (most recent call last):
   
     File "/usr/local/lib/python2.7/dist-packages/nose/case.py", line 197, in 
runTest
   
       self.test(*self.arg)
   
     File "/usr/local/lib/python2.7/dist-packages/nose/util.py", line 620, in 
newfunc
   
       return func(*arg, **kw)
   
     File "/work/mxnet/tests/python/gpu/../unittest/common.py", line 177, in 
test_new
   
       orig_test(*args, **kwargs)
   
     File "/work/mxnet/tests/python/gpu/../unittest/test_sparse_ndarray.py", 
line 58, in test_sparse_nd_elemwise_add
   
       check_sparse_nd_elemwise_binary(shape, ['default'] * 2, op, g)
   
     File "/work/mxnet/tests/python/gpu/../unittest/test_sparse_ndarray.py", 
line 51, in check_sparse_nd_elemwise_binary
   
       assert_almost_equal(test.asnumpy(), g(nds[0].asnumpy(), 
nds[1].asnumpy()))
   
     File "/work/mxnet/python/mxnet/ndarray/ndarray.py", line 2092, in asnumpy
   
       ctypes.c_size_t(data.size)))
   
     File "/work/mxnet/python/mxnet/base.py", line 253, in check_call
   
       raise MXNetError(py_str(_LIB.MXGetLastError()))
   
   MXNetError: [06:11:14] src/operator/random/./sample_op.h:400: Check failed: 
param.scale > 0 (-1 vs. 0) : scale parameter in gaussian has to be positive
   
   Stack trace:
   
     [bt] (0) 
/work/mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x3c)
 [0x7fd3c5df01bc]
   
     [bt] (1) /work/mxnet/python/mxnet/../../lib/libmxnet.so(+0x8a59fff) 
[0x7fd3cc2b3fff]
   
     [bt] (2) /work/mxnet/python/mxnet/../../lib/libmxnet.so(void 
mxnet::op::Sample_<mshadow::gpu, mxnet::op::SampleNormalParam>(nnvm::NodeAttrs 
const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)+0x296) [0x7fd3cc2c38f6]
   
     [bt] (3) 
/work/mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void 
(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&), void (*)(nnvm::NodeAttrs const&, 
mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> 
> const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > 
const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > 
const&)>::_M_invoke(std::_Any_data const&, nnvm::NodeAttrs const&, 
mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> 
> const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > 
const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x20) 
[0x7fd3c5dea780]
   
     [bt] (4) 
/work/mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void
 (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, 
nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, 
std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, 
std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > 
const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) 
const+0x2e2) [0x7fd3c91cd4e2]
   
     [bt] (5) 
/work/mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void 
(mxnet::RunContext), mxnet::imperative::PushFCompute(std::function<void 
(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, 
nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, 
std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, 
std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > 
const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, 
mxnet::RunContext&&)+0x1f) [0x7fd3c91cd83f]
   
     [bt] (6) /work/mxnet/python/mxnet/../../lib/libmxnet.so(+0x63e37ee) 
[0x7fd3c9c3d7ee]
   
     [bt] (7) 
/work/mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext,
 mxnet::engine::OprBlock*)+0x679) [0x7fd3c9c37809]
   
     [bt] (8) /work/mxnet/python/mxnet/../../lib/libmxnet.so(void 
mxnet::engine::ThreadedEnginePerDevice::GPUWorker<(dmlc::ConcurrentQueueType)0>(mxnet::Context,
 bool, 
mxnet::engine::ThreadedEnginePerDevice::ThreadWorkerBlock<(dmlc::ConcurrentQueueType)0>*,
 std::shared_ptr<dmlc::ManualEvent> const&)+0x18d) [0x7fd3c9c4f54d]
   
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to