leezu commented on issue #18314:
URL: 
https://github.com/apache/incubator-mxnet/issues/18314#issuecomment-628821443


   The mutex in above core dump is actually non existant. The problem here is 
that the ffi call to MXNDArraySyncCopyToCPU passes a handle whose Chunk was 
cleaned up. So the segfault is just a symptom of memory and reference 
management issues.
   
   the array
   ```
   (gdb) p *this
   $6 = {ptr_ = std::shared_ptr<mxnet::NDArray::Chunk> (use count 1645762192, 
weak count 22028) = {get() = 0x560d62a70cb0}, shape_ = {<mxnet::Tuple<long>> = 
{static kStackCache = 4, ndim_ = 2,
         num_heap_allocated_ = 0, data_stack_ = {10, 10, -3617008641903833651, 
-3617008641903833651}, data_heap_ = 0x0}, <No data fields>}, byte_offset_ = 0, 
dtype_ = 0, reuse_ = false,
     storage_type_ = mxnet::kDefaultStorage, autograd_entry_ = {node = 
std::shared_ptr<nnvm::Node> (use count 1647685840, weak count 22028) = {get() = 
0x560d622fe220}, index = 0, version = 0},
     deferredcompute_entry_ = {node = std::shared_ptr<nnvm::Node> (empty) = 
{get() = 0x0}, index = 0, version = 0}, tblob_ = {dptr_ = 0x0, shape_ = 
{<mxnet::Tuple<long>> = {static kStackCache = 4,
           ndim_ = -1, num_heap_allocated_ = 0, data_stack_ = 
{-3617008641903833651, -2459565876494606899, -2459565876494606883, 
-2459565876494606883}, data_heap_ = 0x0}, <No data fields>}, type_flag_ = 0,
       dltensor_ = {data = 0x0, ctx = {device_type = kDLCPU, device_id = 0}, 
ndim = -1, dtype = {code = 2 '\002', bits = 32 ' ', lanes = 1}, shape = 
0x560d62188f78, strides = 0x0, byte_offset = 0}}}
   ```
   
   and the chunk
   ```
   p *ptr_
   $4 = {shandle = {dptr = 0xdddddddddddddddd, size = 481, ctx = {dev_type = 
1655088448, dev_id = 22029, static kMaxDevType = 6, static kMaxDevID = 16}, 
shared_pid = 1645776592, shared_id = 22029,
       profiler_scope = Python Exception <class 'OverflowError'> int too big to 
convert:
   , Python Exception <class 'OverflowError'> int too big to convert:
   name = }, aux_handles = std::vector of length 0, capacity 0, mkl_mem_ = 
<error reading variable: Cannot access memory at address 0xdddddddddddddde5>, 
var = 0xdddddddddddddddd,
     static_data = 221, delay_alloc = 221, storage_type = -572662307, aux_types 
= std::vector of length 0, capacity 0, ctx = {dev_type = 3722304989, dev_id = 
-572662307, static kMaxDevType = 6,
       static kMaxDevID = 16}, storage_shape = {<mxnet::Tuple<long>> = {static 
kStackCache = 4, ndim_ = -572662307, num_heap_allocated_ = -572662307, 
data_stack_ = {-2459565876494606883,
           -2459565876494606883, -2459565876494606883, -2459565876494606883}, 
data_heap_ = 0xdddddddddddddddd}, <No data fields>}, aux_shapes = std::vector 
of length 0, capacity 0,
     storage_ref_ = <error reading variable: Cannot access memory at address 
0xdddddddddddddde5>, engine_ref_ = <error reading variable: Cannot access 
memory at address 0xdddddddddddddde5>}
   
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to