leezu commented on pull request #18690: URL: https://github.com/apache/incubator-mxnet/pull/18690#issuecomment-660321410
Backtrace of the issue below. I use the following patch to obtain the backtrace ``` diff modified src/c_api/c_api_ndarray.cc @@ -394,7 +394,6 @@ int MXAutogradBackwardEx(uint32_t num_output, NDArrayHandle **grad_handles, int **grad_stypes) { MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get(); - API_BEGIN(); std::vector<NDArray*> outputs, ograds, variables; outputs.reserve(num_output); @@ -430,7 +429,7 @@ int MXAutogradBackwardEx(uint32_t num_output, *grad_handles = dmlc::BeginPtr(ret->ret_handles); *grad_stypes = dmlc::BeginPtr(ret->out_types); } - API_END(); + return 0; } int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out) { ``` and build via `cmake -GNinja -DCMAKE_BUILD_TYPE=Debug -DLOG_FATAL_THROW=0 -DUSE_CUDA=0 ..; ninja` Backtrace: ``` Thread 1 "python3.8" received signal SIGABRT, Aborted. __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51 51 ../sysdeps/unix/sysv/linux/raise.c: No such file or directory. (gdb) bt #0 0x00007ffff705ef47 in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51 #1 0x00007ffff70608b1 in __GI_abort () at abort.c:79 #2 0x00007fff3737c257 in () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6 #3 0x00007fff37387606 in () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6 #4 0x00007fff37387671 in () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6 #5 0x00007fff37387905 in () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6 #6 0x00007fff3737e96b in std::__throw_bad_cast() () at /usr/lib/x86_64-linux-gnu/libstdc++.so.6 #7 0x00007fff3ba064f7 in __gnu_cxx::new_allocator<nnvm::NodeEntry>::allocate(unsigned long, void const*) (this=0x555555e73ea0, __n=18446744073709459446) at /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/ext/new_allocator.h:106 #8 0x00007fff3ba064b4 in std::allocator_traits<std::allocator<nnvm::NodeEntry> >::allocate(std::allocator<nnvm::NodeEntry>&, unsigned long) (__a=..., __n=18446744073709459446) at /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/bits/alloc_traits.h:460 #9 0x00007fff3ba06423 in std::_Vector_base<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> >::_M_allocate(unsigned long) (this=0x555555e73ea0, __n=18446744073709459446) at /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/bits/stl_vector.h:346 #10 0x00007fff3ba060cc in std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> >::_M_allocate_and_copy<__gnu_cxx::__normal_iterator<nnvm::NodeEntry const*, std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> > > >(unsigned long, __gnu_cxx::__normal_iterator<nnvm::NodeEntry const*, std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> > >, __gnu_cxx::__normal_iterator<nnvm::NodeEntry const*, std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> > >) (this=0x555555e73ea0, __n=18446744073709459446, __first={node = <error reading variable: Cannot access memory at address 0xf0000000a>, index = 15, version = 3707764736}, __last={node = <error reading variable: Cannot access memory at address 0x100000010>, index = 1470775728, version = 21845}) at /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/bits/stl_vector.h:1511 #11 0x00007fff3ba04c07 in std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> >::operator=(std::vector<nnvm::NodeEntry, std::allocator<nnvm::NodeEntry> > const&) (this=0x555555e73ea0, __x=std::vector of length -92170, capacity 12802 = {...}) at /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/bits/vector.tcc:226 #12 0x00007fff3bc74a44 in nnvm::Graph::operator=(nnvm::Graph const&) (this=0x555555e73ea0) at ../include/nnvm/graph.h:46 #13 0x00007fff3bc60133 in mxnet::CachedOp::DynamicBackward(bool, mxnet::OpStatePtr const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&) (this=0x555557aa41b0, retain_graph=false, op_state=..., inputs=std::vector of length 3, capacity 3 = {...}, reqs=std::vector of length 1, capacity 1 = {...}, outputs=std::vector of length 1, capacity 1 = {...}) at ../src/imperative/cached_op.cc:853 #14 0x00007fff3bc6262b in mxnet::CachedOp::Backward(bool, mxnet::OpStatePtr const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&) (this=0x555557aa41b0, retain_graph=false, state=..., inputs=std::vector of length 3, capacity 3 = {...}, reqs=std::vector of length 1, capacity 1 = {...}, outputs=std::vector of length 1, capacity 1 = {...}) at ../src/imperative/cached_op.cc:1048 #15 0x00007fff3bcdd29d in (anonymous namespace)::InvokeOperator(nnvm::IndexedGraph const&, int, bool, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, mxnet::Context, std::vector<mxnet::OpStatePtr, std::allocator<mxnet::OpStatePtr> >*, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> >*, std::vector<unsigned int, std::allocator<unsigned int> >*, std::function<void (mxnet::OpStatePtr const&)>) (idx=..., node_idx=5, retain_graph=false, arrays=std::vector of length 8, capacity 8 = {...}, ctx=..., p_states=0x7ffffffccfa8, ndinputs=std::vector of length 3, capacity 3 = {...}, ndoutputs=std::vector of length 1, capacity 1 = {...}, p_req=0x7ffffffcc328, p_ref_count=0x7ffffffccfc8, invoke=...) at ../src/imperative/imperative_utils.cc:91 #16 0x00007fff3bcdcaca in mxnet::imperative::RunGraph(bool, nnvm::IndexedGraph const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, unsigned long, unsigned long, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> >&&, std::vector<unsigned int, std::allocator<unsigned int> >&&, std::vector<mxnet::OpStatePtr, std::allocator<mxnet::OpStatePtr> >*, std::vector<mxnet::DispatchMode, std::allocator<mxnet::DispatchMode> > const&, bool, std::vector<mxnet::TShape, std::allocator<mxnet::TShape> >*, std::function<void (char const*, char const*, void*)> const&, bool) (retain_graph=false, idx=..., arrays=std::vector of length 8, capacity 8 = {...}, node_start=4, node_end=7, array_reqs=..., ref_count=..., p_states=0x7ffffffccfa8, dispatch_modes=std::vector of length 7, capacity 7 = {...}, recording=false, shapes=0x0, callback=..., monitor_all=false) at ../src/imperative/imperative_utils.cc:165 #17 0x00007fff3bcbe53c in mxnet::Imperative::Backward(std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, bool, bool, bool) (this=0x7fff501e8e78 <mxnet::Imperative::Get()::inst>, outputs=std::vector of length 1, capacity 1 = {...}, ograds=std::vector of length 1, capacity 1 = {...}, variables=std::vector of length 0, capacity 0, is_train=true, retain_graph=false, create_graph=false) at ../src/imperative/imperative.cc:616 #18 0x00007fff3bab5a3f in MXAutogradBackwardEx(uint32_t, NDArrayHandle*, NDArrayHandle*, uint32_t, NDArrayHandle*, int, int, int, NDArrayHandle**, int**) (num_output=1, output_handles=0x7ffda003bbe0, ograd_handles=0x7ffda003bd20, num_variables=0, var_handles=0x0, retain_graph=0, create_graph=0, is_train=1, grad_handles=0x0, grad_stypes=0x0) at ../src/c_api/c_api_ndarray.cc:418 #19 0x00007ffff2f65dae in ffi_call_unix64 () at /usr/lib/x86_64-linux-gnu/libffi.so.6 #20 0x00007ffff2f6571f in ffi_call () at /usr/lib/x86_64-linux-gnu/libffi.so.6 #21 0x00007ffff317d415 in _call_function_pointer (flags=4353, pProc=0x7fff3bab57c0 <MXAutogradBackwardEx(uint32_t, NDArrayHandle*, NDArrayHandle*, uint32_t, NDArrayHandle*, int, int, int, NDArrayHandle**, int**)>, avalues=0x7ffffffcd9c0, atypes=0x7ffffffcd960, restype=0x7ffff35bb9f8, resmem=0x7ffffffcda20, argcount=10) at /tmp/python-build.20200514035455.63369/Python-3.8.2/Modules/_ctypes/callproc.c:871 #22 0x00007ffff317de19 in _ctypes_callproc (pProc=0x7fff3bab57c0 <MXAutogradBackwardEx(uint32_t, NDArrayHandle*, NDArrayHandle*, uint32_t, NDArrayHandle*, int, int, int, NDArrayHandle**, int**)>, argtuple=(1, <c_void_p_Array_1 at remote 0x7ffda003bb90>, <c_void_p_Array_1 at remote 0x7ffda003bcd0>, 0, <c_void_p at remote 0x7ffda003be10>, <c_int at remote 0x7ffda003beb0>, <c_int at remote 0x7ffda003bf50>, <c_int at remote 0x7ffdc19de050>, <c_void_p at remote 0x7ffdc19de0f0>, <c_void_p at remote 0x7ffdc19de190>), flags=4353, argtypes=0x0, restype=<_ctypes.PyCSimpleType at remote 0x555555e7bce0>, checker=0x0) at /tmp/python-build.20200514035455.63369/Python-3.8.2/Modules/_ctypes/callproc.c:1199 #23 0x00007ffff3178169 in PyCFuncPtr_call (self=0x7ffda004cc90, inargs=(1, <c_void_p_Array_1 at remote 0x7ffda003bb90>, <c_void_p_Array_1 at remote 0x7ffda003bcd0>, 0, <c_void_p at remote 0x7ffda003be10>, <c_int at remote 0x7ffda003beb0>, <c_int at remote 0x7ffda003bf50>, <c_int at remote 0x7ffdc19de050>, <c_void_p at remote 0x7ffdc19de0f0>, <c_void_p at remote 0x7ffdc19de190>), kwds=0x0) at /tmp/python-build.20200514035455.63369/Python-3.8.2/Modules/_ctypes/_ctypes.c:4201 ... ``` Specifically note the `Cannot access memory at address 0xf0000000a` To obtain the backtrace run `gdb /path/to/python` and `(gdb) run -m pytest --color=yes --verbose --exitfirst ./tests/python/unittest/test_dynamic_shape.py` and then `bt` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org