RuRo commented on issue #18090:
URL: 
https://github.com/apache/incubator-mxnet/issues/18090#issuecomment-617348847


   While we are waiting for somebody to reproduce, I've tried playing with the 
stuck process in gdb.
   Here is the backtrace of all the threads after they get stuck.
   <details>
   
   ```c
   Thread 16 (Thread 0x7ffef27fc700 (LWP 1388714)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc64fa7d1 in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<mxnet::op::custom::CustomOperator::SetNumThreads(int)::{lambda()#1}>
 > >::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557d5d620) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #5  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #6  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 15 (Thread 0x7ffef2ffd700 (LWP 1388687)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc64fa7d1 in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<mxnet::op::custom::CustomOperator::SetNumThreads(int)::{lambda()#1}>
 > >::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557d6a8a0) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #5  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #6  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 14 (Thread 0x7ffef37fe700 (LWP 1388625)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc64fa7d1 in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<mxnet::op::custom::CustomOperator::SetNumThreads(int)::{lambda()#1}>
 > >::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557d3f910) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #5  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #6  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 13 (Thread 0x7ffef3fff700 (LWP 1388624)):
   #0  0x00007ffff79c401a in pthread_cond_timedwait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007ffff7b020c3 in PyEval_RestoreThread () at 
/usr/lib/libpython3.8.so.1.0
   #2  0x00007ffff7aa57d7 in  () at /usr/lib/libpython3.8.so.1.0
   #3  0x00007ffff72ff168 in  () at 
/usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so
   #4  0x00007ffff727d8c2 in  () at /usr/lib/libffi.so.7
   #5  0x00007ffff727dc20 in  () at /usr/lib/libffi.so.7
   #6  0x00007fffc6bf3782 in  () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #7  0x00007fffc64840b7 in 
std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #8  0x00007fffc6bf433e in  () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #9  0x00007fffc6bf8eca in  () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #10 0x00007fffc64fa794 in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<mxnet::op::custom::CustomOperator::SetNumThreads(int)::{lambda()#1}>
 > >::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #11 0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557c4f8b0) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #12 0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #13 0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 12 (Thread 0x7fff10ff9700 (LWP 1388623)):
   #0  0x00007ffff79c401a in pthread_cond_timedwait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007ffff7b020c3 in PyEval_RestoreThread () at 
/usr/lib/libpython3.8.so.1.0
   #2  0x00007ffff7300f24 in  () at 
/usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so
   #3  0x00007ffff730570d in  () at 
/usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so
   #4  0x00007ffff7b0c3d2 in _PyObject_MakeTpCall () at 
/usr/lib/libpython3.8.so.1.0
   #5  0x00007ffff7bc9c51 in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #6  0x00007ffff7bb6a9d in _PyFunction_Vectorcall () at 
/usr/lib/libpython3.8.so.1.0
   #7  0x00007ffff7bc558e in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #8  0x00007ffff7bb6a9d in _PyFunction_Vectorcall () at 
/usr/lib/libpython3.8.so.1.0
   #9  0x00007ffff7b5acda in  () at /usr/lib/libpython3.8.so.1.0
   #10 0x00007ffff7b5bc78 in  () at /usr/lib/libpython3.8.so.1.0
   #11 0x00007ffff7bc5e1c in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #12 0x00007ffff7bb58f4 in _PyEval_EvalCodeWithName () at 
/usr/lib/libpython3.8.so.1.0
   #13 0x00007ffff7bb6c7b in _PyFunction_Vectorcall () at 
/usr/lib/libpython3.8.so.1.0
   #14 0x00007ffff7bc5f5a in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #15 0x00007ffff7bb58f4 in _PyEval_EvalCodeWithName () at 
/usr/lib/libpython3.8.so.1.0
   #16 0x00007ffff7bb6c7b in _PyFunction_Vectorcall () at 
/usr/lib/libpython3.8.so.1.0
   #17 0x00007ffff7b12508 in PyObject_Call () at /usr/lib/libpython3.8.so.1.0
   #18 0x00007ffff7bc70c4 in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #19 0x00007ffff7bb58f4 in _PyEval_EvalCodeWithName () at 
/usr/lib/libpython3.8.so.1.0
   #20 0x00007ffff7bb6c7b in _PyFunction_Vectorcall () at 
/usr/lib/libpython3.8.so.1.0
   #21 0x00007ffff7b12508 in PyObject_Call () at /usr/lib/libpython3.8.so.1.0
   #22 0x00007ffff7bc70c4 in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #23 0x00007ffff7bb58f4 in _PyEval_EvalCodeWithName () at 
/usr/lib/libpython3.8.so.1.0
   #24 0x00007ffff7bb72c2 in  () at /usr/lib/libpython3.8.so.1.0
   #25 0x00007ffff7bc5f5a in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #26 0x00007ffff7bb6154 in _PyEval_EvalCodeWithName () at 
/usr/lib/libpython3.8.so.1.0
   #27 0x00007ffff7bb6c7b in _PyFunction_Vectorcall () at 
/usr/lib/libpython3.8.so.1.0
   #28 0x00007ffff7b123fd in PyObject_Call () at /usr/lib/libpython3.8.so.1.0
   #29 0x00007ffff72ff2a0 in  () at 
/usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so
   #30 0x00007ffff727d8c2 in  () at /usr/lib/libffi.so.7
   #31 0x00007ffff727dc20 in  () at /usr/lib/libffi.so.7
   #32 0x00007fffc6bf3c74 in  () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #33 0x00007fffc6bfa4d6 in  () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #34 0x00007fffc64fa748 in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<mxnet::op::custom::CustomOperator::SetNumThreads(int)::{lambda()#1}>
 > >::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #35 0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557bf09e0) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #36 0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #37 0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 11 (Thread 0x7fff117fa700 (LWP 1388622)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc64fa7d1 in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<mxnet::op::custom::CustomOperator::SetNumThreads(int)::{lambda()#1}>
 > >::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557d0b310) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #5  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #6  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 10 (Thread 0x7fff11ffb700 (LWP 1388621)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc64fa7d1 in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<mxnet::op::custom::CustomOperator::SetNumThreads(int)::{lambda()#1}>
 > >::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557b63970) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #5  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #6  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 9 (Thread 0x7fff127fc700 (LWP 1388620)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc65d9181 in std::_Function_handler<void 
(std::shared_ptr<dmlc::ManualEvent>), 
mxnet::engine::ThreadedEnginePerDevice::PushToExecute(mxnet::engine::OprBlock*, 
bool)::{lambda()#1}::operator()() 
const::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data
 const&, std::shared_ptr<dmlc::ManualEvent>&&) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fffc65d76ba in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::function<void 
(std::shared_ptr<dmlc::ManualEvent>)>, std::shared_ptr<dmlc::ManualEvent> > > 
>::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #5  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557d093c0) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #6  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #7  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 8 (Thread 0x7fff12ffd700 (LWP 1388619)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc65db716 in 
dmlc::ConcurrentBlockingQueue<mxnet::engine::OprBlock*, 
(dmlc::ConcurrentQueueType)1>::Pop(mxnet::engine::OprBlock**) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fffc65dbe99 in std::_Function_handler<void 
(std::shared_ptr<dmlc::ManualEvent>), 
mxnet::engine::ThreadedEnginePerDevice::Start()::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data
 const&, std::shared_ptr<dmlc::ManualEvent>&&) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #5  0x00007fffc65d76ba in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::function<void 
(std::shared_ptr<dmlc::ManualEvent>)>, std::shared_ptr<dmlc::ManualEvent> > > 
>::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #6  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555556ca9940) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #7  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #8  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 7 (Thread 0x7fff137fe700 (LWP 1388618)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc65db716 in 
dmlc::ConcurrentBlockingQueue<mxnet::engine::OprBlock*, 
(dmlc::ConcurrentQueueType)1>::Pop(mxnet::engine::OprBlock**) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fffc65dbe99 in std::_Function_handler<void 
(std::shared_ptr<dmlc::ManualEvent>), 
mxnet::engine::ThreadedEnginePerDevice::Start()::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data
 const&, std::shared_ptr<dmlc::ManualEvent>&&) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #5  0x00007fffc65d76ba in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::function<void 
(std::shared_ptr<dmlc::ManualEvent>)>, std::shared_ptr<dmlc::ManualEvent> > > 
>::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #6  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555556ca9a50) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #7  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #8  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 6 (Thread 0x7fff13fff700 (LWP 1388617)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc65db716 in 
dmlc::ConcurrentBlockingQueue<mxnet::engine::OprBlock*, 
(dmlc::ConcurrentQueueType)1>::Pop(mxnet::engine::OprBlock**) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fffc65dbe99 in std::_Function_handler<void 
(std::shared_ptr<dmlc::ManualEvent>), 
mxnet::engine::ThreadedEnginePerDevice::Start()::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data
 const&, std::shared_ptr<dmlc::ManualEvent>&&) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #5  0x00007fffc65d76ba in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::function<void 
(std::shared_ptr<dmlc::ManualEvent>)>, std::shared_ptr<dmlc::ManualEvent> > > 
>::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #6  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557b9fea0) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #7  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #8  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 5 (Thread 0x7fff188cf700 (LWP 1388616)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007fff7e14ce71 in __gthread_cond_wait (__mutex=<optimized out>, 
__cond=<optimized out>) at 
/build/gcc/src/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/x86_64-pc-linux-gnu/bits/gthr-default.h:865
   #2  std::condition_variable::wait(std::unique_lock<std::mutex>&) 
(this=<optimized out>, __lock=...) at 
/build/gcc/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
   #3  0x00007fffc65db716 in 
dmlc::ConcurrentBlockingQueue<mxnet::engine::OprBlock*, 
(dmlc::ConcurrentQueueType)1>::Pop(mxnet::engine::OprBlock**) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fffc65dbe99 in std::_Function_handler<void 
(std::shared_ptr<dmlc::ManualEvent>), 
mxnet::engine::ThreadedEnginePerDevice::Start()::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data
 const&, std::shared_ptr<dmlc::ManualEvent>&&) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #5  0x00007fffc65d76ba in 
std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::function<void 
(std::shared_ptr<dmlc::ManualEvent>)>, std::shared_ptr<dmlc::ManualEvent> > > 
>::_M_run() () at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #6  0x00007fff7e152b24 in std::execute_native_thread_routine(void*) 
(__p=0x555557d34010) at /build/gcc/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
   #7  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #8  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 4 (Thread 0x7fff7a60c880 (LWP 1388611)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007ffff14570e1 in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #2  0x00007ffff138f9a1 in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #3  0x00007ffff139223d in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #4  0x00007ffff139926b in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #5  0x00007ffff13d5170 in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #6  0x00007ffff145119c in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #7  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #8  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 3 (Thread 0x7fff7ae0e800 (LWP 1388610)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007ffff14570e1 in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #2  0x00007ffff138f9a1 in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #3  0x00007ffff139223d in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #4  0x00007ffff139926b in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #5  0x00007ffff13d5170 in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #6  0x00007ffff145119c in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #7  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #8  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 2 (Thread 0x7fff7b610780 (LWP 1388609)):
   #0  0x00007ffff79c3cf5 in pthread_cond_wait@@GLIBC_2.3.2 () at 
/usr/lib/libpthread.so.0
   #1  0x00007ffff14570e1 in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #2  0x00007ffff138f9a1 in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #3  0x00007ffff139223d in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #4  0x00007ffff139926b in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #5  0x00007ffff13d5170 in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #6  0x00007ffff145119c in  () at /opt/intel/mkl/lib/intel64/libiomp5.so
   #7  0x00007ffff79bd46f in start_thread () at /usr/lib/libpthread.so.0
   #8  0x00007ffff7e953d3 in clone () at /usr/lib/libc.so.6
   
   Thread 1 (Thread 0x7ffff785f740 (LWP 1388524)):
   #0  0x00007ffff79c74cf in __lll_lock_wait () at /usr/lib/libpthread.so.0
   #1  0x00007ffff79bfe03 in pthread_mutex_lock () at /usr/lib/libpthread.so.0
   #2  0x00007fffc6bf9b4e in mxnet::op::custom::AttrParser(nnvm::NodeAttrs*) () 
at /usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #3  0x00007fffc65053db in MXImperativeInvokeImpl(void*, int, void**, int*, 
void***, int, char const**, char const**) () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #4  0x00007fffc65061da in MXImperativeInvokeEx () at 
/usr/lib/python3.8/site-packages/mxnet/libmxnet.so
   #5  0x00007fff79dc143b in __pyx_pf_5mxnet_4_cy3_7ndarray_2_imperative_invoke 
(__pyx_self=<optimized out>, __pyx_v_output_is_list=<optimized out>, 
__pyx_v_is_np_op=<optimized out>, __pyx_v_out=<optimized out>, 
__pyx_v_vals=<optimized out>, __pyx_v_keys=<optimized out>, 
__pyx_v_ndargs=<optimized out>, __pyx_v_handle=<optimized out>) at 
/usr/include/c++/9.3.0/bits/stl_vector.h:915
   #6  __pyx_pw_5mxnet_4_cy3_7ndarray_3_imperative_invoke(PyObject*, PyObject*, 
PyObject*) (__pyx_self=<optimized out>, __pyx_args=<optimized out>, 
__pyx_kwds=<optimized out>) at mxnet/cython/ndarray.cpp:5334
   #7  0x00007ffff7b18ff6 in PyCFunction_Call () at /usr/lib/libpython3.8.so.1.0
   #8  0x00007ffff7b0c3d2 in _PyObject_MakeTpCall () at 
/usr/lib/libpython3.8.so.1.0
   #9  0x00007ffff7bc979c in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #10 0x00007ffff7bb58f4 in _PyEval_EvalCodeWithName () at 
/usr/lib/libpython3.8.so.1.0
   #11 0x00007ffff7bb6c7b in _PyFunction_Vectorcall () at 
/usr/lib/libpython3.8.so.1.0
   #12 0x00007ffff7b0ef60 in _PyObject_FastCallDict () at 
/usr/lib/libpython3.8.so.1.0
   #13 0x00007ffff7c2e63d in  () at /usr/lib/libpython3.8.so.1.0
   #14 0x00007ffff7b0c3d2 in _PyObject_MakeTpCall () at 
/usr/lib/libpython3.8.so.1.0
   #15 0x00007ffff7bc9e8b in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #16 0x00007ffff7bb58f4 in _PyEval_EvalCodeWithName () at 
/usr/lib/libpython3.8.so.1.0
   #17 0x00007ffff7bb6c7b in _PyFunction_Vectorcall () at 
/usr/lib/libpython3.8.so.1.0
   #18 0x00007ffff7bc5f5a in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #19 0x00007ffff7bb58f4 in _PyEval_EvalCodeWithName () at 
/usr/lib/libpython3.8.so.1.0
   #20 0x00007ffff7bb6c7b in _PyFunction_Vectorcall () at 
/usr/lib/libpython3.8.so.1.0
   #21 0x00007ffff7bc980a in _PyEval_EvalFrameDefault () at 
/usr/lib/libpython3.8.so.1.0
   #22 0x00007ffff7bb58f4 in _PyEval_EvalCodeWithName () at 
/usr/lib/libpython3.8.so.1.0
   #23 0x00007ffff7c3cd73 in PyEval_EvalCode () at /usr/lib/libpython3.8.so.1.0
   #24 0x00007ffff7c3cdc8 in  () at /usr/lib/libpython3.8.so.1.0
   #25 0x00007ffff7c41063 in  () at /usr/lib/libpython3.8.so.1.0
   #26 0x00007ffff7adbdf0 in PyRun_FileExFlags () at 
/usr/lib/libpython3.8.so.1.0
   #27 0x00007ffff7ae5aa4 in PyRun_SimpleFileExFlags () at 
/usr/lib/libpython3.8.so.1.0
   #28 0x00007ffff7c4d81e in Py_RunMain () at /usr/lib/libpython3.8.so.1.0
   #29 0x00007ffff7c4d909 in Py_BytesMain () at /usr/lib/libpython3.8.so.1.0
   #30 0x00007ffff7dbd023 in __libc_start_main () at /usr/lib/libc.so.6
   #31 0x000055555555505e in _start ()
   ```
   
   </details>
   
   As you can see, all the threads seem to be either in `pthread_cond_wait`, 
`pthread_cond_timedwait` or `pthread_mutex_lock`, so this is indeed probably a 
deadlock situation.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to