larroy edited a comment on issue #12994: Test failure and possible bug on GPU 
topology algorithm (test_device.test_device_pushpull)
URL: 
https://github.com/apache/incubator-mxnet/issues/12994#issuecomment-436368412
 
 
   Was able to reproduce in p3.16x compiling in release mode:
   
   ```
   cmake\
       -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
       -DCMAKE_C_COMPILER_LAUNCHER=ccache \
       -DUSE_CPP_PACKAGE=ON\
       -DUSE_CUDA=ON\
       -DUSE_OPENMP=ON\
       -DUSE_OPENCV=ON\
       -DCMAKE_BUILD_TYPE=Release\
       -GNinja ..
   ninja -v
   ```
   
   
   ```
   ======================================================================
   ERROR: test_device.test_device_pushpull
   ----------------------------------------------------------------------
   Traceback (most recent call last):
     File 
"/home/piotr/mxnet_other/mxnet_py3/lib/python3.5/site-packages/nose/case.py", 
line 198, in runTest
       self.test(*self.arg)
     File "/home/piotr/mxnet_other/tests/python/gpu/test_device.py", line 74, 
in test_device_pushpull
       check_dense_pushpull('device')
     File "/home/piotr/mxnet_other/tests/python/gpu/test_device.py", line 61, 
in check_dense_pushpull
       kv_device.push(cur_key, arr_list)
     File "/home/piotr/mxnet_other/python/mxnet/kvstore.py", line 234, in push
       self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority)))
     File "/home/piotr/mxnet_other/python/mxnet/base.py", line 252, in 
check_call
       raise MXNetError(py_str(_LIB.MXGetLastError()))
   mxnet.base.MXNetError: [18:57:12] ../src/kvstore/./././gpu_topology.h:1040: 
No valid binary tree found from root 2 using backtracking
   
   Stack trace returned 10 entries:
   [bt] (0) 
/home/piotr/mxnet_other/python/mxnet/../../build/libmxnet.so(dmlc::StackTrace[abi:cxx11]()+0x1bc)
 [0x7f080c46b2fc]
   [bt] (1) 
/home/piotr/mxnet_other/python/mxnet/../../build/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x28)
 [0x7f080c46c6a8]
   [bt] (2) /home/piotr/mxnet_other/python/mxnet/../../build/libmxnet.so(void 
mxnet::kvstore::ComputeTreesFromRoot<float>(std::vector<float, 
std::allocator<float> >*, int, int$ float, bool, std::vector<unsigned long, 
std::allocator<unsigned long> >*, std::vector<unsigned long, 
std::allocator<unsigned long> >*)+0x1621) [0x7f080fd6e8e1]
   [bt] (3) /home/piotr/mxnet_other/python/mxnet/../../build/libmxnet.so(void 
mxnet::kvstore::ComputeTrees<float>(std::vector<float, std::allocator<float> > 
const&, int, float,
    bool, std::vector<std::vector<unsigned long, std::allocator<unsigned long> 
>, std::allocator<std::vector<unsigned long, std::allocator<unsigned long> > > 
>*, std::vector<st
   d::vector<unsigned long, std::allocator<unsigned long> >, 
std::allocator<std::vector<unsigned long, std::allocator<unsigned long> > > 
>*)+0x2f3) [0x7f080fd6ef13]
   [bt] (4) 
/home/piotr/mxnet_other/python/mxnet/../../build/libmxnet.so(mxnet::kvstore::CommDeviceTree::QueryTopology()+0xefd)
 [0x7f080fd7126d]
   [bt] (5) 
/home/piotr/mxnet_other/python/mxnet/../../build/libmxnet.so(mxnet::kvstore::CommDeviceTree::Reduce(int,
 std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray>
   > const&, int)+0xf70) [0x7f080fd726b0]
   [bt] (6) 
/home/piotr/mxnet_other/python/mxnet/../../build/libmxnet.so(mxnet::kvstore::KVStoreLocal::PushImpl(std::vector<int,
 std::allocator<int> > const&, std::vector<mxnet
   ::NDArray, std::allocator<mxnet::NDArray> > const&, int)+0x1b8) 
[0x7f080fd73858]
   [bt] (7) 
/home/piotr/mxnet_other/python/mxnet/../../build/libmxnet.so(mxnet::kvstore::KVStoreLocal::Push(std::vector<std::__cxx11::basic_string<char,
 std::char_traits<char>,
    std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> > > > const&, 
std::vector<mxnet::NDArray, std::allocato
   r<mxnet::NDArray> > const&, int)+0xc5) [0x7f080fd4efb5]
   [bt] (8) 
/home/piotr/mxnet_other/python/mxnet/../../build/libmxnet.so(MXKVStorePushEx+0x16d)
 [0x7f080ff1a7fd]
   [bt] (9) 
/home/piotr/mxnet_other/mxnet_py3/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(ffi_call_unix64+0x4c)
 [0x7f083369ae20]
   
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to