samskalicky edited a comment on pull request #19112:
URL: https://github.com/apache/incubator-mxnet/pull/19112#issuecomment-690810445


   @ZhennanQin @pengzhao-intel Im debugging a test failure with this PR:
   ```
   [2020-09-10T19:49:11.883Z] 
======================================================================
   [2020-09-10T19:49:11.883Z] ERROR: test_subgraph.test_mobilenetv2_struct
   [2020-09-10T19:49:11.883Z] 
----------------------------------------------------------------------
   [2020-09-10T19:49:11.883Z] Traceback (most recent call last):
   [2020-09-10T19:49:11.883Z]   File 
"/usr/local/lib/python3.5/dist-packages/nose/case.py", line 198, in runTest
   [2020-09-10T19:49:11.883Z]     self.test(*self.arg)
   [2020-09-10T19:49:11.883Z]   File 
"/work/mxnet/tests/python/mkl/../unittest/common.py", line 215, in test_new
   [2020-09-10T19:49:11.883Z]     orig_test(*args, **kwargs)
   [2020-09-10T19:49:11.883Z]   File 
"/work/mxnet/tests/python/mkl/test_subgraph.py", line 815, in 
test_mobilenetv2_struct
   [2020-09-10T19:49:11.883Z]     check_fusion(net, data_shape, attrs, 
out_types=['int8', 'auto'])
   [2020-09-10T19:49:11.883Z]   File 
"/work/mxnet/tests/python/mkl/../unittest/common.py", line 215, in test_new
   [2020-09-10T19:49:11.883Z]     orig_test(*args, **kwargs)
   [2020-09-10T19:49:11.883Z]   File 
"/work/mxnet/tests/python/mkl/test_subgraph.py", line 271, in check_fusion
   [2020-09-10T19:49:11.883Z]     exe = sym.bind(ctx=mx.current_context(), 
args=arg_array, aux_states=aux_array, grad_req='null')
   [2020-09-10T19:49:11.883Z]   File 
"/work/mxnet/python/mxnet/symbol/symbol.py", line 2119, in bind
   [2020-09-10T19:49:11.883Z]     ctypes.byref(handle)))
   [2020-09-10T19:49:11.883Z]   File "/work/mxnet/python/mxnet/base.py", line 
246, in check_call
   [2020-09-10T19:49:11.883Z]     raise get_last_ffi_error()
   [2020-09-10T19:49:11.883Z] mxnet.base.MXNetError: Traceback (most recent 
call last):
   [2020-09-10T19:49:11.883Z]   [bt] (9) 
/usr/bin/python3(PyEval_EvalFrameEx+0x4eff) [0x53fe5f]
   [2020-09-10T19:49:11.883Z]   [bt] (8) /usr/bin/python3(PyObject_Call+0x47) 
[0x5c59d7]
   [2020-09-10T19:49:11.883Z]   [bt] (7) 
/usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(+0x9fcb) 
[0x7f7bac29efcb]
   [2020-09-10T19:49:11.883Z]   [bt] (6) 
/usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(_ctypes_callproc+0x49a)
 [0x7f7bac2ab01a]
   [2020-09-10T19:49:11.883Z]   [bt] (5) 
/usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(ffi_call+0x2eb)
 [0x7f7bac2b088b]
   [2020-09-10T19:49:11.883Z]   [bt] (4) 
/usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(ffi_call_unix64+0x4c)
 [0x7f7bac2b0e20]
   [2020-09-10T19:49:11.883Z]   [bt] (3) 
/work/mxnet/python/mxnet/../../lib/libmxnet.so(MXExecutorBindEX+0xdcb) 
[0x7f7b0675858b]
   [2020-09-10T19:49:11.883Z]   [bt] (2) 
/work/mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::Executor::Bind(nnvm::Symbol,
 mxnet::Context const&, std::map<std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> >, mxnet::Context, 
std::less<std::__cxx11::basic_string<char, std::char_traits<char>, 
std::allocator<char> > >, 
std::allocator<std::pair<std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const, mxnet::Context> > > 
const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, 
std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, 
mxnet::Executor*)+0x39b) [0x7f7b05d9d67b]
   [2020-09-10T19:49:11.883Z]   [bt] (1) 
/work/mxnet/python/mxnet/../../lib/libmxnet.so(+0x58f27f1) [0x7f7b05d9b7f1]
   [2020-09-10T19:49:11.883Z]   [bt] (0) 
/work/mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x61)
 [0x7f7b01151441]
   [2020-09-10T19:49:11.883Z]   File "src/executor/graph_executor.cc", line 1892
   [2020-09-10T19:49:11.883Z] MXNetError: Check failed: arg_names.size() == 
in_args_map.size() (8 vs. 7) : 
   ```
   
https://jenkins.mxnet-ci.amazon-ml.com/blue/rest/organizations/jenkins/pipelines/mxnet-validation/pipelines/unix-cpu/branches/PR-19112/runs/1/nodes/296/steps/781/log/?start=0
   And I think I narrowed it down to this part of the mkldnn conv subgraph:
   
https://github.com/apache/incubator-mxnet/blob/2d077db1c92dbf7db979e15609fddf5f371277c0/src/operator/subgraph/mkldnn/mkldnn_conv_property.h#L273-L277
   Commenting out the rotation seems to resolve the issue. 
   
   But now im getting a segfault:
   ```
   Thread 1 "python" received signal SIGSEGV, Segmentation fault.
   0x00007fffe60e4bc9 in nnvm::pass::(anonymous 
namespace)::MXAllocMemory(nnvm::Graph const&, nnvm::IndexedGraph const&, 
std::pair<unsigned int, unsigned int> const&, std::vector<int, 
std::allocator<int> >*, std::vector<int, std::allocator<int> >*, 
std::vector<unsigned int, std::allocator<unsigned int> > const&, 
nnvm::pass::(anonymous namespace)::MXGraphAllocator*) () from 
/home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   (gdb) bt
   #0  0x00007fffe60e4bc9 in nnvm::pass::(anonymous 
namespace)::MXAllocMemory(nnvm::Graph const&, nnvm::IndexedGraph const&, 
std::pair<unsigned int, unsigned int> const&, std::vector<int, 
std::allocator<int> >*, std::vector<int, std::allocator<int> >*, 
std::vector<unsigned int, std::allocator<unsigned int> > const&, 
nnvm::pass::(anonymous namespace)::MXGraphAllocator*) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #1  0x00007fffe60e6884 in nnvm::pass::(anonymous 
namespace)::MXPlanMemory(nnvm::Graph) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #2  0x00007fffe60ac9bc in std::_Function_handler<nnvm::Graph (nnvm::Graph), 
nnvm::Graph (*)(nnvm::Graph)>::_M_invoke(std::_Any_data const&, nnvm::Graph&&) 
() from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #3  0x00007fffe7f8a4ce in nnvm::ApplyPasses(nnvm::Graph, 
std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, 
std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> > > > const&) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #4  0x00007fffe61318ad in nnvm::ApplyPass(nnvm::Graph, 
std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > 
const&) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #5  0x00007fffe69e97c9 in 
mxnet::exec::GraphExecutor::FinishInitGraph(nnvm::Symbol, nnvm::Graph, 
mxnet::Executor*, std::unordered_map<nnvm::NodeEntry, mxnet::NDArray, 
nnvm::NodeEntryHash, nnvm::NodeEntryEqual, 
std::allocator<std::pair<nnvm::NodeEntry const, mxnet::NDArray> > > const&) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #6  0x00007fffe69eb1c6 in mxnet::exec::GraphExecutor::Init(nnvm::Symbol, 
mxnet::Context const&, std::map<std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> >, mxnet::Context, 
std::less<std::__cxx11::basic_string<char, std::char_traits<char>, 
std::allocator<char> > >, 
std::allocator<std::pair<std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const, mxnet::Context> > > 
const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, 
std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, 
mxnet::Executor*, std::unordered_map<nnvm::NodeEntry, mxnet::NDArray, 
nnvm::NodeEntryHash, nnvm::NodeEntryEqual, 
std::allocator<std::pair<nnvm::NodeEntry const, mxnet::NDArray> > > const&) () 
from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #7  0x00007fffe69f7f86 in mxnet::Executor::Bind(nnvm::Symbol, mxnet::Context 
const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, 
std::allocator<char> >, mxnet::Context, 
std::less<std::__cxx11::basic_string<char, std::char_traits<char>, 
std::allocator<char> > >, 
std::allocator<std::pair<std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> > const, mxnet::Context> > > 
const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, 
std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, 
mxnet::Executor*) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   ```
   which reminds me vaguely of @DickJC123's issue: 
https://github.com/apache/incubator-mxnet/issues/16685
   
   But this isnt failing on master, were there other fixes you guys made on 
master that I should backport also to resolve this?
   
   Also, heres the graph after partitioning:
   ```
   {
     "nodes": [
       {
         "op": "null", 
         "name": "data", 
         "attrs": {
           "__dtype__": "0", 
           "__shape__": "(64, 4, 10, 10)"
         }, 
         "inputs": []
       }, 
       {
         "op": "null", 
         "name": "conv1_weight", 
         "attrs": {"__dtype__": "0"}, 
         "inputs": []
       }, 
       {
         "op": "null", 
         "name": "bn1_gamma", 
         "inputs": []
       }, 
       {
         "op": "null", 
         "name": "bn1_beta", 
         "inputs": []
       }, 
       {
         "op": "null", 
         "name": "bn1_moving_mean", 
         "attrs": {"__init__": "[\"zero\", {}]"}, 
         "inputs": []
       }, 
       {
         "op": "null", 
         "name": "bn1_moving_var", 
         "attrs": {"__init__": "[\"one\", {}]"}, 
         "inputs": []
       }, 
       {
         "op": "_sg_mkldnn_conv", 
         "name": "sg_mkldnn_conv_bn_0", 
         "attrs": {"with_bn": "true"}, 
         "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 1], [5, 
0, 1]], 
         "subgraphs": [
           {
             "nodes": [
               {
                 "op": "null", 
                 "name": "data0", 
                 "inputs": []
               }, 
               {
                 "op": "null", 
                 "name": "conv1_weight0", 
                 "inputs": []
               }, 
               {
                 "op": "Convolution", 
                 "name": "conv1", 
                 "attrs": {
                   "kernel": "(1, 1)", 
                   "no_bias": "True", 
                   "num_filter": "64", 
                   "stride": "(1, 1)"
                 }, 
                 "inputs": [[0, 0, 0], [1, 0, 0]]
               }, 
               {
                 "op": "null", 
                 "name": "bn1_gamma0", 
                 "inputs": []
               }, 
               {
                 "op": "null", 
                 "name": "bn1_beta0", 
                 "inputs": []
               }, 
               {
                 "op": "null", 
                 "name": "bn1_moving_mean0", 
                 "inputs": []
               }, 
               {
                 "op": "null", 
                 "name": "bn1_moving_var0", 
                 "inputs": []
               }, 
               {
                 "op": "BatchNorm", 
                 "name": "bn1", 
                 "inputs": [[2, 0, 0], [3, 0, 0], [4, 0, 0], [5, 0, 0], [6, 0, 
0]]
               }
             ], 
             "arg_nodes": [0, 1, 3, 4, 5, 6], 
             "node_row_ptr": [0, 1, 2, 3, 4, 5, 6, 7, 10], 
             "heads": [[7, 0, 0]]
           }
         ]
       }, 
       {
         "op": "null", 
         "name": "conv2_weight", 
         "attrs": {"__dtype__": "0"}, 
         "inputs": []
       }, 
       {
         "op": "null", 
         "name": "bn2_gamma", 
         "inputs": []
       }, 
       {
         "op": "null", 
         "name": "bn2_beta", 
         "inputs": []
       }, 
       {
         "op": "null", 
         "name": "bn2_moving_mean", 
         "attrs": {"__init__": "[\"zero\", {}]"}, 
         "inputs": []
       }, 
       {
         "op": "null", 
         "name": "bn2_moving_var", 
         "attrs": {"__init__": "[\"one\", {}]"}, 
         "inputs": []
       }, 
       {
         "op": "_sg_mkldnn_conv", 
         "name": "sg_mkldnn_conv_bn_add_1", 
         "attrs": {
           "with_bn": "true", 
           "with_sum": "true"
         }, 
         "inputs": [[6, 0, 0], [7, 0, 0], [8, 0, 0], [9, 0, 0], [10, 0, 1], 
[11, 0, 1]], 
         "subgraphs": [
           {
             "nodes": [
               {
                 "op": "null", 
                 "name": "sg_mkldnn_conv_bn_0_output0", 
                 "inputs": []
               }, 
               {
                 "op": "null", 
                 "name": "conv2_weight0", 
                 "inputs": []
               }, 
               {
                 "op": "Convolution", 
                 "name": "conv2", 
                 "attrs": {
                   "kernel": "(1, 1)", 
                   "no_bias": "True", 
                   "num_filter": "64", 
                   "stride": "(1, 1)"
                 }, 
                 "inputs": [[0, 0, 0], [1, 0, 0]]
               }, 
               {
                 "op": "null", 
                 "name": "bn2_gamma0", 
                 "inputs": []
               }, 
               {
                 "op": "null", 
                 "name": "bn2_beta0", 
                 "inputs": []
               }, 
               {
                 "op": "null", 
                 "name": "bn2_moving_mean0", 
                 "inputs": []
               }, 
               {
                 "op": "null", 
                 "name": "bn2_moving_var0", 
                 "inputs": []
               }, 
               {
                 "op": "BatchNorm", 
                 "name": "bn2", 
                 "inputs": [[2, 0, 0], [3, 0, 0], [4, 0, 0], [5, 0, 0], [6, 0, 
0]]
               }, 
               {
                 "op": "elemwise_add", 
                 "name": "_plus0", 
                 "inputs": [[0, 0, 0], [7, 0, 0]]
               }
             ], 
             "arg_nodes": [0, 1, 3, 4, 5, 6], 
             "node_row_ptr": [0, 1, 2, 3, 4, 5, 6, 7, 10, 11], 
             "heads": [[8, 0, 0]]
           }
         ]
       }
     ], 
     "arg_nodes": [
       0, 
       1, 
       2, 
       3, 
       4, 
       5, 
       7, 
       8, 
       9, 
       10, 
       11
     ], 
     "node_row_ptr": [
       0, 
       1, 
       2, 
       3, 
       4, 
       5, 
       6, 
       7, 
       8, 
       9, 
       10, 
       11, 
       12, 
       13
     ], 
     "heads": [[12, 0, 0]]
   }
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to