zheng-da opened a new issue #10994: MKLDNN fails in the backward computation 
when forward runs with is_train=False
URL: https://github.com/apache/incubator-mxnet/issues/10994
 
 
   This is a pretty special case. When we run forward with is_train=False and 
MKLDNN is enabled, backward fails with a memory error. @ashokei @pengzhao-intel 
@TaoLv 
   
   ```python
   def test_hybrid_static_memory():
       x = mx.nd.random.uniform(shape=(2, 3, 32, 32))
       x.attach_grad()
   
       net1 = gluon.model_zoo.vision.get_resnet(
           1, 18, pretrained=True, prefix='net_', 
ctx=mx.context.current_context())
       net2 = gluon.model_zoo.vision.get_resnet(
           1, 18, pretrained=True, prefix='net_', 
ctx=mx.context.current_context())
       net1(x)
       net2(x)
   
       net1.save_params('test.params')
       net2.load_params('test.params')
   
       def test(net, x):
           with mx.autograd.record(False):
               y = net(x) + net(x)
               y.backward()
   
           grads = {k: v.grad() for k, v in net.collect_params().items() if 
v.grad_req != 'null'}
   
           return y, grads
   
       y1, grads1 = test(net1, x)
       y2, grads2 = test(net2, x)
   
       assert_almost_equal(y1.asnumpy(), y2.asnumpy(), rtol=1e-3, atol=1e-5)
       for key in grads1:
           print(key)
           try:
               assert_almost_equal(grads1[key].asnumpy(), 
grads2[key].asnumpy(), rtol=1e-3, atol=1e-5)
           except Exception as e:
               print(e)
   ```
   
   The memory error is something like this:
   ```
   *** Error in `/usr/bin/python': corrupted double-linked list: 
0x00007f426ee97880 ***
   ======= Backtrace: =========
   /lib/x86_64-linux-gnu/libc.so.6(+0x777e5)[0x7f4314aa77e5]
   /lib/x86_64-linux-gnu/libc.so.6(+0x80baf)[0x7f4314ab0baf]
   /lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7f4314ab453c]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmkldnn.so.0(mkldnn_primitive_desc_destroy+0xf)[0x7f4308f39bcf]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt19_Sp_counted_deleterIP21mkldnn_primitive_descPF15mkldnn_status_tS1_ESaIvELN9__gnu_cxx12_Lock_policyE2EE10_M_disposeEv+0x2c)[0x7f42db233e4c]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt16_Sp_counted_baseILN9__gnu_cxx12_Lock_policyE2EE10_M_releaseEv+0x42)[0x7f42db22c9a2]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt14__shared_countILN9__gnu_cxx12_Lock_policyE2EED1Ev+0x27)[0x7f42db22a8ad]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt12__shared_ptrI21mkldnn_primitive_descLN9__gnu_cxx12_Lock_policyE2EED1Ev+0x1c)[0x7f42db2241ba]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt10shared_ptrI21mkldnn_primitive_descED1Ev+0x18)[0x7f42db2241d6]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN6mkldnn6handleIP21mkldnn_primitive_descNS_13handle_traitsIS2_EEED1Ev+0x18)[0x7f42db2241f2]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN6mkldnn16pooling_backward14primitive_descD1Ev+0x18)[0x7f42db266438]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN5mxnet2op24MKLDNNPoolingGradComputeERKNS_9OpContextERKNS0_12PoolingParamERKNS_7NDArrayES9_PS8_NS_9OpReqTypeES9_+0xa07)[0x7f42db2643f2]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN5mxnet2op23PoolingGradComputeExCPUERKN4nnvm9NodeAttrsERKNS_9OpContextERKSt6vectorINS_7NDArrayESaIS9_EERKS8_INS_9OpReqTypeESaISE_EESD_+0x401)[0x7f42dd262e2d]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvRKN4nnvm9NodeAttrsERKN5mxnet9OpContextERKSt6vectorINS4_7NDArrayESaIS9_EERKS8_INS4_9OpReqTypeESaISE_EESD_EPSJ_E9_M_invokeERKSt9_Any_dataS3_S7_SD_SI_SD_+0x91)[0x7f42db3727e4]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvRKN4nnvm9NodeAttrsERKN5mxnet9OpContextERKSt6vectorINS4_7NDArrayESaIS9_EERKS8_INS4_9OpReqTypeESaISE_EESD_EEclES3_S7_SD_SI_SD_+0xa6)[0x7f42dd5a5940]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZZN5mxnet10imperative14PushFComputeExERKSt8functionIFvRKN4nnvm9NodeAttrsERKNS_9OpContextERKSt6vectorINS_7NDArrayESaISA_EERKS9_INS_9OpReqTypeESaISF_EESE_EEPKNS2_2OpES5_RKNS_7ContextERKS9_IPNS_6engine3VarESaISW_EES10_RKS9_INS_8ResourceESaIS11_EERKS9_IPSA_SaIS16_EES1A_SJ_ENKUlNS_10RunContextEE_clES1B_+0xf7)[0x7f42dd59f493]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvN5mxnet10RunContextEEZNS0_10imperative14PushFComputeExERKSt8functionIFvRKN4nnvm9NodeAttrsERKNS0_9OpContextERKSt6vectorINS0_7NDArrayESaISD_EERKSC_INS0_9OpReqTypeESaISI_EESH_EEPKNS5_2OpES8_RKNS0_7ContextERKSC_IPNS0_6engine3VarESaISZ_EES13_RKSC_INS0_8ResourceESaIS14_EERKSC_IPSD_SaIS19_EES1D_SM_EUlS1_E_E9_M_invokeERKSt9_Any_dataOS1_+0x44)[0x7f42dd5aa81f]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvN5mxnet10RunContextEEEclES1_+0x56)[0x7f42ddc2e03c]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZZN5mxnet6engine14ThreadedEngine10BulkAppendESt8functionIFvNS_10RunContextEEENS_7ContextERKSt6vectorIPNS0_3VarESaIS9_EESD_ENKUlS3_E_clES3_+0x61)[0x7f42ddc4124b]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvN5mxnet10RunContextEEZNS0_6engine14ThreadedEngine10BulkAppendESt8functionIS2_ENS0_7ContextERKSt6vectorIPNS3_3VarESaISA_EESE_EUlS1_E_E9_M_invokeERKSt9_Any_dataOS1_+0x44)[0x7f42ddc44329]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvN5mxnet10RunContextEEEclES1_+0x56)[0x7f42ddc2e03c]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZZN5mxnet6engine14ThreadedEngine9BulkFlushEvENKUlNS_10RunContextENS0_18CallbackOnCompleteEE_clES2_S3_+0x43)[0x7f42ddc35651]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvN5mxnet10RunContextENS0_6engine18CallbackOnCompleteEEZNS2_14ThreadedEngine9BulkFlushEvEUlS1_S3_E_E9_M_invokeERKSt9_Any_dataOS1_OS3_+0x67)[0x7f42ddc388fa]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvN5mxnet10RunContextENS0_6engine18CallbackOnCompleteEEEclES1_S3_+0x67)[0x7f42ddc2ef89]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN5mxnet6engine14ThreadedEngine15ExecuteOprBlockENS_10RunContextEPNS0_8OprBlockE+0x39f)[0x7f42ddc34f6f]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN5mxnet6engine23ThreadedEnginePerDevice9CPUWorkerILN4dmlc19ConcurrentQueueTypeE0EEEvNS_7ContextEPNS1_17ThreadWorkerBlockIXT_EEERKSt10shared_ptrINS3_11ManualEventEE+0xaf)[0x7f42ddc495d1]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZZZN5mxnet6engine23ThreadedEnginePerDevice13PushToExecuteEPNS0_8OprBlockEbENKUlvE_clEvENKUlSt10shared_ptrIN4dmlc11ManualEventEEE_clES8_+0x42)[0x7f42ddc47868]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvSt10shared_ptrIN4dmlc11ManualEventEEEZZN5mxnet6engine23ThreadedEnginePerDevice13PushToExecuteEPNS6_8OprBlockEbENKUlvE_clEvEUlS3_E_E9_M_invokeERKSt9_Any_dataOS3_+0x5c)[0x7f42ddc4bf74]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvSt10shared_ptrIN4dmlc11ManualEventEEEEclES3_+0x49)[0x7f42ddc5134b]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt12_Bind_simpleIFSt8functionIFvSt10shared_ptrIN4dmlc11ManualEventEEEES4_EE9_M_invokeIILm0EEEEvSt12_Index_tupleIIXspT_EEE+0x68)[0x7f42ddc512be]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt12_Bind_simpleIFSt8functionIFvSt10shared_ptrIN4dmlc11ManualEventEEEES4_EEclEv+0x2c)[0x7f42ddc511a0]
   
/home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt6thread5_ImplISt12_Bind_simpleIFSt8functionIFvSt10shared_ptrIN4dmlc11ManualEventEEEES6_EEE6_M_runEv+0x1c)[0x7f42ddc51130]
   /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb8c80)[0x7f42ff8abc80]
   /lib/x86_64-linux-gnu/libpthread.so.0(+0x76ba)[0x7f4314e016ba]
   /lib/x86_64-linux-gnu/libc.so.6(clone+0x6d)[0x7f4314b3741d]
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to