khui commented on issue #14799: training model failed after one epoch on GPU: e 
== CUDNN_STATUS_SUCCESS (8 vs. 0) cuDNN: CUDNN_STATUS_EXECUTION_FAILED
URL: 
https://github.com/apache/incubator-mxnet/issues/14799#issuecomment-487284235
 
 
   In addition, I tried using naive engine by setting 
`MXNET_ENGINE_TYPE=NaiveEngine`. And got following more specific errors:
   
   Traceback (most recent call last):
     File "/workdir/code/src/project_main.py", line 154, in <module>
       main(args)
     File "/workdir/code/src/project_main.py", line 138, in main
       do_offline_evaluation=args.do_offline_evaluation)
     File "/workdir/code/src/project/estimator/train_pred_eval.py", line 131, 
in train
       ctx=ctx)
     File "/workdir/code/src/project/estimator/train_pred_eval.py", line 343, 
in model_fn
       bn_start_logit, bn_end_logit = model(bn_question_tokens, 
bn_context_tokens)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/block.py", line 
540, in __call__
       out = self.forward(*args)
     File "/workdir/code/src/project/estimator/models/model.py", line 90, in 
forward
       att_f_q, att_f_c = self.model.forward(bn_questions, bn_contexts)
     File "/workdir/code/src/project/estimator/models/model.py", line 212, in 
forward
       f_q = self.bilstm_q(em_q)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/block.py", line 
540, in __call__
       out = self.forward(*args)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/block.py", line 
917, in forward
       return self.hybrid_forward(ndarray, x, *args, **params)
     File 
"/usr/local/lib/python3.6/dist-packages/mxnet/gluon/rnn/rnn_layer.py", line 
234, in hybrid_forward
       out = self._forward_kernel(F, inputs, states, **kwargs)
     File 
"/usr/local/lib/python3.6/dist-packages/mxnet/gluon/rnn/rnn_layer.py", line 
265, in _forward_kernel
       lstm_state_clip_nan=self._lstm_state_clip_nan)
     File "<string>", line 145, in RNN
     File "/usr/local/lib/python3.6/dist-packages/mxnet/_ctypes/ndarray.py", 
line 92, in _imperative_invoke
       ctypes.byref(out_stypes)))
     File "/usr/local/lib/python3.6/dist-packages/mxnet/base.py", line 252, in 
check_call
       raise MXNetError(py_str(_LIB.MXGetLastError()))
   mxnet.base.MXNetError: [09:32:14] src/operator/./cudnn_rnn-inl.h:710: Check 
failed: e == CUDNN_STATUS_SUCCESS (8 vs. 0) cuDNN: CUDNN_STATUS_EXECUTION_FAILED
   
   
   
   Stack trace returned 10 entries:
   [bt] (0) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x3d9c92) 
[0x7f6a25da7c92]
   [bt] (1) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x3da268) 
[0x7f6a25da8268]
   [bt] (2) 
/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x5c50cb4) 
[0x7f6a2b61ecb4]
   [bt] (3) 
/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x5c52af6) 
[0x7f6a2b620af6]
   [bt] (4) 
/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x33f2924) 
[0x7f6a28dc0924]
   [bt] (5) 
/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(mxnet::imperative::PushOperator(mxnet::OpStatePtr
 const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<unsigned int, std::allocator<unsigned int> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
mxnet::DispatchMode)::{lambda(mxnet::RunContext, 
mxnet::engine::CallbackOnComplete)#3}::operator()(mxnet::RunContext, 
mxnet::engine::CallbackOnComplete) const+0x361) [0x7f6a28b9d791]
   [bt] (6) 
/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(std::_Function_handler<void
 (mxnet::RunContext), mxnet::imperative::PushOperator(mxnet::OpStatePtr const&, 
nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<unsigned int, std::allocator<unsigned int> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
mxnet::DispatchMode)::{lambda(mxnet::RunContext)#4}>::_M_invoke(std::_Any_data 
const&, mxnet::RunContext)+0x26) [0x7f6a28b9dde6]
   [bt] (7) 
/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x3121ef3) 
[0x7f6a28aefef3]
   [bt] (8) 
/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x3125ae5) 
[0x7f6a28af3ae5]
   [bt] (9) 
/usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x31246d9) 
[0x7f6a28af26d9]
   
   
   [09:32:14] src/engine/naive_engine.cc:69: Engine shutdown
   (END)
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to