khui edited a comment on issue #14799: training model failed after one epoch on GPU: e == CUDNN_STATUS_SUCCESS (8 vs. 0) cuDNN: CUDNN_STATUS_EXECUTION_FAILED URL: https://github.com/apache/incubator-mxnet/issues/14799#issuecomment-487675104 As a note, when reproducing the errors in a jupyter notebook, I got following errors when trying to print out the loss and compute the mean. @lanking520 Could you help to check the following error messages? Please let me know if they provide you any hints. Thanks!! The loss is. ``` [ 2.8901496 8.305076 13.280055 4.652643 9.613869 4.837726 5.949163 4.6820254 7.0052347 9.829151 6.4464464 5.3237095 6.1686893 7.799595 10.966969 5.2151794 5.0370407 6.5768747 8.265556 11.412268 6.8640356 5.128555 5.1864567 6.8858347 6.894717 2.467805 8.098482 5.589046 6.557484 11.86685 4.3043194 5.3515797 6.1470346 8.024975 3.422638 16.160294 6.2304115 1.178197 2.866407 3.984875 3.7100368 13.471437 7.4196377 8.543673 8.974239 11.460396 7.1255684 7.1223545 5.4278336 10.207495 5.3622923 7.626067 7.2586136 9.395147 4.973973 7.6694055 10.879036 10.221865 5.520145 11.152739 5.0953455 8.80431 4.323547 7.823736 ] <NDArray 64 @gpu(0)> ``` `loss.mean()` I got: ``` --------------------------------------------------------------------------- MXNetError Traceback (most recent call last) ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj) 700 type_pprinters=self.type_printers, 701 deferred_pprinters=self.deferred_printers) --> 702 printer.pretty(obj) 703 printer.flush() 704 return stream.getvalue() ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj) 381 if cls in self.type_pprinters: 382 # printer registered in self.type_pprinters --> 383 return self.type_pprinters[cls](obj, self, cycle) 384 else: 385 # deferred printer ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/lib/pretty.py in inner(obj, p, cycle) 559 p.text(',') 560 p.breakable() --> 561 p.pretty(x) 562 if len(obj) == 1 and type(obj) is tuple: 563 # Special case for 1-item tuples. ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj) 398 if cls is not object \ 399 and callable(cls.__dict__.get('__repr__')): --> 400 return _repr_pprint(obj, self, cycle) 401 402 return _default_pprint(obj, self, cycle) ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle) 693 """A pprint that just redirects to the normal repr function.""" 694 # Find newlines and replace them with p.break_() --> 695 output = repr(obj) 696 for idx,output_line in enumerate(output.splitlines()): 697 if idx: ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py in __repr__(self) 187 """Returns a string representation of the array.""" 188 shape_info = 'x'.join(['%d' % x for x in self.shape]) --> 189 return '\n%s\n<%s %s @%s>' % (str(self.asnumpy()), 190 self.__class__.__name__, 191 shape_info, self.context) ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py in asnumpy(self) 1978 self.handle, 1979 data.ctypes.data_as(ctypes.c_void_p), -> 1980 ctypes.c_size_t(data.size))) 1981 return data 1982 ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/base.py in check_call(ret) 250 """ 251 if ret != 0: --> 252 raise MXNetError(py_str(_LIB.MXGetLastError())) 253 254 MXNetError: [17:08:34] src/nnvm/legacy_op_util.cc:134: Check failed: fwd_init_ Stack trace returned 10 entries: [bt] (0) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x40123a) [0x7f7dfd0b623a] [bt] (1) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x401851) [0x7f7dfd0b6851] [bt] (2) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2f786d2) [0x7f7dffc2d6d2] [bt] (3) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::imperative::PushOperator(mxnet::OpStatePtr const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, mxnet::DispatchMode)::{lambda(mxnet::RunContext, mxnet::engine::CallbackOnComplete)#3}::operator()(mxnet::RunContext, mxnet::engine::CallbackOnComplete) const+0x2f0) [0x7f7dffa14bd0] [bt] (4) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::imperative::PushOperator(mxnet::OpStatePtr const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, mxnet::DispatchMode)::{lambda(mxnet::RunContext)#4}>::_M_invoke(std::_Any_data const&, mxnet::RunContext)+0x26) [0x7f7dffa15246] [bt] (5) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7) [0x7f7dff9656d7] [bt] (6) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7) [0x7f7dff9656d7] [bt] (7) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7) [0x7f7dff9656d7] [bt] (8) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7) [0x7f7dff9656d7] [bt] (9) /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7) [0x7f7dff9656d7] ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
