khui edited a comment on issue #14799: training model failed after one epoch on 
GPU: e == CUDNN_STATUS_SUCCESS (8 vs. 0) cuDNN: CUDNN_STATUS_EXECUTION_FAILED
URL: 
https://github.com/apache/incubator-mxnet/issues/14799#issuecomment-487675104
 
 
   As a note, when reproducing the errors in a jupyter notebook, I got 
following errors when trying to print out the loss and compute the mean (after 
getting the errros described earlier).
   
   @lanking520 Could you help to check the following error messages? Please let 
me know if they provide you any hints. Thanks!!
   
   The loss is.
   ```
   [ 2.8901496  8.305076  13.280055   4.652643   9.613869   4.837726
     5.949163   4.6820254  7.0052347  9.829151   6.4464464  5.3237095
     6.1686893  7.799595  10.966969   5.2151794  5.0370407  6.5768747
     8.265556  11.412268   6.8640356  5.128555   5.1864567  6.8858347
     6.894717   2.467805   8.098482   5.589046   6.557484  11.86685
     4.3043194  5.3515797  6.1470346  8.024975   3.422638  16.160294
     6.2304115  1.178197   2.866407   3.984875   3.7100368 13.471437
     7.4196377  8.543673   8.974239  11.460396   7.1255684  7.1223545
     5.4278336 10.207495   5.3622923  7.626067   7.2586136  9.395147
     4.973973   7.6694055 10.879036  10.221865   5.520145  11.152739
     5.0953455  8.80431    4.323547   7.823736 ]
   <NDArray 64 @gpu(0)>
   ```
   
   `loss.mean()` I got:
   
   ```
   ---------------------------------------------------------------------------
   MXNetError                                Traceback (most recent call last)
   
~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/core/formatters.py
 in __call__(self, obj)
       700                 type_pprinters=self.type_printers,
       701                 deferred_pprinters=self.deferred_printers)
   --> 702             printer.pretty(obj)
       703             printer.flush()
       704             return stream.getvalue()
   
   ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/lib/pretty.py 
in pretty(self, obj)
       381                 if cls in self.type_pprinters:
       382                     # printer registered in self.type_pprinters
   --> 383                     return self.type_pprinters[cls](obj, self, cycle)
       384                 else:
       385                     # deferred printer
   
   ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/lib/pretty.py 
in inner(obj, p, cycle)
       559                 p.text(',')
       560                 p.breakable()
   --> 561             p.pretty(x)
       562         if len(obj) == 1 and type(obj) is tuple:
       563             # Special case for 1-item tuples.
   
   ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/lib/pretty.py 
in pretty(self, obj)
       398                         if cls is not object \
       399                                 and 
callable(cls.__dict__.get('__repr__')):
   --> 400                             return _repr_pprint(obj, self, cycle)
       401 
       402             return _default_pprint(obj, self, cycle)
   
   ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/IPython/lib/pretty.py 
in _repr_pprint(obj, p, cycle)
       693     """A pprint that just redirects to the normal repr function."""
       694     # Find newlines and replace them with p.break_()
   --> 695     output = repr(obj)
       696     for idx,output_line in enumerate(output.splitlines()):
       697         if idx:
   
   
~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py 
in __repr__(self)
       187         """Returns a string representation of the array."""
       188         shape_info = 'x'.join(['%d' % x for x in self.shape])
   --> 189         return '\n%s\n<%s %s @%s>' % (str(self.asnumpy()),
       190                                       self.__class__.__name__,
       191                                       shape_info, self.context)
   
   
~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py 
in asnumpy(self)
      1978             self.handle,
      1979             data.ctypes.data_as(ctypes.c_void_p),
   -> 1980             ctypes.c_size_t(data.size)))
      1981         return data
      1982 
   
   ~/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/base.py in 
check_call(ret)
       250     """
       251     if ret != 0:
   --> 252         raise MXNetError(py_str(_LIB.MXGetLastError()))
       253 
       254 
   
   MXNetError: [17:08:34] src/nnvm/legacy_op_util.cc:134: Check failed: 
fwd_init_ 
   
   Stack trace returned 10 entries:
   [bt] (0) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x40123a)
 [0x7f7dfd0b623a]
   [bt] (1) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x401851)
 [0x7f7dfd0b6851]
   [bt] (2) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2f786d2)
 [0x7f7dffc2d6d2]
   [bt] (3) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::imperative::PushOperator(mxnet::OpStatePtr
 const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<unsigned int, std::allocator<unsigned int> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
mxnet::DispatchMode)::{lambda(mxnet::RunContext, 
mxnet::engine::CallbackOnComplete)#3}::operator()(mxnet::RunContext, 
mxnet::engine::CallbackOnComplete) const+0x2f0) [0x7f7dffa14bd0]
   [bt] (4) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(std::_Function_handler<void
 (mxnet::RunContext), mxnet::imperative::PushOperator(mxnet::OpStatePtr const&, 
nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<unsigned int, std::allocator<unsigned int> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
mxnet::DispatchMode)::{lambda(mxnet::RunContext)#4}>::_M_invoke(std::_Any_data 
const&, mxnet::RunContext)+0x26) [0x7f7dffa15246]
   [bt] (5) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7)
 [0x7f7dff9656d7]
   [bt] (6) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7)
 [0x7f7dff9656d7]
   [bt] (7) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7)
 [0x7f7dff9656d7]
   [bt] (8) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7)
 [0x7f7dff9656d7]
   [bt] (9) 
/home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2cb06d7)
 [0x7f7dff9656d7]
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to