perdasilva opened a new issue #14652: CUDA 10 w/ cuDNN 7.5 Support
URL: https://github.com/apache/incubator-mxnet/issues/14652
 
 
   ## Description
   
   Currently, the CI tests fail when running mxnet on top of CUDA 10 and cuDNN 
7.5 as demonstrated in 
[this](https://github.com/apache/incubator-mxnet/pull/14611) PR.
   
   The tests pass when using CUDA 10 and cuDNN 7.3.1.20, as demonstrated in 
[this](https://github.com/apache/incubator-mxnet/pull/14513) PR.
   
   
   ## Environment info (Required)
   
   g3.8xlarge with CUDA 10 and nvidia driver  410.73 installed.
   The code is running inside the [CI GPU 
container](https://github.com/perdasilva/incubator-mxnet/blob/ci_cuda_10/ci/docker/Dockerfile.build.ubuntu_gpu#L21)
 based on `nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04`.
   
   ## Error Message:
   
   Usually: `src/operator/./cudnn_rnn-inl.h:759: Check failed: e == 
CUDNN_STATUS_SUCCESS (6 vs. 0) cuDNN: CUDNN_STATUS_ARCH_MISMATCH`
   
   Here are some example logs:
   
   
http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/mxnet-validation%2Fcentos-gpu/detail/PR-14611/1/pipeline/
   
   
http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/mxnet-validation%2Funix-gpu/detail/PR-14611/12/pipeline/
   
   ## Steps to reproduce
   
   ```
   # Launch g3.8xlarge instance with ubuntu 16.04
   
   # ==-_-==-_-== Environment Setup ==-_-==-_-==
   
   sudo apt update
   sudo apt-get install -y \
       apt-transport-https \
       build-essential \
       ca-certificates \
       curl \
       git \
       libatlas-base-dev \
       libcurl4-openssl-dev \
       libjemalloc-dev \
       libhdf5-dev \
       liblapack-dev \
       libopenblas-dev \
       libopencv-dev \
       libturbojpeg \
       libzmq3-dev \
       ninja-build \
       software-properties-common \
       sudo \
       unzip \
       wget
   
   sudo apt-get install -y python-dev python3-dev virtualenv wget
   
   # the version of the pip shipped with ubuntu may be too lower, install a 
recent version here
   wget -nv https://bootstrap.pypa.io/get-pip.py
   sudo python3 get-pip.py
   sudo python2 get-pip.py
   
   pip2 install --user nose cpplint==1.3.0 pylint==1.9.3 
'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 
scipy==1.0.1 boto3
   pip3 install --user nose cpplint==1.3.0 pylint==2.1.1 
'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 
scipy==1.0.1 boto3
   
   # ==-_-==-_-== CUDA Installation ==-_-==-_-==
   
   wget 
https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux
   chmod +x cuda_10.0.130_410.48_linux && sudo ./cuda_10.0.130_410.48_linux
   
   # Installation except:
   # Install NVIDIA Accelerated Graphics Driver for Linux-x86_64 410.48?
   # (y)es/(n)o/(q)uit: y
   # 
   # Do you want to install the OpenGL libraries?
   # (y)es/(n)o/(q)uit [ default is yes ]:
   #
   # Do you want to run nvidia-xconfig?
   # This will update the system X configuration file so that the NVIDIA X 
driver
   # is used. The pre-existing X configuration file will be backed up.
   # This option should not be used on systems that require a custom
   # X configuration, such as systems with multiple GPU vendors.
   # (y)es/(n)o/(q)uit [ default is no ]:
   # 
   # Install the CUDA 10.0 Toolkit?
   # (y)es/(n)o/(q)uit: y
   #
   # Enter Toolkit Location
   # [ default is /usr/local/cuda-10.0 ]:
   #
   # Do you want to install a symbolic link at /usr/local/cuda?
   # (y)es/(n)o/(q)uit: y
   #
   # Install the CUDA 10.0 Samples?
   # (y)es/(n)o/(q)uit: n
   
   # Set LD_LIBRARY_PATH
   export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
   
   # Check installation
   nvidia-smi
   
   # 
+-----------------------------------------------------------------------------+
   # | NVIDIA-SMI 410.48                 Driver Version: 410.48                 
   |
   # 
|-------------------------------+----------------------+----------------------+
   # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. 
ECC |
   # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute 
M. |
   # 
|===============================+======================+======================|
   # |   0  Tesla M60           Off  | 00000000:00:1D.0 Off |                   
 0 |
   # | N/A   31C    P0    43W / 150W |      0MiB /  7618MiB |      0%      
Default |
   # 
+-------------------------------+----------------------+----------------------+
   # |   1  Tesla M60           Off  | 00000000:00:1E.0 Off |                   
 0 |
   # | N/A   34C    P0    41W / 150W |      0MiB /  7618MiB |     99%      
Default |
   # 
+-------------------------------+----------------------+----------------------+
   #
   # 
+-----------------------------------------------------------------------------+
   # | Processes:                                                       GPU 
Memory |
   # |  GPU       PID   Type   Process name                             Usage   
   |
   # 
|=============================================================================|
   # |  No running processes found                                              
   |
   # 
+-----------------------------------------------------------------------------+
   
   # ==-_-==-_-== Setup cuDNN ==-_-==-_-==
   
   # https://developer.nvidia.com/rdp/cudnn-download
   # Register with NVIDIA and download cudnn-10.0-linux-x64-v7.5.0.56.tgz
   # scp it to your instance
   # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html
   tar -xvzf cudnn-10.0-linux-x64-v7.5.0.56.tgz
   sudo cp cuda/include/cudnn.h /usr/local/cuda/include
   sudo cp cuda/lib64/libcudnn* /usr/local/cuda/lib64
   sudo chmod a+r /usr/local/cuda/include/cudnn.h 
/usr/local/cuda/lib64/libcudnn*
   
   # ==-_-==-_-== Clone MXNet Repo. ==-_-==-_-==
   mkdir -p repositories/apache && cd repositories/apache
   git clone --recursive https://github.com/apache/incubator-mxnet.git
   cd incubator-mxnet
   
   # ==-_-==-_-== Compile MXNet ==-_-==-_-==
   make \
           DEV=1                                     \
           ENABLE_TESTCOVERAGE=1                     \
           USE_BLAS=openblas                         \
           USE_MKLDNN=0                              \
           USE_CUDA=1                                \
           USE_CUDA_PATH=/usr/local/cuda             \
           USE_CUDNN=1                               \
           USE_CPP_PACKAGE=0                         \
           USE_DIST_KVSTORE=1                        \
           USE_SIGNAL_HANDLER=1                      \
           -j$(nproc)
   
   # ==-_-==-_-== Run failing test ==-_-==-_-==
   export PYTHONPATH=./python/                                                  
                                      
   nosetests-3.4 --verbose 
tests/python/gpu/test_gluon_gpu.py:test_rnn_layers_fp16
   
   # Error excerpt:
   # ======================================================================
   # ERROR: test_gluon_gpu.test_rnn_layers_fp16
   # ----------------------------------------------------------------------
   # Traceback (most recent call last):
   #   File "/usr/local/lib/python3.5/dist-packages/nose/case.py", line 198, in 
runTest
   #     self.test(*self.arg)
   #   File "/usr/local/lib/python3.5/dist-packages/nose/util.py", line 620, in 
newfunc
   #     return func(*arg, **kw)
   #   File 
"/home/ubuntu/repositories/apache/incubator-mxnet/tests/python/gpu/../unittest/common.py",
 line 110, in test_new
   #     orig_test(*args, **kwargs)
   #   File 
"/home/ubuntu/repositories/apache/incubator-mxnet/tests/python/gpu/../unittest/test_gluon_rnn.py",
 line 545, in test_rnn_layers_fp16
   #     run_rnn_layers('float16', 'float32', mx.gpu())
   #   File 
"/home/ubuntu/repositories/apache/incubator-mxnet/tests/python/gpu/../unittest/test_gluon_rnn.py",
 line 479, in run_rnn_layers
   #     check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), 
mx.nd.ones((8, 3, 20), dtype=dtype), ctx=ctx)
   #   File 
"/home/ubuntu/repositories/apache/incubator-mxnet/tests/python/gpu/../unittest/test_gluon_rnn.py",
 line 451, in check_rnn_layer_forward
   #     np_out = out.asnumpy()
   #   File 
"/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/ndarray/ndarray.py",
 line 1995, in asnumpy
   #     ctypes.c_size_t(data.size)))
   #   File 
"/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/base.py", line 
252, in check_call
   #     raise MXNetError(py_str(_LIB.MXGetLastError()))
   # mxnet.base.MXNetError: [07:41:30] src/operator/./cudnn_rnn-inl.h:759: 
Check failed: e == CUDNN_STATUS_SUCCESS (6 vs. 0) cuDNN: 
CUDNN_STATUS_ARCH_MISMATCH
   # 
   # Stack trace returned 10 entries:
   # [bt] (0) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::StackTrace[abi:cxx11]()+0x1c7)
 [0x7fe8ec2eebd7]
   # [bt] (1) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x32)
 [0x7fe8ec2ef082]
   # [bt] (2) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::op::CuDNNRNNOp<mshadow::half::half_t>::Init(mshadow::Stream<mshadow::gpu>*,
 std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, 
std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x333c) 
[0x7fe8f36f8afc]
   # [bt] (3) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::op::CuDNNRNNOp<mshadow::half::half_t>::Forward(mxnet::OpContext
 const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, 
std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x1501) 
[0x7fe8f3700c61]
   # [bt] (4) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::op::OperatorState::Forward(mxnet::OpContext
 const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x48b) 
[0x7fe8ef82dd5b]
   # [bt] (5) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::op::LegacyOpForward(mxnet::OpStatePtr
 const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)+0x18) [0x7fe8ef820838]
   # [bt] (6) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void
 (mxnet::OpStatePtr const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&), void (*)(mxnet::OpStatePtr const&, 
mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> 
> const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > 
const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > 
const&)>::_M_invoke(std::_Any_data const&, mxnet::OpStatePtr const&, 
mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> 
> const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > 
const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x20) 
[0x7fe8ef5d9250]
   # [bt] (7) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::imperative::PushOperator(mxnet::OpStatePtr
 const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<unsigned int, std::allocator<unsigned int> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
mxnet::DispatchMode)::{lambda(mxnet::RunContext, 
mxnet::engine::CallbackOnComplete)#3}::operator()(mxnet::RunContext, 
mxnet::engine::CallbackOnComplete) const+0x2e8) [0x7fe8ef8d7e88]
   # [bt] (8) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void
 (mxnet::RunContext), mxnet::imperative::PushOperator(mxnet::OpStatePtr const&, 
nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, 
std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, 
std::vector<unsigned int, std::allocator<unsigned int> > const&, 
std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, 
mxnet::DispatchMode)::{lambda(mxnet::RunContext)#4}>::_M_invoke(std::_Any_data 
const&, # mxnet::RunContext&&)+0x25) [0x7fe8ef8d8215]
   # [bt] (9) 
/home/ubuntu/repositories/apache/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(+0x5f9056e)
 [0x7fe8f02a656e]
   # 
   # 
   # -------------------- >> begin captured logging << --------------------
   # common: INFO: Setting module np/mx/python random seeds, use 
MXNET_MODULE_SEED=1716277661 to reproduce.
   # --------------------- >> end captured logging << ---------------------
   # 
   # ----------------------------------------------------------------------
   ```
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to