[GitHub] [incubator-mxnet] zixuanweeei opened a new issue #15638: Convolution may have some precision problem with autotuned cudnn

GitBox Tue, 23 Jul 2019 00:36:11 -0700

zixuanweeei opened a new issue #15638: Convolution may have some precision 
problem with autotuned cudnn
URL: https://github.com/apache/incubator-mxnet/issues/15638
 
 
   ## Description
   The convolution outputs of autotuned and non-autotuned cudnn are 
inconsistency when `grad_req` was set to `{"x": "null", "w": "null"}`. 
   
   ## Environment info
   
   Test script. MXNet was built from source. Built details was listed below.
   
   ``` Python
   from __future__ import print_function
   from __future__ import division
   import numpy as np
   import mxnet as mx
   import copy
   import itertools
   from numpy.testing import assert_allclose
   from mxnet.test_utils import default_context
   import os
   import traceback
   
   
   mx.test_utils.set_default_context(mx.gpu(0))
   def conv_gen(kernel, stride, pad, num_filter, no_bias, x_shape, w_shape,
           args, grad, grad_req, autotune=0):
       os.environ["MXNET_CUDNN_AUTOTUNE_DEFAULT"] = str(autotune)
       # print("Autotune: ", autotune)
       # Symbols definition
       args_, grad_, grad_req_ = \
               copy.deepcopy(args), copy.deepcopy(grad), copy.deepcopy(grad_req)
       
       x = mx.sym.Variable('x')
       w = mx.sym.Variable('w')
       b = mx.sym.Variable('b') if not no_bias else None
       conv = mx.sym.Convolution(x, w, b, num_filter=num_filter, 
           kernel=kernel, stride=stride, pad=pad, no_bias=no_bias)
       
       dev = default_context()
       exe1 = conv.bind(dev, args_, args_grad=grad_, grad_req=grad_req_)
       exe1.forward(is_train=True)
       exe1.backward(exe1.outputs[0])
       mx.nd.waitall()
       return args_, grad_, exe1.outputs
       
   
   def test_convolution_independent_gradients():
       reqs = ["null", "write", "add"]
       var_names = ["x", "w", "b"]
       # Prepare params shape
       kernel = (5, 5)
       stride = (1, 1)
       pad = (1, 1)
       num_filter = 64
       x_shape = (2, 64, 7, 7)
       w_shape = (64, 64, 5, 5)
       
       for x_req, w_req, b_req in itertools.product(reqs, repeat=3):
           for no_bias in [False, True]:
               # Binding args for conv with possible dependent gradients
               base_args = {
                   'x': mx.nd.random.normal(shape=x_shape),
                   'w': mx.nd.random.normal(shape=w_shape),
                   'b': mx.nd.random.normal(shape=(num_filter, )) if not 
no_bias else None}
               grad = {
                   'x': mx.nd.zeros(shape=x_shape),
                   'w': mx.nd.zeros(shape=w_shape),
                   'b': mx.nd.zeros(shape=(num_filter, )) if not no_bias else 
None}
               grad_req = {"x": x_req, "w": w_req, "b": b_req}
   
               try:
                   args0, grad0, out0 = conv_gen(kernel, stride, pad, 
num_filter, no_bias, x_shape, w_shape,
                       base_args, grad, grad_req, 0)
                   args1, grad1, out1 = conv_gen(kernel, stride, pad, 
num_filter, no_bias, x_shape, w_shape,
                       base_args, grad, grad_req, 1)
                   args2, grad2, out2 = conv_gen(kernel, stride, pad, 
num_filter, no_bias, x_shape, w_shape,
                       base_args, grad, grad_req, 2)
                   
                   # for var_name in var_names:
                   #     if var_name == "b" and no_bias:
                   #         continue
                   #     assert_allclose(args0[var_name].asnumpy(), 
args1[var_name].asnumpy(), rtol=1.0e-3, atol=1.0e-3)
                   #     assert_allclose(args1[var_name].asnumpy(), 
args2[var_name].asnumpy(), rtol=1.0e-3, atol=1.0e-3)
                   #     assert_allclose(grad0[var_name].asnumpy(), 
grad1[var_name].asnumpy(), rtol=1.0e-3, atol=1.0e-3)
                   #     assert_allclose(grad1[var_name].asnumpy(), 
grad2[var_name].asnumpy(), rtol=1.0e-3, atol=1.0e-3)
                   
                   for m0, m1, m2 in zip(out0, out1, out2):
                       assert_allclose(m0.asnumpy(), m1.asnumpy(), atol=1.0e-3, 
rtol=1.0e-3)
                       assert_allclose(m1.asnumpy(), m2.asnumpy(), atol=1.0e-3, 
rtol=1.0e-3)
               except:
                   
print("==========================================================================================")
                   print(traceback.print_exc())
                   print("x_req: {}, w_req: {}, b_req: {}".format(x_req, w_req, 
"no_bias" if no_bias else b_req))
                   
print("==========================================================================================")
   
   
   if __name__ == "__main__":
       test_convolution_independent_gradients()
   
   ```
   
   ## Build info
   
   Compiler gcc: gcc version 5.3.1 20160406 (Red Hat 5.3.1-6) (GCC)
   
   MXNet commit hash: 77254f2
   
   Build config:
   ``` Makefile
   make -j50 USE_PROFILER=0 USE_CUDA=1 USE_CUDNN=1 USE_MKLDNN=1 USE_BLAS=mkl 
USE_INTEL_PATH=/opt/intel USE_CUDA_PATH=/path/to/cuda-9.0 
USE_CUDNN_PATH=/path/to/cudnn/cudnn-9.0-linux-x64-v7.1.2
   ```
   
   ## Error Message:
   ``` Python
   Traceback (most recent call last):
     File "test_gpu_case_issue.py", line 77, in 
test_convolution_independent_gradients
       assert_allclose(m0.asnumpy(), m1.asnumpy(), atol=1.0e-3, rtol=1.0e-3)
     File 
"/home/zixuanwe/miniconda3/lib/python3.7/site-packages/numpy/testing/nose_tools/utils.py",
 line 1398, in assert_allclose
       verbose=verbose, header=header, equal_nan=equal_nan)
     File 
"/home/zixuanwe/miniconda3/lib/python3.7/site-packages/numpy/testing/nose_tools/utils.py",
 line 781, in assert_array_compare
       raise AssertionError(msg)
   AssertionError: 
   Not equal to tolerance rtol=0.001, atol=0.001
   
   (mismatch 9.0625%)
    x: array([[[[ 2.309166e+01,  1.388979e+01, -2.442778e+01, -1.903630e+01,
             -2.764324e+01],
            [ 3.716447e+00,  4.774844e+01, -5.421930e+01,  4.471700e+01,...
    y: array([[[[ 2.306942e+01,  1.390011e+01, -2.442946e+01, -1.903965e+01,
             -2.763445e+01],
            [ 3.733934e+00,  4.774546e+01, -5.421634e+01,  4.471697e+01,...
   None
   x_req: null, w_req: null, b_req: null
   ```
   
   ## Steps to reproduce
   Build MXNet from source and just run the script above.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] [incubator-mxnet] zixuanweeei opened a new issue #15638: Convolution may have some precision problem with autotuned cudnn

Reply via email to