szha closed pull request #11780: fix flaky test 
test_operator_gpu.test_countsketch
URL: https://github.com/apache/incubator-mxnet/pull/11780
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/operator/contrib/count_sketch.cu 
b/src/operator/contrib/count_sketch.cu
index 373ff3eab62..68dede377fc 100644
--- a/src/operator/contrib/count_sketch.cu
+++ b/src/operator/contrib/count_sketch.cu
@@ -129,8 +129,8 @@ inline void CountSketchForward(const Tensor<gpu, 2, DType> 
&out,
                                     nthreads, out_ptr+bstart*out_dim, h_ptr,
                                     s_ptr, in_ptr+bstart*in_dim, batchlen,
                                     in_dim, out_dim);
-    MSHADOW_CUDA_POST_KERNEL_CHECK(sketch_forward_kernel);
-    // cudaThreadSynchronize();
+    cudaError_t err = cudaDeviceSynchronize();
+    CHECK_EQ(err, cudaSuccess) << "Error occured! CUDA: " << 
cudaGetErrorString(err);
     bstart = (i+1)*batchlen;
   }
 }
@@ -153,7 +153,7 @@ inline void CountSketchBackward(const Tensor<gpu, 2, DType> 
&in_grad,
     upper_bound = upper_bound-1;
   }
   // guarantee there are at least one iteration
-  upper_bound = upper_bound > 0? upper_bound:0;
+  upper_bound = upper_bound > 0 ? upper_bound : 0;
   int bstart = 0;
   for ( int i = 0; i <= upper_bound; i++ ) {
     const int batchlen = min(processing_batch_size, n_samples - bstart);
@@ -165,7 +165,8 @@ inline void CountSketchBackward(const Tensor<gpu, 2, DType> 
&in_grad,
                             nthreads, in_grad_ptr+bstart*in_dim, h_ptr,
                             s_ptr, out_grad_ptr+bstart*out_dim, batchlen,
                             in_dim, out_dim);
-    MSHADOW_CUDA_POST_KERNEL_CHECK(sketch_backward_kernel);
+    cudaError_t err = cudaDeviceSynchronize();
+    CHECK_EQ(err, cudaSuccess) << "Error occured! CUDA: " << 
cudaGetErrorString(err);
     bstart = (i+1)*batchlen;
   }
 }
diff --git a/tests/python/gpu/test_operator_gpu.py 
b/tests/python/gpu/test_operator_gpu.py
index 9e9cc608b29..458028b9e53 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -51,7 +51,10 @@
 
 
 def check_countsketch(in_dim,out_dim,n):
-    sym = mx.sym.contrib.count_sketch(name='countsketch',out_dim = out_dim)
+    data = mx.sym.Variable("data")
+    h = mx.sym.Variable("h")
+    s = mx.sym.Variable("s")
+    sym = mx.sym.contrib.count_sketch(data=data, h=h, s=s, 
name='countsketch',out_dim = out_dim)
     shape = [(n,in_dim), (1,in_dim),(1,in_dim)]     #shape of input x, hash h 
and hash s
 
     arr = [mx.nd.empty(shape[i]) for i in range(3)]
@@ -62,46 +65,33 @@ def check_countsketch(in_dim,out_dim,n):
     arr[1][:] = h                                 #hash h
     s = np.random.randint(0, 2, shape[2])*2-np.ones(shape[2])
     arr[2][:] = s                                 #hash s
-    # forward
-    exe_list = [sym.bind(mx.gpu(0), arr, arr_grad)]
-    for exe in exe_list:
-        exe.forward(is_train= True)
-    out1 = [exe.outputs[0].asnumpy() for exe in exe_list]
-
+    locations = {"data": x, "h": h, "s": s}
     a = np.zeros((n,out_dim))
     temp = np.multiply(x, s)
     for num_sample in np.arange(0,n):
         for idx in np.arange(0,in_dim):
             a[num_sample][h[0][idx]] += temp[num_sample][idx]
-    assert_almost_equal(a,out1[0],rtol=1e-3, atol=1e-12)
-
-    # backward
+    check_symbolic_forward(sym, locations, [a], rtol=1e-3, atol=1e-5, 
ctx=mx.gpu(0))
     out_grad = mx.nd.empty((n,out_dim))
     out_grad[:] = np.random.normal(-3, 3, (n,out_dim))
-    for exe in exe_list:
-        exe.backward([out_grad])
-
-        a = np.zeros((n,in_dim))
-        for j in np.arange(0,n):
-            for i in np.arange(0,in_dim):
-                a[j,i] = out_grad.asnumpy()[j, h[0,i]] * s[0,i]
-    assert_almost_equal(a,arr_grad[0].asnumpy(),rtol=1e-3, atol=1e-12)
+    a = np.zeros((n,in_dim))
+    for j in np.arange(0,n):
+        for i in np.arange(0,in_dim):
+            a[j,i] = out_grad.asnumpy()[j, h[0,i]] * s[0,i]
+    check_symbolic_backward(sym, locations, [out_grad], [a], rtol=1e-3, 
atol=1e-5, ctx=mx.gpu(0))
 
 
[email protected]("test fails intermittently. temporarily disabled till it gets 
fixed. tracked at https://github.com/apache/incubator-mxnet/issues/10988";)
-@with_seed(0)
+@with_seed()
 def test_countsketch():
-    nrepeat = 2
     minindim = 40
     maxindim = 100
     minoutdim = 5
     maxoutdim = 30
     maxn = 200
-    for repeat in range(nrepeat):
-        in_dim = np.random.randint(minindim, maxindim)
-        out_dim = np.random.randint(minoutdim, maxoutdim)
-        n = np.random.randint(1,maxn)
-        check_countsketch(in_dim, out_dim, n)
+    in_dim = np.random.randint(minindim, maxindim)
+    out_dim = np.random.randint(minoutdim, maxoutdim)
+    n = np.random.randint(1, maxn)
+    check_countsketch(in_dim, out_dim, n)
 
 
 def check_ifft(shape):


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to