[tvm] branch main updated: [UnitTest][Metal] Parametrize allreduce GPU tests (#15749)

syfeng Fri, 15 Sep 2023 02:21:09 -0700

This is an automated email from the ASF dual-hosted git repository.

syfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 64ab31ec3e [UnitTest][Metal] Parametrize allreduce GPU tests (#15749)
64ab31ec3e is described below

commit 64ab31ec3e945c515e08dc528318003ebb634360
Author: Eric Lunderberg <[email protected]>
AuthorDate: Fri Sep 15 04:20:57 2023 -0500

    [UnitTest][Metal] Parametrize allreduce GPU tests (#15749)
    
    * [UnitTest][Metal] Parametrize allreduce GPU tests
    
    As a first step to addressing the Metal codegen errors that required
    the reversion in https://github.com/apache/tvm/pull/15725,
    parametrizing the unit tests for `allreduce`.  While these tests are
    parametrized with `@tvm.testing.parametrize_targets("cuda", "metal")`,
    the automatic `tvm.testing.requires_metal` marker inserted for the
    metal parametrization will cause them to be skipped if the metal
    runtime is unavailable, which includes the current CI.
    
    * Updated filename, device used when testing on metal
---
 .../{test_allreduce_cuda.py => test_allreduce.py}  | 109 +++++++++++----------
 1 file changed, 57 insertions(+), 52 deletions(-)

diff --git a/tests/python/unittest/test_allreduce_cuda.py 
b/tests/python/unittest/test_allreduce.py
similarity index 50%
rename from tests/python/unittest/test_allreduce_cuda.py
rename to tests/python/unittest/test_allreduce.py
index e9a8ef81cf..708384daf0 100644
--- a/tests/python/unittest/test_allreduce_cuda.py
+++ b/tests/python/unittest/test_allreduce.py
@@ -46,61 +46,66 @@ def reduce_max(a: T.handle, b: T.handle, d1: T.int32, d2: 
T.int32, d3: T.int32)
             B[vi, vj, vk] = T.max(B[vi, vj, vk], A[vi, vj, vk, vl])
 
 
[email protected]_gpu
[email protected]_cuda
-def test_allreduce_cuda():
-    def check_sum(d1: int, d2: int, d3: int):
-        _, _, _d1, _d2, _d3 = reduce.params
-        mod = reduce.specialize({_d1: d1, _d2: d2, _d3: d3})
-        sch = tvm.tir.Schedule(mod)
-        blk = sch.get_block("reduce")
-        i, j, k, l = sch.get_loops(blk)
-        sch.bind(i, "blockIdx.x")
-        sch.bind(j, "threadIdx.z")
-        sch.bind(k, "threadIdx.y")
-        sch.bind(l, "threadIdx.x")
-        f = tvm.build(sch.mod["main"], target="cuda")
-
-        # prepare input and output array
-        a_np = np.random.rand(1, d1, d2, d3).astype("float32")
-        b_np = a_np.sum(axis=-1).astype("float32")
-        a = tvm.nd.array(a_np, tvm.cuda(0))
-        b = tvm.nd.array(np.zeros_like(b_np), tvm.cuda(0))
-
-        # launch kernel
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
-
-    def check_max(d1: int, d2: int, d3: int):
-        _, _, _d1, _d2, _d3 = reduce_max.params
-        mod = reduce_max.specialize({_d1: d1, _d2: d2, _d3: d3})
-        sch = tvm.tir.Schedule(mod)
-        blk = sch.get_block("reduce")
-        i, j, k, l = sch.get_loops(blk)
-        sch.bind(i, "blockIdx.x")
-        sch.bind(j, "threadIdx.z")
-        sch.bind(k, "threadIdx.y")
-        sch.bind(l, "threadIdx.x")
-        f = tvm.build(sch.mod["main"], target="cuda")
-
-        # prepare input and output array
-        a_np = -np.random.rand(1, d1, d2, d3).astype("float32")
-        b_np = a_np.max(axis=-1).astype("float32")
-        a = tvm.nd.array(a_np, tvm.cuda(0))
-        b = tvm.nd.array(np.zeros_like(b_np), tvm.cuda(0))
-
-        # launch kernel
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
-
+def generate_param_sets():
     for d1 in range(1, 5):
         for d2 in range(1, 5):
             for d3 in [2, 4, 8, 12, 16, 32, 48, 64, 100, 128, 201, 256, 512, 
1024]:
-                if d1 * d2 * d3 > 1024:
-                    continue
-                check_sum(d1, d2, d3)
-                check_max(d1, d2, d3)
+                if d1 * d2 * d3 < 1024:
+                    yield (d1, d2, d3)
+
+
+dims = tvm.testing.parameter(*generate_param_sets())
+
+
[email protected]_targets("cuda", "metal")
+def test_allreduce_sum(dims, target, dev):
+    d1, d2, d3 = dims
+    _, _, _d1, _d2, _d3 = reduce.params
+    mod = reduce.specialize({_d1: d1, _d2: d2, _d3: d3})
+    sch = tvm.tir.Schedule(mod)
+    blk = sch.get_block("reduce")
+    i, j, k, l = sch.get_loops(blk)
+    sch.bind(i, "blockIdx.x")
+    sch.bind(j, "threadIdx.z")
+    sch.bind(k, "threadIdx.y")
+    sch.bind(l, "threadIdx.x")
+    f = tvm.build(sch.mod["main"], target=target)
+
+    # prepare input and output array
+    a_np = np.random.rand(1, d1, d2, d3).astype("float32")
+    b_np = a_np.sum(axis=-1).astype("float32")
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros_like(b_np), dev)
+
+    # launch kernel
+    f(a, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
+
+
[email protected]_targets("cuda", "metal")
+def test_allreduce_max(dims, target, dev):
+    d1, d2, d3 = dims
+    _, _, _d1, _d2, _d3 = reduce_max.params
+    mod = reduce_max.specialize({_d1: d1, _d2: d2, _d3: d3})
+    sch = tvm.tir.Schedule(mod)
+    blk = sch.get_block("reduce")
+    i, j, k, l = sch.get_loops(blk)
+    sch.bind(i, "blockIdx.x")
+    sch.bind(j, "threadIdx.z")
+    sch.bind(k, "threadIdx.y")
+    sch.bind(l, "threadIdx.x")
+    f = tvm.build(sch.mod["main"], target=target)
+
+    # prepare input and output array
+    a_np = -np.random.rand(1, d1, d2, d3).astype("float32")
+    b_np = a_np.max(axis=-1).astype("float32")
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros_like(b_np), dev)
+
+    # launch kernel
+    f(a, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
 
 
 if __name__ == "__main__":
-    test_allreduce_cuda()
+    tvm.testing.main()

[tvm] branch main updated: [UnitTest][Metal] Parametrize allreduce GPU tests (#15749)

Reply via email to