This is an automated email from the ASF dual-hosted git repository.
syfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 64ab31ec3e [UnitTest][Metal] Parametrize allreduce GPU tests (#15749)
64ab31ec3e is described below
commit 64ab31ec3e945c515e08dc528318003ebb634360
Author: Eric Lunderberg <[email protected]>
AuthorDate: Fri Sep 15 04:20:57 2023 -0500
[UnitTest][Metal] Parametrize allreduce GPU tests (#15749)
* [UnitTest][Metal] Parametrize allreduce GPU tests
As a first step to addressing the Metal codegen errors that required
the reversion in https://github.com/apache/tvm/pull/15725,
parametrizing the unit tests for `allreduce`. While these tests are
parametrized with `@tvm.testing.parametrize_targets("cuda", "metal")`,
the automatic `tvm.testing.requires_metal` marker inserted for the
metal parametrization will cause them to be skipped if the metal
runtime is unavailable, which includes the current CI.
* Updated filename, device used when testing on metal
---
.../{test_allreduce_cuda.py => test_allreduce.py} | 109 +++++++++++----------
1 file changed, 57 insertions(+), 52 deletions(-)
diff --git a/tests/python/unittest/test_allreduce_cuda.py
b/tests/python/unittest/test_allreduce.py
similarity index 50%
rename from tests/python/unittest/test_allreduce_cuda.py
rename to tests/python/unittest/test_allreduce.py
index e9a8ef81cf..708384daf0 100644
--- a/tests/python/unittest/test_allreduce_cuda.py
+++ b/tests/python/unittest/test_allreduce.py
@@ -46,61 +46,66 @@ def reduce_max(a: T.handle, b: T.handle, d1: T.int32, d2:
T.int32, d3: T.int32)
B[vi, vj, vk] = T.max(B[vi, vj, vk], A[vi, vj, vk, vl])
[email protected]_gpu
[email protected]_cuda
-def test_allreduce_cuda():
- def check_sum(d1: int, d2: int, d3: int):
- _, _, _d1, _d2, _d3 = reduce.params
- mod = reduce.specialize({_d1: d1, _d2: d2, _d3: d3})
- sch = tvm.tir.Schedule(mod)
- blk = sch.get_block("reduce")
- i, j, k, l = sch.get_loops(blk)
- sch.bind(i, "blockIdx.x")
- sch.bind(j, "threadIdx.z")
- sch.bind(k, "threadIdx.y")
- sch.bind(l, "threadIdx.x")
- f = tvm.build(sch.mod["main"], target="cuda")
-
- # prepare input and output array
- a_np = np.random.rand(1, d1, d2, d3).astype("float32")
- b_np = a_np.sum(axis=-1).astype("float32")
- a = tvm.nd.array(a_np, tvm.cuda(0))
- b = tvm.nd.array(np.zeros_like(b_np), tvm.cuda(0))
-
- # launch kernel
- f(a, b)
- tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
-
- def check_max(d1: int, d2: int, d3: int):
- _, _, _d1, _d2, _d3 = reduce_max.params
- mod = reduce_max.specialize({_d1: d1, _d2: d2, _d3: d3})
- sch = tvm.tir.Schedule(mod)
- blk = sch.get_block("reduce")
- i, j, k, l = sch.get_loops(blk)
- sch.bind(i, "blockIdx.x")
- sch.bind(j, "threadIdx.z")
- sch.bind(k, "threadIdx.y")
- sch.bind(l, "threadIdx.x")
- f = tvm.build(sch.mod["main"], target="cuda")
-
- # prepare input and output array
- a_np = -np.random.rand(1, d1, d2, d3).astype("float32")
- b_np = a_np.max(axis=-1).astype("float32")
- a = tvm.nd.array(a_np, tvm.cuda(0))
- b = tvm.nd.array(np.zeros_like(b_np), tvm.cuda(0))
-
- # launch kernel
- f(a, b)
- tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
-
+def generate_param_sets():
for d1 in range(1, 5):
for d2 in range(1, 5):
for d3 in [2, 4, 8, 12, 16, 32, 48, 64, 100, 128, 201, 256, 512,
1024]:
- if d1 * d2 * d3 > 1024:
- continue
- check_sum(d1, d2, d3)
- check_max(d1, d2, d3)
+ if d1 * d2 * d3 < 1024:
+ yield (d1, d2, d3)
+
+
+dims = tvm.testing.parameter(*generate_param_sets())
+
+
[email protected]_targets("cuda", "metal")
+def test_allreduce_sum(dims, target, dev):
+ d1, d2, d3 = dims
+ _, _, _d1, _d2, _d3 = reduce.params
+ mod = reduce.specialize({_d1: d1, _d2: d2, _d3: d3})
+ sch = tvm.tir.Schedule(mod)
+ blk = sch.get_block("reduce")
+ i, j, k, l = sch.get_loops(blk)
+ sch.bind(i, "blockIdx.x")
+ sch.bind(j, "threadIdx.z")
+ sch.bind(k, "threadIdx.y")
+ sch.bind(l, "threadIdx.x")
+ f = tvm.build(sch.mod["main"], target=target)
+
+ # prepare input and output array
+ a_np = np.random.rand(1, d1, d2, d3).astype("float32")
+ b_np = a_np.sum(axis=-1).astype("float32")
+ a = tvm.nd.array(a_np, dev)
+ b = tvm.nd.array(np.zeros_like(b_np), dev)
+
+ # launch kernel
+ f(a, b)
+ tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
+
+
[email protected]_targets("cuda", "metal")
+def test_allreduce_max(dims, target, dev):
+ d1, d2, d3 = dims
+ _, _, _d1, _d2, _d3 = reduce_max.params
+ mod = reduce_max.specialize({_d1: d1, _d2: d2, _d3: d3})
+ sch = tvm.tir.Schedule(mod)
+ blk = sch.get_block("reduce")
+ i, j, k, l = sch.get_loops(blk)
+ sch.bind(i, "blockIdx.x")
+ sch.bind(j, "threadIdx.z")
+ sch.bind(k, "threadIdx.y")
+ sch.bind(l, "threadIdx.x")
+ f = tvm.build(sch.mod["main"], target=target)
+
+ # prepare input and output array
+ a_np = -np.random.rand(1, d1, d2, d3).astype("float32")
+ b_np = a_np.max(axis=-1).astype("float32")
+ a = tvm.nd.array(a_np, dev)
+ b = tvm.nd.array(np.zeros_like(b_np), dev)
+
+ # launch kernel
+ f(a, b)
+ tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-6, atol=1e-6)
if __name__ == "__main__":
- test_allreduce_cuda()
+ tvm.testing.main()