This is an automated email from the ASF dual-hosted git repository.
junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 4d152fe7c2 [Unittest] merge test_cp_async_in_if_then_else into
test_tir_transform_inject_ptx_async_copy (#14138)
4d152fe7c2 is described below
commit 4d152fe7c252152771178a3eadb9816892516c4e
Author: Tian Xia <[email protected]>
AuthorDate: Tue Feb 28 04:42:15 2023 +0800
[Unittest] merge test_cp_async_in_if_then_else into
test_tir_transform_inject_ptx_async_copy (#14138)
This PR merge two related unittests into one.
---
.../unittest/test_cp_async_in_if_then_else.py | 238 ---------------------
.../test_tir_transform_inject_ptx_async_copy.py | 214 ++++++++++++++++++
2 files changed, 214 insertions(+), 238 deletions(-)
diff --git a/tests/python/unittest/test_cp_async_in_if_then_else.py
b/tests/python/unittest/test_cp_async_in_if_then_else.py
deleted file mode 100644
index 08de5ba34d..0000000000
--- a/tests/python/unittest/test_cp_async_in_if_then_else.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""test the correctness of inject async memory copy from an if_then_else
load"""
-import tvm
-import numpy as np
-
-from tvm.script import tir as T
-import tvm.testing
-
-expected_cuda_script = r"""
-#ifdef _WIN32
- using uint = unsigned int;
- using uchar = unsigned char;
- using ushort = unsigned short;
- using int64_t = long long;
- using uint64_t = unsigned long long;
-#else
- #define uint unsigned int
- #define uchar unsigned char
- #define ushort unsigned short
- #define int64_t long long
- #define uint64_t unsigned long long
-#endif
-extern "C" __global__ void __launch_bounds__(16) main_kernel0(float*
__restrict__ A, float* __restrict__ B, float* __restrict__ C) {
- __shared__ float A_shared[64];
- __shared__ float B_shared[64];
- A_shared[((int)threadIdx.x)] = 0.000000e+00f;
- B_shared[((int)threadIdx.x)] = 0.000000e+00f;
-__asm__ __volatile__("cp.async.commit_group;");
-
-
- {
- unsigned int addr;
- __asm__ __volatile__(
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
- : "=r"(addr)
- : "l"((void *)(A_shared + (((int)threadIdx.x) + 16)))
- );
- __asm__ __volatile__(
- "cp.async.ca.shared.global [%0], [%1], %2;"
- :: "r"(addr), "l"((void*)(A + (((int)threadIdx.x) * 14))), "n"(4)
- );
- }
-
- {
- unsigned int addr;
- __asm__ __volatile__(
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
- : "=r"(addr)
- : "l"((void *)(B_shared + (((int)threadIdx.x) + 16)))
- );
- __asm__ __volatile__(
- "cp.async.ca.shared.global [%0], [%1], %2;"
- :: "r"(addr), "l"((void*)(B + (((int)threadIdx.x) * 14))), "n"(4)
- );
- }
-__asm__ __volatile__("cp.async.commit_group;");
-
-
- {
- unsigned int addr;
- __asm__ __volatile__(
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
- : "=r"(addr)
- : "l"((void *)(A_shared + (((int)threadIdx.x) + 32)))
- );
- __asm__ __volatile__(
- "cp.async.ca.shared.global [%0], [%1], %2;"
- :: "r"(addr), "l"((void*)(A + ((((int)threadIdx.x) * 14) + 1))), "n"(4)
- );
- }
-
- {
- unsigned int addr;
- __asm__ __volatile__(
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
- : "=r"(addr)
- : "l"((void *)(B_shared + (((int)threadIdx.x) + 32)))
- );
- __asm__ __volatile__(
- "cp.async.ca.shared.global [%0], [%1], %2;"
- :: "r"(addr), "l"((void*)(B + ((((int)threadIdx.x) * 14) + 1))), "n"(4)
- );
- }
-__asm__ __volatile__("cp.async.commit_group;");
-
- for (int i = 0; i < 13; ++i) {
- bool cse_var_1 = (i < 12);
-
- {
- unsigned int addr;
- __asm__ __volatile__(
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
- : "=r"(addr)
- : "l"((void *)(A_shared + ((((i + 3) & 3) * 16) + ((int)threadIdx.x))))
- );
- int src_bytes = cse_var_1 ? 4 : 0;
- __asm__ __volatile__(
- "cp.async.ca.shared.global [%0], [%1], %2, %3;"
- :: "r"(addr), "l"((void*)(A + (((((int)threadIdx.x) * 14) + i) + 2))),
"n"(4), "r"(src_bytes)
- );
- }
-__asm__ __volatile__("cp.async.commit_group;");
-
-__asm__ __volatile__("cp.async.wait_group 5;");
-
- __syncthreads();
- C[((((int)threadIdx.x) * 16) + i)] = (A_shared[(((i & 3) * 16) +
((int)threadIdx.x))] + B_shared[(((i & 3) * 16) + ((int)threadIdx.x))]);
- __syncthreads();
-
- {
- unsigned int addr;
- __asm__ __volatile__(
- "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
- : "=r"(addr)
- : "l"((void *)(B_shared + ((((i + 3) & 3) * 16) + ((int)threadIdx.x))))
- );
- int src_bytes = cse_var_1 ? 4 : 0;
- __asm__ __volatile__(
- "cp.async.ca.shared.global [%0], [%1], %2, %3;"
- :: "r"(addr), "l"((void*)(B + (((((int)threadIdx.x) * 14) + i) + 2))),
"n"(4), "r"(src_bytes)
- );
- }
-__asm__ __volatile__("cp.async.commit_group;");
-
- }
-__asm__ __volatile__("cp.async.wait_group 2;");
-
- __syncthreads();
- C[((((int)threadIdx.x) * 16) + 13)] = (A_shared[(((int)threadIdx.x) + 16)] +
B_shared[(((int)threadIdx.x) + 16)]);
-__asm__ __volatile__("cp.async.wait_group 1;");
-
- __syncthreads();
- C[((((int)threadIdx.x) * 16) + 14)] = (A_shared[(((int)threadIdx.x) + 32)] +
B_shared[(((int)threadIdx.x) + 32)]);
-__asm__ __volatile__("cp.async.wait_group 0;");
-
- __syncthreads();
- C[((((int)threadIdx.x) * 16) + 15)] = (A_shared[(((int)threadIdx.x) + 48)] +
B_shared[(((int)threadIdx.x) + 48)]);
-}
-
-"""
-
-
-generated_code = ""
-support_async = True
-
-
[email protected]_func
-def tvm_callback_cuda_postproc(code):
- global generated_code
- global support_async
- generated_code = code
- # return a dummy code so that device < sm80 could build correctly
- if not support_async:
- ret = ""
- for line in code.split("\n"):
- ret += line + "\n"
- if line.startswith('extern "C" __global__'):
- break
- ret += "}"
- return ret
- return code
-
-
[email protected]_cuda
-def test_cp_async_in_if_then_else():
- global support_async
- arch = tvm.contrib.nvcc.get_target_compute_version()
- major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
- if major < 8:
- # At least sm80 is required
- support_async = False
-
- @T.prim_func
- def simple_compute(
- A: T.Buffer((16, 14), "float32"),
- B: T.Buffer((16, 14), "float32"),
- C: T.Buffer((16, 16), "float32"),
- ):
- T.func_attr({"global_symbol": "main", "tir.noalias": True})
- for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
- for i in T.serial(
- 16,
- annotations={
- "software_pipeline_stage": [0, 0, 3],
- "software_pipeline_order": [0, 2, 1],
- "software_pipeline_async_stages": [0],
- },
- ):
- with T.block("compute"):
- T.reads(A[tx, i])
- T.writes(C[tx, i])
- A_shared = T.alloc_buffer((16, 1), dtype="float32",
scope="shared")
- B_shared = T.alloc_buffer((16, 1), dtype="float32",
scope="shared")
- with T.block():
- T.reads(A[tx, i])
- T.writes(A_shared[tx, 0])
- A_shared[tx, 0] = T.if_then_else(
- 1 <= i and i < 15, A[tx, i - 1], T.float32(0),
dtype="float32"
- )
- with T.block():
- T.reads(B[tx, i])
- T.writes(B_shared[tx, 0])
- B_shared[tx, 0] = T.if_then_else(
- 1 <= i and i < 15, B[tx, i - 1], T.float32(0),
dtype="float32"
- )
- with T.block():
- T.reads(A_shared[tx, 0], B_shared[tx, 0])
- T.writes(C[tx, i])
- C[tx, i] = A_shared[tx, 0] + B_shared[tx, 0]
-
- mod = tvm.IRModule.from_expr(simple_compute)
- with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
- tvm.build(mod, target="cuda")
-
- assert generated_code == expected_cuda_script
-
- if not support_async:
- # avoid return dummy code to other tests
- support_async = True
-
-
-if __name__ == "__main__":
- test_cp_async_in_if_then_else()
diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
index fca88594c0..3d779bc7d1 100644
--- a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
+++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
@@ -180,6 +180,220 @@ def test_inject_async_copy_shared_dyn():
tvm.testing.assert_allclose(C_nd.numpy(), A_np + B_np)
+expected_cuda_script = r"""
+#ifdef _WIN32
+ using uint = unsigned int;
+ using uchar = unsigned char;
+ using ushort = unsigned short;
+ using int64_t = long long;
+ using uint64_t = unsigned long long;
+#else
+ #define uint unsigned int
+ #define uchar unsigned char
+ #define ushort unsigned short
+ #define int64_t long long
+ #define uint64_t unsigned long long
+#endif
+extern "C" __global__ void __launch_bounds__(16) main_kernel0(float*
__restrict__ A, float* __restrict__ B, float* __restrict__ C) {
+ __shared__ float A_shared[64];
+ __shared__ float B_shared[64];
+ A_shared[((int)threadIdx.x)] = 0.000000e+00f;
+ B_shared[((int)threadIdx.x)] = 0.000000e+00f;
+__asm__ __volatile__("cp.async.commit_group;");
+
+
+ {
+ unsigned int addr;
+ __asm__ __volatile__(
+ "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
+ : "=r"(addr)
+ : "l"((void *)(A_shared + (((int)threadIdx.x) + 16)))
+ );
+ __asm__ __volatile__(
+ "cp.async.ca.shared.global [%0], [%1], %2;"
+ :: "r"(addr), "l"((void*)(A + (((int)threadIdx.x) * 14))), "n"(4)
+ );
+ }
+
+ {
+ unsigned int addr;
+ __asm__ __volatile__(
+ "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
+ : "=r"(addr)
+ : "l"((void *)(B_shared + (((int)threadIdx.x) + 16)))
+ );
+ __asm__ __volatile__(
+ "cp.async.ca.shared.global [%0], [%1], %2;"
+ :: "r"(addr), "l"((void*)(B + (((int)threadIdx.x) * 14))), "n"(4)
+ );
+ }
+__asm__ __volatile__("cp.async.commit_group;");
+
+
+ {
+ unsigned int addr;
+ __asm__ __volatile__(
+ "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
+ : "=r"(addr)
+ : "l"((void *)(A_shared + (((int)threadIdx.x) + 32)))
+ );
+ __asm__ __volatile__(
+ "cp.async.ca.shared.global [%0], [%1], %2;"
+ :: "r"(addr), "l"((void*)(A + ((((int)threadIdx.x) * 14) + 1))), "n"(4)
+ );
+ }
+
+ {
+ unsigned int addr;
+ __asm__ __volatile__(
+ "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
+ : "=r"(addr)
+ : "l"((void *)(B_shared + (((int)threadIdx.x) + 32)))
+ );
+ __asm__ __volatile__(
+ "cp.async.ca.shared.global [%0], [%1], %2;"
+ :: "r"(addr), "l"((void*)(B + ((((int)threadIdx.x) * 14) + 1))), "n"(4)
+ );
+ }
+__asm__ __volatile__("cp.async.commit_group;");
+
+ for (int i = 0; i < 13; ++i) {
+ bool cse_var_1 = (i < 12);
+
+ {
+ unsigned int addr;
+ __asm__ __volatile__(
+ "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
+ : "=r"(addr)
+ : "l"((void *)(A_shared + ((((i + 3) & 3) * 16) + ((int)threadIdx.x))))
+ );
+ int src_bytes = cse_var_1 ? 4 : 0;
+ __asm__ __volatile__(
+ "cp.async.ca.shared.global [%0], [%1], %2, %3;"
+ :: "r"(addr), "l"((void*)(A + (((((int)threadIdx.x) * 14) + i) + 2))),
"n"(4), "r"(src_bytes)
+ );
+ }
+__asm__ __volatile__("cp.async.commit_group;");
+
+__asm__ __volatile__("cp.async.wait_group 5;");
+
+ __syncthreads();
+ C[((((int)threadIdx.x) * 16) + i)] = (A_shared[(((i & 3) * 16) +
((int)threadIdx.x))] + B_shared[(((i & 3) * 16) + ((int)threadIdx.x))]);
+ __syncthreads();
+
+ {
+ unsigned int addr;
+ __asm__ __volatile__(
+ "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr;
}\n"
+ : "=r"(addr)
+ : "l"((void *)(B_shared + ((((i + 3) & 3) * 16) + ((int)threadIdx.x))))
+ );
+ int src_bytes = cse_var_1 ? 4 : 0;
+ __asm__ __volatile__(
+ "cp.async.ca.shared.global [%0], [%1], %2, %3;"
+ :: "r"(addr), "l"((void*)(B + (((((int)threadIdx.x) * 14) + i) + 2))),
"n"(4), "r"(src_bytes)
+ );
+ }
+__asm__ __volatile__("cp.async.commit_group;");
+
+ }
+__asm__ __volatile__("cp.async.wait_group 2;");
+
+ __syncthreads();
+ C[((((int)threadIdx.x) * 16) + 13)] = (A_shared[(((int)threadIdx.x) + 16)] +
B_shared[(((int)threadIdx.x) + 16)]);
+__asm__ __volatile__("cp.async.wait_group 1;");
+
+ __syncthreads();
+ C[((((int)threadIdx.x) * 16) + 14)] = (A_shared[(((int)threadIdx.x) + 32)] +
B_shared[(((int)threadIdx.x) + 32)]);
+__asm__ __volatile__("cp.async.wait_group 0;");
+
+ __syncthreads();
+ C[((((int)threadIdx.x) * 16) + 15)] = (A_shared[(((int)threadIdx.x) + 48)] +
B_shared[(((int)threadIdx.x) + 48)]);
+}
+
+"""
+
+
+generated_code = ""
+support_async = True
+
+
[email protected]_func
+def tvm_callback_cuda_postproc(code):
+ global generated_code
+ global support_async
+ generated_code = code
+ # return a dummy code so that device < sm80 could build correctly
+ if not support_async:
+ ret = ""
+ for line in code.split("\n"):
+ ret += line + "\n"
+ if line.startswith('extern "C" __global__'):
+ break
+ ret += "}"
+ return ret
+ return code
+
+
[email protected]_cuda
+def test_cp_async_in_if_then_else():
+ global support_async
+ arch = tvm.contrib.nvcc.get_target_compute_version()
+ major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+ if major < 8:
+ # At least sm80 is required
+ support_async = False
+
+ @T.prim_func
+ def simple_compute(
+ A: T.Buffer((16, 14), "float32"),
+ B: T.Buffer((16, 14), "float32"),
+ C: T.Buffer((16, 16), "float32"),
+ ):
+ T.func_attr({"global_symbol": "main", "tir.noalias": True})
+ for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
+ for i in T.serial(
+ 16,
+ annotations={
+ "software_pipeline_stage": [0, 0, 3],
+ "software_pipeline_order": [0, 2, 1],
+ "software_pipeline_async_stages": [0],
+ },
+ ):
+ with T.block("compute"):
+ T.reads(A[tx, i])
+ T.writes(C[tx, i])
+ A_shared = T.alloc_buffer((16, 1), dtype="float32",
scope="shared")
+ B_shared = T.alloc_buffer((16, 1), dtype="float32",
scope="shared")
+ with T.block():
+ T.reads(A[tx, i])
+ T.writes(A_shared[tx, 0])
+ A_shared[tx, 0] = T.if_then_else(
+ 1 <= i and i < 15, A[tx, i - 1], T.float32(0),
dtype="float32"
+ )
+ with T.block():
+ T.reads(B[tx, i])
+ T.writes(B_shared[tx, 0])
+ B_shared[tx, 0] = T.if_then_else(
+ 1 <= i and i < 15, B[tx, i - 1], T.float32(0),
dtype="float32"
+ )
+ with T.block():
+ T.reads(A_shared[tx, 0], B_shared[tx, 0])
+ T.writes(C[tx, i])
+ C[tx, i] = A_shared[tx, 0] + B_shared[tx, 0]
+
+ mod = tvm.IRModule.from_expr(simple_compute)
+ with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
+ tvm.build(mod, target="cuda")
+
+ assert generated_code == expected_cuda_script
+
+ if not support_async:
+ # avoid return dummy code to other tests
+ support_async = True
+
+
if __name__ == "__main__":
test_inject_async_copy()
test_inject_async_copy_shared_dyn()
+ test_cp_async_in_if_then_else()