This is an automated email from the ASF dual-hosted git repository.
syfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 94866f769a [VM] [Hexagon] Add buffers to `dma_wait` builtin (#16706)
94866f769a is described below
commit 94866f769acfc4582607a2a0e818de263c9a1a60
Author: Abhikrant Sharma <[email protected]>
AuthorDate: Fri Mar 15 13:36:17 2024 +0530
[VM] [Hexagon] Add buffers to `dma_wait` builtin (#16706)
* [VM] [Hexagon] Add buffers to dma_wait builtin
While introducing dma operations at graph level, relax KillAfterLastUse
pass introduces kill_tensor operation after dma_copy.
This leads to memory being deallocated when asynchronous copy operation is
in progress.
Hence, moving the input/output buffers to dma_wait to ensure kill_tensor is
introduced after dma_wait at the graph level.
Also, the logic for size calculation is updated to use GetDataSize function.
The test case is updated to use offsets instead of allocating different
storage in VTCM.
* Fix review comments
---
src/runtime/relax_vm/hexagon/builtin.cc | 12 +--
.../contrib/test_hexagon/test_dma_builtin.py | 86 +++++++++-------------
2 files changed, 39 insertions(+), 59 deletions(-)
diff --git a/src/runtime/relax_vm/hexagon/builtin.cc
b/src/runtime/relax_vm/hexagon/builtin.cc
index d18c434193..b32d0e14aa 100644
--- a/src/runtime/relax_vm/hexagon/builtin.cc
+++ b/src/runtime/relax_vm/hexagon/builtin.cc
@@ -22,6 +22,7 @@
* \brief The hexagon graph related builtin functions for Relax virtual
machine.
*/
+#include <tvm/runtime/device_api.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>
#include <tvm/runtime/relax_vm/vm.h>
@@ -38,12 +39,10 @@ TVM_REGISTER_GLOBAL("vm.builtin.hexagon.dma_copy")
const DLTensor* sptr = src_arr.operator->();
void* dst = dptr->data;
void* src = sptr->data;
- uint32_t size = 1;
int ret = DMA_RETRY;
- for (int i = 0; i < dptr->ndim; i++) {
- size = size * dptr->shape[i];
- }
- size = size * sizeof(dptr->dtype);
+
+ CHECK_EQ(GetDataSize(*dptr), GetDataSize(*sptr));
+ auto size = GetDataSize(*dptr);
ICHECK(size > 0);
do {
ret =
tvm::runtime::hexagon::HexagonDeviceAPI::Global()->UserDMA()->Copy(
@@ -53,7 +52,8 @@ TVM_REGISTER_GLOBAL("vm.builtin.hexagon.dma_copy")
});
TVM_REGISTER_GLOBAL("vm.builtin.hexagon.dma_wait")
- .set_body_typed([](TVMArgValue vm_ptr, int queue_id, int inflight_dma) {
+ .set_body_typed([](TVMArgValue vm_ptr, int queue_id, int inflight_dma,
+ [[maybe_unused]] NDArray src_arr, [[maybe_unused]]
NDArray dst_arr) {
ICHECK(inflight_dma >= 0);
tvm::runtime::hexagon::HexagonDeviceAPI::Global()->UserDMA()->Wait(queue_id,
inflight_dma);
});
diff --git a/tests/python/contrib/test_hexagon/test_dma_builtin.py
b/tests/python/contrib/test_hexagon/test_dma_builtin.py
index 11f4d2d540..af82c2b55a 100644
--- a/tests/python/contrib/test_hexagon/test_dma_builtin.py
+++ b/tests/python/contrib/test_hexagon/test_dma_builtin.py
@@ -31,15 +31,17 @@ import tvm.testing
# pylint: disable=invalid-name, missing-class-docstring,
missing-function-docstring, no-self-argument
+data_type = "int32"
+
@I.ir_module
class Module_1D:
@T.prim_func
def compute_add_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None:
m = T.int32()
- A = T.match_buffer(a, (m,), "int32", scope="global.vtcm")
- B = T.match_buffer(b, (m,), "int32", scope="global.vtcm")
- C = T.match_buffer(c, (m,), "int32", scope="global.vtcm")
+ A = T.match_buffer(a, (m,), data_type, scope="global.vtcm")
+ B = T.match_buffer(b, (m,), data_type, scope="global.vtcm")
+ C = T.match_buffer(c, (m,), data_type, scope="global.vtcm")
for ax0 in T.grid(m):
with T.block("T_add"):
v_ax0 = T.axis.remap("S", [ax0])
@@ -49,98 +51,78 @@ class Module_1D:
@R.function
def main(
- x: R.Tensor((12800,), "int32"),
- y: R.Tensor((12800,), "int32"),
- ) -> R.Tensor((12800,), "int32"):
+ x: R.Tensor((12800,), data_type),
+ y: R.Tensor((12800,), data_type),
+ ) -> R.Tensor((12800,), data_type):
cls = Module_1D
- vtcm_obj_a: R.Object = R.vm.alloc_storage(
+ vtcm_obj: R.Object = R.vm.alloc_storage(
R.shape(
[
- 12800,
+ 3 * 12800, # 3 = 2 inputs + 1 output
]
),
runtime_device_index=0,
- dtype="int32",
+ dtype=data_type,
storage_scope="global.vtcm",
)
- a: R.Tensor([12800,], dtype="int32") = R.vm.alloc_tensor(
- vtcm_obj_a,
+ a: R.Tensor([12800,], dtype=data_type) = R.vm.alloc_tensor(
+ vtcm_obj,
offset=0,
shape=R.shape(
[
12800,
]
),
- dtype="int32",
+ dtype=data_type,
)
__: R.Tuple = R.call_builtin_with_ctx(
"vm.builtin.hexagon.dma_copy",
[x, a, 0, True],
sinfo_args=[],
)
- vtcm_obj_b: R.Object = R.vm.alloc_storage(
- R.shape(
- [
- 12800,
- ]
- ),
- runtime_device_index=0,
- dtype="int32",
- storage_scope="global.vtcm",
- )
- b: R.Tensor([12800,], dtype="int32") = R.vm.alloc_tensor(
- vtcm_obj_b,
- offset=0,
+ b: R.Tensor([12800,], dtype=data_type) = R.vm.alloc_tensor(
+ vtcm_obj,
+ offset=12800 * 4,
shape=R.shape(
[
12800,
]
),
- dtype="int32",
+ dtype=data_type,
)
__: R.Tuple = R.call_builtin_with_ctx(
"vm.builtin.hexagon.dma_copy",
[y, b, 1, True],
sinfo_args=[],
)
- vtcm_obj_c: R.Object = R.vm.alloc_storage(
- R.shape(
- [
- 12800,
- ]
- ),
- runtime_device_index=0,
- dtype="int32",
- storage_scope="global.vtcm",
- )
- c: R.Tensor([12800,], dtype="int32") = R.vm.alloc_tensor(
- vtcm_obj_c,
- offset=0,
+ c: R.Tensor([12800,], dtype=data_type) = R.vm.alloc_tensor(
+ vtcm_obj,
+ offset=2 * 12800 * 4,
shape=R.shape(
[
12800,
]
),
- dtype="int32",
+ dtype=data_type,
)
__: R.Tuple = R.call_builtin_with_ctx(
"vm.builtin.hexagon.dma_wait",
- [0, 2],
+ [0, 2, x, a],
sinfo_args=[],
)
__: R.Tuple = R.call_builtin_with_ctx(
"vm.builtin.hexagon.dma_wait",
- [1, 1],
+ [1, 1, y, b],
sinfo_args=[],
)
___: R.Tuple = cls.compute_add_in_vtcm(a, b, c)
- ret_val: R.Tensor((12800,), dtype="int32") = R.builtin.alloc_tensor(
+ ret_val: R.Tensor((12800,), dtype=data_type) = R.builtin.alloc_tensor(
R.shape(
[
12800,
]
),
- R.dtype("int32"),
+ R.dtype(data_type),
R.prim_value(0),
)
__: R.Tuple = R.call_builtin_with_ctx(
@@ -148,18 +130,16 @@ class Module_1D:
[c, ret_val, 0, True],
sinfo_args=[],
)
- _t3: R.Tuple = R.vm.kill_object(vtcm_obj_a)
- _t4: R.Tuple = R.vm.kill_object(vtcm_obj_b)
- _t6: R.Tuple = R.vm.kill_object(a)
- _t7: R.Tuple = R.vm.kill_object(b)
__: R.Tuple = R.call_builtin_with_ctx(
"vm.builtin.hexagon.dma_wait",
- [0, 1],
+ [0, 1, c, ret_val],
sinfo_args=[],
)
- _t5: R.Tuple = R.vm.kill_object(vtcm_obj_c)
+ _t3: R.Tuple = R.vm.kill_object(vtcm_obj)
+ _t6: R.Tuple = R.vm.kill_object(a)
+ _t7: R.Tuple = R.vm.kill_object(b)
_t8: R.Tuple = R.vm.kill_object(c)
- lv: R.Tensor((12800,), dtype="int32") = ret_val
+ lv: R.Tensor((12800,), dtype=data_type) = ret_val
return lv
@@ -177,8 +157,8 @@ class TestDMACopyWait:
ex = relax.build(mod=module, target=target, exec_mode=mode)
with hexagon_launcher.create_session() as session:
dev = session.device
- input_arg0_data = np.random.randint(0, 9, size=(12800,),
dtype="int32")
- input_arg1_data = np.random.randint(0, 9, size=(12800,),
dtype="int32")
+ input_arg0_data = np.random.randint(0, 9, size=(12800,),
dtype=data_type)
+ input_arg1_data = np.random.randint(0, 9, size=(12800,),
dtype=data_type)
output_data = np.add(input_arg0_data, input_arg1_data)
vm_mod = session.get_executor_from_factory(ex)
vm_rt = relax.VirtualMachine(