(tvm) branch main updated: [VM] [Hexagon] Add buffers to `dma_wait` builtin (#16706)

syfeng Fri, 15 Mar 2024 01:08:48 -0700

This is an automated email from the ASF dual-hosted git repository.

syfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 94866f769a [VM] [Hexagon] Add buffers to `dma_wait` builtin (#16706)
94866f769a is described below

commit 94866f769acfc4582607a2a0e818de263c9a1a60
Author: Abhikrant Sharma <[email protected]>
AuthorDate: Fri Mar 15 13:36:17 2024 +0530

    [VM] [Hexagon] Add buffers to `dma_wait` builtin (#16706)
    
    * [VM] [Hexagon] Add buffers to dma_wait builtin
    While introducing dma operations at graph level, relax KillAfterLastUse 
pass introduces kill_tensor operation after dma_copy.
    This leads to memory being deallocated when asynchronous copy operation is 
in progress.
    Hence, moving the input/output buffers to dma_wait to ensure kill_tensor is
    introduced after dma_wait at the graph level.
    Also, the logic for size calculation is updated to use GetDataSize function.
    The test case is updated to use offsets instead of allocating different 
storage in VTCM.
    
    * Fix review comments
---
 src/runtime/relax_vm/hexagon/builtin.cc            | 12 +--
 .../contrib/test_hexagon/test_dma_builtin.py       | 86 +++++++++-------------
 2 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/src/runtime/relax_vm/hexagon/builtin.cc 
b/src/runtime/relax_vm/hexagon/builtin.cc
index d18c434193..b32d0e14aa 100644
--- a/src/runtime/relax_vm/hexagon/builtin.cc
+++ b/src/runtime/relax_vm/hexagon/builtin.cc
@@ -22,6 +22,7 @@
  * \brief The hexagon graph related builtin functions for Relax virtual 
machine.
  */
 
+#include <tvm/runtime/device_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/relax_vm/vm.h>
@@ -38,12 +39,10 @@ TVM_REGISTER_GLOBAL("vm.builtin.hexagon.dma_copy")
       const DLTensor* sptr = src_arr.operator->();
       void* dst = dptr->data;
       void* src = sptr->data;
-      uint32_t size = 1;
       int ret = DMA_RETRY;
-      for (int i = 0; i < dptr->ndim; i++) {
-        size = size * dptr->shape[i];
-      }
-      size = size * sizeof(dptr->dtype);
+
+      CHECK_EQ(GetDataSize(*dptr), GetDataSize(*sptr));
+      auto size = GetDataSize(*dptr);
       ICHECK(size > 0);
       do {
         ret = 
tvm::runtime::hexagon::HexagonDeviceAPI::Global()->UserDMA()->Copy(
@@ -53,7 +52,8 @@ TVM_REGISTER_GLOBAL("vm.builtin.hexagon.dma_copy")
     });
 
 TVM_REGISTER_GLOBAL("vm.builtin.hexagon.dma_wait")
-    .set_body_typed([](TVMArgValue vm_ptr, int queue_id, int inflight_dma) {
+    .set_body_typed([](TVMArgValue vm_ptr, int queue_id, int inflight_dma,
+                       [[maybe_unused]] NDArray src_arr, [[maybe_unused]] 
NDArray dst_arr) {
       ICHECK(inflight_dma >= 0);
       
tvm::runtime::hexagon::HexagonDeviceAPI::Global()->UserDMA()->Wait(queue_id, 
inflight_dma);
     });
diff --git a/tests/python/contrib/test_hexagon/test_dma_builtin.py 
b/tests/python/contrib/test_hexagon/test_dma_builtin.py
index 11f4d2d540..af82c2b55a 100644
--- a/tests/python/contrib/test_hexagon/test_dma_builtin.py
+++ b/tests/python/contrib/test_hexagon/test_dma_builtin.py
@@ -31,15 +31,17 @@ import tvm.testing
 
 # pylint: disable=invalid-name, missing-class-docstring, 
missing-function-docstring, no-self-argument
 
+data_type = "int32"
+
 
 @I.ir_module
 class Module_1D:
     @T.prim_func
     def compute_add_in_vtcm(a: T.handle, b: T.handle, c: T.handle) -> None:
         m = T.int32()
-        A = T.match_buffer(a, (m,), "int32", scope="global.vtcm")
-        B = T.match_buffer(b, (m,), "int32", scope="global.vtcm")
-        C = T.match_buffer(c, (m,), "int32", scope="global.vtcm")
+        A = T.match_buffer(a, (m,), data_type, scope="global.vtcm")
+        B = T.match_buffer(b, (m,), data_type, scope="global.vtcm")
+        C = T.match_buffer(c, (m,), data_type, scope="global.vtcm")
         for ax0 in T.grid(m):
             with T.block("T_add"):
                 v_ax0 = T.axis.remap("S", [ax0])
@@ -49,98 +51,78 @@ class Module_1D:
 
     @R.function
     def main(
-        x: R.Tensor((12800,), "int32"),
-        y: R.Tensor((12800,), "int32"),
-    ) -> R.Tensor((12800,), "int32"):
+        x: R.Tensor((12800,), data_type),
+        y: R.Tensor((12800,), data_type),
+    ) -> R.Tensor((12800,), data_type):
         cls = Module_1D
-        vtcm_obj_a: R.Object = R.vm.alloc_storage(
+        vtcm_obj: R.Object = R.vm.alloc_storage(
             R.shape(
                 [
-                    12800,
+                    3 * 12800,  # 3 = 2 inputs + 1 output
                 ]
             ),
             runtime_device_index=0,
-            dtype="int32",
+            dtype=data_type,
             storage_scope="global.vtcm",
         )
-        a: R.Tensor([12800,], dtype="int32") = R.vm.alloc_tensor(
-            vtcm_obj_a,
+        a: R.Tensor([12800,], dtype=data_type) = R.vm.alloc_tensor(
+            vtcm_obj,
             offset=0,
             shape=R.shape(
                 [
                     12800,
                 ]
             ),
-            dtype="int32",
+            dtype=data_type,
         )
         __: R.Tuple = R.call_builtin_with_ctx(
             "vm.builtin.hexagon.dma_copy",
             [x, a, 0, True],
             sinfo_args=[],
         )
-        vtcm_obj_b: R.Object = R.vm.alloc_storage(
-            R.shape(
-                [
-                    12800,
-                ]
-            ),
-            runtime_device_index=0,
-            dtype="int32",
-            storage_scope="global.vtcm",
-        )
-        b: R.Tensor([12800,], dtype="int32") = R.vm.alloc_tensor(
-            vtcm_obj_b,
-            offset=0,
+        b: R.Tensor([12800,], dtype=data_type) = R.vm.alloc_tensor(
+            vtcm_obj,
+            offset=12800 * 4,
             shape=R.shape(
                 [
                     12800,
                 ]
             ),
-            dtype="int32",
+            dtype=data_type,
         )
         __: R.Tuple = R.call_builtin_with_ctx(
             "vm.builtin.hexagon.dma_copy",
             [y, b, 1, True],
             sinfo_args=[],
         )
-        vtcm_obj_c: R.Object = R.vm.alloc_storage(
-            R.shape(
-                [
-                    12800,
-                ]
-            ),
-            runtime_device_index=0,
-            dtype="int32",
-            storage_scope="global.vtcm",
-        )
-        c: R.Tensor([12800,], dtype="int32") = R.vm.alloc_tensor(
-            vtcm_obj_c,
-            offset=0,
+        c: R.Tensor([12800,], dtype=data_type) = R.vm.alloc_tensor(
+            vtcm_obj,
+            offset=2 * 12800 * 4,
             shape=R.shape(
                 [
                     12800,
                 ]
             ),
-            dtype="int32",
+            dtype=data_type,
         )
         __: R.Tuple = R.call_builtin_with_ctx(
             "vm.builtin.hexagon.dma_wait",
-            [0, 2],
+            [0, 2, x, a],
             sinfo_args=[],
         )
         __: R.Tuple = R.call_builtin_with_ctx(
             "vm.builtin.hexagon.dma_wait",
-            [1, 1],
+            [1, 1, y, b],
             sinfo_args=[],
         )
         ___: R.Tuple = cls.compute_add_in_vtcm(a, b, c)
-        ret_val: R.Tensor((12800,), dtype="int32") = R.builtin.alloc_tensor(
+        ret_val: R.Tensor((12800,), dtype=data_type) = R.builtin.alloc_tensor(
             R.shape(
                 [
                     12800,
                 ]
             ),
-            R.dtype("int32"),
+            R.dtype(data_type),
             R.prim_value(0),
         )
         __: R.Tuple = R.call_builtin_with_ctx(
@@ -148,18 +130,16 @@ class Module_1D:
             [c, ret_val, 0, True],
             sinfo_args=[],
         )
-        _t3: R.Tuple = R.vm.kill_object(vtcm_obj_a)
-        _t4: R.Tuple = R.vm.kill_object(vtcm_obj_b)
-        _t6: R.Tuple = R.vm.kill_object(a)
-        _t7: R.Tuple = R.vm.kill_object(b)
         __: R.Tuple = R.call_builtin_with_ctx(
             "vm.builtin.hexagon.dma_wait",
-            [0, 1],
+            [0, 1, c, ret_val],
             sinfo_args=[],
         )
-        _t5: R.Tuple = R.vm.kill_object(vtcm_obj_c)
+        _t3: R.Tuple = R.vm.kill_object(vtcm_obj)
+        _t6: R.Tuple = R.vm.kill_object(a)
+        _t7: R.Tuple = R.vm.kill_object(b)
         _t8: R.Tuple = R.vm.kill_object(c)
-        lv: R.Tensor((12800,), dtype="int32") = ret_val
+        lv: R.Tensor((12800,), dtype=data_type) = ret_val
         return lv
 
 
@@ -177,8 +157,8 @@ class TestDMACopyWait:
             ex = relax.build(mod=module, target=target, exec_mode=mode)
         with hexagon_launcher.create_session() as session:
             dev = session.device
-            input_arg0_data = np.random.randint(0, 9, size=(12800,), 
dtype="int32")
-            input_arg1_data = np.random.randint(0, 9, size=(12800,), 
dtype="int32")
+            input_arg0_data = np.random.randint(0, 9, size=(12800,), 
dtype=data_type)
+            input_arg1_data = np.random.randint(0, 9, size=(12800,), 
dtype=data_type)
             output_data = np.add(input_arg0_data, input_arg1_data)
             vm_mod = session.get_executor_from_factory(ex)
             vm_rt = relax.VirtualMachine(

(tvm) branch main updated: [VM] [Hexagon] Add buffers to `dma_wait` builtin (#16706)

Reply via email to