This is an automated email from the ASF dual-hosted git repository.

csullivan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new d87fa854b8 [Hexagon] Initial support for meta schedule tuning (#12587)
d87fa854b8 is described below

commit d87fa854b8eb0c8f603d8dc459121eaa1a365e12
Author: masahi <[email protected]>
AuthorDate: Sat Aug 27 02:01:24 2022 +0900

    [Hexagon] Initial support for meta schedule tuning (#12587)
    
    Enables AutoTVM-style, template-based tuning for Hexagon.
    
    To run compiled code on Hexagon, we need to use Hexagon `Session` object 
https://github.com/apache/tvm/blob/dc522a6ff65b68532cd1bba43827cd981114df2c/python/tvm/contrib/hexagon/session.py#L35
 in the metaschedule `RPCRunner`. But for RPC "session", `RPCRunner` expects an 
instance of `RPCSession`, 
https://github.com/apache/tvm/blob/53fe5966823eee4e011d7228bceab3c82c1d9caa/python/tvm/rpc/client.py#L32,
  to be created and used by various customizable functions.
    
    Since `RPCSession` and Hexagon `Session` have slightly different API, we 
cannot use `RPCRunner` with customizable functions directly. So I introduced an 
alternative implementation of `RPCRunner` for Hexagon.
    
    The test is disabled for simulator since `HexagonLauncherSimulator` is not 
pickle-able due to its `multiprocessing.Process` attribute: 
https://github.com/apache/tvm/blob/c97895e0ffb512e73c89de7cdee9846f052244fc/python/tvm/contrib/hexagon/build.py#L614
    
    
    Output log from tuning `vrmpy` dense (included in the test)
    
    ```
     ID | Name |      FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted 
Latency (us) | Trials | Terminated
    
--------------------------------------------------------------------------------------------------------------
      0 | main | 150994944 |      1 |       380.3399 |     397.0000 |           
   397.0000 |     32 |
    
--------------------------------------------------------------------------------------------------------------
    ```
---
 apps/hexagon_api/CMakeLists.txt                    |   2 +
 python/tvm/contrib/hexagon/meta_schedule.py        | 166 ++++++++++++++++
 python/tvm/contrib/hexagon/session.py              |   8 +-
 python/tvm/contrib/hexagon/tools.py                |   7 +
 python/tvm/meta_schedule/default_config.py         |   6 +-
 python/tvm/target/target.py                        |   5 +
 python/tvm/tir/tensor_intrin/__init__.py           |   2 +-
 python/tvm/tir/tensor_intrin/hexagon.py            |  71 +++++++
 src/target/target_kind.cc                          |   1 +
 .../contrib/test_hexagon/test_meta_schedule.py     | 211 +++++++++++++++++++++
 10 files changed, 472 insertions(+), 7 deletions(-)

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index aa971c8753..9a05cf3675 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -87,6 +87,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
     "-DUSE_HEXAGON_RPC=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
     "-DUSE_ALTERNATIVE_LINKER=OFF"
+    "-DUSE_RANDOM=ON"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
@@ -133,6 +134,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DUSE_ALTERNATIVE_LINKER=OFF"
     "-DUSE_CUSTOM_LOGGING=ON"
     "-DUSE_HEXAGON_QHL=ON"
+    "-DUSE_RANDOM=ON"
     "${GTEST_FLAG}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
diff --git a/python/tvm/contrib/hexagon/meta_schedule.py 
b/python/tvm/contrib/hexagon/meta_schedule.py
new file mode 100644
index 0000000000..8a4de74b61
--- /dev/null
+++ b/python/tvm/contrib/hexagon/meta_schedule.py
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Meta schedule tuning utilities for Hexagon."""
+import os
+import tempfile
+from typing import Callable, List, Optional
+from tvm.contrib.popen_pool import PopenPoolExecutor
+from tvm.meta_schedule.utils import cpu_count, derived_object
+from tvm.meta_schedule.builder import LocalBuilder
+from tvm.meta_schedule.runner import (
+    EvaluatorConfig,
+    RunnerInput,
+    RunnerFuture,
+    PyRunner,
+)
+from tvm.meta_schedule.runner.rpc_runner import (
+    default_alloc_argument,
+    default_run_evaluator,
+    RPCRunnerFuture,
+)
+
+from .build import HexagonLauncherRPC
+from .tools import export_module
+
+
+@derived_object
+class HexagonRPCRunner(PyRunner):
+    """RPCRunner for Hexagon. See the documentation of RPCRunner for more 
details."""
+
+    def __init__(
+        self,
+        hexagon_launcher: HexagonLauncherRPC,
+        evaluator_config: Optional[EvaluatorConfig] = None,
+        cooldown_sec: float = 0.0,
+        alloc_repeat: int = 1,
+        max_workers: Optional[int] = None,
+        initializer: Optional[Callable[[], None]] = None,
+    ):
+        """
+        Parameters
+        ----------
+        hexagon_launcher : HexagonLauncherRPC
+            The RPC launcher for Hexagon. It is needed for creating 
hexagon.Session
+            object inside the worker function.
+        evaluator_config: EvaluatorConfig
+            The evaluator configuration.
+        cooldown_sec: float
+            The cooldown in seconds.
+        alloc_repeat: int
+            The number of times to random fill the allocation.
+        max_workers: Optional[int] = None
+            The maximum number of connections. Defaults to number of logical 
CPU cores.
+        initializer: Optional[Callable[[], None]]
+            The initializer function.
+        """
+
+        super().__init__()
+        self.hexagon_launcher = hexagon_launcher
+        self.evaluator_config = EvaluatorConfig._normalized(evaluator_config)
+        self.cooldown_sec = cooldown_sec
+        self.alloc_repeat = alloc_repeat
+        if max_workers is None:
+            max_workers = cpu_count(logical=True)
+        self.pool = PopenPoolExecutor(
+            max_workers=max_workers,
+            timeout=100,
+            initializer=initializer,
+        )
+
+    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+        results = []
+        for runner_input in runner_inputs:
+            future = RPCRunnerFuture(
+                future=self.pool.submit(
+                    _worker_func,
+                    self.hexagon_launcher,
+                    self.evaluator_config,
+                    self.alloc_repeat,
+                    str(runner_input.artifact_path),
+                    tuple(arg_info.as_json() for arg_info in 
runner_input.args_info),
+                ),
+                timeout_sec=100,
+            )
+            results.append(future)
+        return results
+
+
+def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, 
artifact_path, args_info):
+    with hexagon_launcher.start_session() as session:
+        device = session.device
+        _, remote_path = os.path.split(artifact_path)
+        uploaded = session.upload(artifact_path, remote_path)
+        rt_mod = session.load_module(uploaded)
+        repeated_args = default_alloc_argument(
+            session,
+            device,
+            args_info,
+            alloc_repeat,
+        )
+        costs = default_run_evaluator(
+            session,
+            rt_mod,
+            device,
+            evaluator_config,
+            repeated_args,
+        )
+    return costs
+
+
+def get_hexagon_local_builder():
+    """Return Hexagon-compatible Builder for meta schedule."""
+
+    def export_func(mod):
+        binary_path = export_module(mod, tempfile.mkdtemp())
+        return str(binary_path)
+
+    return LocalBuilder(f_export=export_func)
+
+
+def get_hexagon_rpc_runner(
+    hexagon_launcher: HexagonLauncherRPC, number=3, repeat=1, min_repeat_ms=100
+):
+    """Return Hexagon-compatible RPC Runner for meta schedule.
+
+    Parameters
+    ----------
+    hexagon_launcher : HexagonLauncherRPC
+        The RPC launcher for Hexagon.
+    number: int
+        The number of times to run this function for taking average.
+        We call these runs as one `repeat` of measurement.
+    repeat: int
+        The number of times to repeat the measurement.
+        In total, the function will be invoked (1 + number x repeat) times,
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
+    min_repeat_ms: int
+        Minimum repeat time in ms. if the execution latency is too short,
+        increase the number of runs to the given time (in ms) to reduce the 
measurement error.
+    """
+    evaluator_config = EvaluatorConfig(
+        number=number,
+        repeat=repeat,
+        min_repeat_ms=min_repeat_ms,
+        enable_cpu_cache_flush=False,
+    )
+
+    return HexagonRPCRunner(
+        hexagon_launcher,
+        evaluator_config,
+    )
diff --git a/python/tvm/contrib/hexagon/session.py 
b/python/tvm/contrib/hexagon/session.py
index 0c0bf296df..9308e396b2 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -30,6 +30,7 @@ from tvm.relay.backend.executor_factory import (
     AOTExecutorFactoryModule,
     GraphExecutorFactoryModule,
 )
+from .tools import export_module
 
 
 class Session:
@@ -110,6 +111,9 @@ class Session:
 
         return self._device
 
+    def get_function(self, name):
+        return self._rpc.get_function(name)
+
     def upload(self, local_path: Union[str, pathlib.Path], remote_filename: 
str) -> pathlib.Path:
         """Upload a local file to the remote workspace.
 
@@ -154,10 +158,8 @@ class Session:
 
         if isinstance(module, tvm.runtime.Module):
             with tempfile.TemporaryDirectory() as temp_dir:
-                temp_dir = pathlib.Path(temp_dir)
                 binary_name = "test_binary.so"
-                binary_path = temp_dir / binary_name
-                module.save(str(binary_path))
+                binary_path = export_module(module, temp_dir, binary_name)
                 remote_file_path = self.upload(binary_path, binary_name)
         else:
             remote_file_path = module
diff --git a/python/tvm/contrib/hexagon/tools.py 
b/python/tvm/contrib/hexagon/tools.py
index 1aec8c7d56..3f4adb90f6 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -194,3 +194,10 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], 
files, hexagon_arch: st
     cross_compile.output_format = "o"
     c_files = [str(file) for file in files]
     cross_compile(str(so_name), c_files, options=compile_options + options)
+
+
+def export_module(module, out_dir, binary_name="test_binary.so"):
+    """Export Hexagon shared object to a file."""
+    binary_path = pathlib.Path(out_dir) / binary_name
+    module.save(str(binary_path))
+    return binary_path
diff --git a/python/tvm/meta_schedule/default_config.py 
b/python/tvm/meta_schedule/default_config.py
index 0f1f7d3c2c..97cbfc58a6 100644
--- a/python/tvm/meta_schedule/default_config.py
+++ b/python/tvm/meta_schedule/default_config.py
@@ -178,7 +178,7 @@ def schedule_rules(  # pylint: disable=redefined-outer-name
         return sch_rules()
     if sch_rules is not None:
         raise TypeError(f"Expected `sch_rules` to be None or callable, but 
gets: {sch_rules}")
-    if target.kind.name == "llvm":
+    if target.kind.name in ["llvm", "hexagon"]:
         return _DefaultLLVM.schedule_rules()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.schedule_rules()
@@ -194,7 +194,7 @@ def postproc(  # pylint: disable=redefined-outer-name
         return postproc()
     if postproc is not None:
         raise TypeError(f"Expected `postproc` to be None or callable, but 
gets: {postproc}")
-    if target.kind.name == "llvm":
+    if target.kind.name in ["llvm", "hexagon"]:
         return _DefaultLLVM.postprocs()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.postprocs()
@@ -212,7 +212,7 @@ def mutator_probs(  # pylint: disable=redefined-outer-name
         raise TypeError(
             f"Expected `mutator_probs` to be None or callable, but gets: 
{mutator_probs}"
         )
-    if target.kind.name == "llvm":
+    if target.kind.name in ["llvm", "hexagon"]:
         return _DefaultLLVM.mutator_probs()
     if target.kind.name in ["cuda", "rocm", "vulkan"]:
         return _DefaultCUDA.mutator_probs()
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index a558fcbeaf..1e9e2e698c 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -636,6 +636,8 @@ def hexagon(cpu_ver="v66", **kwargs):
         Whether to use QFloat HVX instructions.
     use_ieee_fp : bool (default: False)
         Whether to use IEEE HVX instructions
+    num_cores : int (default: 4)
+        The number of HVX threads. This attribute is required by meta 
scheduler.
 
     Note: Floating point support in HVX requires LLVM 14+.
     """
@@ -740,6 +742,9 @@ def hexagon(cpu_ver="v66", **kwargs):
 
     args_list = target_str.split() + llvm_str.split()
 
+    num_cores = config["num_cores"] if "num_cores" in kwargs else 4
+    args_list.append("--num-cores=%d" % num_cores)
+
     return Target(" ".join(["hexagon"] + args_list))
 
 
diff --git a/python/tvm/tir/tensor_intrin/__init__.py 
b/python/tvm/tir/tensor_intrin/__init__.py
index f0725b666e..7e5a26bdeb 100644
--- a/python/tvm/tir/tensor_intrin/__init__.py
+++ b/python/tvm/tir/tensor_intrin/__init__.py
@@ -16,4 +16,4 @@
 # under the License.
 # pylint: disable=unused-import
 """Intrinsics for tensorization."""
-from . import arm_cpu, cuda, rocm, x86
+from . import arm_cpu, cuda, rocm, x86, hexagon
diff --git a/python/tvm/tir/tensor_intrin/hexagon.py 
b/python/tvm/tir/tensor_intrin/hexagon.py
new file mode 100644
index 0000000000..0227312d63
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for Hexagon tensorization."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+
+
[email protected]_func
+def dot_product_32x4_u8u8i32_desc(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((32, 4), "uint8", offset_factor=1),
+    C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+        T.writes(C[0:32])
+        for i in T.serial(0, 32):
+            with T.init():
+                C[i] = T.int32(0)
+            for k in T.serial(0, 4):
+                with T.block("update"):
+                    vi, vk = T.axis.remap("SR", [i, k])
+                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], 
"int32")
+
+
[email protected]_func
+def dot_product_32x4_u8u8i32_vrmpy(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((32, 4), "uint8", offset_factor=1),
+    C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+        T.writes(C[0:32])
+
+        A_u8x4 = A.vload([0], "uint8x4")
+        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+        B_i8x128 = B.vload([0, 0], dtype="uint8x128")
+        B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+
+        C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+            T.uint32(3),
+            C[T.ramp(T.int32(0), 1, 32)],
+            B_i32x32,
+            A_i32,
+            dtype="int32x32",
+        )
+
+
+VRMPY_u8u8i32_INTRIN = "dot_32x4_u8u8i32_vrmpy"
+
+TensorIntrin.register(
+    VRMPY_u8u8i32_INTRIN, dot_product_32x4_u8u8i32_desc, 
dot_product_32x4_u8u8i32_vrmpy
+)
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index e3b2d7b096..a95f55357f 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -417,6 +417,7 @@ TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
     .add_attr_option<Array<String>>("llvm-options")
+    .add_attr_option<Integer>("num-cores")
     .set_default_keys({"hexagon"});
 
 TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU);
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py 
b/tests/python/contrib/test_hexagon/test_meta_schedule.py
new file mode 100644
index 0000000000..96d18c9b30
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, 
get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
[email protected]_module
+class MatmulModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: 
disable=no-self-argument
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, (16, 16), "float32")
+        B = T.match_buffer(b, (16, 16), "float32")
+        C = T.match_buffer(c, (16, 16), "float32")
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
[email protected]_hexagon
+def test_builder_runner(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    mod = MatmulModule
+
+    builder = get_hexagon_local_builder()
+    runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, 
min_repeat_ms=0)
+
+    (builder_result,) = builder.build([BuilderInput(mod, target)])
+    assert builder_result.artifact_path is not None
+    assert builder_result.error_msg is None
+
+    runner_input = RunnerInput(
+        builder_result.artifact_path,
+        "llvm",
+        [
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+        ],
+    )
+
+    # Run the module
+    (runner_future,) = runner.run([runner_input])
+    runner_result = runner_future.result()
+
+    assert runner_result.error_msg is None
+    for result in runner_result.run_secs:
+        if isinstance(result, FloatImm):
+            result = result.value
+        assert isinstance(result, float)
+        assert result >= 0.0
+
+
+def dense(m, n, k):
+    X = te.placeholder((m, k), name="X", dtype="uint8")
+    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", 
dtype="uint8")
+
+    ak = te.reduce_axis((0, k), name="k")
+    out = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 
32, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        name="compute",
+    )
+    return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+    a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+    if do_tune:
+        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+        a_yo, a_yi = sch.split(a_y, factors=y_factors)
+    else:
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+    a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+    sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+    a_xi, a_k = sch.get_loops(block)[-2:]
+    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+    sch.reorder(a_ko, a_xi, a_ki)
+
+    fused = sch.fuse(a_yo, a_xo)
+
+    sch.parallel(fused)
+
+    dec = sch.decompose_reduction(block, a_ko)
+
+    init_loop = sch.get_loops(dec)[-1]
+    sch.vectorize(init_loop)
+
+    sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+    f = tvm.build(sch.mod["main"], target=target, name="dense")
+    mod = hexagon_session.load_module(f)
+    dev = hexagon_session.device
+
+    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
+
+    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 
4)).astype("uint8")
+
+    for r_idx in range(N // 32):
+        for ko in range(K // 4):
+            for s_idx in range(32):
+                for t_idx in range(4):
+                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + 
s_idx][ko * 4 + t_idx]
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(packW, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+
+    mod(a, b, c)
+    np.testing.assert_equal(c.numpy(), c_np)
+
+    evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
+    gflops = (N * M * K) * 2 / 1e9
+    time_ms = evaluator(a, b, c).mean * 1e3
+    print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
+
+
[email protected](reason="xgboost not installed on CI")
[email protected]_hexagon
+def test_vrmpy_dense(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    do_tune = True
+    target_hexagon = tvm.target.hexagon("v68")
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+    M, N, K = 128, 768, 768
+    workload = te.create_prim_func(dense(M, N, K))
+
+    if not do_tune:
+        ir_module = tvm.IRModule({"main": workload})
+        sch = tvm.tir.Schedule(ir_module)
+        block = sch.get_block("compute")
+        schedule_dense(sch, block, M, do_tune)
+    else:
+        with tempfile.TemporaryDirectory() as work_dir:
+            config = ms.TuneConfig(
+                strategy="replay_trace",
+                num_trials_per_iter=8,
+                max_trials_per_task=8,
+                max_trials_global=8,
+            )
+
+            def schedule_dense_for_tune(sch):
+                block = sch.get_block("compute")
+                return schedule_dense(sch, block, None, True)
+
+            sch = ms.tune_tir(
+                mod=workload,
+                target=target,
+                config=config,
+                work_dir=work_dir,
+                space=ms.space_generator.ScheduleFn(schedule_dense_for_tune),
+                builder=get_hexagon_local_builder(),
+                runner=get_hexagon_rpc_runner(hexagon_launcher, number=10),
+            )
+
+    with hexagon_launcher.start_session() as session:
+        verify_dense(sch, target, M, N, K, session)

Reply via email to