This is an automated email from the ASF dual-hosted git repository.
csullivan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new d87fa854b8 [Hexagon] Initial support for meta schedule tuning (#12587)
d87fa854b8 is described below
commit d87fa854b8eb0c8f603d8dc459121eaa1a365e12
Author: masahi <[email protected]>
AuthorDate: Sat Aug 27 02:01:24 2022 +0900
[Hexagon] Initial support for meta schedule tuning (#12587)
Enables AutoTVM-style, template-based tuning for Hexagon.
To run compiled code on Hexagon, we need to use Hexagon `Session` object
https://github.com/apache/tvm/blob/dc522a6ff65b68532cd1bba43827cd981114df2c/python/tvm/contrib/hexagon/session.py#L35
in the metaschedule `RPCRunner`. But for RPC "session", `RPCRunner` expects an
instance of `RPCSession`,
https://github.com/apache/tvm/blob/53fe5966823eee4e011d7228bceab3c82c1d9caa/python/tvm/rpc/client.py#L32,
to be created and used by various customizable functions.
Since `RPCSession` and Hexagon `Session` have slightly different API, we
cannot use `RPCRunner` with customizable functions directly. So I introduced an
alternative implementation of `RPCRunner` for Hexagon.
The test is disabled for simulator since `HexagonLauncherSimulator` is not
pickle-able due to its `multiprocessing.Process` attribute:
https://github.com/apache/tvm/blob/c97895e0ffb512e73c89de7cdee9846f052244fc/python/tvm/contrib/hexagon/build.py#L614
Output log from tuning `vrmpy` dense (included in the test)
```
ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted
Latency (us) | Trials | Terminated
--------------------------------------------------------------------------------------------------------------
0 | main | 150994944 | 1 | 380.3399 | 397.0000 |
397.0000 | 32 |
--------------------------------------------------------------------------------------------------------------
```
---
apps/hexagon_api/CMakeLists.txt | 2 +
python/tvm/contrib/hexagon/meta_schedule.py | 166 ++++++++++++++++
python/tvm/contrib/hexagon/session.py | 8 +-
python/tvm/contrib/hexagon/tools.py | 7 +
python/tvm/meta_schedule/default_config.py | 6 +-
python/tvm/target/target.py | 5 +
python/tvm/tir/tensor_intrin/__init__.py | 2 +-
python/tvm/tir/tensor_intrin/hexagon.py | 71 +++++++
src/target/target_kind.cc | 1 +
.../contrib/test_hexagon/test_meta_schedule.py | 211 +++++++++++++++++++++
10 files changed, 472 insertions(+), 7 deletions(-)
diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index aa971c8753..9a05cf3675 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -87,6 +87,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
"-DUSE_HEXAGON_RPC=ON"
"-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
"-DUSE_ALTERNATIVE_LINKER=OFF"
+ "-DUSE_RANDOM=ON"
INSTALL_COMMAND ""
BUILD_ALWAYS ON
)
@@ -133,6 +134,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
"-DUSE_ALTERNATIVE_LINKER=OFF"
"-DUSE_CUSTOM_LOGGING=ON"
"-DUSE_HEXAGON_QHL=ON"
+ "-DUSE_RANDOM=ON"
"${GTEST_FLAG}"
INSTALL_COMMAND ""
BUILD_ALWAYS ON
diff --git a/python/tvm/contrib/hexagon/meta_schedule.py
b/python/tvm/contrib/hexagon/meta_schedule.py
new file mode 100644
index 0000000000..8a4de74b61
--- /dev/null
+++ b/python/tvm/contrib/hexagon/meta_schedule.py
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Meta schedule tuning utilities for Hexagon."""
+import os
+import tempfile
+from typing import Callable, List, Optional
+from tvm.contrib.popen_pool import PopenPoolExecutor
+from tvm.meta_schedule.utils import cpu_count, derived_object
+from tvm.meta_schedule.builder import LocalBuilder
+from tvm.meta_schedule.runner import (
+ EvaluatorConfig,
+ RunnerInput,
+ RunnerFuture,
+ PyRunner,
+)
+from tvm.meta_schedule.runner.rpc_runner import (
+ default_alloc_argument,
+ default_run_evaluator,
+ RPCRunnerFuture,
+)
+
+from .build import HexagonLauncherRPC
+from .tools import export_module
+
+
+@derived_object
+class HexagonRPCRunner(PyRunner):
+ """RPCRunner for Hexagon. See the documentation of RPCRunner for more
details."""
+
+ def __init__(
+ self,
+ hexagon_launcher: HexagonLauncherRPC,
+ evaluator_config: Optional[EvaluatorConfig] = None,
+ cooldown_sec: float = 0.0,
+ alloc_repeat: int = 1,
+ max_workers: Optional[int] = None,
+ initializer: Optional[Callable[[], None]] = None,
+ ):
+ """
+ Parameters
+ ----------
+ hexagon_launcher : HexagonLauncherRPC
+ The RPC launcher for Hexagon. It is needed for creating
hexagon.Session
+ object inside the worker function.
+ evaluator_config: EvaluatorConfig
+ The evaluator configuration.
+ cooldown_sec: float
+ The cooldown in seconds.
+ alloc_repeat: int
+ The number of times to random fill the allocation.
+ max_workers: Optional[int] = None
+ The maximum number of connections. Defaults to number of logical
CPU cores.
+ initializer: Optional[Callable[[], None]]
+ The initializer function.
+ """
+
+ super().__init__()
+ self.hexagon_launcher = hexagon_launcher
+ self.evaluator_config = EvaluatorConfig._normalized(evaluator_config)
+ self.cooldown_sec = cooldown_sec
+ self.alloc_repeat = alloc_repeat
+ if max_workers is None:
+ max_workers = cpu_count(logical=True)
+ self.pool = PopenPoolExecutor(
+ max_workers=max_workers,
+ timeout=100,
+ initializer=initializer,
+ )
+
+ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+ results = []
+ for runner_input in runner_inputs:
+ future = RPCRunnerFuture(
+ future=self.pool.submit(
+ _worker_func,
+ self.hexagon_launcher,
+ self.evaluator_config,
+ self.alloc_repeat,
+ str(runner_input.artifact_path),
+ tuple(arg_info.as_json() for arg_info in
runner_input.args_info),
+ ),
+ timeout_sec=100,
+ )
+ results.append(future)
+ return results
+
+
+def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat,
artifact_path, args_info):
+ with hexagon_launcher.start_session() as session:
+ device = session.device
+ _, remote_path = os.path.split(artifact_path)
+ uploaded = session.upload(artifact_path, remote_path)
+ rt_mod = session.load_module(uploaded)
+ repeated_args = default_alloc_argument(
+ session,
+ device,
+ args_info,
+ alloc_repeat,
+ )
+ costs = default_run_evaluator(
+ session,
+ rt_mod,
+ device,
+ evaluator_config,
+ repeated_args,
+ )
+ return costs
+
+
+def get_hexagon_local_builder():
+ """Return Hexagon-compatible Builder for meta schedule."""
+
+ def export_func(mod):
+ binary_path = export_module(mod, tempfile.mkdtemp())
+ return str(binary_path)
+
+ return LocalBuilder(f_export=export_func)
+
+
+def get_hexagon_rpc_runner(
+ hexagon_launcher: HexagonLauncherRPC, number=3, repeat=1, min_repeat_ms=100
+):
+ """Return Hexagon-compatible RPC Runner for meta schedule.
+
+ Parameters
+ ----------
+ hexagon_launcher : HexagonLauncherRPC
+ The RPC launcher for Hexagon.
+ number: int
+ The number of times to run this function for taking average.
+ We call these runs as one `repeat` of measurement.
+ repeat: int
+ The number of times to repeat the measurement.
+ In total, the function will be invoked (1 + number x repeat) times,
+ where the first one is warm up and will be discarded.
+ The returned result contains `repeat` costs,
+ each of which is an average of `number` costs.
+ min_repeat_ms: int
+ Minimum repeat time in ms. if the execution latency is too short,
+ increase the number of runs to the given time (in ms) to reduce the
measurement error.
+ """
+ evaluator_config = EvaluatorConfig(
+ number=number,
+ repeat=repeat,
+ min_repeat_ms=min_repeat_ms,
+ enable_cpu_cache_flush=False,
+ )
+
+ return HexagonRPCRunner(
+ hexagon_launcher,
+ evaluator_config,
+ )
diff --git a/python/tvm/contrib/hexagon/session.py
b/python/tvm/contrib/hexagon/session.py
index 0c0bf296df..9308e396b2 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -30,6 +30,7 @@ from tvm.relay.backend.executor_factory import (
AOTExecutorFactoryModule,
GraphExecutorFactoryModule,
)
+from .tools import export_module
class Session:
@@ -110,6 +111,9 @@ class Session:
return self._device
+ def get_function(self, name):
+ return self._rpc.get_function(name)
+
def upload(self, local_path: Union[str, pathlib.Path], remote_filename:
str) -> pathlib.Path:
"""Upload a local file to the remote workspace.
@@ -154,10 +158,8 @@ class Session:
if isinstance(module, tvm.runtime.Module):
with tempfile.TemporaryDirectory() as temp_dir:
- temp_dir = pathlib.Path(temp_dir)
binary_name = "test_binary.so"
- binary_path = temp_dir / binary_name
- module.save(str(binary_path))
+ binary_path = export_module(module, temp_dir, binary_name)
remote_file_path = self.upload(binary_path, binary_name)
else:
remote_file_path = module
diff --git a/python/tvm/contrib/hexagon/tools.py
b/python/tvm/contrib/hexagon/tools.py
index 1aec8c7d56..3f4adb90f6 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -194,3 +194,10 @@ def create_aot_shared(so_name: Union[str, pathlib.Path],
files, hexagon_arch: st
cross_compile.output_format = "o"
c_files = [str(file) for file in files]
cross_compile(str(so_name), c_files, options=compile_options + options)
+
+
+def export_module(module, out_dir, binary_name="test_binary.so"):
+ """Export Hexagon shared object to a file."""
+ binary_path = pathlib.Path(out_dir) / binary_name
+ module.save(str(binary_path))
+ return binary_path
diff --git a/python/tvm/meta_schedule/default_config.py
b/python/tvm/meta_schedule/default_config.py
index 0f1f7d3c2c..97cbfc58a6 100644
--- a/python/tvm/meta_schedule/default_config.py
+++ b/python/tvm/meta_schedule/default_config.py
@@ -178,7 +178,7 @@ def schedule_rules( # pylint: disable=redefined-outer-name
return sch_rules()
if sch_rules is not None:
raise TypeError(f"Expected `sch_rules` to be None or callable, but
gets: {sch_rules}")
- if target.kind.name == "llvm":
+ if target.kind.name in ["llvm", "hexagon"]:
return _DefaultLLVM.schedule_rules()
if target.kind.name in ["cuda", "rocm", "vulkan"]:
return _DefaultCUDA.schedule_rules()
@@ -194,7 +194,7 @@ def postproc( # pylint: disable=redefined-outer-name
return postproc()
if postproc is not None:
raise TypeError(f"Expected `postproc` to be None or callable, but
gets: {postproc}")
- if target.kind.name == "llvm":
+ if target.kind.name in ["llvm", "hexagon"]:
return _DefaultLLVM.postprocs()
if target.kind.name in ["cuda", "rocm", "vulkan"]:
return _DefaultCUDA.postprocs()
@@ -212,7 +212,7 @@ def mutator_probs( # pylint: disable=redefined-outer-name
raise TypeError(
f"Expected `mutator_probs` to be None or callable, but gets:
{mutator_probs}"
)
- if target.kind.name == "llvm":
+ if target.kind.name in ["llvm", "hexagon"]:
return _DefaultLLVM.mutator_probs()
if target.kind.name in ["cuda", "rocm", "vulkan"]:
return _DefaultCUDA.mutator_probs()
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index a558fcbeaf..1e9e2e698c 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -636,6 +636,8 @@ def hexagon(cpu_ver="v66", **kwargs):
Whether to use QFloat HVX instructions.
use_ieee_fp : bool (default: False)
Whether to use IEEE HVX instructions
+ num_cores : int (default: 4)
+ The number of HVX threads. This attribute is required by meta
scheduler.
Note: Floating point support in HVX requires LLVM 14+.
"""
@@ -740,6 +742,9 @@ def hexagon(cpu_ver="v66", **kwargs):
args_list = target_str.split() + llvm_str.split()
+ num_cores = config["num_cores"] if "num_cores" in kwargs else 4
+ args_list.append("--num-cores=%d" % num_cores)
+
return Target(" ".join(["hexagon"] + args_list))
diff --git a/python/tvm/tir/tensor_intrin/__init__.py
b/python/tvm/tir/tensor_intrin/__init__.py
index f0725b666e..7e5a26bdeb 100644
--- a/python/tvm/tir/tensor_intrin/__init__.py
+++ b/python/tvm/tir/tensor_intrin/__init__.py
@@ -16,4 +16,4 @@
# under the License.
# pylint: disable=unused-import
"""Intrinsics for tensorization."""
-from . import arm_cpu, cuda, rocm, x86
+from . import arm_cpu, cuda, rocm, x86, hexagon
diff --git a/python/tvm/tir/tensor_intrin/hexagon.py
b/python/tvm/tir/tensor_intrin/hexagon.py
new file mode 100644
index 0000000000..0227312d63
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/hexagon.py
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for Hexagon tensorization."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+
+
[email protected]_func
+def dot_product_32x4_u8u8i32_desc(
+ A: T.Buffer((4,), "uint8", offset_factor=1),
+ B: T.Buffer((32, 4), "uint8", offset_factor=1),
+ C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+ with T.block("root"):
+ T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+ T.writes(C[0:32])
+ for i in T.serial(0, 32):
+ with T.init():
+ C[i] = T.int32(0)
+ for k in T.serial(0, 4):
+ with T.block("update"):
+ vi, vk = T.axis.remap("SR", [i, k])
+ C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk],
"int32")
+
+
[email protected]_func
+def dot_product_32x4_u8u8i32_vrmpy(
+ A: T.Buffer((4,), "uint8", offset_factor=1),
+ B: T.Buffer((32, 4), "uint8", offset_factor=1),
+ C: T.Buffer((32,), "int32", offset_factor=1),
+) -> None:
+ with T.block("root"):
+ T.reads(C[0:32], A[0:4], B[0:32, 0:4])
+ T.writes(C[0:32])
+
+ A_u8x4 = A.vload([0], "uint8x4")
+ A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+ B_i8x128 = B.vload([0, 0], dtype="uint8x128")
+ B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32")
+
+ C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin(
+ T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"),
+ T.uint32(3),
+ C[T.ramp(T.int32(0), 1, 32)],
+ B_i32x32,
+ A_i32,
+ dtype="int32x32",
+ )
+
+
+VRMPY_u8u8i32_INTRIN = "dot_32x4_u8u8i32_vrmpy"
+
+TensorIntrin.register(
+ VRMPY_u8u8i32_INTRIN, dot_product_32x4_u8u8i32_desc,
dot_product_32x4_u8u8i32_vrmpy
+)
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index e3b2d7b096..a95f55357f 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -417,6 +417,7 @@ TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon)
.add_attr_option<String>("mcpu")
.add_attr_option<String>("mtriple")
.add_attr_option<Array<String>>("llvm-options")
+ .add_attr_option<Integer>("num-cores")
.set_default_keys({"hexagon"});
TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU);
diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py
b/tests/python/contrib/test_hexagon/test_meta_schedule.py
new file mode 100644
index 0000000000..96d18c9b30
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder,
get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
[email protected]_module
+class MatmulModule:
+ @T.prim_func
+ def main(a: T.handle, b: T.handle, c: T.handle) -> None: # pylint:
disable=no-self-argument
+ T.func_attr({"global_symbol": "main", "tir.noalias": True})
+ A = T.match_buffer(a, (16, 16), "float32")
+ B = T.match_buffer(b, (16, 16), "float32")
+ C = T.match_buffer(c, (16, 16), "float32")
+ for i, j, k in T.grid(16, 16, 16):
+ with T.block("matmul"):
+ vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+ with T.init():
+ C[vi, vj] = 0.0
+ C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
[email protected]_hexagon
+def test_builder_runner(hexagon_launcher):
+ if hexagon_launcher._serial_number == "simulator":
+ pytest.skip(msg="Tuning on simulator not supported.")
+
+ target_hexagon = tvm.target.hexagon("v68", link_params=True)
+ target = tvm.target.Target(target_hexagon, host=target_hexagon)
+ mod = MatmulModule
+
+ builder = get_hexagon_local_builder()
+ runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1,
min_repeat_ms=0)
+
+ (builder_result,) = builder.build([BuilderInput(mod, target)])
+ assert builder_result.artifact_path is not None
+ assert builder_result.error_msg is None
+
+ runner_input = RunnerInput(
+ builder_result.artifact_path,
+ "llvm",
+ [
+ TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+ TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+ TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+ ],
+ )
+
+ # Run the module
+ (runner_future,) = runner.run([runner_input])
+ runner_result = runner_future.result()
+
+ assert runner_result.error_msg is None
+ for result in runner_result.run_secs:
+ if isinstance(result, FloatImm):
+ result = result.value
+ assert isinstance(result, float)
+ assert result >= 0.0
+
+
+def dense(m, n, k):
+ X = te.placeholder((m, k), name="X", dtype="uint8")
+ packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW",
dtype="uint8")
+
+ ak = te.reduce_axis((0, k), name="k")
+ out = te.compute(
+ (m, n),
+ lambda i, j: te.sum(
+ X[i, ak].astype("int32")
+ * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j %
32, ak % 4].astype(
+ "int32"
+ ),
+ axis=ak,
+ ),
+ name="compute",
+ )
+ return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+ a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+ if do_tune:
+ y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+ a_yo, a_yi = sch.split(a_y, factors=y_factors)
+ else:
+ a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+ a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+ sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+ a_xi, a_k = sch.get_loops(block)[-2:]
+ a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+ sch.reorder(a_ko, a_xi, a_ki)
+
+ fused = sch.fuse(a_yo, a_xo)
+
+ sch.parallel(fused)
+
+ dec = sch.decompose_reduction(block, a_ko)
+
+ init_loop = sch.get_loops(dec)[-1]
+ sch.vectorize(init_loop)
+
+ sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+ f = tvm.build(sch.mod["main"], target=target, name="dense")
+ mod = hexagon_session.load_module(f)
+ dev = hexagon_session.device
+
+ a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+ b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+ c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
+
+ packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32,
4)).astype("uint8")
+
+ for r_idx in range(N // 32):
+ for ko in range(K // 4):
+ for s_idx in range(32):
+ for t_idx in range(4):
+ packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 +
s_idx][ko * 4 + t_idx]
+
+ a = tvm.nd.array(a_np, dev)
+ b = tvm.nd.array(packW, dev)
+ c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+
+ mod(a, b, c)
+ np.testing.assert_equal(c.numpy(), c_np)
+
+ evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
+ gflops = (N * M * K) * 2 / 1e9
+ time_ms = evaluator(a, b, c).mean * 1e3
+ print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
+
+
[email protected](reason="xgboost not installed on CI")
[email protected]_hexagon
+def test_vrmpy_dense(hexagon_launcher):
+ if hexagon_launcher._serial_number == "simulator":
+ pytest.skip(msg="Tuning on simulator not supported.")
+
+ do_tune = True
+ target_hexagon = tvm.target.hexagon("v68")
+ target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+ M, N, K = 128, 768, 768
+ workload = te.create_prim_func(dense(M, N, K))
+
+ if not do_tune:
+ ir_module = tvm.IRModule({"main": workload})
+ sch = tvm.tir.Schedule(ir_module)
+ block = sch.get_block("compute")
+ schedule_dense(sch, block, M, do_tune)
+ else:
+ with tempfile.TemporaryDirectory() as work_dir:
+ config = ms.TuneConfig(
+ strategy="replay_trace",
+ num_trials_per_iter=8,
+ max_trials_per_task=8,
+ max_trials_global=8,
+ )
+
+ def schedule_dense_for_tune(sch):
+ block = sch.get_block("compute")
+ return schedule_dense(sch, block, None, True)
+
+ sch = ms.tune_tir(
+ mod=workload,
+ target=target,
+ config=config,
+ work_dir=work_dir,
+ space=ms.space_generator.ScheduleFn(schedule_dense_for_tune),
+ builder=get_hexagon_local_builder(),
+ runner=get_hexagon_rpc_runner(hexagon_launcher, number=10),
+ )
+
+ with hexagon_launcher.start_session() as session:
+ verify_dense(sch, target, M, N, K, session)