This is an automated email from the ASF dual-hosted git repository.
ekalda pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 5dc25afc87 [microNPU][ETHOSU] Add Vela's logic to select configuration
block (#15186)
5dc25afc87 is described below
commit 5dc25afc873fd2c1cecabbf87515f29d459986e2
Author: Aleksei-grovety <[email protected]>
AuthorDate: Tue Jul 4 11:34:11 2023 +0300
[microNPU][ETHOSU] Add Vela's logic to select configuration block (#15186)
For the case when cascader is enabled, the logic of choosing the optimal
configuration block from TVM will be used in other cases, the Vela's logic will
be used except the cases when dev_force_block_config option is specified.
---
.../tvm/relay/backend/contrib/ethosu/vela_api.py | 86 +++++++++++++++++++++-
tests/python/contrib/test_ethosu/test_networks.py | 4 +-
.../contrib/test_ethosu/test_replace_conv2d.py | 14 ++--
tests/python/contrib/test_ethosu/test_vela_api.py | 50 +++++++++++++
4 files changed, 145 insertions(+), 9 deletions(-)
diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py
b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
index 45c232a461..22f5cdd83b 100644
--- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py
+++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -27,7 +27,12 @@ from typing import List, Optional, Tuple
import numpy as np # type: ignore
from ethosu.vela import api as vapi # type: ignore
+from ethosu.vela.architecture_allocator import find_block_config
from ethosu.vela.architecture_features import Accelerator, create_default_arch
+from ethosu.vela.operation import NpuBlockType
+from ethosu.vela.register_command_stream_generator import resampling_mode_map
+from ethosu.vela.register_command_stream_util import to_kernel
+from ethosu.vela.shape4d import Shape4D
import tvm
from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
@@ -56,6 +61,9 @@ def get_optimal_block_config(
Therefore, we need to pick an optimal block configuration considering
bandwidth
to bring IFM blocks and the number of OFM block computes need to happen
to cover the OFM as indicated by the npu op.
+ For the case when cascader is enabled, the logic of choosing the optimal
configuration block
+ from TVM will be used in other cases, the Vela's logic will be used except
+ the cases when dev_force_block_config option is specified.
Parameters
----------
@@ -73,8 +81,82 @@ def get_optimal_block_config(
if options and options.dev_force_block_config:
block_config = [int(v) for v in
options.dev_force_block_config.split("x")]
return vapi.NpuShape3D(height=block_config[0], width=block_config[1],
depth=block_config[2])
- all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
- return _get_optimal_block_config(all_valid_block_configs)
+ elif options and options.enable_cascader:
+ all_valid_block_configs = vapi.npu_find_block_configs(npu_op,
accel_config)
+ return _get_optimal_block_config(all_valid_block_configs)
+ else:
+ return _find_block_config_with_vela(npu_op, accel_config)
+
+
+def _find_block_config_with_vela(
+ npu_op: vapi.NpuOperation, accelerator: vapi.NpuAccelerator
+) -> vapi.NpuShape3D:
+ """An internal function to get block config using Vela's logic.
+
+ Parameters
+ ----------
+ npu_op : ethosu.vela.api.NpuOperation
+ The NPU operation
+ accelerator : ethosu.vela.api.NpuAccelerator
+ The NPU accelerator
+
+ Returns
+ -------
+ ethosu.vela.api.NpuShape3D :
+ The optimal block config for the operator
+ """
+ if isinstance(npu_op, vapi.NpuConv2DOperation):
+ block_type = NpuBlockType.ConvolutionMxN
+ elif isinstance(npu_op, vapi.NpuConvDepthWiseOperation):
+ block_type = NpuBlockType.ConvolutionDepthWise
+ elif isinstance(npu_op, vapi.NpuPoolingOperation):
+ block_type = (
+ NpuBlockType.ReduceSum
+ if npu_op.sub_op_type == vapi.NpuPoolingOp.REDUCE_SUM
+ else NpuBlockType.Pooling
+ )
+ elif isinstance(npu_op, vapi.NpuElementWiseOperation):
+ block_type = NpuBlockType.ElementWise
+ else:
+ assert 0, "Unsupported operation"
+
+ ifm_shape = Shape4D(1, npu_op.ifm.shape.height, npu_op.ifm.shape.width,
npu_op.ifm.shape.depth)
+ ifm2_shape = None
+ if npu_op.ifm2:
+ ifm2_shape = Shape4D(
+ 1, npu_op.ifm2.shape.height, npu_op.ifm2.shape.width,
npu_op.ifm2.shape.depth
+ )
+ ofm_shape = Shape4D(1, npu_op.ofm.shape.height, npu_op.ofm.shape.width,
npu_op.ofm.shape.depth)
+
+ ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
+ ifm_bits = npu_op.ifm.data_type.size_in_bits()
+ lut_banks = 0
+ if npu_op.activation:
+ lut_banks = 2 if npu_op.activation.op_type ==
vapi.NpuActivationOp.TABLE_LOOKUP else 0
+
+ has_scaling = True
+ for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]:
+ if tensor and tensor.quantization is None:
+ has_scaling = False
+ break
+
+ arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator))
+
+ cfg = find_block_config(
+ arch,
+ block_type,
+ ofm_shape,
+ ifm_shape,
+ ifm2_shape,
+ npu_op.ifm2_scalar is not None,
+ ifm_bits,
+ to_kernel(npu_op.kernel),
+ lut_banks,
+ has_scaling,
+ ifm_resampling_mode,
+ )
+ assert cfg is not None, f"There is no configuration suitable for
{accelerator}"
+ return vapi.NpuShape3D(cfg.ofm_block.height, cfg.ofm_block.width,
cfg.ofm_block.depth)
def _get_optimal_block_config(all_valid_block_configs: List[vapi.NpuShape3D])
-> vapi.NpuShape3D:
diff --git a/tests/python/contrib/test_ethosu/test_networks.py
b/tests/python/contrib/test_ethosu/test_networks.py
index a5490cbe2b..308c06f504 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -44,9 +44,9 @@ MOBILENET_V2_URL = (
@pytest.mark.parametrize(
"accel_type, model_url, workspace_size",
[
- ("ethos-u65-256", MOBILENET_V1_URL, 2338848),
+ ("ethos-u65-256", MOBILENET_V1_URL, 2338864),
("ethos-u65-256", MOBILENET_V2_URL, 2264320),
- ("ethos-u55-256", MOBILENET_V1_URL, 1793376),
+ ("ethos-u55-256", MOBILENET_V1_URL, 1793392),
("ethos-u55-256", MOBILENET_V2_URL, 2217152),
("ethos-u55-128", MOBILENET_V2_URL, 2217152),
("ethos-u55-64", MOBILENET_V2_URL, 2217152),
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 6bcea7008c..32d1303e12 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -633,11 +633,15 @@ def test_conv2d_double_cascade(trial):
reference_mod = trial[0]
params = trial[1:]
- func = _get_func(*params[:-1])
- mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
- script = mod.script()
- mod = tvm.script.from_source(script)
- tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
+ config = {
+ "enable_cascader": True,
+ }
+ with tvm.transform.PassContext(opt_level=3,
config={"relay.ext.ethos-u.options": config}):
+ func = _get_func(*params[:-1])
+ mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
+ script = mod.script()
+ mod = tvm.script.from_source(script)
+ tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"],
True)
# fmt: off
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py
b/tests/python/contrib/test_ethosu/test_vela_api.py
index 9f95e4b709..16785e182a 100644
--- a/tests/python/contrib/test_ethosu/test_vela_api.py
+++ b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -222,6 +222,28 @@ class Module2:
__tvm_meta__ = None
+# fmt: off
[email protected]_module
+class Module3:
+ @T.prim_func
+ def main(ethos_u_0_i0: T.Buffer((1, 299, 299, 2), "int8"), ethosu_write:
T.Buffer((1, 299, 299, 3), "int8")):
+ T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol":
"main", "tir.noalias": T.bool(True)})
+ p2_global = T.allocate([128], "uint8", "global",
annotations={"disable_lower_builtin": T.bool(True)})
+ ax0_ax1_fused_ax2_fused_ax3_fused = T.int32()
+ p2_global_1 = T.Buffer((128,), "uint8", data=p2_global)
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None,
"DataPar", ""), "pragma_compute_cycles_hint", 1056):
+ p1_encoded = T.Buffer((128,), "uint8")
+ T.call_extern("handle", "ethosu_copy", p1_encoded[0], 128,
p2_global_1[0])
+ nn = T.int32()
+ T.attr(T.iter_var(nn, None, "DataPar", ""),
"pragma_compute_cycles_hint", T.int64(179570))
+ ethos_u_0_i0_1 = T.Buffer((178802,), "int8", data=ethos_u_0_i0.data)
+ ethosu_write_1 = T.Buffer((268203,), "int8", data=ethosu_write.data)
+ T.call_extern("handle", "ethosu_conv2d", "int8", 299, 299, 2, 299, 0,
299, ethos_u_0_i0_1[0], 0, 0, 0, T.float32(0.0039215683937072754), -128,
"NHWC", 598, 2, 1, "int8", 299, 299, 3, 299, 0, 299, ethosu_write_1[0], 0, 0,
0, T.float32(0.025585981085896492), -128, "NHWC", 897, 3, 1, 2, 3, 1, 1, 1, 2,
p2_global_1[0], 96, T.int8(-1), T.int8(-1), 0, p2_global_1[96], 32, T.int8(-1),
T.int8(-1), 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", 32, 12, 8)
+
+ __tvm_meta__ = None
+# fmt: on
+
+
def test_get_optimal_block_config():
block_configs_cases = [
{
@@ -559,5 +581,33 @@ def test_encode_weights(accel):
verify(_test_vec, _mock_enc_w)
+def test_find_block_config_with_vela():
+ block_configs_cases = [
+ {
+ "accel_type": vapi.NpuAccelerator.Ethos_U55_256,
+ "ref": vapi.NpuShape3D(30, 12, 8),
+ },
+ {
+ "accel_type": vapi.NpuAccelerator.Ethos_U55_128,
+ "ref": vapi.NpuShape3D(17, 10, 8),
+ },
+ {
+ "accel_type": vapi.NpuAccelerator.Ethos_U55_64,
+ "ref": vapi.NpuShape3D(25, 5, 8),
+ },
+ {
+ "accel_type": vapi.NpuAccelerator.Ethos_U55_32,
+ "ref": vapi.NpuShape3D(25, 5, 4),
+ },
+ ]
+
+ mod = Module3
+ ethosu_conv2d_call = mod["main"].body.body.seq[1].body.value
+ npu_op, _ = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call)
+
+ for case in block_configs_cases:
+ assert vela_api._find_block_config_with_vela(npu_op,
case["accel_type"]) == case["ref"]
+
+
if __name__ == "__main__":
tvm.testing.main()