[tvm] branch main updated: [microNPU][ETHOSU] Add Vela's logic to select configuration block (#15186)

ekalda Tue, 04 Jul 2023 01:34:23 -0700

This is an automated email from the ASF dual-hosted git repository.

ekalda pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 5dc25afc87 [microNPU][ETHOSU] Add Vela's logic to select configuration 
block (#15186)
5dc25afc87 is described below

commit 5dc25afc873fd2c1cecabbf87515f29d459986e2
Author: Aleksei-grovety <[email protected]>
AuthorDate: Tue Jul 4 11:34:11 2023 +0300

    [microNPU][ETHOSU] Add Vela's logic to select configuration block (#15186)
    
    For the case when cascader is enabled, the logic of choosing the optimal 
configuration block from TVM will be used in other cases, the Vela's logic will 
be used except the cases when dev_force_block_config option is specified.
---
 .../tvm/relay/backend/contrib/ethosu/vela_api.py   | 86 +++++++++++++++++++++-
 tests/python/contrib/test_ethosu/test_networks.py  |  4 +-
 .../contrib/test_ethosu/test_replace_conv2d.py     | 14 ++--
 tests/python/contrib/test_ethosu/test_vela_api.py  | 50 +++++++++++++
 4 files changed, 145 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py 
b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
index 45c232a461..22f5cdd83b 100644
--- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py
+++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -27,7 +27,12 @@ from typing import List, Optional, Tuple
 
 import numpy as np  # type: ignore
 from ethosu.vela import api as vapi  # type: ignore
+from ethosu.vela.architecture_allocator import find_block_config
 from ethosu.vela.architecture_features import Accelerator, create_default_arch
+from ethosu.vela.operation import NpuBlockType
+from ethosu.vela.register_command_stream_generator import resampling_mode_map
+from ethosu.vela.register_command_stream_util import to_kernel
+from ethosu.vela.shape4d import Shape4D
 
 import tvm
 from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
@@ -56,6 +61,9 @@ def get_optimal_block_config(
     Therefore, we need to pick an optimal block configuration considering 
bandwidth
     to bring IFM blocks and the number of OFM block computes need to happen
     to cover the OFM as indicated by the npu op.
+    For the case when cascader is enabled, the logic of choosing the optimal 
configuration block
+    from TVM will be used in other cases, the Vela's logic will be used except
+    the cases when dev_force_block_config option is specified.
 
     Parameters
     ----------
@@ -73,8 +81,82 @@ def get_optimal_block_config(
     if options and options.dev_force_block_config:
         block_config = [int(v) for v in 
options.dev_force_block_config.split("x")]
         return vapi.NpuShape3D(height=block_config[0], width=block_config[1], 
depth=block_config[2])
-    all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
-    return _get_optimal_block_config(all_valid_block_configs)
+    elif options and options.enable_cascader:
+        all_valid_block_configs = vapi.npu_find_block_configs(npu_op, 
accel_config)
+        return _get_optimal_block_config(all_valid_block_configs)
+    else:
+        return _find_block_config_with_vela(npu_op, accel_config)
+
+
+def _find_block_config_with_vela(
+    npu_op: vapi.NpuOperation, accelerator: vapi.NpuAccelerator
+) -> vapi.NpuShape3D:
+    """An internal function to get block config using Vela's logic.
+
+    Parameters
+    ----------
+    npu_op : ethosu.vela.api.NpuOperation
+        The NPU operation
+    accelerator : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator
+
+    Returns
+    -------
+    ethosu.vela.api.NpuShape3D :
+        The optimal block config for the operator
+    """
+    if isinstance(npu_op, vapi.NpuConv2DOperation):
+        block_type = NpuBlockType.ConvolutionMxN
+    elif isinstance(npu_op, vapi.NpuConvDepthWiseOperation):
+        block_type = NpuBlockType.ConvolutionDepthWise
+    elif isinstance(npu_op, vapi.NpuPoolingOperation):
+        block_type = (
+            NpuBlockType.ReduceSum
+            if npu_op.sub_op_type == vapi.NpuPoolingOp.REDUCE_SUM
+            else NpuBlockType.Pooling
+        )
+    elif isinstance(npu_op, vapi.NpuElementWiseOperation):
+        block_type = NpuBlockType.ElementWise
+    else:
+        assert 0, "Unsupported operation"
+
+    ifm_shape = Shape4D(1, npu_op.ifm.shape.height, npu_op.ifm.shape.width, 
npu_op.ifm.shape.depth)
+    ifm2_shape = None
+    if npu_op.ifm2:
+        ifm2_shape = Shape4D(
+            1, npu_op.ifm2.shape.height, npu_op.ifm2.shape.width, 
npu_op.ifm2.shape.depth
+        )
+    ofm_shape = Shape4D(1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, 
npu_op.ofm.shape.depth)
+
+    ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
+    ifm_bits = npu_op.ifm.data_type.size_in_bits()
+    lut_banks = 0
+    if npu_op.activation:
+        lut_banks = 2 if npu_op.activation.op_type == 
vapi.NpuActivationOp.TABLE_LOOKUP else 0
+
+    has_scaling = True
+    for tensor in [npu_op.ifm, npu_op.ifm2, npu_op.ofm]:
+        if tensor and tensor.quantization is None:
+            has_scaling = False
+            break
+
+    arch = create_default_arch(Accelerator.from_npu_accelerator(accelerator))
+
+    cfg = find_block_config(
+        arch,
+        block_type,
+        ofm_shape,
+        ifm_shape,
+        ifm2_shape,
+        npu_op.ifm2_scalar is not None,
+        ifm_bits,
+        to_kernel(npu_op.kernel),
+        lut_banks,
+        has_scaling,
+        ifm_resampling_mode,
+    )
+    assert cfg is not None, f"There is no configuration suitable for 
{accelerator}"
+    return vapi.NpuShape3D(cfg.ofm_block.height, cfg.ofm_block.width, 
cfg.ofm_block.depth)
 
 
 def _get_optimal_block_config(all_valid_block_configs: List[vapi.NpuShape3D]) 
-> vapi.NpuShape3D:
diff --git a/tests/python/contrib/test_ethosu/test_networks.py 
b/tests/python/contrib/test_ethosu/test_networks.py
index a5490cbe2b..308c06f504 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -44,9 +44,9 @@ MOBILENET_V2_URL = (
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 2338848),
+        ("ethos-u65-256", MOBILENET_V1_URL, 2338864),
         ("ethos-u65-256", MOBILENET_V2_URL, 2264320),
-        ("ethos-u55-256", MOBILENET_V1_URL, 1793376),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1793392),
         ("ethos-u55-256", MOBILENET_V2_URL, 2217152),
         ("ethos-u55-128", MOBILENET_V2_URL, 2217152),
         ("ethos-u55-64", MOBILENET_V2_URL, 2217152),
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py 
b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 6bcea7008c..32d1303e12 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -633,11 +633,15 @@ def test_conv2d_double_cascade(trial):
 
     reference_mod = trial[0]
     params = trial[1:]
-    func = _get_func(*params[:-1])
-    mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
-    script = mod.script()
-    mod = tvm.script.from_source(script)
-    tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
+    config = {
+        "enable_cascader": True,
+    }
+    with tvm.transform.PassContext(opt_level=3, 
config={"relay.ext.ethos-u.options": config}):
+        func = _get_func(*params[:-1])
+        mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
+        script = mod.script()
+        mod = tvm.script.from_source(script)
+        tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], 
True)
 
 
 # fmt: off
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py 
b/tests/python/contrib/test_ethosu/test_vela_api.py
index 9f95e4b709..16785e182a 100644
--- a/tests/python/contrib/test_ethosu/test_vela_api.py
+++ b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -222,6 +222,28 @@ class Module2:
     __tvm_meta__ = None
 
 
+# fmt: off
[email protected]_module
+class Module3:
+    @T.prim_func
+    def main(ethos_u_0_i0: T.Buffer((1, 299, 299, 2), "int8"), ethosu_write: 
T.Buffer((1, 299, 299, 3), "int8")):
+        T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": 
"main", "tir.noalias": T.bool(True)})
+        p2_global = T.allocate([128], "uint8", "global", 
annotations={"disable_lower_builtin": T.bool(True)})
+        ax0_ax1_fused_ax2_fused_ax3_fused = T.int32()
+        p2_global_1 = T.Buffer((128,), "uint8", data=p2_global)
+        with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, 
"DataPar", ""), "pragma_compute_cycles_hint", 1056):
+            p1_encoded = T.Buffer((128,), "uint8")
+            T.call_extern("handle", "ethosu_copy", p1_encoded[0], 128, 
p2_global_1[0])
+        nn = T.int32()
+        T.attr(T.iter_var(nn, None, "DataPar", ""), 
"pragma_compute_cycles_hint", T.int64(179570))
+        ethos_u_0_i0_1 = T.Buffer((178802,), "int8", data=ethos_u_0_i0.data)
+        ethosu_write_1 = T.Buffer((268203,), "int8", data=ethosu_write.data)
+        T.call_extern("handle", "ethosu_conv2d", "int8", 299, 299, 2, 299, 0, 
299, ethos_u_0_i0_1[0], 0, 0, 0, T.float32(0.0039215683937072754), -128, 
"NHWC", 598, 2, 1, "int8", 299, 299, 3, 299, 0, 299, ethosu_write_1[0], 0, 0, 
0, T.float32(0.025585981085896492), -128, "NHWC", 897, 3, 1, 2, 3, 1, 1, 1, 2, 
p2_global_1[0], 96, T.int8(-1), T.int8(-1), 0, p2_global_1[96], 32, T.int8(-1), 
T.int8(-1), 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", 32, 12, 8)
+
+    __tvm_meta__ = None
+# fmt: on
+
+
 def test_get_optimal_block_config():
     block_configs_cases = [
         {
@@ -559,5 +581,33 @@ def test_encode_weights(accel):
         verify(_test_vec, _mock_enc_w)
 
 
+def test_find_block_config_with_vela():
+    block_configs_cases = [
+        {
+            "accel_type": vapi.NpuAccelerator.Ethos_U55_256,
+            "ref": vapi.NpuShape3D(30, 12, 8),
+        },
+        {
+            "accel_type": vapi.NpuAccelerator.Ethos_U55_128,
+            "ref": vapi.NpuShape3D(17, 10, 8),
+        },
+        {
+            "accel_type": vapi.NpuAccelerator.Ethos_U55_64,
+            "ref": vapi.NpuShape3D(25, 5, 8),
+        },
+        {
+            "accel_type": vapi.NpuAccelerator.Ethos_U55_32,
+            "ref": vapi.NpuShape3D(25, 5, 4),
+        },
+    ]
+
+    mod = Module3
+    ethosu_conv2d_call = mod["main"].body.body.seq[1].body.value
+    npu_op, _ = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call)
+
+    for case in block_configs_cases:
+        assert vela_api._find_block_config_with_vela(npu_op, 
case["accel_type"]) == case["ref"]
+
+
 if __name__ == "__main__":
     tvm.testing.main()

[tvm] branch main updated: [microNPU][ETHOSU] Add Vela's logic to select configuration block (#15186)

Reply via email to