[tvm] branch main updated: [microNPU] Expose compute cycle annotations to TIR lowering (#11288)

manupa Thu, 26 May 2022 04:21:35 -0700

This is an automated email from the ASF dual-hosted git repository.

manupa pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new f6ddd52dc0 [microNPU] Expose compute cycle annotations to TIR lowering 
(#11288)
f6ddd52dc0 is described below

commit f6ddd52dc00831059966a89dbd67e2fe6c683759
Author: Luke Hutton <[email protected]>
AuthorDate: Thu May 26 12:21:22 2022 +0100

    [microNPU] Expose compute cycle annotations to TIR lowering (#11288)
    
    * [microNPU] Expose compute cycle annotations to TIR lowering
    
    Adds an AttrSttmt "compute_cycles_hint" to each NPU operation for later
    passes to consume.
    
    Change-Id: I09779bdab6de6ef2094db610bb20d6e052e68ee3
    
    * compute_cycles->compute_cycles_hint
    
    Change-Id: Iebd71e699522e92a28fd321ffdb41ed7924db4e0
    
    * add test to check annotations in compilation flow
    
    Change-Id: Idcdcc8c8b5536c4732f297246b71aa8378a2732c
    
    * add compute cycles hints for copy operations
    
    Change-Id: I007ba19732e16081fa2ea9baca40c64a653c93cf
    
    * fixing annotations for copies and improving test coverage
    
    Change-Id: Ib812c4151fab03f4c1adcc016b4e798003a22e5e
    
    * rebase
    
    Change-Id: I653101908706096ae25ad1ebf08e7b6c4f1196c7
---
 .../tvm/contrib/ethosu/cascader/plan_generator.py  |  24 +++-
 python/tvm/contrib/ethosu/cascader/scheduler.py    |  18 ++-
 .../relay/backend/contrib/ethosu/tir/scheduler.py  |   7 +
 src/contrib/ethosu/cascader/plan_generator.cc      |  64 ++++++---
 src/tir/contrib/ethosu/passes.cc                   |   9 +-
 .../test_ethosu/cascader/test_integration.py       | 145 +++++++++++++++++++++
 .../contrib/test_ethosu/cascader/test_scheduler.py |  85 ++++++++----
 7 files changed, 301 insertions(+), 51 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py 
b/python/tvm/contrib/ethosu/cascader/plan_generator.py
index 155e01431c..ed29ff4b59 100644
--- a/python/tvm/contrib/ethosu/cascader/plan_generator.py
+++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -15,9 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Algorithms to generate Plans for a CascaderGraph."""
-from typing import List, Dict
+from typing import List, Dict, Tuple
 
-from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion
+from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion, 
TensorConfig
 
 from . import _ffi_api
 from .cascader_options import CascaderOptions
@@ -55,3 +55,23 @@ def _generate_graph_plans(
         home_map,
         options,
     )
+
+
+def get_copy_cycles_hint(tensor_config: TensorConfig) -> Tuple[int, int]:
+    """
+    Returns a hint estimating the number of cycles for the copy
+    specified by tensor_config.
+
+    Parameters
+    ----------
+    tensor_config : TensorConfig
+        The tensor configuration to estimate.
+
+    Returns
+    -------
+    mem2mem_cycles : int
+        Total estimated cycles.
+    initial_mem2mem_cycles : int
+        Estimated cycles for the first block.
+    """
+    return _ffi_api.GetCopyCyclesHint(tensor_config)
diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py 
b/python/tvm/contrib/ethosu/cascader/scheduler.py
index d33abaf2b7..fd247e660a 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -31,6 +31,7 @@ from .parts import EthosuPart
 from .tensor_config import MemoryRegion
 from .proposal import Proposal
 from .proposal_generator import generate_proposals
+from .plan_generator import get_copy_cycles_hint
 from .graph import create_cascader_graph
 from .device_config import EthosuDeviceConfig
 from .logging import Logging
@@ -134,7 +135,11 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) 
-> None:
             if isinstance(part, EthosuPart):
                 tensor_config = plan.tensor_configs[part.output_tensor]
                 stripe_config = tensor_config.stripe_configs[0]
+                buffer_mode = tensor_config.buffer_mode
                 block_config = part.get_block_config(stripe_config)
+                compute_cycles = part.get_performance_info(
+                    stripe_config, buffer_mode
+                ).compute_cycles
                 iv = part.subgraph.output_tensor.op.axis[0]
                 block_shape = block_config.output_shape
                 if len(block_shape) == 4:
@@ -147,6 +152,10 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) 
-> None:
                 sch[part.subgraph.output_tensor].pragma(iv, 
"block_config_width", width)
                 sch[part.subgraph.output_tensor].pragma(iv, 
"block_config_depth", depth)
 
+                # Attach AttrStmt directly to npu op so it isn't removed by 
ReplaceOperators
+                npu_op = 
part.subgraph.output_tensor.op.input_tensors[0].op.input_tensors[0]
+                sch[npu_op].pragma(npu_op.op.axis[0], "compute_cycles_hint", 
compute_cycles)
+
         output_tensor_config = plan.output_config
         output_tensor = output_tensor_config.tensor
         output_part = output_tensor.producers[0]
@@ -156,6 +165,7 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> 
None:
         stripe_shape = [int(x) for x in stripe_config.shape]
         stripe_stage, stripe_axis = stripe_part(output_part, stripe_shape, sch)
         copy_te_tensors = []
+        compute_cycles_hints = []
         readers = defaultdict(list)
         for part in plan.part_group:
             if part != output_part:
@@ -167,8 +177,14 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) 
-> None:
                 if tensor_config.home_region != tensor_config.copy_region:
                     copy_te_tensors.append(part.subgraph.input_tensors[i])
 
-        for te_tensor in copy_te_tensors:
+                    compute_cycles_hint, _ = 
get_copy_cycles_hint(tensor_config)
+                    compute_cycles_hints.append(compute_cycles_hint)
+
+        for te_tensor, compute_cycles_hint in zip(copy_te_tensors, 
compute_cycles_hints):
             copy_stage = sch.cache_read(te_tensor, "global", 
readers[te_tensor])
+            sch[copy_stage].pragma(
+                copy_stage.op.axis[0], "compute_cycles_hint", 
compute_cycles_hint
+            )
             sch[copy_stage].compute_at(stripe_stage, stripe_axis)
 
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py 
b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
index 827a58055d..bcabe2b7c2 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -263,6 +263,13 @@ def schedule_cache_reads(sch):
         if stage.attach_type != 2:  # Not inlined
             if _detect_cache_read(stage):
                 fax = stage.fuse(*stage.op.axis)
+
+                # propagate pragmas placed on the outer loop
+                if len(stage.op.axis) > 0 and stage.op.axis[0] in 
stage.iter_var_attrs:
+                    attrs = stage.iter_var_attrs[stage.op.axis[0]]
+                    for k, v in zip(attrs.pragma_keys, attrs.pragma_values):
+                        stage.pragma(fax, k.value, v)
+
                 stage.pragma(fax, "op", "ethosu_copy")
 
 
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc 
b/src/contrib/ethosu/cascader/plan_generator.cc
index 780f9adc2c..9545a511e7 100644
--- a/src/contrib/ethosu/cascader/plan_generator.cc
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -301,6 +301,42 @@ int GetInteriorMemoryUsage(const 
std::vector<TensorConfig>& input_configs,
   return memory_usage;
 }
 
+/**
+ * \brief Returns a hint estimating the number of cycles required for
+ * the copy specified by tensor_config.
+ *
+ * \param tensor_config  The tensor configuration to estimate.
+ * \return mem2mem_cycles Total estimated cycles.
+ * \return initial_mem2mem_cycles Estimated cycles for the first block.
+ */
+std::pair<int, int> GetCopyCyclesHint(const TensorConfig& tensor_config) {
+  Tensor tensor = tensor_config->GetTensor();
+  MemoryRegion home_region = tensor_config->GetHomeRegion();
+  MemoryRegion copy_region = tensor_config->GetCopyRegion();
+  int initial_mem2mem_cycles = 0;
+  int mem2mem_cycles = 0;
+
+  // This Tensor needs to be copied - Count stripes for this config
+  for (const auto& stripe_config : tensor_config->GetStripeConfigs()) {
+    std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, 
true);
+    bool first_block = true;
+    for (const auto& block : input_blocks) {
+      int bytes_transferred = mul_reduce(block.first) * 
tensor->GetDataType().bytes() *
+                              tensor->GetCompressionRatio() * block.second;
+      int read_cycles = bytes_transferred * home_region->read_bandwidth + 
home_region->read_latency;
+      int write_cycles = bytes_transferred * copy_region->write_bandwidth;
+
+      if (first_block) {
+        first_block = false;
+        initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
+      }
+      mem2mem_cycles += std::max(read_cycles, write_cycles);
+    }
+  }
+
+  return {mem2mem_cycles, initial_mem2mem_cycles};
+}
+
 std::vector<Plan> GenerateSinglePlans(
     const Part& part, const std::vector<StripeConfig>& output_stripe_configs,
     const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, 
ObjectPtrEqual>&
@@ -372,28 +408,12 @@ std::vector<Plan> GenerateSinglePlans(
         BlockConfig block_config = perf_info->block_config;
         for (size_t i = 0; i < input_configs.size(); i++) {
           Tensor tensor = input_configs[i]->GetTensor();
-          MemoryRegion home_region = input_configs[i]->GetHomeRegion();
           MemoryRegion copy_region = input_configs[i]->GetCopyRegion();
 
           if (input_configs[i]->DoCopy()) {
-            // This Tensor needs to be copied - Count stripes for this config
-            for (const auto& stripe_config : 
input_configs[i]->GetStripeConfigs()) {
-              std::map<std::vector<int>, int> input_blocks = 
CountStripes(stripe_config, true);
-              bool first_block = true;
-              for (const auto& block : input_blocks) {
-                int bytes_transferred = mul_reduce(block.first) * 
tensor->GetDataType().bytes() *
-                                        tensor->GetCompressionRatio() * 
block.second;
-                int read_cycles = bytes_transferred * 
home_region->read_bandwidth +
-                                  
input_configs[i]->GetHomeRegion()->read_latency;
-                int write_cycles = bytes_transferred * 
copy_region->write_bandwidth;
-
-                if (first_block) {
-                  first_block = false;
-                  initial_mem2mem_cycles += std::max(read_cycles, 
write_cycles);
-                }
-                mem2mem_cycles += std::max(read_cycles, write_cycles);
-              }
-            }
+            std::pair<int, int> ret = GetCopyCyclesHint(input_configs[i]);
+            mem2mem_cycles += ret.first;
+            initial_mem2mem_cycles += ret.second;
           }
           float read_efficiency =
               GetTransferEfficiency(tensor, 
block_config->GetInputBlockShape(), copy_region);
@@ -585,6 +605,12 @@ 
TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateGraphPlans")
       return tclosed_plans;
     });
 
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GetCopyCyclesHint")
+    .set_body_typed([](TensorConfig tensor_config) {
+      std::pair<int, int> ret = GetCopyCyclesHint(tensor_config);
+      return Array<Integer>({ret.first, ret.second});
+    });
+
 }  // namespace cascader
 }  // namespace ethosu
 }  // namespace contrib
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 2b7b2b4741..09c359c55a 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -168,9 +168,14 @@ class CopyComputeReorderingMutator : public 
StmtExprMutator {
   }
 
   tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(const Stmt& stmt) {
-    auto eval_node{stmt.as<EvaluateNode>()};
+    Stmt eval_stmt = stmt;
+    if (const auto* attr_stmt = eval_stmt.as<AttrStmtNode>()) {
+      eval_stmt = attr_stmt->body;
+    }
+
+    auto eval_node{eval_stmt.as<EvaluateNode>()};
     ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
-                      << stmt->GetTypeKey();
+                      << eval_stmt->GetTypeKey();
     auto call_node{eval_node->value.as<CallNode>()};
     ICHECK(call_node) << "Expected expression to be a call node, but was "
                       << eval_node->value->GetTypeKey();
diff --git a/tests/python/contrib/test_ethosu/cascader/test_integration.py 
b/tests/python/contrib/test_ethosu/cascader/test_integration.py
new file mode 100644
index 0000000000..8e1f020861
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_integration.py
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wrong-import-position,invalid-name
+
+"""
+Test the cascader in the compilation flow.
+"""
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu.codegen import _create_cascader
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
+from tvm.contrib.ethosu.cascader import MemoryRegion, EthosuDeviceConfig
+
+from .. import infra as test_infra
+from . import infra as cascader_test_infra
+
+
+def _ethos_u55_cascader():
+    sram = MemoryRegion(
+        name="SRAM",
+        size=10**6,
+        read_bandwidth=16,
+        write_bandwidth=16,
+        read_latency=0,
+        write_latency=0,
+        burst_length=1,
+    )
+    flash = MemoryRegion(name="FLASH", size=10**7, read_bandwidth=4, 
write_bandwidth=4)
+
+    device_config = EthosuDeviceConfig("ethos-u55-256")
+    cascader_options = cascader_test_infra.make_options(
+        cascade_region=sram,
+        max_proposals=64,
+        stripe_factors=4,
+        max_plan_size=10,
+        max_open_plans=8,
+        max_closed_plans=32,
+        always_copy_size=1024,
+        disable_pareto_plans=False,
+        disable_pareto_proposals=False,
+        enable_striping=False,
+    )
+    return _create_cascader(
+        options=cascader_options,
+        io_region=sram,
+        constant_region=flash,
+        working_regions=[sram],
+        device_config=device_config,
+    )
+
+
+def _compile_model(relay_function):
+    mod = tvm.IRModule()
+    mod["main"] = relay_function
+    mod = relay.transform.InferType()(mod)
+    tir_mod = _lower_to_tir(mod["main"], _ethos_u55_cascader())[0]
+    return tir_mod["main"]
+
+
+def _create_single_conv2d():
+    ifm = relay.var("x", shape=(1, 8, 8, 4), dtype="int8")
+    conv1 = test_infra.make_ethosu_conv2d(ifm, 4, 4, (3, 3), (1, 1), (1, 1), 
(1, 1))
+    func = relay.Function(relay.analysis.free_vars(conv1), conv1)
+    return func
+
+
+def _create_double_conv2d():
+    ifm = relay.var("x", shape=(1, 8, 8, 4), dtype="int8")
+    conv1 = test_infra.make_ethosu_conv2d(ifm, 4, 4, (3, 3), (1, 1), (1, 1), 
(1, 1))
+    conv2 = test_infra.make_ethosu_conv2d(conv1, 4, 4, (1, 3), (1, 1), (1, 1), 
(1, 1))
+    func = relay.Function(relay.analysis.free_vars(conv2), conv2)
+    return func
+
+
+def _create_scalar_add():
+    ifm = relay.var("x", shape=(1, 5, 4, 3), dtype="int8")
+    ifm2 = relay.const(np.ones((1, 1, 1, 1)), dtype="int8")
+    add = test_infra.make_ethosu_binary_elementwise(
+        ifm, ifm2, ifm_channels=3, ifm2_channels=1, operator_type="ADD", 
ofm_dtype="int8"
+    )
+    func = relay.Function(relay.analysis.free_vars(add), add)
+    return func
+
+
+def test_single_conv_compute_cycles_hint():
+    """
+    Check the "compute_cycles_hint" annotation remains in the lowering flow
+    for single convolution.
+    """
+    primfunc = _compile_model(_create_single_conv2d())
+    ops = primfunc.body.body.body.seq
+
+    compute_cycles_hints = [2304, 640, 320]
+    for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+        assert op.attr_key == "pragma_compute_cycles_hint"
+        assert op.value == compute_cycle_hint
+
+
+def test_double_conv_compute_cycles_hint():
+    """
+    Check the "compute_cycles_hint" annotation remains in the lowering flow
+    for double convolution.
+    """
+    primfunc = _compile_model(_create_double_conv2d())
+    ops = primfunc.body.body.body.body.body.body.seq
+
+    compute_cycles_hints = [2304, 640, 768, 640, 320, 240]
+    for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+        assert op.attr_key == "pragma_compute_cycles_hint"
+        assert op.value == compute_cycle_hint
+
+
+def test_scalar_add_compute_cycles_hint():
+    """
+    Check the "compute_cycles_hint" annotation remains in the lowering flow
+    for add with scalar values.
+    """
+    primfunc = _compile_model(_create_scalar_add())
+    ops = primfunc.body.body.seq
+
+    compute_cycles_hints = [16, 24]
+    for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+        assert op.attr_key == "pragma_compute_cycles_hint"
+        assert op.value == compute_cycle_hint
diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py 
b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index 6ac188187e..c97cfeb7a9 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -14,37 +14,68 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=wrong-import-position, invalid-name
+
 import pytest
 
+pytest.importorskip("ethosu.vela")
+
 import tvm.contrib.ethosu.cascader as cs
 
-from .infra import ethosu_enabled
-
-if ethosu_enabled:
-
-    def test_cascade(
-        SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1StartTE, 
MobileNetv1TE
-    ):
-        fixtures = [
-            TwoConv2DTE,
-            TwoConv2DWithSliceTE,
-            MobileNetv1StartTE,
-            MobileNetv1TE,
-        ]
-        device_config = cs.EthosuDeviceConfig("ethos-u55-256")
-        for sch, te_graph, const_dict in fixtures:
-            options = cs.CascaderOptions(
-                cascade_region=SRAM,
-                max_proposals=64,
-                stripe_factors=4,
-                max_plan_size=10,
-                max_open_plans=8,
-                max_closed_plans=32,
-                always_copy_size=1024,
-                disable_pareto_plans=False,
-                disable_pareto_proposals=False,
-            )
-            cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, 
[SRAM], device_config)
+from . import infra
+
+
+def test_cascade(SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, 
MobileNetv1StartTE, MobileNetv1TE):
+    fixtures = [
+        TwoConv2DTE,
+        TwoConv2DWithSliceTE,
+        MobileNetv1StartTE,
+        MobileNetv1TE,
+    ]
+    device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+    for sch, te_graph, const_dict in fixtures:
+        options = infra.make_options(
+            cascade_region=SRAM,
+            max_proposals=64,
+            stripe_factors=4,
+            max_plan_size=10,
+            max_open_plans=8,
+            max_closed_plans=32,
+            always_copy_size=1024,
+            disable_pareto_plans=False,
+            disable_pareto_proposals=False,
+        )
+        cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], 
device_config)
+
+
+def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
+    device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+    options = infra.make_options(
+        cascade_region=SRAM,
+        max_proposals=64,
+        stripe_factors=4,
+        max_plan_size=10,
+        max_open_plans=8,
+        max_closed_plans=32,
+        always_copy_size=1024,
+        disable_pareto_plans=False,
+        disable_pareto_proposals=False,
+    )
+    sch, te_graph, const_dict = TwoConv2DTE
+    cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], 
device_config)
+
+    # Stages that should have compute cycle annotations
+    # [copy, copy, conv2d, copy, conv2d]
+    stages = [6, 8, 9, 18, 19]
+    # Expected hints for each operation
+    compute_cycles_hints = [4096, 5120, 1632, 2560, 3072]
+
+    for stage, compute_cycles_hint in zip(stages, compute_cycles_hints):
+        op = sch.stages[stage]
+        op_iter_vars = op.leaf_iter_vars[0]
+        op_attrs = op.iter_var_attrs[op_iter_vars]
+        assert op_attrs.pragma_keys[0] == "compute_cycles_hint"
+        assert op_attrs.pragma_values[0] == compute_cycles_hint
 
 
 if __name__ == "__main__":

[tvm] branch main updated: [microNPU] Expose compute cycle annotations to TIR lowering (#11288)

Reply via email to