This is an automated email from the ASF dual-hosted git repository.
manupa pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new f6ddd52dc0 [microNPU] Expose compute cycle annotations to TIR lowering
(#11288)
f6ddd52dc0 is described below
commit f6ddd52dc00831059966a89dbd67e2fe6c683759
Author: Luke Hutton <[email protected]>
AuthorDate: Thu May 26 12:21:22 2022 +0100
[microNPU] Expose compute cycle annotations to TIR lowering (#11288)
* [microNPU] Expose compute cycle annotations to TIR lowering
Adds an AttrSttmt "compute_cycles_hint" to each NPU operation for later
passes to consume.
Change-Id: I09779bdab6de6ef2094db610bb20d6e052e68ee3
* compute_cycles->compute_cycles_hint
Change-Id: Iebd71e699522e92a28fd321ffdb41ed7924db4e0
* add test to check annotations in compilation flow
Change-Id: Idcdcc8c8b5536c4732f297246b71aa8378a2732c
* add compute cycles hints for copy operations
Change-Id: I007ba19732e16081fa2ea9baca40c64a653c93cf
* fixing annotations for copies and improving test coverage
Change-Id: Ib812c4151fab03f4c1adcc016b4e798003a22e5e
* rebase
Change-Id: I653101908706096ae25ad1ebf08e7b6c4f1196c7
---
.../tvm/contrib/ethosu/cascader/plan_generator.py | 24 +++-
python/tvm/contrib/ethosu/cascader/scheduler.py | 18 ++-
.../relay/backend/contrib/ethosu/tir/scheduler.py | 7 +
src/contrib/ethosu/cascader/plan_generator.cc | 64 ++++++---
src/tir/contrib/ethosu/passes.cc | 9 +-
.../test_ethosu/cascader/test_integration.py | 145 +++++++++++++++++++++
.../contrib/test_ethosu/cascader/test_scheduler.py | 85 ++++++++----
7 files changed, 301 insertions(+), 51 deletions(-)
diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py
b/python/tvm/contrib/ethosu/cascader/plan_generator.py
index 155e01431c..ed29ff4b59 100644
--- a/python/tvm/contrib/ethosu/cascader/plan_generator.py
+++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -15,9 +15,9 @@
# specific language governing permissions and limitations
# under the License.
"""Algorithms to generate Plans for a CascaderGraph."""
-from typing import List, Dict
+from typing import List, Dict, Tuple
-from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion
+from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion,
TensorConfig
from . import _ffi_api
from .cascader_options import CascaderOptions
@@ -55,3 +55,23 @@ def _generate_graph_plans(
home_map,
options,
)
+
+
+def get_copy_cycles_hint(tensor_config: TensorConfig) -> Tuple[int, int]:
+ """
+ Returns a hint estimating the number of cycles for the copy
+ specified by tensor_config.
+
+ Parameters
+ ----------
+ tensor_config : TensorConfig
+ The tensor configuration to estimate.
+
+ Returns
+ -------
+ mem2mem_cycles : int
+ Total estimated cycles.
+ initial_mem2mem_cycles : int
+ Estimated cycles for the first block.
+ """
+ return _ffi_api.GetCopyCyclesHint(tensor_config)
diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py
b/python/tvm/contrib/ethosu/cascader/scheduler.py
index d33abaf2b7..fd247e660a 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -31,6 +31,7 @@ from .parts import EthosuPart
from .tensor_config import MemoryRegion
from .proposal import Proposal
from .proposal_generator import generate_proposals
+from .plan_generator import get_copy_cycles_hint
from .graph import create_cascader_graph
from .device_config import EthosuDeviceConfig
from .logging import Logging
@@ -134,7 +135,11 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule)
-> None:
if isinstance(part, EthosuPart):
tensor_config = plan.tensor_configs[part.output_tensor]
stripe_config = tensor_config.stripe_configs[0]
+ buffer_mode = tensor_config.buffer_mode
block_config = part.get_block_config(stripe_config)
+ compute_cycles = part.get_performance_info(
+ stripe_config, buffer_mode
+ ).compute_cycles
iv = part.subgraph.output_tensor.op.axis[0]
block_shape = block_config.output_shape
if len(block_shape) == 4:
@@ -147,6 +152,10 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule)
-> None:
sch[part.subgraph.output_tensor].pragma(iv,
"block_config_width", width)
sch[part.subgraph.output_tensor].pragma(iv,
"block_config_depth", depth)
+ # Attach AttrStmt directly to npu op so it isn't removed by
ReplaceOperators
+ npu_op =
part.subgraph.output_tensor.op.input_tensors[0].op.input_tensors[0]
+ sch[npu_op].pragma(npu_op.op.axis[0], "compute_cycles_hint",
compute_cycles)
+
output_tensor_config = plan.output_config
output_tensor = output_tensor_config.tensor
output_part = output_tensor.producers[0]
@@ -156,6 +165,7 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) ->
None:
stripe_shape = [int(x) for x in stripe_config.shape]
stripe_stage, stripe_axis = stripe_part(output_part, stripe_shape, sch)
copy_te_tensors = []
+ compute_cycles_hints = []
readers = defaultdict(list)
for part in plan.part_group:
if part != output_part:
@@ -167,8 +177,14 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule)
-> None:
if tensor_config.home_region != tensor_config.copy_region:
copy_te_tensors.append(part.subgraph.input_tensors[i])
- for te_tensor in copy_te_tensors:
+ compute_cycles_hint, _ =
get_copy_cycles_hint(tensor_config)
+ compute_cycles_hints.append(compute_cycles_hint)
+
+ for te_tensor, compute_cycles_hint in zip(copy_te_tensors,
compute_cycles_hints):
copy_stage = sch.cache_read(te_tensor, "global",
readers[te_tensor])
+ sch[copy_stage].pragma(
+ copy_stage.op.axis[0], "compute_cycles_hint",
compute_cycles_hint
+ )
sch[copy_stage].compute_at(stripe_stage, stripe_axis)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
index 827a58055d..bcabe2b7c2 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -263,6 +263,13 @@ def schedule_cache_reads(sch):
if stage.attach_type != 2: # Not inlined
if _detect_cache_read(stage):
fax = stage.fuse(*stage.op.axis)
+
+ # propagate pragmas placed on the outer loop
+ if len(stage.op.axis) > 0 and stage.op.axis[0] in
stage.iter_var_attrs:
+ attrs = stage.iter_var_attrs[stage.op.axis[0]]
+ for k, v in zip(attrs.pragma_keys, attrs.pragma_values):
+ stage.pragma(fax, k.value, v)
+
stage.pragma(fax, "op", "ethosu_copy")
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc
b/src/contrib/ethosu/cascader/plan_generator.cc
index 780f9adc2c..9545a511e7 100644
--- a/src/contrib/ethosu/cascader/plan_generator.cc
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -301,6 +301,42 @@ int GetInteriorMemoryUsage(const
std::vector<TensorConfig>& input_configs,
return memory_usage;
}
+/**
+ * \brief Returns a hint estimating the number of cycles required for
+ * the copy specified by tensor_config.
+ *
+ * \param tensor_config The tensor configuration to estimate.
+ * \return mem2mem_cycles Total estimated cycles.
+ * \return initial_mem2mem_cycles Estimated cycles for the first block.
+ */
+std::pair<int, int> GetCopyCyclesHint(const TensorConfig& tensor_config) {
+ Tensor tensor = tensor_config->GetTensor();
+ MemoryRegion home_region = tensor_config->GetHomeRegion();
+ MemoryRegion copy_region = tensor_config->GetCopyRegion();
+ int initial_mem2mem_cycles = 0;
+ int mem2mem_cycles = 0;
+
+ // This Tensor needs to be copied - Count stripes for this config
+ for (const auto& stripe_config : tensor_config->GetStripeConfigs()) {
+ std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config,
true);
+ bool first_block = true;
+ for (const auto& block : input_blocks) {
+ int bytes_transferred = mul_reduce(block.first) *
tensor->GetDataType().bytes() *
+ tensor->GetCompressionRatio() * block.second;
+ int read_cycles = bytes_transferred * home_region->read_bandwidth +
home_region->read_latency;
+ int write_cycles = bytes_transferred * copy_region->write_bandwidth;
+
+ if (first_block) {
+ first_block = false;
+ initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
+ }
+ mem2mem_cycles += std::max(read_cycles, write_cycles);
+ }
+ }
+
+ return {mem2mem_cycles, initial_mem2mem_cycles};
+}
+
std::vector<Plan> GenerateSinglePlans(
const Part& part, const std::vector<StripeConfig>& output_stripe_configs,
const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash,
ObjectPtrEqual>&
@@ -372,28 +408,12 @@ std::vector<Plan> GenerateSinglePlans(
BlockConfig block_config = perf_info->block_config;
for (size_t i = 0; i < input_configs.size(); i++) {
Tensor tensor = input_configs[i]->GetTensor();
- MemoryRegion home_region = input_configs[i]->GetHomeRegion();
MemoryRegion copy_region = input_configs[i]->GetCopyRegion();
if (input_configs[i]->DoCopy()) {
- // This Tensor needs to be copied - Count stripes for this config
- for (const auto& stripe_config :
input_configs[i]->GetStripeConfigs()) {
- std::map<std::vector<int>, int> input_blocks =
CountStripes(stripe_config, true);
- bool first_block = true;
- for (const auto& block : input_blocks) {
- int bytes_transferred = mul_reduce(block.first) *
tensor->GetDataType().bytes() *
- tensor->GetCompressionRatio() *
block.second;
- int read_cycles = bytes_transferred *
home_region->read_bandwidth +
-
input_configs[i]->GetHomeRegion()->read_latency;
- int write_cycles = bytes_transferred *
copy_region->write_bandwidth;
-
- if (first_block) {
- first_block = false;
- initial_mem2mem_cycles += std::max(read_cycles,
write_cycles);
- }
- mem2mem_cycles += std::max(read_cycles, write_cycles);
- }
- }
+ std::pair<int, int> ret = GetCopyCyclesHint(input_configs[i]);
+ mem2mem_cycles += ret.first;
+ initial_mem2mem_cycles += ret.second;
}
float read_efficiency =
GetTransferEfficiency(tensor,
block_config->GetInputBlockShape(), copy_region);
@@ -585,6 +605,12 @@
TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateGraphPlans")
return tclosed_plans;
});
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GetCopyCyclesHint")
+ .set_body_typed([](TensorConfig tensor_config) {
+ std::pair<int, int> ret = GetCopyCyclesHint(tensor_config);
+ return Array<Integer>({ret.first, ret.second});
+ });
+
} // namespace cascader
} // namespace ethosu
} // namespace contrib
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 2b7b2b4741..09c359c55a 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -168,9 +168,14 @@ class CopyComputeReorderingMutator : public
StmtExprMutator {
}
tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(const Stmt& stmt) {
- auto eval_node{stmt.as<EvaluateNode>()};
+ Stmt eval_stmt = stmt;
+ if (const auto* attr_stmt = eval_stmt.as<AttrStmtNode>()) {
+ eval_stmt = attr_stmt->body;
+ }
+
+ auto eval_node{eval_stmt.as<EvaluateNode>()};
ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
- << stmt->GetTypeKey();
+ << eval_stmt->GetTypeKey();
auto call_node{eval_node->value.as<CallNode>()};
ICHECK(call_node) << "Expected expression to be a call node, but was "
<< eval_node->value->GetTypeKey();
diff --git a/tests/python/contrib/test_ethosu/cascader/test_integration.py
b/tests/python/contrib/test_ethosu/cascader/test_integration.py
new file mode 100644
index 0000000000..8e1f020861
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_integration.py
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wrong-import-position,invalid-name
+
+"""
+Test the cascader in the compilation flow.
+"""
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu.codegen import _create_cascader
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
+from tvm.contrib.ethosu.cascader import MemoryRegion, EthosuDeviceConfig
+
+from .. import infra as test_infra
+from . import infra as cascader_test_infra
+
+
+def _ethos_u55_cascader():
+ sram = MemoryRegion(
+ name="SRAM",
+ size=10**6,
+ read_bandwidth=16,
+ write_bandwidth=16,
+ read_latency=0,
+ write_latency=0,
+ burst_length=1,
+ )
+ flash = MemoryRegion(name="FLASH", size=10**7, read_bandwidth=4,
write_bandwidth=4)
+
+ device_config = EthosuDeviceConfig("ethos-u55-256")
+ cascader_options = cascader_test_infra.make_options(
+ cascade_region=sram,
+ max_proposals=64,
+ stripe_factors=4,
+ max_plan_size=10,
+ max_open_plans=8,
+ max_closed_plans=32,
+ always_copy_size=1024,
+ disable_pareto_plans=False,
+ disable_pareto_proposals=False,
+ enable_striping=False,
+ )
+ return _create_cascader(
+ options=cascader_options,
+ io_region=sram,
+ constant_region=flash,
+ working_regions=[sram],
+ device_config=device_config,
+ )
+
+
+def _compile_model(relay_function):
+ mod = tvm.IRModule()
+ mod["main"] = relay_function
+ mod = relay.transform.InferType()(mod)
+ tir_mod = _lower_to_tir(mod["main"], _ethos_u55_cascader())[0]
+ return tir_mod["main"]
+
+
+def _create_single_conv2d():
+ ifm = relay.var("x", shape=(1, 8, 8, 4), dtype="int8")
+ conv1 = test_infra.make_ethosu_conv2d(ifm, 4, 4, (3, 3), (1, 1), (1, 1),
(1, 1))
+ func = relay.Function(relay.analysis.free_vars(conv1), conv1)
+ return func
+
+
+def _create_double_conv2d():
+ ifm = relay.var("x", shape=(1, 8, 8, 4), dtype="int8")
+ conv1 = test_infra.make_ethosu_conv2d(ifm, 4, 4, (3, 3), (1, 1), (1, 1),
(1, 1))
+ conv2 = test_infra.make_ethosu_conv2d(conv1, 4, 4, (1, 3), (1, 1), (1, 1),
(1, 1))
+ func = relay.Function(relay.analysis.free_vars(conv2), conv2)
+ return func
+
+
+def _create_scalar_add():
+ ifm = relay.var("x", shape=(1, 5, 4, 3), dtype="int8")
+ ifm2 = relay.const(np.ones((1, 1, 1, 1)), dtype="int8")
+ add = test_infra.make_ethosu_binary_elementwise(
+ ifm, ifm2, ifm_channels=3, ifm2_channels=1, operator_type="ADD",
ofm_dtype="int8"
+ )
+ func = relay.Function(relay.analysis.free_vars(add), add)
+ return func
+
+
+def test_single_conv_compute_cycles_hint():
+ """
+ Check the "compute_cycles_hint" annotation remains in the lowering flow
+ for single convolution.
+ """
+ primfunc = _compile_model(_create_single_conv2d())
+ ops = primfunc.body.body.body.seq
+
+ compute_cycles_hints = [2304, 640, 320]
+ for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+ assert op.attr_key == "pragma_compute_cycles_hint"
+ assert op.value == compute_cycle_hint
+
+
+def test_double_conv_compute_cycles_hint():
+ """
+ Check the "compute_cycles_hint" annotation remains in the lowering flow
+ for double convolution.
+ """
+ primfunc = _compile_model(_create_double_conv2d())
+ ops = primfunc.body.body.body.body.body.body.seq
+
+ compute_cycles_hints = [2304, 640, 768, 640, 320, 240]
+ for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+ assert op.attr_key == "pragma_compute_cycles_hint"
+ assert op.value == compute_cycle_hint
+
+
+def test_scalar_add_compute_cycles_hint():
+ """
+ Check the "compute_cycles_hint" annotation remains in the lowering flow
+ for add with scalar values.
+ """
+ primfunc = _compile_model(_create_scalar_add())
+ ops = primfunc.body.body.seq
+
+ compute_cycles_hints = [16, 24]
+ for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+ assert op.attr_key == "pragma_compute_cycles_hint"
+ assert op.value == compute_cycle_hint
diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index 6ac188187e..c97cfeb7a9 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -14,37 +14,68 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+# pylint: disable=wrong-import-position, invalid-name
+
import pytest
+pytest.importorskip("ethosu.vela")
+
import tvm.contrib.ethosu.cascader as cs
-from .infra import ethosu_enabled
-
-if ethosu_enabled:
-
- def test_cascade(
- SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1StartTE,
MobileNetv1TE
- ):
- fixtures = [
- TwoConv2DTE,
- TwoConv2DWithSliceTE,
- MobileNetv1StartTE,
- MobileNetv1TE,
- ]
- device_config = cs.EthosuDeviceConfig("ethos-u55-256")
- for sch, te_graph, const_dict in fixtures:
- options = cs.CascaderOptions(
- cascade_region=SRAM,
- max_proposals=64,
- stripe_factors=4,
- max_plan_size=10,
- max_open_plans=8,
- max_closed_plans=32,
- always_copy_size=1024,
- disable_pareto_plans=False,
- disable_pareto_proposals=False,
- )
- cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH,
[SRAM], device_config)
+from . import infra
+
+
+def test_cascade(SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE,
MobileNetv1StartTE, MobileNetv1TE):
+ fixtures = [
+ TwoConv2DTE,
+ TwoConv2DWithSliceTE,
+ MobileNetv1StartTE,
+ MobileNetv1TE,
+ ]
+ device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+ for sch, te_graph, const_dict in fixtures:
+ options = infra.make_options(
+ cascade_region=SRAM,
+ max_proposals=64,
+ stripe_factors=4,
+ max_plan_size=10,
+ max_open_plans=8,
+ max_closed_plans=32,
+ always_copy_size=1024,
+ disable_pareto_plans=False,
+ disable_pareto_proposals=False,
+ )
+ cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM],
device_config)
+
+
+def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
+ device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+ options = infra.make_options(
+ cascade_region=SRAM,
+ max_proposals=64,
+ stripe_factors=4,
+ max_plan_size=10,
+ max_open_plans=8,
+ max_closed_plans=32,
+ always_copy_size=1024,
+ disable_pareto_plans=False,
+ disable_pareto_proposals=False,
+ )
+ sch, te_graph, const_dict = TwoConv2DTE
+ cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM],
device_config)
+
+ # Stages that should have compute cycle annotations
+ # [copy, copy, conv2d, copy, conv2d]
+ stages = [6, 8, 9, 18, 19]
+ # Expected hints for each operation
+ compute_cycles_hints = [4096, 5120, 1632, 2560, 3072]
+
+ for stage, compute_cycles_hint in zip(stages, compute_cycles_hints):
+ op = sch.stages[stage]
+ op_iter_vars = op.leaf_iter_vars[0]
+ op_attrs = op.iter_var_attrs[op_iter_vars]
+ assert op_attrs.pragma_keys[0] == "compute_cycles_hint"
+ assert op_attrs.pragma_values[0] == compute_cycles_hint
if __name__ == "__main__":