This is an automated email from the ASF dual-hosted git repository.
leandron pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new ae45b04772 [Relay][Strategy] Use x86 dense schedules for arm_cpu
(#15470)
ae45b04772 is described below
commit ae45b04772830d7c7b09b89bb9434e0b0a52f9e6
Author: Luke Hutton <[email protected]>
AuthorDate: Mon Aug 7 12:13:21 2023 +0100
[Relay][Strategy] Use x86 dense schedules for arm_cpu (#15470)
Currently the fallback used when compiling a dense operation with
targets such as `llvm -device=arm_cpu` is `dense.generic`. This results
very poor performance. Although https://github.com/apache/tvm/pull/13775
meant that x86 schedules are used in cases where no strategy is provided
by arm_cpu, the dense strategy is registered due to the existance of
specialized schedules for arm_cpu e.g. a schedule for embedded devices.
This commit ensures x86 schedules are used inplace of a generic
schedule which yeilds much better performance.
The commit also follows the same approach for the `dense.generic`
schedule as the x86 strategy. This will only be used when autoscheduler
is enabled.
A test has been added to check the intended schedules are picked when
compiling with `arm_cpu`.
Change-Id: I8697f630d4acfab71a9626cf9e0dc3086987f163
---
python/tvm/relay/op/strategy/arm_cpu.py | 62 ++++++++++++++--------
.../relay/strategy/test_select_implementation.py | 38 +++++++++++++
2 files changed, 79 insertions(+), 21 deletions(-)
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py
b/python/tvm/relay/op/strategy/arm_cpu.py
index bbbe5bb732..24966019db 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -559,33 +559,53 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type,
target):
wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
name="dense_dsp.arm_cpu",
)
- else:
- # For dynamic matrix-vector multiply we use a hand written kernel.
- if (
- isinstance(inputs[0].shape[0], (int, tir.IntImm))
- and inputs[0].shape[0] == 1
- and (
- topi.utils.is_dynamic_shape(inputs[0].shape)
- or topi.utils.is_dynamic_shape(inputs[1].shape)
- )
- ):
- strategy.add_implementation(
- wrap_compute_dense(topi.x86.dense_dynamic),
- wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
- name="dense_dynamic.x86",
- plevel=20,
- )
- return strategy
- logger.warning("dense is not optimized for arm cpu.")
+ return strategy
+
+ # For dynamic matrix-vector multiply we use a hand written kernel.
+ if (
+ isinstance(inputs[0].shape[0], (int, tir.IntImm))
+ and inputs[0].shape[0] == 1
+ and (
+ topi.utils.is_dynamic_shape(inputs[0].shape)
+ or topi.utils.is_dynamic_shape(inputs[1].shape)
+ )
+ ):
+ strategy.add_implementation(
+ wrap_compute_dense(topi.x86.dense_dynamic),
+ wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
+ name="dense_dynamic.x86",
+ plevel=20,
+ )
+ return strategy
+
+ need_auto_scheduler_layout = is_auto_scheduler_enabled()
+ need_meta_schedule_layout = is_meta_schedule_enabled()
+ if need_auto_scheduler_layout or need_meta_schedule_layout:
strategy.add_implementation(
wrap_compute_dense(
topi.nn.dense,
- need_auto_scheduler_layout=is_auto_scheduler_enabled(),
- need_meta_schedule_layout=is_meta_schedule_enabled(),
+ need_auto_scheduler_layout=need_auto_scheduler_layout,
+ need_meta_schedule_layout=need_meta_schedule_layout,
),
- wrap_topi_schedule(topi.generic.schedule_dense),
+ naive_schedule,
name="dense.generic",
+ plevel=11,
)
+
+ # Fallback to x86 schedules as there is currently no arm_cpu schedule for
dense
+ strategy.add_implementation(
+ wrap_compute_dense(topi.x86.dense_nopack),
+ wrap_topi_schedule(topi.x86.schedule_dense_nopack),
+ name="dense_nopack.x86",
+ plevel=5,
+ )
+ strategy.add_implementation(
+ wrap_compute_dense(topi.x86.dense_pack),
+ wrap_topi_schedule(topi.x86.schedule_dense_pack),
+ name="dense_pack.x86",
+ plevel=10,
+ )
+
return strategy
diff --git a/tests/python/relay/strategy/test_select_implementation.py
b/tests/python/relay/strategy/test_select_implementation.py
index eae186524c..20dfe9670a 100644
--- a/tests/python/relay/strategy/test_select_implementation.py
+++ b/tests/python/relay/strategy/test_select_implementation.py
@@ -16,7 +16,10 @@
# under the License.
""" Tests strategy selection for Relay ops """
+
import pytest
+import numpy as np
+
import tvm
from tvm import relay
from tvm import te
@@ -149,5 +152,40 @@ def test_int8_depthwise_conv2d(target, expected_impl):
assert impl.name == expected_impl
[email protected](
+ "target,expected_valid_impl,expected_impl",
+ [("llvm -device=arm_cpu", ["dense_pack.x86", "dense_nopack.x86"],
"dense_pack.x86")],
+)
+def test_dense(target, expected_valid_impl, expected_impl):
+ target = tvm.target.Target(target)
+
+ data_shape = (30, 40)
+ weight_shape = (30, 40)
+ dtype = "float32"
+
+ out = relay.nn.dense(
+ relay.var("data", shape=data_shape, dtype=dtype),
+ relay.var("weight", shape=weight_shape, dtype=dtype),
+ out_dtype=dtype,
+ )
+ out = run_infer_type(out)
+
+ with target:
+ args = [
+ out.op,
+ out.attrs,
+ [te.placeholder(data_shape, dtype), te.placeholder(weight_shape,
dtype)],
+ out.checked_type,
+ target,
+ ]
+ valid_impl = relay.backend.te_compiler.get_valid_implementations(*args)
+ selected_impl, _ =
relay.backend.te_compiler.select_implementation(*args, use_autotvm=False)
+
+ assert len(valid_impl) == len(expected_valid_impl)
+ for impl in valid_impl:
+ assert impl.name in expected_valid_impl
+ assert selected_impl.name == expected_impl
+
+
if __name__ == "__main__":
tvm.testing.main()