This is an automated email from the ASF dual-hosted git repository.

leandron pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new ae45b04772 [Relay][Strategy] Use x86 dense schedules for arm_cpu 
(#15470)
ae45b04772 is described below

commit ae45b04772830d7c7b09b89bb9434e0b0a52f9e6
Author: Luke Hutton <[email protected]>
AuthorDate: Mon Aug 7 12:13:21 2023 +0100

    [Relay][Strategy] Use x86 dense schedules for arm_cpu (#15470)
    
    Currently the fallback used when compiling a dense operation with
    targets such as `llvm -device=arm_cpu` is `dense.generic`. This results
    very poor performance. Although https://github.com/apache/tvm/pull/13775
    meant that x86 schedules are used in cases where no strategy is provided
    by arm_cpu, the dense strategy is registered due to the existance of
    specialized schedules for arm_cpu e.g. a schedule for embedded devices.
    This commit ensures x86 schedules are used inplace of a generic
    schedule which yeilds much better performance.
    
    The commit also follows the same approach for the `dense.generic`
    schedule as the x86 strategy. This will only be used when autoscheduler
    is enabled.
    
    A test has been added to check the intended schedules are picked when
    compiling with `arm_cpu`.
    
    Change-Id: I8697f630d4acfab71a9626cf9e0dc3086987f163
---
 python/tvm/relay/op/strategy/arm_cpu.py            | 62 ++++++++++++++--------
 .../relay/strategy/test_select_implementation.py   | 38 +++++++++++++
 2 files changed, 79 insertions(+), 21 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py 
b/python/tvm/relay/op/strategy/arm_cpu.py
index bbbe5bb732..24966019db 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -559,33 +559,53 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, 
target):
             wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
             name="dense_dsp.arm_cpu",
         )
-    else:
-        # For dynamic matrix-vector multiply we use a hand written kernel.
-        if (
-            isinstance(inputs[0].shape[0], (int, tir.IntImm))
-            and inputs[0].shape[0] == 1
-            and (
-                topi.utils.is_dynamic_shape(inputs[0].shape)
-                or topi.utils.is_dynamic_shape(inputs[1].shape)
-            )
-        ):
-            strategy.add_implementation(
-                wrap_compute_dense(topi.x86.dense_dynamic),
-                wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
-                name="dense_dynamic.x86",
-                plevel=20,
-            )
-            return strategy
-        logger.warning("dense is not optimized for arm cpu.")
+        return strategy
+
+    # For dynamic matrix-vector multiply we use a hand written kernel.
+    if (
+        isinstance(inputs[0].shape[0], (int, tir.IntImm))
+        and inputs[0].shape[0] == 1
+        and (
+            topi.utils.is_dynamic_shape(inputs[0].shape)
+            or topi.utils.is_dynamic_shape(inputs[1].shape)
+        )
+    ):
+        strategy.add_implementation(
+            wrap_compute_dense(topi.x86.dense_dynamic),
+            wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
+            name="dense_dynamic.x86",
+            plevel=20,
+        )
+        return strategy
+
+    need_auto_scheduler_layout = is_auto_scheduler_enabled()
+    need_meta_schedule_layout = is_meta_schedule_enabled()
+    if need_auto_scheduler_layout or need_meta_schedule_layout:
         strategy.add_implementation(
             wrap_compute_dense(
                 topi.nn.dense,
-                need_auto_scheduler_layout=is_auto_scheduler_enabled(),
-                need_meta_schedule_layout=is_meta_schedule_enabled(),
+                need_auto_scheduler_layout=need_auto_scheduler_layout,
+                need_meta_schedule_layout=need_meta_schedule_layout,
             ),
-            wrap_topi_schedule(topi.generic.schedule_dense),
+            naive_schedule,
             name="dense.generic",
+            plevel=11,
         )
+
+    # Fallback to x86 schedules as there is currently no arm_cpu schedule for 
dense
+    strategy.add_implementation(
+        wrap_compute_dense(topi.x86.dense_nopack),
+        wrap_topi_schedule(topi.x86.schedule_dense_nopack),
+        name="dense_nopack.x86",
+        plevel=5,
+    )
+    strategy.add_implementation(
+        wrap_compute_dense(topi.x86.dense_pack),
+        wrap_topi_schedule(topi.x86.schedule_dense_pack),
+        name="dense_pack.x86",
+        plevel=10,
+    )
+
     return strategy
 
 
diff --git a/tests/python/relay/strategy/test_select_implementation.py 
b/tests/python/relay/strategy/test_select_implementation.py
index eae186524c..20dfe9670a 100644
--- a/tests/python/relay/strategy/test_select_implementation.py
+++ b/tests/python/relay/strategy/test_select_implementation.py
@@ -16,7 +16,10 @@
 # under the License.
 
 """ Tests strategy selection for Relay ops """
+
 import pytest
+import numpy as np
+
 import tvm
 from tvm import relay
 from tvm import te
@@ -149,5 +152,40 @@ def test_int8_depthwise_conv2d(target, expected_impl):
     assert impl.name == expected_impl
 
 
[email protected](
+    "target,expected_valid_impl,expected_impl",
+    [("llvm -device=arm_cpu", ["dense_pack.x86", "dense_nopack.x86"], 
"dense_pack.x86")],
+)
+def test_dense(target, expected_valid_impl, expected_impl):
+    target = tvm.target.Target(target)
+
+    data_shape = (30, 40)
+    weight_shape = (30, 40)
+    dtype = "float32"
+
+    out = relay.nn.dense(
+        relay.var("data", shape=data_shape, dtype=dtype),
+        relay.var("weight", shape=weight_shape, dtype=dtype),
+        out_dtype=dtype,
+    )
+    out = run_infer_type(out)
+
+    with target:
+        args = [
+            out.op,
+            out.attrs,
+            [te.placeholder(data_shape, dtype), te.placeholder(weight_shape, 
dtype)],
+            out.checked_type,
+            target,
+        ]
+        valid_impl = relay.backend.te_compiler.get_valid_implementations(*args)
+        selected_impl, _ = 
relay.backend.te_compiler.select_implementation(*args, use_autotvm=False)
+
+    assert len(valid_impl) == len(expected_valid_impl)
+    for impl in valid_impl:
+        assert impl.name in expected_valid_impl
+    assert selected_impl.name == expected_impl
+
+
 if __name__ == "__main__":
     tvm.testing.main()

Reply via email to