This is an automated email from the ASF dual-hosted git repository.
ekalda pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 63a95d54f4 [Relay][Strategy] Fix `arm_cpu` int8 conv2d schedule
selection for 32-bit targets (#15468)
63a95d54f4 is described below
commit 63a95d54f44032eb5c0658a4ca6f775119722239
Author: Luke Hutton <[email protected]>
AuthorDate: Fri Aug 4 16:26:03 2023 +0100
[Relay][Strategy] Fix `arm_cpu` int8 conv2d schedule selection for 32-bit
targets (#15468)
[Relay][Strategy] Fix `arm_cpu` int8 conv2d schedule selection for
32-bit targets
https://github.com/apache/tvm/pull/12455 slightly altered the behaviour
when selecting an int8 conv2d schedule. Previously conditions that
decide which schedule to select used `is_aarch64` which checks for the
existance of `aarch64` in the target triple. However, the conditions now
use `has_asimd` which is true if `aarch64` exists in the target triple
OR `+neon` is used in the mattr.
Both `conv2d_NHWC_quantized_interleaved.arm_cpu` and
`depthwise_conv2d_nhwc.arm_cpu` makes calls to LLVM intrinsics that
require both `aarch64` and `+neon`. But in the case of the target
`rasp4b`, the updated conditions result in compilation failure since
the target has `+neon` but doesn't have `aarch64` in the target triple.
The conditions have been updated to fix the compilation failure.
Likewise, the previous behaviour of the condition for
`conv2d_nhwc_spatial_pack.arm_cpu` has been restored ensure a program
with a 32-bit target can still be compiled.
Finally, we should only select the `depthwise_conv2d_nhwc_dsp.arm_cpu`
schedule when a backend that understands `pragma_import_c` has been
selected, i.e. "c".
For a more detailed discussion of the issue please see:
https://discuss.tvm.apache.org/t/tflite-llvm-llvm-error-when-compiling-tflite-model/15411
---
python/tvm/relay/op/strategy/arm_cpu.py | 10 ++-
.../relay/strategy/test_select_implementation.py | 97 ++++++++++++++++++++++
2 files changed, 103 insertions(+), 4 deletions(-)
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py
b/python/tvm/relay/op/strategy/arm_cpu.py
index dc3b16aa82..bbbe5bb732 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -211,21 +211,23 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type,
target):
name="conv2d_nhwc_dsp.arm_cpu",
)
elif kernel_layout == "HWIO":
+ is_aarch64 = target.features.is_aarch64
has_asimd = target.features.has_asimd
has_dot_prod = target.features.has_dotprod
+
if has_dot_prod and data.dtype in ["int8", "uint8"]:
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_native),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
name="conv2d_NHWC_quantized_native.arm_cpu",
)
- if has_asimd and data.dtype in ["int8", "uint8"]:
+ if is_aarch64 and has_asimd and data.dtype in ["int8",
"uint8"]:
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
name="conv2d_NHWC_quantized_interleaved.arm_cpu",
)
- if (not has_asimd) or (data.dtype not in ["int8", "uint8"]):
+ if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]):
# TODO(@giuseros)
# This strategy errors out for quantized data types when
tuning.
# Let's use this only for non-aarch64 or non-quantized
cases
@@ -285,7 +287,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type,
target):
)
elif layout == "NHWC":
assert kernel_layout == "HWOI"
- if target.features.has_asimd:
+ if target.features.is_aarch64 and target.features.has_asimd:
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.compute_depthwise_conv2d_nhwc),
wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),
@@ -298,7 +300,6 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type,
target):
# The int8 implementation DOES need the DSP unit (for SXTB16), but
it is not
# possible to use the DSP unit to speed up a NHWC depthwise
convolution (though
# an NCHW convolution would benefit).
-
elif (
dilation_w == dilation_h == 1
and kernel.shape[3] == 1 # channel_multiplier == 1
@@ -308,6 +309,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type,
target):
or (data.shape[3] % 2 == 0 and data.dtype == "int16")
)
and (padding != "SAME" or data.shape[1] % stride_h ==
data.shape[2] % stride_w == 0)
+ and target.kind.name == "c"
# Ideally we should check that kernel is a Relay constant, but
strategy functions
# don't have access to the data needed to check this.
):
diff --git a/tests/python/relay/strategy/test_select_implementation.py
b/tests/python/relay/strategy/test_select_implementation.py
index 3e63bc4751..eae186524c 100644
--- a/tests/python/relay/strategy/test_select_implementation.py
+++ b/tests/python/relay/strategy/test_select_implementation.py
@@ -52,5 +52,102 @@ def test_concatenate(target, expected_implementation):
assert impl.name == expected_implementation
[email protected](
+ "target,expected_impl",
+ [
+ ("llvm -device=arm_cpu", "conv2d_nhwc_spatial_pack.arm_cpu"),
+ (
+ "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
+ "conv2d_NHWC_quantized_interleaved.arm_cpu",
+ ),
+ (
+ "llvm -device=arm_cpu -mtriple=armv8l-linux-gnu -mattr=+neon",
+ "conv2d_nhwc_spatial_pack.arm_cpu",
+ ),
+ ],
+)
+def test_int8_conv2d(target, expected_impl):
+ target = tvm.target.Target(target)
+
+ dtype = "int8"
+ data_shape = (1, 1, 1, 4)
+ weight_shape = (1, 1, 4, 4)
+ data_layout = "NHWC"
+ kernel_layout = "HWIO"
+ channels = 4
+ kernel_size = (1, 1)
+
+ out = relay.nn.conv2d(
+ relay.var("data", shape=data_shape, dtype=dtype),
+ relay.var("weight", shape=weight_shape, dtype=dtype),
+ kernel_size=kernel_size,
+ channels=channels,
+ data_layout=data_layout,
+ kernel_layout=kernel_layout,
+ )
+ out = run_infer_type(out)
+
+ with target:
+ impl, _ = relay.backend.te_compiler.select_implementation(
+ out.op,
+ out.attrs,
+ [te.placeholder(data_shape, dtype), te.placeholder(weight_shape,
dtype)],
+ out.checked_type,
+ target,
+ )
+
+ assert impl.name == expected_impl
+
+
[email protected](
+ "target,expected_impl",
+ [
+ ("llvm -device=arm_cpu", "depthwise_conv2d_nhwc.generic"),
+ (
+ "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon",
+ "depthwise_conv2d_nhwc.arm_cpu",
+ ),
+ (
+ "llvm -device=arm_cpu -mtriple=armv8l-linux-gnu -mattr=+neon",
+ "depthwise_conv2d_nhwc.generic",
+ ),
+ ("c -device=arm_cpu -mcpu=cortex-m55",
"depthwise_conv2d_nhwc_dsp.arm_cpu"),
+ ],
+)
+def test_int8_depthwise_conv2d(target, expected_impl):
+ target = tvm.target.Target(target)
+
+ dtype = "int8"
+ out_dtype = "int32"
+ data_shape = (2, 2, 4, 8)
+ weight_shape = (2, 2, 8, 1)
+ data_layout = "NHWC"
+ kernel_layout = "HWOI"
+ groups = 8
+ kernel_size = (2, 2)
+
+ out = relay.nn.conv2d(
+ relay.var("data", shape=data_shape, dtype=dtype),
+ relay.var("weight", shape=weight_shape, dtype=dtype),
+ kernel_size=kernel_size,
+ data_layout=data_layout,
+ kernel_layout=kernel_layout,
+ groups=groups,
+ out_dtype=out_dtype,
+ )
+ out = run_infer_type(out)
+
+ with target:
+ impl, _ = relay.backend.te_compiler.select_implementation(
+ out.op,
+ out.attrs,
+ [te.placeholder(data_shape, dtype), te.placeholder(weight_shape,
dtype)],
+ out.checked_type,
+ target,
+ )
+
+ assert impl.name == expected_impl
+
+
if __name__ == "__main__":
tvm.testing.main()