This is an automated email from the ASF dual-hosted git repository.
areusch pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new c51f429 [microTVM] Update support for ARMv7m intrinsic (#8990)
c51f429 is described below
commit c51f4295c4b13f5c0b7f853264a64eca9c1dab8f
Author: Sergei Smirnov <[email protected]>
AuthorDate: Thu Sep 30 16:29:58 2021 -0700
[microTVM] Update support for ARMv7m intrinsic (#8990)
* [microTVM] Update support for ARMv7m intrinsic
- Improved implementaion of gemm function for conv2d
- Removed %4 restriction for channels
- Added test case to verify SMLAD intrinsic speed acceleration
Signed-off-by: Sergey Smirnov <[email protected]>
* [microTVM] Update support for ARMv7m intrinsic
- Improved implementaion of gemm function for conv2d
- Removed %4 restriction for channels
- Added test case to verify SMLAD intrinsic speed acceleration
Signed-off-by: Sergey Smirnov <[email protected]>
* Implemented discussed changes.
* Removed unnecessary test files.
* Formatting fixed.
* Formatting fixed2.
* Formatting fixed3.
* Formatting fixed4.
* Formatting fixed5.
* Fixed test time result checking.
* Check rebuild.
* Formatting fixed.
* Formatting fixed.
* [microTVM] Update support for ARMv7m intrinsic
- Improved implementaion of gemm function for conv2d
- Removed %4 restriction for channels
- Added test case to verify SMLAD intrinsic speed acceleration
Signed-off-by: Sergey Smirnov <[email protected]>
* Implemented discussed changes.
* Removed unnecessary test files.
* Formatting fixed.
* Formatting fixed2.
* Formatting fixed3.
* Formatting fixed4.
* Formatting fixed5.
* Fixed test time result checking.
* Check rebuild.
* Formatting fixed.
* Issue 8717 Add schedule for depthwise_conv2d_nhwc
* Resolve merge conflict.
* Resolve merge conflicts.
* Fixed formatting.
* From Issue 8717//
Fixed micro model library test. Checking size reduced to 16 bytes from
2466816.
* From Issue 8717.
Removed changes.
* From Issue 8717. Fixed typo.
* Fixed import.
* Fixed import and method call.
* Added QEMU testing comment.
* Fixed ZEPHYR_BOARD usage.
* Fixed tests. Removed issue 8717 changes.
* Formatting fixed.
* Removed test call from base_box_test.sh
---
python/tvm/relay/op/strategy/arm_cpu.py | 3 +-
.../topi/arm_cpu/cortex_m7/conv2d/direct_simd.py | 13 +-
.../topi/arm_cpu/cortex_m7/micro_kernel/gemm.py | 328 +++++++++++++++++++--
tests/micro/zephyr/test_utils.py | 129 ++++++++
tests/micro/zephyr/test_zephyr_aot.py | 100 +------
tests/micro/zephyr/test_zephyr_armv7m.py | 225 ++++++++++++++
6 files changed, 679 insertions(+), 119 deletions(-)
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py
b/python/tvm/relay/op/strategy/arm_cpu.py
index 2d331d0..e8731a0 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -128,8 +128,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type,
target):
name="conv2d_hwcn.generic",
)
elif layout == "NHWC":
- channels = data.shape[3]
- if "SMLAD" in isa and (channels % 4) == 0 and kernel_layout ==
"HWOI":
+ if "SMLAD" in isa and kernel_layout == "HWOI":
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_direct_simd),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd),
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
index 988c3a9..3073120 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
@@ -112,10 +112,14 @@ def conv2d_direct_simd_compute(cfg, data, kernel,
strides, padding, dilation, ou
cfg.reduce_axis(in_channels.value),
)
- assert in_channels.value % 4 == 0
owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
cio, cii = cfg.define_split(
- "tile_ci", ci, policy="factors", num_outputs=2, filter=lambda x:
x.size[-1] % 4 == 0
+ "tile_ci",
+ ci,
+ policy="factors",
+ num_outputs=2,
+ # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
+ filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
)
coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)
@@ -134,6 +138,11 @@ def conv2d_direct_simd_compute(cfg, data, kernel, strides,
padding, dilation, ou
cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
cfg.define_knob("unroll_explicit", [0, 1])
+ if cfg.is_fallback:
+ cfg.fallback_split("tile_ow", [-1, out_width.value])
+ cfg.fallback_split("tile_ci", [-1, in_channels.value])
+ cfg.fallback_split("tile_co", [-1, out_channels.value])
+
return conv
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
index fb6f7a5..9a00fe2 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
+++ b/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
@@ -46,9 +46,8 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
K = K.value
if isinstance(N, tvm.tir.IntImm):
N = N.value
- assert K % 4 == 0
# TODO(weberlo, areusch): support more dtypes?
- assert in_dtype == "int8"
+ assert in_dtype in ("int8", "int16")
assert out_dtype == "int32"
A = te.placeholder((M, K), name="a", dtype=in_dtype)
B = te.placeholder((N, K), name="b", dtype=in_dtype)
@@ -71,13 +70,14 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
def intrin_func(ins, outs):
aa, bb = ins
cc = outs[0]
+ gemm_func_prefix = "gemm" if in_dtype == "int8" else "gemm16"
def _reduce_update():
ib = tvm.tir.ir_builder.create()
ib.emit(
tvm.tir.call_extern(
"int32",
- f"gemm_{M}x{K}x{N}_update_{uniq_id}",
+ f"{gemm_func_prefix}_{M}x{K}x{N}_update_{uniq_id}",
aa.access_ptr("r"),
bb.access_ptr("r"),
cc.access_ptr("w"),
@@ -102,7 +102,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
ib.emit(
tvm.tir.call_extern(
"int32",
- f"gemm_{M}x{K}x{N}_body_{uniq_id}",
+ f"{gemm_func_prefix}_{M}x{K}x{N}_body_{uniq_id}",
aa.access_ptr("r"),
bb.access_ptr("r"),
cc.access_ptr("w"),
@@ -122,7 +122,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
def gemm_MxKxN_impl(M, K, N, uniq_id):
"""Emit C code for gemm impl."""
# TODO(weberlo, areusch): are there any SIMD tricks to zero out arrays
quickly?
- aa_pad_size = M * K
+ # aa_pad_size = M * K
bb_pad_size = N * K
# code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)
cc_code = f"""
@@ -132,32 +132,270 @@ extern "C"
#include <arm_math.h>
#include <arm_nnsupportfunctions.h>
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_body_rest_{uniq_id}(
+ int K,
+ int8_t *aa, int8_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ int k_base = (K / 4) * 4;
+ switch ( K % 4 ) {{
+ case 1:
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int8_t *a_ptr = &aa[i * A_stride + k_base];
+ int8_t *b_ptr = &bb[j * B_stride + k_base];
+ cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
+ }}
+ }}
+ break;
+ case 2:
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int8_t *a_ptr = &aa[i * A_stride + k_base];
+ int8_t *b_ptr = &bb[j * B_stride + k_base];
+ cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
+ + (int32_t) a_ptr[1] * (int32_t) b_ptr[1];
+ }}
+ }}
+ break;
+ case 3:
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int8_t *a_ptr = &aa[i * A_stride + k_base];
+ int8_t *b_ptr = &bb[j * B_stride + k_base];
+ cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
+ + (int32_t) a_ptr[1] * (int32_t) b_ptr[1]
+ + (int32_t) a_ptr[2] * (int32_t) b_ptr[2];
+ }}
+ }}
+ break;
+ }}
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_loop_{uniq_id}(
+ int8_t *aa, int8_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int32_t sum = 0;
+ for (int l = 0; l < {K}; l++) {{
+ sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
+ }}
+ // NOTE: this is the line where `*_body` differs from `*_update`. here
+ // we're *setting* the result, instead of accumulating, because we know
+ // the `i` and `j` itervars span their entire respective axes.
+ cc[i*C_stride + j] = sum;
+ }}
+ }}
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_body_{uniq_id}(
int8_t *aa, int8_t *bb, int32_t *cc,
int A_stride, int B_stride, int C_stride) {{
- int16_t aa_pad[{aa_pad_size}];
int16_t bb_pad[{bb_pad_size}];
+ if ( {M} < 16 || {N} < 16 )
+ return gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+
+ for (int i = 0; i < {N}; i++)
+ for (int j = 0; j < {K} / 4; j++)
+ read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4],
(int32_t*) &bb_pad[i*{K} + j*4 + 2]);
+
for (int i = 0; i < {M}; i++) {{
- for (int j = 0; j < {K} / 4; j++) {{
- read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4],
(int32_t*) &aa_pad[i*{K} + j*4 + 2]);
+ int16_t aa_pad_line[{K}];
+ for (int l = 0; l < {K} / 4; l++)
+ read_and_pad(&aa[i*A_stride + l*4], (int32_t*) &aa_pad_line[l*4],
(int32_t*) &aa_pad_line[l*4 + 2]);
+
+ for (int j = 0; j < {N}; j++) {{
+ int32_t *aa_ptr = (int32_t *) aa_pad_line;
+ int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
+ int32_t sum = 0;
+ for (int l = 0; l < 2 * ({K} / 4); l++) {{
+ sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+ ++ aa_ptr; ++ bb_ptr;
+ }}
+ // NOTE: this is the line where `*_body` differs from `*_update`. here
+ // we're *setting* the result, instead of accumulating, because we know
+ // the `i` and `j` itervars span their entire respective axes.
+ cc[i*C_stride + j] = sum;
}}
}}
- for (int i = 0; i < {N}; i++) {{
- for (int j = 0; j < {K} / 4; j++) {{
+ if ( {K} % 4 != 0 )
+ gemm_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride,
C_stride);
+
+ return 0;
+}}
+
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{N}_update_rest_{uniq_id}(
+ int K,
+ int8_t *aa, int8_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ int k_base = (K / 4) * 4;
+ switch ( K % 4 ) {{
+ case 1:
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int8_t *a_ptr = &aa[i * A_stride + k_base];
+ int8_t *b_ptr = &bb[j * B_stride + k_base];
+ cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
+ }}
+ }}
+ break;
+ case 2:
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int8_t *a_ptr = &aa[i * A_stride + k_base];
+ int8_t *b_ptr = &bb[j * B_stride + k_base];
+ cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
+ + (int32_t) a_ptr[1] * (int32_t) b_ptr[1];
+ }}
+ }}
+ break;
+ case 3:
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int8_t *a_ptr = &aa[i * A_stride + k_base];
+ int8_t *b_ptr = &bb[j * B_stride + k_base];
+ cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0]
+ + (int32_t) a_ptr[1] * (int32_t) b_ptr[1]
+ + (int32_t) a_ptr[2] * (int32_t) b_ptr[2];
+ }}
+ }}
+ break;
+ }}
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_loop_{uniq_id}(
+ int8_t *aa, int8_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int32_t sum = 0;
+ for (int l = 0; l < {K}; l++) {{
+ sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
+ }}
+ cc[i*C_stride + j] += sum;
+ }}
+ }}
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
+ int8_t *aa, int8_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ int16_t bb_pad[{bb_pad_size}];
+
+ if ( {M} < 16 || {N} < 16 )
+ return gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+
+ for (int i = 0; i < {N}; i++)
+ for (int j = 0; j < {K} / 4; j++)
read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4],
(int32_t*) &bb_pad[i*{K} + j*4 + 2]);
+
+ for (int i = 0; i < {M}; i++) {{
+ int16_t aa_pad_line[{K}];
+ for (int l = 0; l < {K} / 4; l++)
+ read_and_pad(&aa[i*A_stride + l*4], (int32_t*) &aa_pad_line[l*4],
(int32_t*) &aa_pad_line[l*4 + 2]);
+
+ for (int j = 0; j < {N}; j++) {{
+ int32_t *aa_ptr = (int32_t *) aa_pad_line;
+ int32_t *bb_ptr = (int32_t *) &bb_pad[j*{K}];
+ int32_t sum = 0;
+ for (int l = 0; l < 2 * ({K} / 4); l++) {{
+ sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+ ++ aa_ptr; ++ bb_ptr;
+ }}
+ cc[i*C_stride + j] += sum;
+ }}
+ }}
+
+ if ( {K} % 4 != 0 )
+ gemm_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride,
C_stride);
+
+ return 0;
+}}
+
+
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_body_rest_{uniq_id}(
+ int K,
+ int16_t *aa, int16_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ int k_base = (K / 2) * 2;
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int16_t *a_ptr = &aa[i * A_stride + k_base];
+ int16_t *b_ptr = &bb[j * B_stride + k_base];
+ cc[i * C_stride + j] = (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
+ }}
+ }}
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(
+ int16_t *aa, int16_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int32_t sum = 0;
+ for (int l = 0; l < {K}; l++) {{
+ sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
+ }}
+ // NOTE: this is the line where `*_body` differs from `*_update`. here
+ // we're *setting* the result, instead of accumulating, because we know
+ // the `i` and `j` itervars span their entire respective axes.
+ cc[i*C_stride + j] = sum;
}}
}}
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}(
+ int16_t *aa, int16_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ if ( {M} < 2 || {N} < 2 )
+ return gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
+ int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
+ int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
+
int32_t sum = 0;
for (int l = 0; l < {K} / 2; l++) {{
- sum = __SMLAD(
- *((int32_t*) &aa_pad[i*{K} + l*2]),
- *((int32_t*) &bb_pad[j*{K} + l*2]),
- sum);
+ sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+ ++ aa_ptr; ++ bb_ptr;
}}
// NOTE: this is the line where `*_body` differs from `*_update`. here
// we're *setting* the result, instead of accumulating, because we know
@@ -166,46 +404,80 @@ __STATIC_FORCEINLINE int32_t
gemm_{M}x{K}x{N}_body_{uniq_id}(
}}
}}
+ if ( {K} % 2 != 0 )
+ gemm16_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride,
C_stride);
+
return 0;
}}
+
#ifdef __cplusplus
extern "C"
#endif
-__STATIC_FORCEINLINE int32_t gemm_{M}x{K}x{N}_update_{uniq_id}(
- int8_t *aa, int8_t *bb, int32_t *cc,
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{N}_update_rest_{uniq_id}(
+ int K,
+ int16_t *aa, int16_t *bb, int32_t *cc,
int A_stride, int B_stride, int C_stride) {{
- int16_t aa_pad[{aa_pad_size}];
- int16_t bb_pad[{bb_pad_size}];
-
+ int k_base = (K / 2) * 2;
for (int i = 0; i < {M}; i++) {{
- for (int j = 0; j < {K} / 4; j++) {{
- read_and_pad(&aa[i*A_stride + j*4], (int32_t*) &aa_pad[i*{K} + j*4],
(int32_t*) &aa_pad[i*{K} + j*4 + 2]);
+ for (int j = 0; j < {N}; j++) {{
+ int16_t *a_ptr = &aa[i * A_stride + k_base];
+ int16_t *b_ptr = &bb[j * B_stride + k_base];
+ cc[i * C_stride + j] += (int32_t) a_ptr[0] * (int32_t) b_ptr[0];
}}
}}
+ return 0;
+}}
- for (int i = 0; i < {N}; i++) {{
- for (int j = 0; j < {K} / 4; j++) {{
- read_and_pad(&bb[i*B_stride + j*4], (int32_t*) &bb_pad[i*{K} + j*4],
(int32_t*) &bb_pad[i*{K} + j*4 + 2]);
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(
+ int16_t *aa, int16_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ for (int i = 0; i < {M}; i++) {{
+ for (int j = 0; j < {N}; j++) {{
+ int32_t sum = 0;
+ for (int l = 0; l < {K}; l++) {{
+ sum += (int32_t) aa[i*A_stride + l] * (int32_t) bb[j*B_stride + l];
+ }}
+ cc[i*C_stride + j] += sum;
}}
}}
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}(
+ int16_t *aa, int16_t *bb, int32_t *cc,
+ int A_stride, int B_stride, int C_stride) {{
+ if ( {M} < 2 || {N} < 2 )
+ return gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
+ int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
+ int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
+
int32_t sum = 0;
for (int l = 0; l < {K} / 2; l++) {{
- sum = __SMLAD(
- *((int32_t*) &aa_pad[i*{K} + l*2]),
- *((int32_t*) &bb_pad[j*{K} + l*2]),
- sum);
+ sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
+ ++ aa_ptr; ++ bb_ptr;
}}
cc[i*C_stride + j] += sum;
}}
}}
+ if ( {K} % 2 != 0 )
+ gemm16_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride,
C_stride);
+
return 0;
}}
+
+
#ifdef __cplusplus
extern "C"
#endif
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
index 54c3de2..c27c869 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/test_utils.py
@@ -14,8 +14,21 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+import io
+import os
import json
import pathlib
+import logging
+import tarfile
+
+import numpy as np
+
+from urllib.request import urlopen, urlretrieve
+from urllib.error import HTTPError
+import json
+import requests
+
+import tvm.micro
TEMPLATE_PROJECT_DIR = (
@@ -60,3 +73,119 @@ def has_fpu(board: str):
fpu_boards = [name for name, board in board_properties.items() if
board["fpu"]]
return board in fpu_boards
+
+
+def build_project(temp_dir, zephyr_board, west_cmd, mod, build_config,
extra_files_tar=None):
+ project_dir = temp_dir / "project"
+ project = tvm.micro.generate_project(
+ str(TEMPLATE_PROJECT_DIR),
+ mod,
+ project_dir,
+ {
+ "extra_files_tar": extra_files_tar,
+ "project_type": "aot_demo",
+ "west_cmd": west_cmd,
+ "verbose": bool(build_config.get("debug")),
+ "zephyr_board": zephyr_board,
+ },
+ )
+ project.build()
+ return project, project_dir
+
+
+def create_header_file(tensor_name, npy_data, output_path, tar_file):
+ """
+ This method generates a header file containing the data contained in the
numpy array provided.
+ It is used to capture the tensor data (for both inputs and expected
outputs).
+ """
+ header_file = io.StringIO()
+ header_file.write("#include <stddef.h>\n")
+ header_file.write("#include <stdint.h>\n")
+ header_file.write("#include <dlpack/dlpack.h>\n")
+ header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
+
+ if npy_data.dtype == "int8":
+ header_file.write(f"int8_t {tensor_name}[] =")
+ elif npy_data.dtype == "int32":
+ header_file.write(f"int32_t {tensor_name}[] = ")
+ elif npy_data.dtype == "uint8":
+ header_file.write(f"uint8_t {tensor_name}[] = ")
+ elif npy_data.dtype == "float32":
+ header_file.write(f"float {tensor_name}[] = ")
+ else:
+ raise ValueError("Data type not expected.")
+
+ header_file.write("{")
+ for i in np.ndindex(npy_data.shape):
+ header_file.write(f"{npy_data[i]}, ")
+ header_file.write("};\n\n")
+
+ header_file_bytes = bytes(header_file.getvalue(), "utf-8")
+ raw_path = pathlib.Path(output_path) / f"{tensor_name}.h"
+ ti = tarfile.TarInfo(name=str(raw_path))
+ ti.size = len(header_file_bytes)
+ ti.mode = 0o644
+ ti.type = tarfile.REGTYPE
+ tar_file.addfile(ti, io.BytesIO(header_file_bytes))
+
+
+def _read_line(fd, timeout_sec: int):
+ data = ""
+ new_line = False
+ while True:
+ if new_line:
+ break
+ new_data = fd.read(1, timeout_sec=timeout_sec)
+ logging.debug(f"read data: {new_data}")
+ for item in new_data:
+ new_c = chr(item)
+ data = data + new_c
+ if new_c == "\n":
+ new_line = True
+ break
+ return data
+
+
+def get_message(fd, expr: str, timeout_sec: int):
+ while True:
+ data = _read_line(fd, timeout_sec)
+ logging.debug(f"new line: {data}")
+ if expr in data:
+ return data
+
+
+# TODO move CMSIS integration to microtvm_api_server.py
+# see
https://discuss.tvm.apache.org/t/tvm-capturing-dependent-libraries-of-code-generated-tir-initially-for-use-in-model-library-format/11080
+def loadCMSIS(temp_dir):
+ REPO_PATH = "ARM-software/CMSIS_5"
+ BRANCH = "master"
+ API_PATH_URL = f"https://api.github.com/repos/{REPO_PATH}/git/trees"
+ RAW_PATH_URL = f"https://raw.githubusercontent.com/{REPO_PATH}/{BRANCH}"
+
+ url =
"https://api.github.com/repos/ARM-software/CMSIS_5/git/trees/master?recursive=1"
+ r = requests.get(url)
+ res = r.json()
+
+ include_trees = {}
+
+ for file in res["tree"]:
+ if file["path"] in {"CMSIS/DSP/Include", "CMSIS/DSP/Include/dsp",
"CMSIS/NN/Include"}:
+ include_trees.update({file["path"]: file["sha"]})
+
+ for path, sha in include_trees.items():
+ url = f"{API_PATH_URL}/{sha}"
+ content = json.load(urlopen(url))
+ temp_path = f"{temp_dir}"
+ if path == "CMSIS/DSP/Include/dsp":
+ temp_path = f"{temp_dir}/dsp"
+ if not os.path.isdir(temp_path):
+ os.makedirs(temp_path)
+ for item in content["tree"]:
+ if item["type"] == "blob":
+ file_name = item["path"]
+ file_url = f"{RAW_PATH_URL}/{path}/{file_name}"
+ print(file_name, " ", file_url)
+ try:
+ urlretrieve(file_url, f"{temp_path}/{file_name}")
+ except HTTPError as e:
+ print(f"Failed to download {file_url}: {e}")
diff --git a/tests/micro/zephyr/test_zephyr_aot.py
b/tests/micro/zephyr/test_zephyr_aot.py
index f03b8ec..a8a7a99 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -27,6 +27,7 @@ import pytest
import numpy as np
import tvm
+import tvm.testing
from tvm.micro.project_api import server
import tvm.relay as relay
@@ -36,85 +37,6 @@ from tvm.micro.interface_api import
generate_c_interface_header
import test_utils
-def _build_project(temp_dir, zephyr_board, west_cmd, mod, build_config,
extra_files_tar=None):
- project_dir = temp_dir / "project"
- project = tvm.micro.generate_project(
- str(test_utils.TEMPLATE_PROJECT_DIR),
- mod,
- project_dir,
- {
- "extra_files_tar": extra_files_tar,
- "project_type": "aot_demo",
- "west_cmd": west_cmd,
- "verbose": bool(build_config.get("debug")),
- "zephyr_board": zephyr_board,
- },
- )
- project.build()
- return project, project_dir
-
-
-def _create_header_file(tensor_name, npy_data, output_path, tar_file):
- """
- This method generates a header file containing the data contained in the
numpy array provided.
- It is used to capture the tensor data (for both inputs and expected
outputs).
- """
- header_file = io.StringIO()
- header_file.write("#include <stddef.h>\n")
- header_file.write("#include <stdint.h>\n")
- header_file.write("#include <dlpack/dlpack.h>\n")
- header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
-
- if npy_data.dtype == "int8":
- header_file.write(f"int8_t {tensor_name}[] =")
- elif npy_data.dtype == "int32":
- header_file.write(f"int32_t {tensor_name}[] = ")
- elif npy_data.dtype == "uint8":
- header_file.write(f"uint8_t {tensor_name}[] = ")
- elif npy_data.dtype == "float32":
- header_file.write(f"float {tensor_name}[] = ")
- else:
- raise ValueError("Data type not expected.")
-
- header_file.write("{")
- for i in np.ndindex(npy_data.shape):
- header_file.write(f"{npy_data[i]}, ")
- header_file.write("};\n\n")
-
- header_file_bytes = bytes(header_file.getvalue(), "utf-8")
- raw_path = pathlib.Path(output_path) / f"{tensor_name}.h"
- ti = tarfile.TarInfo(name=str(raw_path))
- ti.size = len(header_file_bytes)
- ti.mode = 0o644
- ti.type = tarfile.REGTYPE
- tar_file.addfile(ti, io.BytesIO(header_file_bytes))
-
-
-def _read_line(fd, timeout_sec: int):
- data = ""
- new_line = False
- while True:
- if new_line:
- break
- new_data = fd.read(1, timeout_sec=timeout_sec)
- logging.debug(f"read data: {new_data}")
- for item in new_data:
- new_c = chr(item)
- data = data + new_c
- if new_c == "\n":
- new_line = True
- break
- return data
-
-
-def _get_message(fd, expr: str, timeout_sec: int):
- while True:
- data = _read_line(fd, timeout_sec)
- logging.debug(f"new line: {data}")
- if expr in data:
- return data
-
-
@tvm.testing.requires_micro
def test_tflite(temp_dir, board, west_cmd, tvm_debug):
"""Testing a TFLite model."""
@@ -175,12 +97,12 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
)
tf.add(header_path, arcname=os.path.relpath(header_path,
tar_temp_dir))
- _create_header_file("input_data", sample, "include", tf)
- _create_header_file(
+ test_utils.create_header_file("input_data", sample, "include", tf)
+ test_utils.create_header_file(
"output_data", np.zeros(shape=output_shape, dtype="float32"),
"include", tf
)
- project, _ = _build_project(
+ project, _ = test_utils.build_project(
temp_dir,
board,
west_cmd,
@@ -192,9 +114,9 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
project.flash()
with project.transport() as transport:
timeout_read = 60
- _get_message(transport, "#wakeup", timeout_sec=timeout_read)
+ test_utils.get_message(transport, "#wakeup", timeout_sec=timeout_read)
transport.write(b"start\n", timeout_sec=5)
- result_line = _get_message(transport, "#result",
timeout_sec=timeout_read)
+ result_line = test_utils.get_message(transport, "#result",
timeout_sec=timeout_read)
result_line = result_line.strip("\n")
result_line = result_line.split(":")
@@ -236,10 +158,14 @@ def test_qemu_make_fail(temp_dir, board, west_cmd,
tvm_debug):
lowered.libmod_name, ["input_1"], ["output"],
model_files_path
)
tf.add(header_path, arcname=os.path.relpath(header_path,
tar_temp_dir))
- _create_header_file("input_data", np.zeros(shape=shape,
dtype=dtype), "include", tf)
- _create_header_file("output_data", np.zeros(shape=shape,
dtype=dtype), "include", tf)
+ test_utils.create_header_file(
+ "input_data", np.zeros(shape=shape, dtype=dtype), "include", tf
+ )
+ test_utils.create_header_file(
+ "output_data", np.zeros(shape=shape, dtype=dtype), "include",
tf
+ )
- project, project_dir = _build_project(
+ project, project_dir = test_utils.build_project(
temp_dir,
board,
west_cmd,
diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py
b/tests/micro/zephyr/test_zephyr_armv7m.py
new file mode 100644
index 0000000..350f7e2
--- /dev/null
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -0,0 +1,225 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+import os
+import pathlib
+import sys
+import tarfile
+import tempfile
+
+import pytest
+import numpy as np
+
+import test_utils
+
+import tvm
+import tvm.rpc
+import tvm.micro
+import tvm.testing
+from tvm import relay
+
+from tvm.contrib.download import download_testdata
+from tvm.micro.interface_api import generate_c_interface_header
+
+import conftest
+
+
+_LOG = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+def _open_tflite_model():
+ # Import TFLite model
+
+ model_url =
"https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/mnist_model_quant.tflite"
+ model_path = download_testdata(model_url, "mnist_model_quant.tflite",
module="model")
+
+ tflite_model_buf = open(model_path, "rb").read()
+
+ try:
+ import tflite
+
+ tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+ except AttributeError:
+ import tflite.Model
+
+ tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+
+ relay_mod, params = relay.frontend.from_tflite(tflite_model)
+
+ return relay_mod, params
+
+
+def _get_test_data(testdata_dir):
+
+ from PIL import Image
+
+ image_files = ["digit-2.jpg"]
+
+ for file in image_files:
+ img = Image.open(testdata_dir / file).resize((28, 28))
+ img = np.asarray(img).astype("uint8")
+ sample = np.reshape(img, -1)
+
+ output_shape = (1, 10)
+
+ return sample, output_shape
+
+
+def _apply_desired_layout_simd(relay_mod):
+
+ desired_layouts = {"qnn.conv2d": ["NHWC", "HWOI"], "nn.conv2d": ["NHWC",
"HWOI"]}
+
+ seq = tvm.transform.Sequential(
+ [relay.transform.RemoveUnusedFunctions(),
relay.transform.ConvertLayout(desired_layouts)]
+ )
+
+ with tvm.transform.PassContext(opt_level=3):
+ return seq(relay_mod)
+
+
+def _apply_desired_layout_no_simd(relay_mod):
+
+ desired_layouts = {"qnn.conv2d": ["NHWC", "HWIO"], "nn.conv2d": ["NHWC",
"HWIO"]}
+
+ seq = tvm.transform.Sequential(
+ [relay.transform.RemoveUnusedFunctions(),
relay.transform.ConvertLayout(desired_layouts)]
+ )
+
+ with tvm.transform.PassContext(opt_level=3):
+ return seq(relay_mod)
+
+
+def _generate_project(temp_dir, board, west_cmd, lowered, build_config,
sample, output_shape):
+
+ with tempfile.NamedTemporaryFile() as tar_temp_file:
+ with tarfile.open(tar_temp_file.name, "w:gz") as tf:
+ with tempfile.TemporaryDirectory() as tar_temp_dir:
+ model_files_path = os.path.join(tar_temp_dir, "include")
+ os.mkdir(model_files_path)
+ test_utils.loadCMSIS(model_files_path)
+ tf.add(model_files_path,
arcname=os.path.relpath(model_files_path, tar_temp_dir))
+ header_path = generate_c_interface_header(
+ lowered.libmod_name, ["input_1"], ["output"],
model_files_path
+ )
+ tf.add(header_path, arcname=os.path.relpath(header_path,
tar_temp_dir))
+
+ test_utils.create_header_file("input_data", sample, "include", tf)
+ test_utils.create_header_file(
+ "output_data", np.zeros(shape=output_shape, dtype="float32"),
"include", tf
+ )
+
+ project, _ = test_utils.build_project(
+ temp_dir,
+ board,
+ west_cmd,
+ lowered,
+ build_config,
+ extra_files_tar=tar_temp_file.name,
+ )
+
+ return project
+
+
+def _run_model(temp_dir, board, west_cmd, lowered, build_config, sample,
output_shape):
+
+ project = _generate_project(
+ temp_dir, board, west_cmd, lowered, build_config, sample, output_shape
+ )
+
+ project.flash()
+
+ with project.transport() as transport:
+ timeout_read = 60
+ transport.write(b"start\n", timeout_sec=5)
+ result_line = test_utils.get_message(transport, "#result",
timeout_sec=timeout_read)
+
+ result_line = result_line.strip("\n")
+ result_line = result_line.split(":")
+ result = int(result_line[1])
+ time = int(result_line[2])
+ logging.info(f"Result: {result}\ttime: {time} ms")
+
+ return result, time
+
+
[email protected]_micro
+def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug):
+ """Testing a ARM v7m SIMD extension."""
+
+ if board not in [
+ "mps2_an521",
+ "stm32f746xx_disco",
+ "nucleo_f746zg",
+ "nucleo_l4r5zi",
+ ]:
+ pytest.skip(msg="Platform does not support ARM v7m SIMD extenion.")
+
+ model = test_utils.ZEPHYR_BOARDS[board]
+
+ build_config = {"debug": tvm_debug}
+
+ this_dir = pathlib.Path(os.path.dirname(__file__))
+ testdata_dir = this_dir.parent / "testdata" / "mnist"
+
+ relay_mod, params = _open_tflite_model()
+
+ sample, output_shape = _get_test_data(testdata_dir)
+
+ relay_mod_simd = _apply_desired_layout_simd(relay_mod)
+ # kernel layout "HWIO" is not supported by arm_cpu SIMD extension (see
tvm\python\relay\op\strategy\arm_cpu.py)
+ relay_mod_no_simd = _apply_desired_layout_no_simd(relay_mod)
+
+ target = tvm.target.target.micro(
+ model,
+ options=[
+ "-keys=arm_cpu,cpu",
+ "-link-params=1",
+ "--executor=aot",
+ "--unpacked-api=1",
+ "--interface-api=c",
+ ],
+ )
+
+ temp_dir_simd = temp_dir / "simd"
+ temp_dir_no_simd = temp_dir / "nosimd"
+
+ os.makedirs(temp_dir_simd, exist_ok=True)
+ os.makedirs(temp_dir_no_simd, exist_ok=True)
+
+ with tvm.transform.PassContext(opt_level=3,
config={"tir.disable_vectorize": True}):
+ lowered_simd = relay.build(relay_mod_simd, target, params=params)
+ lowered_no_simd = relay.build(relay_mod_no_simd, target, params=params)
+ result_simd, time_simd = _run_model(
+ temp_dir_simd, board, west_cmd, lowered_simd, build_config,
sample, output_shape
+ )
+ result_no_simd, time_no_simd = _run_model(
+ temp_dir_no_simd, board, west_cmd, lowered_no_simd, build_config,
sample, output_shape
+ )
+
+ assert result_no_simd == result_simd == 2
+
+ # Time performance measurements on QEMU emulator are always equal to zero.
+ if board not in [
+ "mps2_an521",
+ ]:
+ assert time_no_simd > time_simd
+
+
+if __name__ == "__main__":
+ sys.exit(pytest.main([__file__] + sys.argv[1:]))