This is an automated email from the ASF dual-hosted git repository.

mousius pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 0e046daf9e [CMSIS-NN] Reduction in code size of AOT test runner binary 
(#13815)
0e046daf9e is described below

commit 0e046daf9e51724b3910aa7ba199069b09e2707e
Author: Nicola Lancellotti <[email protected]>
AuthorDate: Mon Feb 27 12:13:53 2023 +0100

    [CMSIS-NN] Reduction in code size of AOT test runner binary (#13815)
    
    * [CMSIS-NN] Reduction in code size of AOT test runner binary
    
    
    Co-authored-by: Ashutosh Parkhi <[email protected]>
---
 .../dsp/micro_kernel/multi_channel_convolve.py     | 58 ++++++++++++++--------
 tests/python/relay/aot/corstone300.mk              |  8 +--
 2 files changed, 41 insertions(+), 25 deletions(-)

diff --git 
a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py 
b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
index 25588964ee..90ca04ac9f 100644
--- 
a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
+++ 
b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/multi_channel_convolve.py
@@ -116,15 +116,15 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, 
channels, kernel_h, ke
             tensor_c3210, \
             sum_c0, sum_c1, sum_c2, sum_c3) {{ \
           \
-          uint32_t kernel_c3210 = *arranged_kernel++; \
+          int32_t kernel_c3210 = *arranged_kernel++; \
           \
-          uint32_t tensor_c20 = __sxtb16(tensor_c3210); \
-          uint32_t kernel_c20 = __sxtb16(kernel_c3210); \
+          int32_t tensor_c20 = __sxtb16(tensor_c3210); \
+          int32_t kernel_c20 = __sxtb16(kernel_c3210); \
           sum_c0 = __builtin_arm_smlabb(tensor_c20, kernel_c20, sum_c0); \
           sum_c2 = __builtin_arm_smlatt(tensor_c20, kernel_c20, sum_c2); \
           \
-          uint32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
-          uint32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
+          int32_t tensor_c31 = __sxtb16(__ror(tensor_c3210, 8)); \
+          int32_t kernel_c31 = __sxtb16(__ror(kernel_c3210, 8)); \
           sum_c1 = __builtin_arm_smlabb(tensor_c31, kernel_c31, sum_c1); \
           sum_c3 = __builtin_arm_smlatt(tensor_c31, kernel_c31, sum_c3); \
         }}
@@ -134,22 +134,30 @@ def _quad_int8_channel_convolve_impl(_tensor_h, tensor_w, 
channels, kernel_h, ke
         extern "C"
         #endif
         int32_t {_get_func_name("int8", tensor_w, channels, kernel_h, 
kernel_w, suffix)}(
-            uint32_t *out,
-            uint32_t *tensor,
-            uint32_t *kernel) {{
+            int32_t *out,
+            int8_t *tensor,
+            int8_t *kernel) {{
 
-          uint32_t sum_c0 = 0;
-          uint32_t sum_c1 = 0;
-          uint32_t sum_c2 = 0;
-          uint32_t sum_c3 = 0;
+          int32_t sum_c0 = 0;
+          int32_t sum_c1 = 0;
+          int32_t sum_c2 = 0;
+          int32_t sum_c3 = 0;
+
+          int32_t kernel_i32[{kernel_h} * {kernel_w}];
+          memcpy(kernel_i32, kernel, {kernel_h} * {kernel_w} * 
sizeof(int32_t));
+          int32_t *arranged_kernel = kernel_i32;
+
+          int32_t tensor_length = {((kernel_w - 1) * (channels // 4) + 
(kernel_h - 1) * tensor_w * (channels // 4)) + 1};
+          int32_t tensor_i32[tensor_length];
+          memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));
 
           #pragma GCC unroll 3
           for (int i = 0; i < {kernel_h}; i++) {{
             #pragma GCC unroll 3
             for (int j = 0; j < {kernel_w}; j++) {{
               TVMGEN_QUAD_INT8_CHANNEL_REARRANGE_SUM_DSP(
-                kernel,
-                *(tensor + j * {channels // 4} + i * {tensor_w * (channels // 
4)}),
+                arranged_kernel,
+                *(tensor_i32 + j * {channels // 4} + i * {tensor_w * (channels 
// 4)}),
                 sum_c0, sum_c1, sum_c2, sum_c3)
             }}
           }}
@@ -179,20 +187,26 @@ def _dual_int16_channel_convolve_impl(_tensor_h, 
tensor_w, channels, kernel_h, k
         extern "C"
         #endif
         int32_t {_get_func_name("int16", tensor_w, channels, kernel_h, 
kernel_w, suffix)}(
-            uint32_t *out,
-            uint32_t *tensor,
-            uint32_t *kernel) {{
+            int32_t *out,
+            int16_t *tensor,
+            int16_t *kernel) {{
+
+          int32_t sum_c0 = 0;
+          int32_t sum_c1 = 0;
+
+          int32_t kernel_i32[{kernel_h} * {kernel_w}];
+          memcpy(kernel_i32, kernel, {kernel_h} * {kernel_w} * 
sizeof(int32_t));
 
-          uint32_t sum_c0 = 0;
-          uint32_t sum_c1 = 0;
+          int32_t tensor_length = {((kernel_w - 1) * (channels // 2) + 
(kernel_h - 1) * tensor_w * (channels // 2)) + 1};
+          int32_t tensor_i32[tensor_length];
+          memcpy(tensor_i32, tensor, tensor_length * sizeof(int32_t));
 
           #pragma GCC unroll 3
           for (int i = 0; i < {kernel_h}; i++) {{
             #pragma GCC unroll 3
             for (int j = 0; j < {kernel_w}; j++) {{
-              uint32_t tensor_c10 = *(tensor + j * {channels // 2}
-                + i * {tensor_w * (channels // 2)});
-              uint32_t kernel_c10 = *kernel++;
+              int32_t tensor_c10 = tensor_i32[j * {channels // 2} + i * 
{tensor_w * (channels // 2)}];
+              int32_t kernel_c10 = kernel_i32[{kernel_w} * i + j];
               sum_c0 = __builtin_arm_smlabb(tensor_c10, kernel_c10, sum_c0);
               sum_c1 = __builtin_arm_smlatt(tensor_c10, kernel_c10, sum_c1);
             }}
diff --git a/tests/python/relay/aot/corstone300.mk 
b/tests/python/relay/aot/corstone300.mk
index 61373ec3ef..1ac1ebfa0c 100644
--- a/tests/python/relay/aot/corstone300.mk
+++ b/tests/python/relay/aot/corstone300.mk
@@ -43,7 +43,7 @@ DRIVER_PATH=${ETHOSU_PATH}/core_driver
 CMSIS_PATH=${ETHOSU_PATH}/cmsis
 ETHOSU_PLATFORM_PATH=/opt/arm/ethosu/core_platform
 CORSTONE_300_PATH = ${ETHOSU_PLATFORM_PATH}/targets/corstone-300
-PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format 
-Werror-implicit-function-declaration -mcpu=${MCPU}${MCPU_FLAGS} -mthumb 
-mfloat-abi=${MFLOAT_ABI} -std=gnu99
+PKG_COMPILE_OPTS = -Wall -Ofast -Wno-incompatible-pointer-types -Wno-format 
-Werror-implicit-function-declaration -mcpu=${MCPU}${MCPU_FLAGS} -mthumb 
-mfloat-abi=${MFLOAT_ABI} -std=gnu99
 CMAKE = /opt/arm/cmake/bin/cmake
 CC = arm-none-eabi-gcc
 AR = arm-none-eabi-ar
@@ -64,7 +64,8 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 CMAKE_FLAGS = 
-DCMAKE_TOOLCHAIN_FILE=${TVM_ROOT}/tests/python/contrib/test_ethosu/reference_system/arm-none-eabi-gcc.cmake
 \
        -DCMAKE_SYSTEM_PROCESSOR=${MCPU}
 
-PKG_LDFLAGS = -lm -specs=nosys.specs -static -T ${AOT_TEST_ROOT}/corstone300.ld
+# -fdata-sections together with --gc-section may lead to smaller 
statically-linked executables
+PKG_LDFLAGS = -lm -specs=nosys.specs -static -Wl,--gc-sections -T 
${AOT_TEST_ROOT}/corstone300.ld
 
 $(ifeq VERBOSE,1)
 QUIET ?=
@@ -113,9 +114,10 @@ ${build_dir}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS)
        $(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_startup.a) $(abspath 
$(build_dir))/libcmsis_startup/*.o
        $(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_startup.a)
 
+# -fdata-sections together with --gc-section may lead to smaller 
statically-linked executables
 ${build_dir}/libcmsis_nn.a: $(CMSIS_NN_SRCS)
        $(QUIET)mkdir -p $(abspath $(build_dir)/libcmsis_nn)
-       $(QUIET)cd $(abspath $(build_dir)/libcmsis_nn) && $(CC) -c 
$(PKG_CFLAGS) -D${ARM_CPU} $^
+       $(QUIET)cd $(abspath $(build_dir)/libcmsis_nn) && $(CC) -c 
$(PKG_CFLAGS) -ffunction-sections -fdata-sections -D${ARM_CPU} $^
        $(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_nn.a) $(abspath 
$(build_dir))/libcmsis_nn/*.o
        $(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_nn.a)
 

Reply via email to