(nuttx-apps) 07/22: Neon optimized Add operator

xiaoxiang Thu, 17 Oct 2024 18:40:29 -0700

This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nuttx-apps.git


commit fb17e33894c3524a0089189aaa1ef03ce6c56bff
Author: xinhaiteng <xinhait...@xiaomi.com>
AuthorDate: Sat Mar 2 21:45:24 2024 +0800

    Neon optimized Add operator
    
    VELAPLATFO-25411
    
    On the basis of CMSIS-NN, neon was used to optimize the Add operator, which 
calculates the offset and addition of eight input and output data in one loop.
    
    Signed-off-by: xinhaiteng <xinhait...@xiaomi.com>
---
 .../operators/neon/arm_elementwise_add_s8.c        | 149 +++++++++++++++++++++
 1 file changed, 149 insertions(+)

diff --git a/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c 
b/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c
new file mode 100644
index 000000000..678ae69fd
--- /dev/null
+++ b/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c
@@ -0,0 +1,149 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its 
affiliates <open-source-off...@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/* Note: __SHIFT is expected to be <=0 */
+__STATIC_FORCEINLINE int32x4_t arm_requantize_neon(const int32x4_t val, const 
int32_t multiplier, const int32_t shift)
+{
+    int32x4_t dividend = vqrdmulhq_n_s32(vshlq_s32(val, 
vdupq_n_s32(LEFT_SHIFT(shift))), multiplier);
+    int32_t exponent = RIGHT_SHIFT(shift);
+    int32x4_t shift__ = vdupq_n_s32(-exponent);
+    int32x4_t fixup__ = vshrq_n_s32(vandq_s32(dividend, shift__), 31);
+    int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup__);
+    return vrshlq_s32(fixed_up_dividend, shift__);
+}
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup groupElementwise
+ * @{
+ */
+
+/*
+ * s8 elementwise add
+ *
+ * Refer header file for details.
+ *
+ */
+
+arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
+                                           const int8_t *input_2_vect,
+                                           const int32_t input_1_offset,
+                                           const int32_t input_1_mult,
+                                           const int32_t input_1_shift,
+                                           const int32_t input_2_offset,
+                                           const int32_t input_2_mult,
+                                           const int32_t input_2_shift,
+                                           const int32_t left_shift,
+                                           int8_t *output,
+                                           const int32_t out_offset,
+                                           const int32_t out_mult,
+                                           const int32_t out_shift,
+                                           const int32_t out_activation_min,
+                                           const int32_t out_activation_max,
+                                           const int32_t block_size)
+{
+    int32_t loop_count = block_size / 8;
+    const int8_t *input_1 = input_1_vect;
+    const int8_t *input_2 = input_2_vect;
+    int8_t *output_ = output;
+
+    while (loop_count)
+    {
+        int8x8_t res;
+        int8x8_t input_1_s8;
+        int8x8_t input_2_s8;
+        int16x8_t i1_val_16;
+        int16x8_t input_1_s16;
+        int16x8_t input_2_s16;
+        int32x4_t input_1_s16_low;
+        int32x4_t input_1_s16_high;
+        int32x4_t input_2_s16_low;
+        int32x4_t input_2_s16_high;
+
+        input_1_s8 = vld1_s8(input_1);
+        input_2_s8 = vld1_s8(input_2);
+        input_1_s16 = vmovl_s8(input_1_s8);
+        input_2_s16 = vmovl_s8(input_2_s8);
+        input_1 += 8;
+        input_2 += 8;
+
+        input_1_s16_low  = vaddw_s16(vdupq_n_s32(input_1_offset), 
vget_low_s16(input_1_s16));
+        input_1_s16_high = vaddw_s16(vdupq_n_s32(input_1_offset), 
vget_high_s16(input_1_s16));
+        input_2_s16_low  = vaddw_s16(vdupq_n_s32(input_2_offset), 
vget_low_s16(input_2_s16));
+        input_2_s16_high = vaddw_s16(vdupq_n_s32(input_2_offset), 
vget_high_s16(input_2_s16));
+
+        input_1_s16_low  = vshlq_s32(input_1_s16_low, vdupq_n_s32(left_shift));
+        input_2_s16_low  = vshlq_s32(input_2_s16_low, vdupq_n_s32(left_shift));
+        input_1_s16_high = vshlq_s32(input_1_s16_high, 
vdupq_n_s32(left_shift));
+        input_2_s16_high = vshlq_s32(input_2_s16_high, 
vdupq_n_s32(left_shift));
+
+        input_1_s16_low  = arm_requantize_neon(input_1_s16_low, input_1_mult, 
input_1_shift);
+        input_1_s16_high = arm_requantize_neon(input_1_s16_high, input_1_mult, 
input_1_shift);
+        input_2_s16_low  = arm_requantize_neon(input_2_s16_low, input_2_mult, 
input_2_shift);
+        input_2_s16_high = arm_requantize_neon(input_2_s16_high, input_2_mult, 
input_2_shift);
+
+        input_1_s16_low  = vaddq_s32(input_1_s16_low, input_2_s16_low);
+        input_1_s16_high = vaddq_s32(input_1_s16_high, input_2_s16_high);
+
+        input_1_s16_low  = arm_requantize_neon(input_1_s16_low, out_mult, 
out_shift);
+        input_1_s16_high = arm_requantize_neon(input_1_s16_high, out_mult, 
out_shift);
+
+        input_1_s16_low  = vaddq_s32(input_1_s16_low, vdupq_n_s32(out_offset));
+        input_1_s16_high = vaddq_s32(input_1_s16_high, 
vdupq_n_s32(out_offset));
+
+        input_1_s16_low  = vmaxq_s32(input_1_s16_low, 
vdupq_n_s32(out_activation_min));
+        input_1_s16_high = vmaxq_s32(input_1_s16_high, 
vdupq_n_s32(out_activation_min));
+        input_1_s16_low  = vminq_s32(input_1_s16_low, 
vdupq_n_s32(out_activation_max));
+        input_1_s16_high = vminq_s32(input_1_s16_high, 
vdupq_n_s32(out_activation_max));
+
+        i1_val_16 = vcombine_s16(vmovn_s32(input_1_s16_low), 
vmovn_s32(input_1_s16_high));
+        res = vmovn_s16(i1_val_16);
+
+        vst1_s8(output_, res);
+        output_ += 8;
+        loop_count--;
+    }
+
+    loop_count = block_size % 8;
+    while (loop_count)
+    {
+        int32_t a1 = (*input_1++ + input_1_offset) << left_shift;
+        int32_t a2 = (*input_2++ + input_2_offset) << left_shift;
+        a1 = arm_nn_requantize(a1, input_1_mult, input_1_shift);
+        a2 = arm_nn_requantize(a2, input_2_mult, input_2_shift);
+
+        int32_t sum = a1 + a2;
+        sum = arm_nn_requantize(sum, out_mult, out_shift);
+        sum += out_offset;
+
+        sum = MAX(sum, out_activation_min);
+        sum = MIN(sum, out_activation_max);
+        *output_ = (int8_t) sum;
+        loop_count--;
+        output_++;
+    }
+
+    return (ARM_CMSIS_NN_SUCCESS);
+}

(nuttx-apps) 07/22: Neon optimized Add operator

Reply via email to