This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nuttx-apps.git

commit 25937282edf193bd8db07bb069fd81f20e0d114f
Author: jihandong <jihand...@xiaomi.com>
AuthorDate: Thu Oct 17 16:16:58 2024 +0800

    ml: follow nxstyle
    
    Signed-off-by: jihandong <jihand...@xiaomi.com>
---
 .../tflite-micro/operators/neon/arm_convolve_s8.c  | 330 +++++++------
 .../operators/neon/arm_elementwise_add_s8.c        | 258 +++++-----
 .../operators/neon/arm_nn_mat_mult_kernel_s8_s16.c | 539 +++++++++++----------
 .../operators/neon/arm_q7_to_q15_with_offset.c     |  70 +--
 4 files changed, 644 insertions(+), 553 deletions(-)

diff --git a/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c 
b/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c
index 454c467a1..3d8e5af5c 100644
--- a/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c
+++ b/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c
@@ -1,5 +1,8 @@
-/*
- * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its 
affiliates <open-source-off...@arm.com>
+/****************************************************************************
+ * apps/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c
+ *
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or
+ * its affiliates <open-source-off...@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -14,191 +17,210 @@
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
 
 #include <arm_neon.h>
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 
-/**
- *  @ingroup Public
- */
-
-/**
- * @addtogroup NNConv
- * @{
- */
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
 
-/*
- * Basic s8 convolution function.
- *
- * Refer header file for details. Optimal use case for the DSP/MVE 
implementation is when input and output channels
- * are multiples of 4 or atleast greater than 4.
+/* Basic s8 convolution function.
  *
+ * Refer header file for details. Optimal use case for the DSP/MVE
+ * implementation is when input and output channels are multiples of 4 or
+ * atleast greater than 4.
  */
-arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
-                                    const cmsis_nn_conv_params *conv_params,
-                                    const cmsis_nn_per_channel_quant_params 
*quant_params,
-                                    const cmsis_nn_dims *input_dims,
-                                    const int8_t *input_data,
-                                    const cmsis_nn_dims *filter_dims,
-                                    const int8_t *filter_data,
-                                    const cmsis_nn_dims *bias_dims,
-                                    const int32_t *bias_data,
-                                    const cmsis_nn_dims *output_dims,
-                                    int8_t *output_data)
+
+arm_cmsis_nn_status
+arm_convolve_s8(const cmsis_nn_context *ctx,
+                const cmsis_nn_conv_params *conv_params,
+                const cmsis_nn_per_channel_quant_params *quant_params,
+                const cmsis_nn_dims *input_dims,
+                const int8_t *input_data,
+                const cmsis_nn_dims *filter_dims,
+                const int8_t *filter_data,
+                const cmsis_nn_dims *bias_dims,
+                const int32_t *bias_data,
+                const cmsis_nn_dims *output_dims,
+                int8_t *output_data)
 {
-    (void)bias_dims;
+  (void)bias_dims;
 
-    if (ctx->buf == NULL)
+  if (ctx->buf == NULL)
     {
-        return ARM_CMSIS_NN_ARG_ERROR;
+      return ARM_CMSIS_NN_ARG_ERROR;
     }
-    int16_t *buffer_a = (int16_t *)ctx->buf;
-
-    const int32_t input_batches = input_dims->n;
-    const uint16_t input_x = input_dims->w;
-    const uint16_t input_y = input_dims->h;
-    const uint16_t input_ch = input_dims->c;
-    const uint16_t kernel_x = filter_dims->w;
-    const uint16_t kernel_y = filter_dims->h;
-    const uint16_t output_x = output_dims->w;
-    const uint16_t output_y = output_dims->h;
-    const uint16_t output_ch = output_dims->c;
-
-    const uint16_t pad_x = conv_params->padding.w;
-    const uint16_t pad_y = conv_params->padding.h;
-    const uint16_t stride_x = conv_params->stride.w;
-    const uint16_t stride_y = conv_params->stride.h;
-    const int32_t dilation_x = conv_params->dilation.w;
-    const int32_t dilation_y = conv_params->dilation.h;
-    const int32_t out_offset = conv_params->output_offset;
-    const int32_t out_activation_min = conv_params->activation.min;
-    const int32_t out_activation_max = conv_params->activation.max;
-    const int32_t rhs_cols = kernel_x * kernel_y * input_ch;
-    const int32_t input_offset = conv_params->input_offset;
-
-    int32_t *output_mult = quant_params->multiplier;
-    int32_t *output_shift = quant_params->shift;
-
-    int i_batch;
-    for (i_batch = 0; i_batch < input_batches; i_batch++)
+
+  int16_t *buffer_a = (int16_t *)ctx->buf;
+
+  const int32_t input_batches = input_dims->n;
+  const uint16_t input_x = input_dims->w;
+  const uint16_t input_y = input_dims->h;
+  const uint16_t input_ch = input_dims->c;
+  const uint16_t kernel_x = filter_dims->w;
+  const uint16_t kernel_y = filter_dims->h;
+  const uint16_t output_x = output_dims->w;
+  const uint16_t output_y = output_dims->h;
+  const uint16_t output_ch = output_dims->c;
+
+  const uint16_t pad_x = conv_params->padding.w;
+  const uint16_t pad_y = conv_params->padding.h;
+  const uint16_t stride_x = conv_params->stride.w;
+  const uint16_t stride_y = conv_params->stride.h;
+  const int32_t dilation_x = conv_params->dilation.w;
+  const int32_t dilation_y = conv_params->dilation.h;
+  const int32_t out_offset = conv_params->output_offset;
+  const int32_t out_activation_min = conv_params->activation.min;
+  const int32_t out_activation_max = conv_params->activation.max;
+  const int32_t rhs_cols = kernel_x * kernel_y * input_ch;
+  const int32_t input_offset = conv_params->input_offset;
+
+  int32_t *output_mult = quant_params->multiplier;
+  int32_t *output_shift = quant_params->shift;
+
+  int i_batch;
+  for (i_batch = 0; i_batch < input_batches; i_batch++)
     {
-        const int32_t remainder = rhs_cols % 4;
-        const int32_t aligned_rhs_cols = remainder != 0 ? rhs_cols + 4 - 
remainder : rhs_cols;
-        /**
-         * Use Im2col to speed up conv2d calculations.
-         * Use as a ping-pong buffer for unordered elements.
-         */
-        int8_t *im2col_buf = (int8_t *)buffer_a + aligned_rhs_cols * 2;
-        int16_t *im2col_buf_start_s16 = buffer_a;
-        int8_t *out = output_data;
-        int32_t lhs_rows = 0;
-        /* This part implements the im2col function */
-        for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
+      const int32_t remainder = rhs_cols % 4;
+      const int32_t aligned_rhs_cols = remainder != 0 ?
+          rhs_cols + 4 - remainder : rhs_cols;
+
+      /**
+       * Use Im2col to speed up conv2d calculations.
+       * Use as a ping-pong buffer for unordered elements.
+       */
+
+      int8_t *im2col_buf = (int8_t *)buffer_a + aligned_rhs_cols * 2;
+      int16_t *im2col_buf_start_s16 = buffer_a;
+      int8_t *out = output_data;
+      int32_t lhs_rows = 0;
+
+      /* This part implements the im2col function */
+
+      for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
         {
-            const int32_t base_idx_x = stride_x * i_out_x - pad_x;
-            for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
+          const int32_t base_idx_x = stride_x * i_out_x - pad_x;
+          for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
             {
-                const int32_t base_idx_y = stride_y * i_out_y - pad_y;
-                for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
+              const int32_t base_idx_y = stride_y * i_out_y - pad_y;
+              for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
                 {
-                    int32_t k_x = base_idx_x + dilation_x * i_ker_x;
-                    int32_t k_y = base_idx_y - dilation_y;
-                    for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
+                  int32_t k_x = base_idx_x + dilation_x * i_ker_x;
+                  int32_t k_y = base_idx_y - dilation_y;
+                  for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
                     {
-                        k_y += dilation_y;
-                        arm_memcpy_s8(im2col_buf,
-                                      input_data + (k_y * input_x + k_x) * 
input_ch,
-                                      input_ch);
-                        im2col_buf += input_ch;
+                      k_y += dilation_y;
+                      arm_memcpy_s8(im2col_buf,
+                          input_data + (k_y * input_x + k_x) * input_ch,
+                          input_ch);
+                      im2col_buf += input_ch;
                     }
                 }
-                lhs_rows++;
-                /* Extend the input data from int8 to int16, and add offset. */
-                arm_q7_to_q15_with_offset(im2col_buf - rhs_cols,
-                                          im2col_buf_start_s16,
-                                          rhs_cols,
-                                          (int16_t)input_offset);
-                im2col_buf_start_s16 += aligned_rhs_cols;
-                if (lhs_rows & 2)
+
+              lhs_rows++;
+
+              /* Extend the input data from int8 to int16, and add offset. */
+
+              arm_q7_to_q15_with_offset(im2col_buf - rhs_cols,
+                                        im2col_buf_start_s16,
+                                        rhs_cols,
+                                        (int16_t)input_offset);
+              im2col_buf_start_s16 += aligned_rhs_cols;
+              if (lhs_rows & 2)
                 {
-                    out = arm_nn_mat_mult_kernel_s8_s16(filter_data,
-                                                        buffer_a,
-                                                        output_ch,
-                                                        output_shift,
-                                                        output_mult,
-                                                        out_offset,
-                                                        out_activation_min,
-                                                        out_activation_max,
-                                                        rhs_cols,
-                                                        aligned_rhs_cols,
-                                                        bias_data,
-                                                        out);
-                    /* counter reset */
-                    im2col_buf_start_s16 = buffer_a;
-                    im2col_buf = (int8_t *)buffer_a + (aligned_rhs_cols << 1);
-                    lhs_rows = 0;
+                  out = arm_nn_mat_mult_kernel_s8_s16(filter_data,
+                                                      buffer_a,
+                                                      output_ch,
+                                                      output_shift,
+                                                      output_mult,
+                                                      out_offset,
+                                                      out_activation_min,
+                                                      out_activation_max,
+                                                      rhs_cols,
+                                                      aligned_rhs_cols,
+                                                      bias_data,
+                                                      out);
+
+                  /* counter reset */
+
+                  im2col_buf_start_s16 = buffer_a;
+                  im2col_buf = (int8_t *)buffer_a + (aligned_rhs_cols << 1);
+                  lhs_rows = 0;
                 }
             }
         }
-        if (lhs_rows != 0)
+
+      if (lhs_rows != 0)
         {
-            const int8_t *ker_a = filter_data;
-            int i;
-            for (i = 0; i < output_ch; i++)
+          const int8_t *ker_a = filter_data;
+          int i;
+
+          for (i = 0; i < output_ch; i++)
             {
-                /* Load the accumulator with bias first */
-                uint16_t col_count = rhs_cols / 8;
-                int32_t sum = 0;
-                const int16_t *ip_as_col = buffer_a;
-                int32x4_t res_s32 = vdupq_n_s32(0);
-                if (bias_data)
+              /* Load the accumulator with bias first */
+
+              uint16_t col_count = rhs_cols / 8;
+              int32_t sum = 0;
+              const int16_t *ip_as_col = buffer_a;
+              int32x4_t res_s32 = vdupq_n_s32(0);
+              if (bias_data)
                 {
-                    sum = bias_data[i];
+                  sum = bias_data[i];
                 }
-                while (col_count)
+
+              while (col_count)
                 {
-                    int8x8_t filter_s8 = vld1_s8(ker_a);
-                    int16x8_t input_s16 = vld1q_s16(ip_as_col);
-                    int16x8_t filter_s16 = vmovl_s8(filter_s8);
-                    ker_a += 8;
-                    ip_as_col += 8;
-                    res_s32 = vmlal_s16(res_s32,
-                                        vget_low_s16(input_s16),
-                                        vget_low_s16(filter_s16));
-                    res_s32 = vmlal_s16(res_s32,
-                                        vget_high_s16(input_s16),
-                                        vget_high_s16(filter_s16));
-                    col_count --;
+                  int8x8_t filter_s8 = vld1_s8(ker_a);
+                  int16x8_t input_s16 = vld1q_s16(ip_as_col);
+                  int16x8_t filter_s16 = vmovl_s8(filter_s8);
+                  ker_a += 8;
+                  ip_as_col += 8;
+                  res_s32 = vmlal_s16(res_s32,
+                                      vget_low_s16(input_s16),
+                                      vget_low_s16(filter_s16));
+                  res_s32 = vmlal_s16(res_s32,
+                                      vget_high_s16(input_s16),
+                                      vget_high_s16(filter_s16));
+                  col_count--;
                 }
-                sum += vgetq_lane_s32(res_s32, 0);
-                sum += vgetq_lane_s32(res_s32, 1);
-                sum += vgetq_lane_s32(res_s32, 2);
-                sum += vgetq_lane_s32(res_s32, 3);
-                col_count = rhs_cols % 8;
-                while (col_count)
+
+              sum += vgetq_lane_s32(res_s32, 0);
+              sum += vgetq_lane_s32(res_s32, 1);
+              sum += vgetq_lane_s32(res_s32, 2);
+              sum += vgetq_lane_s32(res_s32, 3);
+              col_count = rhs_cols % 8;
+              while (col_count)
                 {
-                    int8_t ker_a1 = *ker_a++;
-                    int16_t ip_b1 = *ip_as_col++;
-                    sum += ker_a1 * ip_b1;
-                    col_count--;
+                  int8_t ker_a1 = *ker_a++;
+                  int16_t ip_b1 = *ip_as_col++;
+                  sum += ker_a1 * ip_b1;
+                  col_count--;
                 }
-                sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
-                sum += out_offset;
-                sum = MAX(sum, out_activation_min);
-                sum = MIN(sum, out_activation_max);
-                *out++ = (int8_t)sum;
+
+              sum = arm_nn_requantize(sum,
+                  output_mult[i], output_shift[i]);
+              sum += out_offset;
+              sum = MAX(sum, out_activation_min);
+              sum = MIN(sum, out_activation_max);
+              *out++ = (int8_t)sum;
             }
         }
-        /* Advance to the next batch */
-        input_data += (input_x * input_y * input_ch);
-        output_data += (output_x * output_y * output_ch);
+
+      /* Advance to the next batch */
+
+      input_data += (input_x * input_y * input_ch);
+      output_data += (output_x * output_y * output_ch);
     }
-    /* Return to application */
-    return ARM_CMSIS_NN_SUCCESS;
+
+  /* Return to application */
+
+  return ARM_CMSIS_NN_SUCCESS;
 }
 
-/**
- * @} end of NNConv group
- */
\ No newline at end of file
diff --git a/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c 
b/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c
index 678ae69fd..d7147fbce 100644
--- a/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c
+++ b/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c
@@ -1,5 +1,8 @@
-/*
- * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its 
affiliates <open-source-off...@arm.com>
+/****************************************************************************
+ * apps/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c
+ *
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or
+ * its affiliates <open-source-off...@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -14,136 +17,157 @@
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
 
 #include <arm_neon.h>
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
 /* Note: __SHIFT is expected to be <=0 */
-__STATIC_FORCEINLINE int32x4_t arm_requantize_neon(const int32x4_t val, const 
int32_t multiplier, const int32_t shift)
+
+__STATIC_FORCEINLINE int32x4_t
+arm_requantize_neon(const int32x4_t val,
+                    const int32_t multiplier,
+                    const int32_t shift)
 {
-    int32x4_t dividend = vqrdmulhq_n_s32(vshlq_s32(val, 
vdupq_n_s32(LEFT_SHIFT(shift))), multiplier);
-    int32_t exponent = RIGHT_SHIFT(shift);
-    int32x4_t shift__ = vdupq_n_s32(-exponent);
-    int32x4_t fixup__ = vshrq_n_s32(vandq_s32(dividend, shift__), 31);
-    int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup__);
-    return vrshlq_s32(fixed_up_dividend, shift__);
+  int32x4_t dividend = vqrdmulhq_n_s32(
+      vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier);
+  int32_t exponent = RIGHT_SHIFT(shift);
+  int32x4_t shift__ = vdupq_n_s32(-exponent);
+  int32x4_t fixup__ = vshrq_n_s32(vandq_s32(dividend, shift__), 31);
+  int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup__);
+  return vrshlq_s32(fixed_up_dividend, shift__);
 }
 
-/**
- *  @ingroup Public
- */
-
-/**
- * @addtogroup groupElementwise
- * @{
- */
-
-/*
- * s8 elementwise add
- *
- * Refer header file for details.
- *
- */
-
-arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
-                                           const int8_t *input_2_vect,
-                                           const int32_t input_1_offset,
-                                           const int32_t input_1_mult,
-                                           const int32_t input_1_shift,
-                                           const int32_t input_2_offset,
-                                           const int32_t input_2_mult,
-                                           const int32_t input_2_shift,
-                                           const int32_t left_shift,
-                                           int8_t *output,
-                                           const int32_t out_offset,
-                                           const int32_t out_mult,
-                                           const int32_t out_shift,
-                                           const int32_t out_activation_min,
-                                           const int32_t out_activation_max,
-                                           const int32_t block_size)
+arm_cmsis_nn_status
+arm_elementwise_add_s8(const int8_t *input_1_vect,
+                       const int8_t *input_2_vect,
+                       const int32_t input_1_offset,
+                       const int32_t input_1_mult,
+                       const int32_t input_1_shift,
+                       const int32_t input_2_offset,
+                       const int32_t input_2_mult,
+                       const int32_t input_2_shift,
+                       const int32_t left_shift,
+                       int8_t *output,
+                       const int32_t out_offset,
+                       const int32_t out_mult,
+                       const int32_t out_shift,
+                       const int32_t out_activation_min,
+                       const int32_t out_activation_max,
+                       const int32_t block_size)
 {
-    int32_t loop_count = block_size / 8;
-    const int8_t *input_1 = input_1_vect;
-    const int8_t *input_2 = input_2_vect;
-    int8_t *output_ = output;
+  int32_t loop_count = block_size / 8;
+  const int8_t *input_1 = input_1_vect;
+  const int8_t *input_2 = input_2_vect;
+  int8_t *output_ = output;
 
-    while (loop_count)
+  while (loop_count)
     {
-        int8x8_t res;
-        int8x8_t input_1_s8;
-        int8x8_t input_2_s8;
-        int16x8_t i1_val_16;
-        int16x8_t input_1_s16;
-        int16x8_t input_2_s16;
-        int32x4_t input_1_s16_low;
-        int32x4_t input_1_s16_high;
-        int32x4_t input_2_s16_low;
-        int32x4_t input_2_s16_high;
-
-        input_1_s8 = vld1_s8(input_1);
-        input_2_s8 = vld1_s8(input_2);
-        input_1_s16 = vmovl_s8(input_1_s8);
-        input_2_s16 = vmovl_s8(input_2_s8);
-        input_1 += 8;
-        input_2 += 8;
-
-        input_1_s16_low  = vaddw_s16(vdupq_n_s32(input_1_offset), 
vget_low_s16(input_1_s16));
-        input_1_s16_high = vaddw_s16(vdupq_n_s32(input_1_offset), 
vget_high_s16(input_1_s16));
-        input_2_s16_low  = vaddw_s16(vdupq_n_s32(input_2_offset), 
vget_low_s16(input_2_s16));
-        input_2_s16_high = vaddw_s16(vdupq_n_s32(input_2_offset), 
vget_high_s16(input_2_s16));
-
-        input_1_s16_low  = vshlq_s32(input_1_s16_low, vdupq_n_s32(left_shift));
-        input_2_s16_low  = vshlq_s32(input_2_s16_low, vdupq_n_s32(left_shift));
-        input_1_s16_high = vshlq_s32(input_1_s16_high, 
vdupq_n_s32(left_shift));
-        input_2_s16_high = vshlq_s32(input_2_s16_high, 
vdupq_n_s32(left_shift));
-
-        input_1_s16_low  = arm_requantize_neon(input_1_s16_low, input_1_mult, 
input_1_shift);
-        input_1_s16_high = arm_requantize_neon(input_1_s16_high, input_1_mult, 
input_1_shift);
-        input_2_s16_low  = arm_requantize_neon(input_2_s16_low, input_2_mult, 
input_2_shift);
-        input_2_s16_high = arm_requantize_neon(input_2_s16_high, input_2_mult, 
input_2_shift);
-
-        input_1_s16_low  = vaddq_s32(input_1_s16_low, input_2_s16_low);
-        input_1_s16_high = vaddq_s32(input_1_s16_high, input_2_s16_high);
-
-        input_1_s16_low  = arm_requantize_neon(input_1_s16_low, out_mult, 
out_shift);
-        input_1_s16_high = arm_requantize_neon(input_1_s16_high, out_mult, 
out_shift);
-
-        input_1_s16_low  = vaddq_s32(input_1_s16_low, vdupq_n_s32(out_offset));
-        input_1_s16_high = vaddq_s32(input_1_s16_high, 
vdupq_n_s32(out_offset));
-
-        input_1_s16_low  = vmaxq_s32(input_1_s16_low, 
vdupq_n_s32(out_activation_min));
-        input_1_s16_high = vmaxq_s32(input_1_s16_high, 
vdupq_n_s32(out_activation_min));
-        input_1_s16_low  = vminq_s32(input_1_s16_low, 
vdupq_n_s32(out_activation_max));
-        input_1_s16_high = vminq_s32(input_1_s16_high, 
vdupq_n_s32(out_activation_max));
-
-        i1_val_16 = vcombine_s16(vmovn_s32(input_1_s16_low), 
vmovn_s32(input_1_s16_high));
-        res = vmovn_s16(i1_val_16);
-
-        vst1_s8(output_, res);
-        output_ += 8;
-        loop_count--;
+      int8x8_t res;
+      int8x8_t input_1_s8;
+      int8x8_t input_2_s8;
+      int16x8_t i1_val_16;
+      int16x8_t input_1_s16;
+      int16x8_t input_2_s16;
+      int32x4_t input_1_s16_low;
+      int32x4_t input_1_s16_high;
+      int32x4_t input_2_s16_low;
+      int32x4_t input_2_s16_high;
+
+      input_1_s8 = vld1_s8(input_1);
+      input_2_s8 = vld1_s8(input_2);
+      input_1_s16 = vmovl_s8(input_1_s8);
+      input_2_s16 = vmovl_s8(input_2_s8);
+      input_1 += 8;
+      input_2 += 8;
+
+      input_1_s16_low  = vaddw_s16(
+          vdupq_n_s32(input_1_offset), vget_low_s16(input_1_s16));
+      input_1_s16_high = vaddw_s16(
+          vdupq_n_s32(input_1_offset), vget_high_s16(input_1_s16));
+      input_2_s16_low  = vaddw_s16(
+          vdupq_n_s32(input_2_offset), vget_low_s16(input_2_s16));
+      input_2_s16_high = vaddw_s16(
+          vdupq_n_s32(input_2_offset), vget_high_s16(input_2_s16));
+
+      input_1_s16_low  = vshlq_s32(
+          input_1_s16_low, vdupq_n_s32(left_shift));
+      input_2_s16_low  = vshlq_s32(
+          input_2_s16_low, vdupq_n_s32(left_shift));
+      input_1_s16_high = vshlq_s32(
+          input_1_s16_high, vdupq_n_s32(left_shift));
+      input_2_s16_high = vshlq_s32(
+          input_2_s16_high, vdupq_n_s32(left_shift));
+
+      input_1_s16_low  = arm_requantize_neon(
+          input_1_s16_low, input_1_mult, input_1_shift);
+      input_1_s16_high = arm_requantize_neon(
+          input_1_s16_high, input_1_mult, input_1_shift);
+      input_2_s16_low  = arm_requantize_neon(
+          input_2_s16_low, input_2_mult, input_2_shift);
+      input_2_s16_high = arm_requantize_neon(
+          input_2_s16_high, input_2_mult, input_2_shift);
+
+      input_1_s16_low  = vaddq_s32(
+          input_1_s16_low, input_2_s16_low);
+      input_1_s16_high = vaddq_s32(
+          input_1_s16_high, input_2_s16_high);
+
+      input_1_s16_low  = arm_requantize_neon(
+          input_1_s16_low, out_mult, out_shift);
+      input_1_s16_high = arm_requantize_neon(
+          input_1_s16_high, out_mult, out_shift);
+
+      input_1_s16_low  = vaddq_s32(
+          input_1_s16_low, vdupq_n_s32(out_offset));
+      input_1_s16_high = vaddq_s32(
+          input_1_s16_high, vdupq_n_s32(out_offset));
+
+      input_1_s16_low  = vmaxq_s32(
+          input_1_s16_low, vdupq_n_s32(out_activation_min));
+      input_1_s16_high = vmaxq_s32(
+          input_1_s16_high, vdupq_n_s32(out_activation_min));
+      input_1_s16_low  = vminq_s32(
+          input_1_s16_low, vdupq_n_s32(out_activation_max));
+      input_1_s16_high = vminq_s32(
+          input_1_s16_high, vdupq_n_s32(out_activation_max));
+
+      i1_val_16 = vcombine_s16(
+          vmovn_s32(input_1_s16_low), vmovn_s32(input_1_s16_high));
+      res = vmovn_s16(i1_val_16);
+
+      vst1_s8(output_, res);
+      output_ += 8;
+      loop_count--;
     }
 
-    loop_count = block_size % 8;
-    while (loop_count)
+  loop_count = block_size % 8;
+  while (loop_count)
     {
-        int32_t a1 = (*input_1++ + input_1_offset) << left_shift;
-        int32_t a2 = (*input_2++ + input_2_offset) << left_shift;
-        a1 = arm_nn_requantize(a1, input_1_mult, input_1_shift);
-        a2 = arm_nn_requantize(a2, input_2_mult, input_2_shift);
-
-        int32_t sum = a1 + a2;
-        sum = arm_nn_requantize(sum, out_mult, out_shift);
-        sum += out_offset;
-
-        sum = MAX(sum, out_activation_min);
-        sum = MIN(sum, out_activation_max);
-        *output_ = (int8_t) sum;
-        loop_count--;
-        output_++;
+      int32_t a1 = (*input_1++ + input_1_offset) << left_shift;
+      int32_t a2 = (*input_2++ + input_2_offset) << left_shift;
+      a1 = arm_nn_requantize(a1, input_1_mult, input_1_shift);
+      a2 = arm_nn_requantize(a2, input_2_mult, input_2_shift);
+
+      int32_t sum = a1 + a2;
+      sum = arm_nn_requantize(sum, out_mult, out_shift);
+      sum += out_offset;
+
+      sum = MAX(sum, out_activation_min);
+      sum = MIN(sum, out_activation_max);
+      *output_ = (int8_t) sum;
+      loop_count--;
+      output_++;
     }
 
-    return (ARM_CMSIS_NN_SUCCESS);
+  return (ARM_CMSIS_NN_SUCCESS);
 }
diff --git 
a/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c 
b/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c
index 43cd33b30..190edb74c 100644
--- a/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c
+++ b/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c
@@ -1,5 +1,8 @@
-/*
- * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its 
affiliates <open-source-off...@arm.com>
+/****************************************************************************
+ * apps/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c
+ *
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or
+ * its affiliates <open-source-off...@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -14,9 +17,9 @@
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */
+ ****************************************************************************/
 
-/* ----------------------------------------------------------------------
+/****************************************************************************
  * Project:      CMSIS NN Library
  * Title:        arm_nn_mat_mult_kernel_s8_s16.c
  * Description:  Matrix-multiplication function for convolution
@@ -25,18 +28,24 @@
  * $Revision:    V.2.0.0
  *
  * Target :  Arm(R) M-Profile Architecture
- * -------------------------------------------------------------------- */
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
 
 #include <arm_neon.h>
 #include "arm_nnfunctions.h"
 #include "arm_nnsupportfunctions.h"
 
-/*
- * Matrix-multiplication function for convolution with per-channel 
requantization.
- *
- * Refer header file for details.
- *
+/* Matrix-multiplication function for convolution with per-channel
+ * requantization. Refer header file for details.
  */
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
 int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t *input_a,
                                       const int16_t *input_b,
                                       const uint16_t output_ch,
@@ -50,318 +59,352 @@ int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t 
*input_a,
                                       const int32_t *const output_bias,
                                       int8_t *out_0)
 {
-    int8_t *out_1 = out_0 + output_ch;
-    const int32_t *bias = output_bias;
+  int8_t *out_1 = out_0 + output_ch;
+  const int32_t *bias = output_bias;
+
+  uint16_t row_count = output_ch / 4;
+  const int8_t *ip_a0 = input_a;
 
-    uint16_t row_count = output_ch / 4;
-    const int8_t *ip_a0 = input_a;
+  /* this loop over rows in A */
 
-    /* this loop over rows in A */
-    while (row_count)
+  while (row_count)
     {
-        int32_t col_count = num_col_a / 8;
-        const int16_t *ip_b0 = input_b;
-        const int16_t *ip_b1 = ip_b0 + aligned_num_col_a;
-        const int8_t *ip_a[4] = {ip_a0,
-                                 ip_a0 + num_col_a,
-                                 ip_a0 + 2 * num_col_a,
-                                 ip_a0 + 3 * num_col_a};
-        int32_t ch_out[4][2] = {0};
-        int32x4_t res[8];
-
-        for (int i = 0; i < 8; i++)
+      int32_t col_count = num_col_a / 8;
+      const int16_t *ip_b0 = input_b;
+      const int16_t *ip_b1 = ip_b0 + aligned_num_col_a;
+      const int8_t *ip_a[4] =
+        {
+          ip_a0,
+          ip_a0 + num_col_a,
+          ip_a0 + 2 * num_col_a,
+          ip_a0 + 3 * num_col_a
+        };
+
+      int32_t ch_out[4][2] =
         {
-            res[i] = vdupq_n_s32(0);
+          0
+        };
+
+      int32x4_t res[8];
+
+      for (int i = 0; i < 8; i++)
+        {
+          res[i] = vdupq_n_s32(0);
         }
 
-        /* Init accumulator with bias for channel N and N + 1 */
-        if (bias)
+      /* Init accumulator with bias for channel N and N + 1 */
+
+      if (bias)
         {
-            for (int i = 0; i < 4; i++)
+          for (int i = 0; i < 4; i++)
             {
-                ch_out[i][0] = *bias;
-                ch_out[i][1] = *bias++;
+              ch_out[i][0] = *bias;
+              ch_out[i][1] = *bias++;
             }
         }
 
-        /**
-         * Each time eight int8 data of four filters and eight int16 data
-         * of two inputs are read.First, the filter data is expanded to
-         * int16, and then cross-multiplied to obtain eight calculation 
results.
-         */
-        while (col_count)
+      /* Each time eight int8 data of four filters and eight int16 data
+       * of two inputs are read.First, the filter data is expanded to
+       * int16, and then cross-multiplied to obtain eight
+       * calculation results.
+       */
+
+      while (col_count)
         {
-            int8x8_t filter_s8[4];
-            int16x8_t input_s16[2];
-            int16x8_t filter_s16[4];
+          int8x8_t filter_s8[4];
+          int16x8_t input_s16[2];
+          int16x8_t filter_s16[4];
 
-            input_s16[0] = vld1q_s16(ip_b0);
-            ip_b0 += 8;
-            input_s16[1] = vld1q_s16(ip_b1);
-            ip_b1 += 8;
+          input_s16[0] = vld1q_s16(ip_b0);
+          ip_b0 += 8;
+          input_s16[1] = vld1q_s16(ip_b1);
+          ip_b1 += 8;
 
-            for (int i = 0; i < 4; i++)
+          for (int i = 0; i < 4; i++)
             {
-                filter_s8[i] = vld1_s8(ip_a[i]);
-                ip_a[i] += 8;
-                filter_s16[i] = vmovl_s8(filter_s8[i]);
-                res[i * 2]     = vmlal_s16(res[i * 2],
-                                           vget_low_s16(filter_s16[i]),
-                                           vget_low_s16(input_s16[0]));
-                res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
-                                           vget_low_s16(filter_s16[i]),
-                                           vget_low_s16(input_s16[1]));
-                res[i * 2]     = vmlal_s16(res[i * 2],
-                                           vget_high_s16(filter_s16[i]),
-                                           vget_high_s16(input_s16[0]));
-                res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
-                                           vget_high_s16(filter_s16[i]),
-                                           vget_high_s16(input_s16[1]));
+              filter_s8[i] = vld1_s8(ip_a[i]);
+              ip_a[i] += 8;
+              filter_s16[i] = vmovl_s8(filter_s8[i]);
+              res[i * 2]     = vmlal_s16(res[i * 2],
+                                         vget_low_s16(filter_s16[i]),
+                                         vget_low_s16(input_s16[0]));
+              res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
+                                         vget_low_s16(filter_s16[i]),
+                                         vget_low_s16(input_s16[1]));
+              res[i * 2]     = vmlal_s16(res[i * 2],
+                                         vget_high_s16(filter_s16[i]),
+                                         vget_high_s16(input_s16[0]));
+              res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
+                                         vget_high_s16(filter_s16[i]),
+                                         vget_high_s16(input_s16[1]));
             }
 
-            col_count --;
+          col_count--;
         }
-        for (int i = 0; i < 4; i++)
+
+      for (int i = 0; i < 4; i++)
         {
-            for (int j = 0; j < 2; j++)
+          for (int j = 0; j < 2; j++)
             {
-                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0);
-                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1);
-                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2);
-                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3);
+              ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0);
+              ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1);
+              ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2);
+              ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3);
             }
         }
 
-        col_count = num_col_a % 8;
-        while (col_count)
+      col_count = num_col_a % 8;
+      while (col_count) /* while over col_count */
         {
-            int16_t b0 = *ip_b0++;
-            int16_t b1 = *ip_b1++;
+          int16_t b0 = *ip_b0++;
+          int16_t b1 = *ip_b1++;
 
-            for (int i = 0; i < 4; i++)
+          for (int i = 0; i < 4; i++)
             {
-                int8_t input_remaining = *(ip_a[i]++);
-                ch_out[i][0] += input_remaining * b0;
-                ch_out[i][1] += input_remaining * b1;
+              int8_t input_remaining = *(ip_a[i]++);
+              ch_out[i][0] += input_remaining * b0;
+              ch_out[i][1] += input_remaining * b1;
             }
-            col_count--;
-        } /* while over col_count */
 
-        for (int i = 0; i < 4; i++)
+          col_count--;
+        }
+
+      for (int i = 0; i < 4; i++)
         {
-            ch_out[i][0] = arm_nn_requantize(ch_out[i][0], *out_mult, 
*out_shift);
-            ch_out[i][1] = arm_nn_requantize(ch_out[i][1], *out_mult, 
*out_shift);
-            ch_out[i][0] += out_offset;
-            ch_out[i][1] += out_offset;
-            ch_out[i][0] = MAX(ch_out[i][0], activation_min);
-            ch_out[i][1] = MAX(ch_out[i][1], activation_min);
-            ch_out[i][0] = MIN(ch_out[i][0], activation_max);
-            ch_out[i][1] = MIN(ch_out[i][1], activation_max);
-            *out_0++ = (int8_t)ch_out[i][0];
-            *out_1++ = (int8_t)ch_out[i][1];
-            out_mult++;
-            out_shift++;
+          ch_out[i][0] = arm_nn_requantize(
+              ch_out[i][0], *out_mult, *out_shift);
+          ch_out[i][1] = arm_nn_requantize(
+              ch_out[i][1], *out_mult, *out_shift);
+          ch_out[i][0] += out_offset;
+          ch_out[i][1] += out_offset;
+          ch_out[i][0] = MAX(ch_out[i][0], activation_min);
+          ch_out[i][1] = MAX(ch_out[i][1], activation_min);
+          ch_out[i][0] = MIN(ch_out[i][0], activation_max);
+          ch_out[i][1] = MIN(ch_out[i][1], activation_max);
+          *out_0++ = (int8_t)ch_out[i][0];
+          *out_1++ = (int8_t)ch_out[i][1];
+          out_mult++;
+          out_shift++;
         }
 
-        /* skip row */
-        ip_a0 = ip_a[3];
-        row_count--;
+      /* skip row */
+
+      ip_a0 = ip_a[3];
+      row_count--;
     }
-    row_count = output_ch % 4;
-    if (row_count >= 2)
+
+  row_count = output_ch % 4;
+  if (row_count >= 2)
     {
-        int32_t col_count = num_col_a / 8;
-        const int8_t *ip_a1 = ip_a0 + num_col_a;
-        const int16_t *ip_b0 = input_b;
-        const int16_t *ip_b1 = ip_b0 + aligned_num_col_a;
-        int32_t ch_out[2][2] = {0};
-        int32x4_t res[4];
-
-        /* Init accumulator with bias for channel N and N + 1 */
-        if (bias)
+      int32_t col_count = num_col_a / 8;
+      const int8_t *ip_a1 = ip_a0 + num_col_a;
+      const int16_t *ip_b0 = input_b;
+      const int16_t *ip_b1 = ip_b0 + aligned_num_col_a;
+      int32_t ch_out[2][2] =
+        {
+          0
+        };
+
+      int32x4_t res[4];
+
+      /* Init accumulator with bias for channel N and N + 1 */
+
+      if (bias)
         {
-            for (int i = 0; i < 2; i++)
+          for (int i = 0; i < 2; i++)
             {
-                ch_out[i][0] = *bias;
-                ch_out[i][1] = *bias++;
+              ch_out[i][0] = *bias;
+              ch_out[i][1] = *bias++;
             }
         }
 
-        for (int i = 0; i < 4; i++)
+      for (int i = 0; i < 4; i++)
         {
-            res[i] = vdupq_n_s32(0);
+          res[i] = vdupq_n_s32(0);
         }
 
-        /**
-         * Each time eight int8 data of four filters and eight int16 data
-         * of two inputs are read.First, the filter data is expanded to
-         * int16, and then cross-multiplied to obtain 8 calculation results.
-         */
-        while (col_count)
+      /* Each time eight int8 data of four filters and eight int16 data
+       * of two inputs are read.First, the filter data is expanded to
+       * int16, and then cross-multiplied to obtain 8 calculation results.
+       */
+
+      while (col_count)
         {
-            int8x8_t filter_s8[2];
-            int16x8_t input_s16[2];
-            int16x8_t filter_s16[2];
+          int8x8_t filter_s8[2];
+          int16x8_t input_s16[2];
+          int16x8_t filter_s16[2];
 
-            filter_s8[0] = vld1_s8(ip_a0);
-            ip_a0 += 8;
-            filter_s8[1] = vld1_s8(ip_a1);
-            ip_a1 += 8;
+          filter_s8[0] = vld1_s8(ip_a0);
+          ip_a0 += 8;
+          filter_s8[1] = vld1_s8(ip_a1);
+          ip_a1 += 8;
 
-            input_s16[0] = vld1q_s16(ip_b0);
-            ip_b0 += 8;
-            input_s16[1] = vld1q_s16(ip_b1);
-            ip_b1 += 8;
+          input_s16[0] = vld1q_s16(ip_b0);
+          ip_b0 += 8;
+          input_s16[1] = vld1q_s16(ip_b1);
+          ip_b1 += 8;
 
-            for (int i = 0; i < 2; i++)
+          for (int i = 0; i < 2; i++)
             {
-                filter_s16[i] = vmovl_s8(filter_s8[i]);
-                res[i * 2]     = vmlal_s16(res[i * 2],
-                                           vget_low_s16(filter_s16[i]),
-                                           vget_low_s16(input_s16[0]));
-                res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
-                                           vget_low_s16(filter_s16[i]),
-                                           vget_low_s16(input_s16[1]));
-                res[i * 2]     = vmlal_s16(res[i * 2],
-                                           vget_high_s16(filter_s16[i]),
-                                           vget_high_s16(input_s16[0]));
-                res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
-                                           vget_high_s16(filter_s16[i]),
-                                           vget_high_s16(input_s16[1]));
+              filter_s16[i] = vmovl_s8(filter_s8[i]);
+              res[i * 2]     = vmlal_s16(res[i * 2],
+                                         vget_low_s16(filter_s16[i]),
+                                         vget_low_s16(input_s16[0]));
+              res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
+                                         vget_low_s16(filter_s16[i]),
+                                         vget_low_s16(input_s16[1]));
+              res[i * 2]     = vmlal_s16(res[i * 2],
+                                         vget_high_s16(filter_s16[i]),
+                                         vget_high_s16(input_s16[0]));
+              res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
+                                         vget_high_s16(filter_s16[i]),
+                                         vget_high_s16(input_s16[1]));
             }
 
-            col_count --;
+          col_count--;
         }
-        for (int i = 0; i < 2; i++)
+
+      for (int i = 0; i < 2; i++)
         {
-            for (int j = 0; j < 2; j++)
+          for (int j = 0; j < 2; j++)
             {
-                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0);
-                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1);
-                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2);
-                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3);
+              ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0);
+              ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1);
+              ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2);
+              ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3);
             }
         }
 
-        col_count = num_col_a % 8;
-        while (col_count)
+      col_count = num_col_a % 8;
+      while (col_count) /* while over col_count */
         {
-            int8_t a0 = *ip_a0++; // filter
-            int8_t a1 = *ip_a1++;
-            int16_t b0 = *ip_b0++; // input
-            int16_t b1 = *ip_b1++;
-
-            ch_out[0][0] += a0 * b0;
-            ch_out[1][1] += a1 * b1;
-            ch_out[1][0] += a1 * b0;
-            ch_out[0][1] += a0 * b1;
-            col_count--;
-        } /* while over col_count */
-
-        for (int i = 0; i < 2; i++)
+          int8_t a0 = *ip_a0++; /* filter */
+          int8_t a1 = *ip_a1++;
+          int16_t b0 = *ip_b0++; /* input */
+          int16_t b1 = *ip_b1++;
+
+          ch_out[0][0] += a0 * b0;
+          ch_out[1][1] += a1 * b1;
+          ch_out[1][0] += a1 * b0;
+          ch_out[0][1] += a0 * b1;
+          col_count--;
+        }
+
+      for (int i = 0; i < 2; i++)
         {
-            ch_out[i][0] = arm_nn_requantize(ch_out[i][0], *out_mult, 
*out_shift);
-            ch_out[i][1] = arm_nn_requantize(ch_out[i][1], *out_mult, 
*out_shift);
-            ch_out[i][0] += out_offset;
-            ch_out[i][1] += out_offset;
-            ch_out[i][0] = MAX(ch_out[i][0], activation_min);
-            ch_out[i][1] = MAX(ch_out[i][1], activation_min);
-            ch_out[i][0] = MIN(ch_out[i][0], activation_max);
-            ch_out[i][1] = MIN(ch_out[i][1], activation_max);
-            *out_0++ = (int8_t)ch_out[i][0];
-            *out_1++ = (int8_t)ch_out[i][1];
-            out_mult++;
-            out_shift++;
+          ch_out[i][0] = arm_nn_requantize(
+              ch_out[i][0], *out_mult, *out_shift);
+          ch_out[i][1] = arm_nn_requantize(
+              ch_out[i][1], *out_mult, *out_shift);
+          ch_out[i][0] += out_offset;
+          ch_out[i][1] += out_offset;
+          ch_out[i][0] = MAX(ch_out[i][0], activation_min);
+          ch_out[i][1] = MAX(ch_out[i][1], activation_min);
+          ch_out[i][0] = MIN(ch_out[i][0], activation_max);
+          ch_out[i][1] = MIN(ch_out[i][1], activation_max);
+          *out_0++ = (int8_t)ch_out[i][0];
+          *out_1++ = (int8_t)ch_out[i][1];
+          out_mult++;
+          out_shift++;
         }
 
-        /* skip row */
-        ip_a0 += num_col_a;
-        row_count -= 2;
+      /* skip row */
+
+      ip_a0 += num_col_a;
+      row_count -= 2;
     }
 
-    /* compute the last odd numbered row if any */
-    if (output_ch & 0x1)
+  /* compute the last odd numbered row if any */
+
+  if (output_ch & 0x1)
     {
-        int32_t col_count = num_col_a / 8;
-        const int16_t *ip_b0 = input_b;
-        const int16_t *ip_b1 = ip_b0 + aligned_num_col_a;
-        int32_t ch_out[2] = {0};
-        int32x4_t res[2];
-
-        /* load the bias */
-        if (bias)
+      int32_t col_count = num_col_a / 8;
+      const int16_t *ip_b0 = input_b;
+      const int16_t *ip_b1 = ip_b0 + aligned_num_col_a;
+      int32_t ch_out[2] =
         {
-            ch_out[0] = *bias;
-            ch_out[1] = *bias++;
-        }
+          0
+        };
+
+      int32x4_t res[2];
 
-        res[0] = vdupq_n_s32(0);
-        res[1] = vdupq_n_s32(0);
+      /* load the bias */
 
-        while(col_count)
+      if (bias)
         {
-            int8x8_t filter_s8 = vld1_s8(ip_a0);
-            int16x8_t filter_s16 = vmovl_s8(filter_s8);
-            int16x8_t input_0_s16 = vld1q_s16(ip_b0);
-            int16x8_t input_1_s16 = vld1q_s16(ip_b1);
-            ip_a0 += 8;
-            ip_b0 += 8;
-            ip_b1 += 8;
-            res[0] = vmlal_s16(res[0],
-                               vget_low_s16(filter_s16),
-                               vget_low_s16(input_0_s16));
-            res[1] = vmlal_s16(res[1],
-                               vget_low_s16(filter_s16),
-                               vget_low_s16(input_1_s16));
-            res[0] = vmlal_s16(res[0],
-                               vget_high_s16(filter_s16),
-                               vget_high_s16(input_0_s16));
-            res[1] = vmlal_s16(res[1],
-                               vget_high_s16(filter_s16),
-                               vget_high_s16(input_1_s16));
-            col_count --;
+          ch_out[0] = *bias;
+          ch_out[1] = *bias++;
         }
 
-        ch_out[0] += vgetq_lane_s32(res[0], 0);
-        ch_out[0] += vgetq_lane_s32(res[0], 1);
-        ch_out[0] += vgetq_lane_s32(res[0], 2);
-        ch_out[0] += vgetq_lane_s32(res[0], 3);
+      res[0] = vdupq_n_s32(0);
+      res[1] = vdupq_n_s32(0);
+      while (col_count)
+        {
+          int8x8_t filter_s8 = vld1_s8(ip_a0);
+          int16x8_t filter_s16 = vmovl_s8(filter_s8);
+          int16x8_t input_0_s16 = vld1q_s16(ip_b0);
+          int16x8_t input_1_s16 = vld1q_s16(ip_b1);
+          ip_a0 += 8;
+          ip_b0 += 8;
+          ip_b1 += 8;
+          res[0] = vmlal_s16(res[0],
+                             vget_low_s16(filter_s16),
+                             vget_low_s16(input_0_s16));
+          res[1] = vmlal_s16(res[1],
+                             vget_low_s16(filter_s16),
+                             vget_low_s16(input_1_s16));
+          res[0] = vmlal_s16(res[0],
+                             vget_high_s16(filter_s16),
+                             vget_high_s16(input_0_s16));
+          res[1] = vmlal_s16(res[1],
+                             vget_high_s16(filter_s16),
+                             vget_high_s16(input_1_s16));
+          col_count--;
+        }
 
-        ch_out[1] += vgetq_lane_s32(res[1], 0);
-        ch_out[1] += vgetq_lane_s32(res[1], 1);
-        ch_out[1] += vgetq_lane_s32(res[1], 2);
-        ch_out[1] += vgetq_lane_s32(res[1], 3);
+      ch_out[0] += vgetq_lane_s32(res[0], 0);
+      ch_out[0] += vgetq_lane_s32(res[0], 1);
+      ch_out[0] += vgetq_lane_s32(res[0], 2);
+      ch_out[0] += vgetq_lane_s32(res[0], 3);
 
-        col_count = num_col_a % 8;
-        while (col_count)
+      ch_out[1] += vgetq_lane_s32(res[1], 0);
+      ch_out[1] += vgetq_lane_s32(res[1], 1);
+      ch_out[1] += vgetq_lane_s32(res[1], 2);
+      ch_out[1] += vgetq_lane_s32(res[1], 3);
+
+      col_count = num_col_a % 8;
+      while (col_count)
         {
-            int8_t a0 = *ip_a0++;
-            int16_t b0 = *ip_b0++;
-            int16_t b1 = *ip_b1++;
+          int8_t a0 = *ip_a0++;
+          int16_t b0 = *ip_b0++;
+          int16_t b1 = *ip_b1++;
 
-            ch_out[0] += a0 * b0;
-            ch_out[1] += a0 * b1;
-            col_count--;
+          ch_out[0] += a0 * b0;
+          ch_out[1] += a0 * b1;
+          col_count--;
         }
 
-        ch_out[0] = arm_nn_requantize(ch_out[0], *out_mult, *out_shift);
-        ch_out[0] += out_offset;
-        ch_out[0] = MAX(ch_out[0], activation_min);
-        ch_out[0] = MIN(ch_out[0], activation_max);
-        *out_0++ = (int8_t)ch_out[0];
-
-        ch_out[1] = arm_nn_requantize(ch_out[1], *out_mult, *out_shift);
-        ch_out[1] += out_offset;
-        ch_out[1] = MAX(ch_out[1], activation_min);
-        ch_out[1] = MIN(ch_out[1], activation_max);
-        *out_1++ = (int8_t)ch_out[1];
-
-        out_mult++;
-        out_shift++;
+      ch_out[0] = arm_nn_requantize(
+          ch_out[0], *out_mult, *out_shift);
+      ch_out[0] += out_offset;
+      ch_out[0] = MAX(ch_out[0], activation_min);
+      ch_out[0] = MIN(ch_out[0], activation_max);
+      *out_0++ = (int8_t)ch_out[0];
+
+      ch_out[1] = arm_nn_requantize(
+          ch_out[1], *out_mult, *out_shift);
+      ch_out[1] += out_offset;
+      ch_out[1] = MAX(ch_out[1], activation_min);
+      ch_out[1] = MIN(ch_out[1], activation_max);
+      *out_1++ = (int8_t)ch_out[1];
+
+      out_mult++;
+      out_shift++;
     }
 
-    out_0 += output_ch;
+  out_0 += output_ch;
+
+  /* return the new output pointer with offset */
 
-    /* return the new output pointer with offset */
-    return out_0;
+  return out_0;
 }
diff --git a/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c 
b/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c
index b7156e473..770e1a88c 100644
--- a/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c
+++ b/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c
@@ -1,60 +1,62 @@
-/*
- * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its 
affiliates <open-source-off...@arm.com>
+/****************************************************************************
+ * apps/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c
+ *
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or
+ * its affiliates <open-source-off...@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in_q7x4 compliance with the License.
+ * not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in_q7x4 writing, software
+ * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- */
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
 
 #include <arm_neon.h>
 #include "arm_nnsupportfunctions.h"
 
-/**
- * @ingroup groupSupport
- */
-
-/**
- * @addtogroup supportConversion
- * @{
- */
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
 
-void arm_q7_to_q15_with_offset(const int8_t *src, int16_t *dst, int32_t 
block_size, int16_t offset)
+void arm_q7_to_q15_with_offset(const int8_t *src,
+                               int16_t *dst,
+                               int32_t block_size,
+                               int16_t offset)
 {
-    int32_t block_cnt;
+  int32_t block_cnt;
 
-    block_cnt = block_size / 8;
-    int16x8_t offset_s16 = vdupq_n_s16(offset);
-    while (block_cnt)
+  block_cnt = block_size / 8;
+  int16x8_t offset_s16 = vdupq_n_s16(offset);
+  while (block_cnt)
     {
-        int8x8_t src_s8 = vld1_s8(src);
-        int16x8_t src_s16 = vmovl_s8(src_s8);
-        src += 8;
-        src_s16 = vaddq_s16(offset_s16, src_s16);
-        block_cnt--;
-        vst1q_s16(dst, src_s16);
-        dst += 8;
+      int8x8_t src_s8 = vld1_s8(src);
+      int16x8_t src_s16 = vmovl_s8(src_s8);
+      src += 8;
+      src_s16 = vaddq_s16(offset_s16, src_s16);
+      block_cnt--;
+      vst1q_s16(dst, src_s16);
+      dst += 8;
     }
 
-    block_cnt = block_size % 8;
-    while (block_cnt > 0)
+  block_cnt = block_size % 8;
+  while (block_cnt > 0)
     {
-        *dst++ = (int16_t)*src++ + offset;
+      *dst++ = (int16_t)*src++ + offset;
 
-        /* Decrement the loop counter */
-        block_cnt--;
+      /* Decrement the loop counter */
+
+      block_cnt--;
     }
 }
-
-/**
- * @} end of Doxygen group
- */

Reply via email to