This is an automated email from the ASF dual-hosted git repository. xiaoxiang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nuttx-apps.git
commit 25937282edf193bd8db07bb069fd81f20e0d114f Author: jihandong <jihand...@xiaomi.com> AuthorDate: Thu Oct 17 16:16:58 2024 +0800 ml: follow nxstyle Signed-off-by: jihandong <jihand...@xiaomi.com> --- .../tflite-micro/operators/neon/arm_convolve_s8.c | 330 +++++++------ .../operators/neon/arm_elementwise_add_s8.c | 258 +++++----- .../operators/neon/arm_nn_mat_mult_kernel_s8_s16.c | 539 +++++++++++---------- .../operators/neon/arm_q7_to_q15_with_offset.c | 70 +-- 4 files changed, 644 insertions(+), 553 deletions(-) diff --git a/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c b/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c index 454c467a1..3d8e5af5c 100644 --- a/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c +++ b/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c @@ -1,5 +1,8 @@ -/* - * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-off...@arm.com> +/**************************************************************************** + * apps/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c + * + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or + * its affiliates <open-source-off...@arm.com> * * SPDX-License-Identifier: Apache-2.0 * @@ -14,191 +17,210 @@ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ #include <arm_neon.h> #include "arm_nnfunctions.h" #include "arm_nnsupportfunctions.h" -/** - * @ingroup Public - */ - -/** - * @addtogroup NNConv - * @{ - */ +/**************************************************************************** + * Public Functions + ****************************************************************************/ -/* - * Basic s8 convolution function. - * - * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels - * are multiples of 4 or atleast greater than 4. +/* Basic s8 convolution function. * + * Refer header file for details. Optimal use case for the DSP/MVE + * implementation is when input and output channels are multiples of 4 or + * atleast greater than 4. */ -arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, - const cmsis_nn_conv_params *conv_params, - const cmsis_nn_per_channel_quant_params *quant_params, - const cmsis_nn_dims *input_dims, - const int8_t *input_data, - const cmsis_nn_dims *filter_dims, - const int8_t *filter_data, - const cmsis_nn_dims *bias_dims, - const int32_t *bias_data, - const cmsis_nn_dims *output_dims, - int8_t *output_data) + +arm_cmsis_nn_status +arm_convolve_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const int8_t *input_data, + const cmsis_nn_dims *filter_dims, + const int8_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + int8_t *output_data) { - (void)bias_dims; + (void)bias_dims; - if (ctx->buf == NULL) + if (ctx->buf == NULL) { - return ARM_CMSIS_NN_ARG_ERROR; + return ARM_CMSIS_NN_ARG_ERROR; } - int16_t *buffer_a = (int16_t *)ctx->buf; - - const int32_t input_batches = input_dims->n; - const uint16_t input_x = input_dims->w; - const uint16_t input_y = input_dims->h; - const uint16_t input_ch = input_dims->c; - const uint16_t kernel_x = filter_dims->w; - const uint16_t kernel_y = filter_dims->h; - const uint16_t output_x = output_dims->w; - const uint16_t output_y = output_dims->h; - const uint16_t output_ch = output_dims->c; - - const uint16_t pad_x = conv_params->padding.w; - const uint16_t pad_y = conv_params->padding.h; - const uint16_t stride_x = conv_params->stride.w; - const uint16_t stride_y = conv_params->stride.h; - const int32_t dilation_x = conv_params->dilation.w; - const int32_t dilation_y = conv_params->dilation.h; - const int32_t out_offset = conv_params->output_offset; - const int32_t out_activation_min = conv_params->activation.min; - const int32_t out_activation_max = conv_params->activation.max; - const int32_t rhs_cols = kernel_x * kernel_y * input_ch; - const int32_t input_offset = conv_params->input_offset; - - int32_t *output_mult = quant_params->multiplier; - int32_t *output_shift = quant_params->shift; - - int i_batch; - for (i_batch = 0; i_batch < input_batches; i_batch++) + + int16_t *buffer_a = (int16_t *)ctx->buf; + + const int32_t input_batches = input_dims->n; + const uint16_t input_x = input_dims->w; + const uint16_t input_y = input_dims->h; + const uint16_t input_ch = input_dims->c; + const uint16_t kernel_x = filter_dims->w; + const uint16_t kernel_y = filter_dims->h; + const uint16_t output_x = output_dims->w; + const uint16_t output_y = output_dims->h; + const uint16_t output_ch = output_dims->c; + + const uint16_t pad_x = conv_params->padding.w; + const uint16_t pad_y = conv_params->padding.h; + const uint16_t stride_x = conv_params->stride.w; + const uint16_t stride_y = conv_params->stride.h; + const int32_t dilation_x = conv_params->dilation.w; + const int32_t dilation_y = conv_params->dilation.h; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + const int32_t rhs_cols = kernel_x * kernel_y * input_ch; + const int32_t input_offset = conv_params->input_offset; + + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + int i_batch; + for (i_batch = 0; i_batch < input_batches; i_batch++) { - const int32_t remainder = rhs_cols % 4; - const int32_t aligned_rhs_cols = remainder != 0 ? rhs_cols + 4 - remainder : rhs_cols; - /** - * Use Im2col to speed up conv2d calculations. - * Use as a ping-pong buffer for unordered elements. - */ - int8_t *im2col_buf = (int8_t *)buffer_a + aligned_rhs_cols * 2; - int16_t *im2col_buf_start_s16 = buffer_a; - int8_t *out = output_data; - int32_t lhs_rows = 0; - /* This part implements the im2col function */ - for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + const int32_t remainder = rhs_cols % 4; + const int32_t aligned_rhs_cols = remainder != 0 ? + rhs_cols + 4 - remainder : rhs_cols; + + /** + * Use Im2col to speed up conv2d calculations. + * Use as a ping-pong buffer for unordered elements. + */ + + int8_t *im2col_buf = (int8_t *)buffer_a + aligned_rhs_cols * 2; + int16_t *im2col_buf_start_s16 = buffer_a; + int8_t *out = output_data; + int32_t lhs_rows = 0; + + /* This part implements the im2col function */ + + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) { - const int32_t base_idx_x = stride_x * i_out_x - pad_x; - for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + const int32_t base_idx_x = stride_x * i_out_x - pad_x; + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) { - const int32_t base_idx_y = stride_y * i_out_y - pad_y; - for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) + const int32_t base_idx_y = stride_y * i_out_y - pad_y; + for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) { - int32_t k_x = base_idx_x + dilation_x * i_ker_x; - int32_t k_y = base_idx_y - dilation_y; - for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++) + int32_t k_x = base_idx_x + dilation_x * i_ker_x; + int32_t k_y = base_idx_y - dilation_y; + for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++) { - k_y += dilation_y; - arm_memcpy_s8(im2col_buf, - input_data + (k_y * input_x + k_x) * input_ch, - input_ch); - im2col_buf += input_ch; + k_y += dilation_y; + arm_memcpy_s8(im2col_buf, + input_data + (k_y * input_x + k_x) * input_ch, + input_ch); + im2col_buf += input_ch; } } - lhs_rows++; - /* Extend the input data from int8 to int16, and add offset. */ - arm_q7_to_q15_with_offset(im2col_buf - rhs_cols, - im2col_buf_start_s16, - rhs_cols, - (int16_t)input_offset); - im2col_buf_start_s16 += aligned_rhs_cols; - if (lhs_rows & 2) + + lhs_rows++; + + /* Extend the input data from int8 to int16, and add offset. */ + + arm_q7_to_q15_with_offset(im2col_buf - rhs_cols, + im2col_buf_start_s16, + rhs_cols, + (int16_t)input_offset); + im2col_buf_start_s16 += aligned_rhs_cols; + if (lhs_rows & 2) { - out = arm_nn_mat_mult_kernel_s8_s16(filter_data, - buffer_a, - output_ch, - output_shift, - output_mult, - out_offset, - out_activation_min, - out_activation_max, - rhs_cols, - aligned_rhs_cols, - bias_data, - out); - /* counter reset */ - im2col_buf_start_s16 = buffer_a; - im2col_buf = (int8_t *)buffer_a + (aligned_rhs_cols << 1); - lhs_rows = 0; + out = arm_nn_mat_mult_kernel_s8_s16(filter_data, + buffer_a, + output_ch, + output_shift, + output_mult, + out_offset, + out_activation_min, + out_activation_max, + rhs_cols, + aligned_rhs_cols, + bias_data, + out); + + /* counter reset */ + + im2col_buf_start_s16 = buffer_a; + im2col_buf = (int8_t *)buffer_a + (aligned_rhs_cols << 1); + lhs_rows = 0; } } } - if (lhs_rows != 0) + + if (lhs_rows != 0) { - const int8_t *ker_a = filter_data; - int i; - for (i = 0; i < output_ch; i++) + const int8_t *ker_a = filter_data; + int i; + + for (i = 0; i < output_ch; i++) { - /* Load the accumulator with bias first */ - uint16_t col_count = rhs_cols / 8; - int32_t sum = 0; - const int16_t *ip_as_col = buffer_a; - int32x4_t res_s32 = vdupq_n_s32(0); - if (bias_data) + /* Load the accumulator with bias first */ + + uint16_t col_count = rhs_cols / 8; + int32_t sum = 0; + const int16_t *ip_as_col = buffer_a; + int32x4_t res_s32 = vdupq_n_s32(0); + if (bias_data) { - sum = bias_data[i]; + sum = bias_data[i]; } - while (col_count) + + while (col_count) { - int8x8_t filter_s8 = vld1_s8(ker_a); - int16x8_t input_s16 = vld1q_s16(ip_as_col); - int16x8_t filter_s16 = vmovl_s8(filter_s8); - ker_a += 8; - ip_as_col += 8; - res_s32 = vmlal_s16(res_s32, - vget_low_s16(input_s16), - vget_low_s16(filter_s16)); - res_s32 = vmlal_s16(res_s32, - vget_high_s16(input_s16), - vget_high_s16(filter_s16)); - col_count --; + int8x8_t filter_s8 = vld1_s8(ker_a); + int16x8_t input_s16 = vld1q_s16(ip_as_col); + int16x8_t filter_s16 = vmovl_s8(filter_s8); + ker_a += 8; + ip_as_col += 8; + res_s32 = vmlal_s16(res_s32, + vget_low_s16(input_s16), + vget_low_s16(filter_s16)); + res_s32 = vmlal_s16(res_s32, + vget_high_s16(input_s16), + vget_high_s16(filter_s16)); + col_count--; } - sum += vgetq_lane_s32(res_s32, 0); - sum += vgetq_lane_s32(res_s32, 1); - sum += vgetq_lane_s32(res_s32, 2); - sum += vgetq_lane_s32(res_s32, 3); - col_count = rhs_cols % 8; - while (col_count) + + sum += vgetq_lane_s32(res_s32, 0); + sum += vgetq_lane_s32(res_s32, 1); + sum += vgetq_lane_s32(res_s32, 2); + sum += vgetq_lane_s32(res_s32, 3); + col_count = rhs_cols % 8; + while (col_count) { - int8_t ker_a1 = *ker_a++; - int16_t ip_b1 = *ip_as_col++; - sum += ker_a1 * ip_b1; - col_count--; + int8_t ker_a1 = *ker_a++; + int16_t ip_b1 = *ip_as_col++; + sum += ker_a1 * ip_b1; + col_count--; } - sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]); - sum += out_offset; - sum = MAX(sum, out_activation_min); - sum = MIN(sum, out_activation_max); - *out++ = (int8_t)sum; + + sum = arm_nn_requantize(sum, + output_mult[i], output_shift[i]); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + *out++ = (int8_t)sum; } } - /* Advance to the next batch */ - input_data += (input_x * input_y * input_ch); - output_data += (output_x * output_y * output_ch); + + /* Advance to the next batch */ + + input_data += (input_x * input_y * input_ch); + output_data += (output_x * output_y * output_ch); } - /* Return to application */ - return ARM_CMSIS_NN_SUCCESS; + + /* Return to application */ + + return ARM_CMSIS_NN_SUCCESS; } -/** - * @} end of NNConv group - */ \ No newline at end of file diff --git a/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c b/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c index 678ae69fd..d7147fbce 100644 --- a/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c +++ b/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c @@ -1,5 +1,8 @@ -/* - * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-off...@arm.com> +/**************************************************************************** + * apps/mlearning/tflite-micro/operators/neon/arm_elementwise_add_s8.c + * + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or + * its affiliates <open-source-off...@arm.com> * * SPDX-License-Identifier: Apache-2.0 * @@ -14,136 +17,157 @@ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ #include <arm_neon.h> #include "arm_nnfunctions.h" #include "arm_nnsupportfunctions.h" +/**************************************************************************** + * Public Functions + ****************************************************************************/ + /* Note: __SHIFT is expected to be <=0 */ -__STATIC_FORCEINLINE int32x4_t arm_requantize_neon(const int32x4_t val, const int32_t multiplier, const int32_t shift) + +__STATIC_FORCEINLINE int32x4_t +arm_requantize_neon(const int32x4_t val, + const int32_t multiplier, + const int32_t shift) { - int32x4_t dividend = vqrdmulhq_n_s32(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier); - int32_t exponent = RIGHT_SHIFT(shift); - int32x4_t shift__ = vdupq_n_s32(-exponent); - int32x4_t fixup__ = vshrq_n_s32(vandq_s32(dividend, shift__), 31); - int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup__); - return vrshlq_s32(fixed_up_dividend, shift__); + int32x4_t dividend = vqrdmulhq_n_s32( + vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier); + int32_t exponent = RIGHT_SHIFT(shift); + int32x4_t shift__ = vdupq_n_s32(-exponent); + int32x4_t fixup__ = vshrq_n_s32(vandq_s32(dividend, shift__), 31); + int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup__); + return vrshlq_s32(fixed_up_dividend, shift__); } -/** - * @ingroup Public - */ - -/** - * @addtogroup groupElementwise - * @{ - */ - -/* - * s8 elementwise add - * - * Refer header file for details. - * - */ - -arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect, - const int8_t *input_2_vect, - const int32_t input_1_offset, - const int32_t input_1_mult, - const int32_t input_1_shift, - const int32_t input_2_offset, - const int32_t input_2_mult, - const int32_t input_2_shift, - const int32_t left_shift, - int8_t *output, - const int32_t out_offset, - const int32_t out_mult, - const int32_t out_shift, - const int32_t out_activation_min, - const int32_t out_activation_max, - const int32_t block_size) +arm_cmsis_nn_status +arm_elementwise_add_s8(const int8_t *input_1_vect, + const int8_t *input_2_vect, + const int32_t input_1_offset, + const int32_t input_1_mult, + const int32_t input_1_shift, + const int32_t input_2_offset, + const int32_t input_2_mult, + const int32_t input_2_shift, + const int32_t left_shift, + int8_t *output, + const int32_t out_offset, + const int32_t out_mult, + const int32_t out_shift, + const int32_t out_activation_min, + const int32_t out_activation_max, + const int32_t block_size) { - int32_t loop_count = block_size / 8; - const int8_t *input_1 = input_1_vect; - const int8_t *input_2 = input_2_vect; - int8_t *output_ = output; + int32_t loop_count = block_size / 8; + const int8_t *input_1 = input_1_vect; + const int8_t *input_2 = input_2_vect; + int8_t *output_ = output; - while (loop_count) + while (loop_count) { - int8x8_t res; - int8x8_t input_1_s8; - int8x8_t input_2_s8; - int16x8_t i1_val_16; - int16x8_t input_1_s16; - int16x8_t input_2_s16; - int32x4_t input_1_s16_low; - int32x4_t input_1_s16_high; - int32x4_t input_2_s16_low; - int32x4_t input_2_s16_high; - - input_1_s8 = vld1_s8(input_1); - input_2_s8 = vld1_s8(input_2); - input_1_s16 = vmovl_s8(input_1_s8); - input_2_s16 = vmovl_s8(input_2_s8); - input_1 += 8; - input_2 += 8; - - input_1_s16_low = vaddw_s16(vdupq_n_s32(input_1_offset), vget_low_s16(input_1_s16)); - input_1_s16_high = vaddw_s16(vdupq_n_s32(input_1_offset), vget_high_s16(input_1_s16)); - input_2_s16_low = vaddw_s16(vdupq_n_s32(input_2_offset), vget_low_s16(input_2_s16)); - input_2_s16_high = vaddw_s16(vdupq_n_s32(input_2_offset), vget_high_s16(input_2_s16)); - - input_1_s16_low = vshlq_s32(input_1_s16_low, vdupq_n_s32(left_shift)); - input_2_s16_low = vshlq_s32(input_2_s16_low, vdupq_n_s32(left_shift)); - input_1_s16_high = vshlq_s32(input_1_s16_high, vdupq_n_s32(left_shift)); - input_2_s16_high = vshlq_s32(input_2_s16_high, vdupq_n_s32(left_shift)); - - input_1_s16_low = arm_requantize_neon(input_1_s16_low, input_1_mult, input_1_shift); - input_1_s16_high = arm_requantize_neon(input_1_s16_high, input_1_mult, input_1_shift); - input_2_s16_low = arm_requantize_neon(input_2_s16_low, input_2_mult, input_2_shift); - input_2_s16_high = arm_requantize_neon(input_2_s16_high, input_2_mult, input_2_shift); - - input_1_s16_low = vaddq_s32(input_1_s16_low, input_2_s16_low); - input_1_s16_high = vaddq_s32(input_1_s16_high, input_2_s16_high); - - input_1_s16_low = arm_requantize_neon(input_1_s16_low, out_mult, out_shift); - input_1_s16_high = arm_requantize_neon(input_1_s16_high, out_mult, out_shift); - - input_1_s16_low = vaddq_s32(input_1_s16_low, vdupq_n_s32(out_offset)); - input_1_s16_high = vaddq_s32(input_1_s16_high, vdupq_n_s32(out_offset)); - - input_1_s16_low = vmaxq_s32(input_1_s16_low, vdupq_n_s32(out_activation_min)); - input_1_s16_high = vmaxq_s32(input_1_s16_high, vdupq_n_s32(out_activation_min)); - input_1_s16_low = vminq_s32(input_1_s16_low, vdupq_n_s32(out_activation_max)); - input_1_s16_high = vminq_s32(input_1_s16_high, vdupq_n_s32(out_activation_max)); - - i1_val_16 = vcombine_s16(vmovn_s32(input_1_s16_low), vmovn_s32(input_1_s16_high)); - res = vmovn_s16(i1_val_16); - - vst1_s8(output_, res); - output_ += 8; - loop_count--; + int8x8_t res; + int8x8_t input_1_s8; + int8x8_t input_2_s8; + int16x8_t i1_val_16; + int16x8_t input_1_s16; + int16x8_t input_2_s16; + int32x4_t input_1_s16_low; + int32x4_t input_1_s16_high; + int32x4_t input_2_s16_low; + int32x4_t input_2_s16_high; + + input_1_s8 = vld1_s8(input_1); + input_2_s8 = vld1_s8(input_2); + input_1_s16 = vmovl_s8(input_1_s8); + input_2_s16 = vmovl_s8(input_2_s8); + input_1 += 8; + input_2 += 8; + + input_1_s16_low = vaddw_s16( + vdupq_n_s32(input_1_offset), vget_low_s16(input_1_s16)); + input_1_s16_high = vaddw_s16( + vdupq_n_s32(input_1_offset), vget_high_s16(input_1_s16)); + input_2_s16_low = vaddw_s16( + vdupq_n_s32(input_2_offset), vget_low_s16(input_2_s16)); + input_2_s16_high = vaddw_s16( + vdupq_n_s32(input_2_offset), vget_high_s16(input_2_s16)); + + input_1_s16_low = vshlq_s32( + input_1_s16_low, vdupq_n_s32(left_shift)); + input_2_s16_low = vshlq_s32( + input_2_s16_low, vdupq_n_s32(left_shift)); + input_1_s16_high = vshlq_s32( + input_1_s16_high, vdupq_n_s32(left_shift)); + input_2_s16_high = vshlq_s32( + input_2_s16_high, vdupq_n_s32(left_shift)); + + input_1_s16_low = arm_requantize_neon( + input_1_s16_low, input_1_mult, input_1_shift); + input_1_s16_high = arm_requantize_neon( + input_1_s16_high, input_1_mult, input_1_shift); + input_2_s16_low = arm_requantize_neon( + input_2_s16_low, input_2_mult, input_2_shift); + input_2_s16_high = arm_requantize_neon( + input_2_s16_high, input_2_mult, input_2_shift); + + input_1_s16_low = vaddq_s32( + input_1_s16_low, input_2_s16_low); + input_1_s16_high = vaddq_s32( + input_1_s16_high, input_2_s16_high); + + input_1_s16_low = arm_requantize_neon( + input_1_s16_low, out_mult, out_shift); + input_1_s16_high = arm_requantize_neon( + input_1_s16_high, out_mult, out_shift); + + input_1_s16_low = vaddq_s32( + input_1_s16_low, vdupq_n_s32(out_offset)); + input_1_s16_high = vaddq_s32( + input_1_s16_high, vdupq_n_s32(out_offset)); + + input_1_s16_low = vmaxq_s32( + input_1_s16_low, vdupq_n_s32(out_activation_min)); + input_1_s16_high = vmaxq_s32( + input_1_s16_high, vdupq_n_s32(out_activation_min)); + input_1_s16_low = vminq_s32( + input_1_s16_low, vdupq_n_s32(out_activation_max)); + input_1_s16_high = vminq_s32( + input_1_s16_high, vdupq_n_s32(out_activation_max)); + + i1_val_16 = vcombine_s16( + vmovn_s32(input_1_s16_low), vmovn_s32(input_1_s16_high)); + res = vmovn_s16(i1_val_16); + + vst1_s8(output_, res); + output_ += 8; + loop_count--; } - loop_count = block_size % 8; - while (loop_count) + loop_count = block_size % 8; + while (loop_count) { - int32_t a1 = (*input_1++ + input_1_offset) << left_shift; - int32_t a2 = (*input_2++ + input_2_offset) << left_shift; - a1 = arm_nn_requantize(a1, input_1_mult, input_1_shift); - a2 = arm_nn_requantize(a2, input_2_mult, input_2_shift); - - int32_t sum = a1 + a2; - sum = arm_nn_requantize(sum, out_mult, out_shift); - sum += out_offset; - - sum = MAX(sum, out_activation_min); - sum = MIN(sum, out_activation_max); - *output_ = (int8_t) sum; - loop_count--; - output_++; + int32_t a1 = (*input_1++ + input_1_offset) << left_shift; + int32_t a2 = (*input_2++ + input_2_offset) << left_shift; + a1 = arm_nn_requantize(a1, input_1_mult, input_1_shift); + a2 = arm_nn_requantize(a2, input_2_mult, input_2_shift); + + int32_t sum = a1 + a2; + sum = arm_nn_requantize(sum, out_mult, out_shift); + sum += out_offset; + + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + *output_ = (int8_t) sum; + loop_count--; + output_++; } - return (ARM_CMSIS_NN_SUCCESS); + return (ARM_CMSIS_NN_SUCCESS); } diff --git a/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c b/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c index 43cd33b30..190edb74c 100644 --- a/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c +++ b/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c @@ -1,5 +1,8 @@ -/* - * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-off...@arm.com> +/**************************************************************************** + * apps/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c + * + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or + * its affiliates <open-source-off...@arm.com> * * SPDX-License-Identifier: Apache-2.0 * @@ -14,9 +17,9 @@ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + ****************************************************************************/ -/* ---------------------------------------------------------------------- +/**************************************************************************** * Project: CMSIS NN Library * Title: arm_nn_mat_mult_kernel_s8_s16.c * Description: Matrix-multiplication function for convolution @@ -25,18 +28,24 @@ * $Revision: V.2.0.0 * * Target : Arm(R) M-Profile Architecture - * -------------------------------------------------------------------- */ + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ #include <arm_neon.h> #include "arm_nnfunctions.h" #include "arm_nnsupportfunctions.h" -/* - * Matrix-multiplication function for convolution with per-channel requantization. - * - * Refer header file for details. - * +/* Matrix-multiplication function for convolution with per-channel + * requantization. Refer header file for details. */ + +/**************************************************************************** + * Public Functions + ****************************************************************************/ + int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t *input_a, const int16_t *input_b, const uint16_t output_ch, @@ -50,318 +59,352 @@ int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t *input_a, const int32_t *const output_bias, int8_t *out_0) { - int8_t *out_1 = out_0 + output_ch; - const int32_t *bias = output_bias; + int8_t *out_1 = out_0 + output_ch; + const int32_t *bias = output_bias; + + uint16_t row_count = output_ch / 4; + const int8_t *ip_a0 = input_a; - uint16_t row_count = output_ch / 4; - const int8_t *ip_a0 = input_a; + /* this loop over rows in A */ - /* this loop over rows in A */ - while (row_count) + while (row_count) { - int32_t col_count = num_col_a / 8; - const int16_t *ip_b0 = input_b; - const int16_t *ip_b1 = ip_b0 + aligned_num_col_a; - const int8_t *ip_a[4] = {ip_a0, - ip_a0 + num_col_a, - ip_a0 + 2 * num_col_a, - ip_a0 + 3 * num_col_a}; - int32_t ch_out[4][2] = {0}; - int32x4_t res[8]; - - for (int i = 0; i < 8; i++) + int32_t col_count = num_col_a / 8; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + aligned_num_col_a; + const int8_t *ip_a[4] = + { + ip_a0, + ip_a0 + num_col_a, + ip_a0 + 2 * num_col_a, + ip_a0 + 3 * num_col_a + }; + + int32_t ch_out[4][2] = { - res[i] = vdupq_n_s32(0); + 0 + }; + + int32x4_t res[8]; + + for (int i = 0; i < 8; i++) + { + res[i] = vdupq_n_s32(0); } - /* Init accumulator with bias for channel N and N + 1 */ - if (bias) + /* Init accumulator with bias for channel N and N + 1 */ + + if (bias) { - for (int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - ch_out[i][0] = *bias; - ch_out[i][1] = *bias++; + ch_out[i][0] = *bias; + ch_out[i][1] = *bias++; } } - /** - * Each time eight int8 data of four filters and eight int16 data - * of two inputs are read.First, the filter data is expanded to - * int16, and then cross-multiplied to obtain eight calculation results. - */ - while (col_count) + /* Each time eight int8 data of four filters and eight int16 data + * of two inputs are read.First, the filter data is expanded to + * int16, and then cross-multiplied to obtain eight + * calculation results. + */ + + while (col_count) { - int8x8_t filter_s8[4]; - int16x8_t input_s16[2]; - int16x8_t filter_s16[4]; + int8x8_t filter_s8[4]; + int16x8_t input_s16[2]; + int16x8_t filter_s16[4]; - input_s16[0] = vld1q_s16(ip_b0); - ip_b0 += 8; - input_s16[1] = vld1q_s16(ip_b1); - ip_b1 += 8; + input_s16[0] = vld1q_s16(ip_b0); + ip_b0 += 8; + input_s16[1] = vld1q_s16(ip_b1); + ip_b1 += 8; - for (int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - filter_s8[i] = vld1_s8(ip_a[i]); - ip_a[i] += 8; - filter_s16[i] = vmovl_s8(filter_s8[i]); - res[i * 2] = vmlal_s16(res[i * 2], - vget_low_s16(filter_s16[i]), - vget_low_s16(input_s16[0])); - res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], - vget_low_s16(filter_s16[i]), - vget_low_s16(input_s16[1])); - res[i * 2] = vmlal_s16(res[i * 2], - vget_high_s16(filter_s16[i]), - vget_high_s16(input_s16[0])); - res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], - vget_high_s16(filter_s16[i]), - vget_high_s16(input_s16[1])); + filter_s8[i] = vld1_s8(ip_a[i]); + ip_a[i] += 8; + filter_s16[i] = vmovl_s8(filter_s8[i]); + res[i * 2] = vmlal_s16(res[i * 2], + vget_low_s16(filter_s16[i]), + vget_low_s16(input_s16[0])); + res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], + vget_low_s16(filter_s16[i]), + vget_low_s16(input_s16[1])); + res[i * 2] = vmlal_s16(res[i * 2], + vget_high_s16(filter_s16[i]), + vget_high_s16(input_s16[0])); + res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], + vget_high_s16(filter_s16[i]), + vget_high_s16(input_s16[1])); } - col_count --; + col_count--; } - for (int i = 0; i < 4; i++) + + for (int i = 0; i < 4; i++) { - for (int j = 0; j < 2; j++) + for (int j = 0; j < 2; j++) { - ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0); - ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1); - ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2); - ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3); } } - col_count = num_col_a % 8; - while (col_count) + col_count = num_col_a % 8; + while (col_count) /* while over col_count */ { - int16_t b0 = *ip_b0++; - int16_t b1 = *ip_b1++; + int16_t b0 = *ip_b0++; + int16_t b1 = *ip_b1++; - for (int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - int8_t input_remaining = *(ip_a[i]++); - ch_out[i][0] += input_remaining * b0; - ch_out[i][1] += input_remaining * b1; + int8_t input_remaining = *(ip_a[i]++); + ch_out[i][0] += input_remaining * b0; + ch_out[i][1] += input_remaining * b1; } - col_count--; - } /* while over col_count */ - for (int i = 0; i < 4; i++) + col_count--; + } + + for (int i = 0; i < 4; i++) { - ch_out[i][0] = arm_nn_requantize(ch_out[i][0], *out_mult, *out_shift); - ch_out[i][1] = arm_nn_requantize(ch_out[i][1], *out_mult, *out_shift); - ch_out[i][0] += out_offset; - ch_out[i][1] += out_offset; - ch_out[i][0] = MAX(ch_out[i][0], activation_min); - ch_out[i][1] = MAX(ch_out[i][1], activation_min); - ch_out[i][0] = MIN(ch_out[i][0], activation_max); - ch_out[i][1] = MIN(ch_out[i][1], activation_max); - *out_0++ = (int8_t)ch_out[i][0]; - *out_1++ = (int8_t)ch_out[i][1]; - out_mult++; - out_shift++; + ch_out[i][0] = arm_nn_requantize( + ch_out[i][0], *out_mult, *out_shift); + ch_out[i][1] = arm_nn_requantize( + ch_out[i][1], *out_mult, *out_shift); + ch_out[i][0] += out_offset; + ch_out[i][1] += out_offset; + ch_out[i][0] = MAX(ch_out[i][0], activation_min); + ch_out[i][1] = MAX(ch_out[i][1], activation_min); + ch_out[i][0] = MIN(ch_out[i][0], activation_max); + ch_out[i][1] = MIN(ch_out[i][1], activation_max); + *out_0++ = (int8_t)ch_out[i][0]; + *out_1++ = (int8_t)ch_out[i][1]; + out_mult++; + out_shift++; } - /* skip row */ - ip_a0 = ip_a[3]; - row_count--; + /* skip row */ + + ip_a0 = ip_a[3]; + row_count--; } - row_count = output_ch % 4; - if (row_count >= 2) + + row_count = output_ch % 4; + if (row_count >= 2) { - int32_t col_count = num_col_a / 8; - const int8_t *ip_a1 = ip_a0 + num_col_a; - const int16_t *ip_b0 = input_b; - const int16_t *ip_b1 = ip_b0 + aligned_num_col_a; - int32_t ch_out[2][2] = {0}; - int32x4_t res[4]; - - /* Init accumulator with bias for channel N and N + 1 */ - if (bias) + int32_t col_count = num_col_a / 8; + const int8_t *ip_a1 = ip_a0 + num_col_a; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + aligned_num_col_a; + int32_t ch_out[2][2] = + { + 0 + }; + + int32x4_t res[4]; + + /* Init accumulator with bias for channel N and N + 1 */ + + if (bias) { - for (int i = 0; i < 2; i++) + for (int i = 0; i < 2; i++) { - ch_out[i][0] = *bias; - ch_out[i][1] = *bias++; + ch_out[i][0] = *bias; + ch_out[i][1] = *bias++; } } - for (int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - res[i] = vdupq_n_s32(0); + res[i] = vdupq_n_s32(0); } - /** - * Each time eight int8 data of four filters and eight int16 data - * of two inputs are read.First, the filter data is expanded to - * int16, and then cross-multiplied to obtain 8 calculation results. - */ - while (col_count) + /* Each time eight int8 data of four filters and eight int16 data + * of two inputs are read.First, the filter data is expanded to + * int16, and then cross-multiplied to obtain 8 calculation results. + */ + + while (col_count) { - int8x8_t filter_s8[2]; - int16x8_t input_s16[2]; - int16x8_t filter_s16[2]; + int8x8_t filter_s8[2]; + int16x8_t input_s16[2]; + int16x8_t filter_s16[2]; - filter_s8[0] = vld1_s8(ip_a0); - ip_a0 += 8; - filter_s8[1] = vld1_s8(ip_a1); - ip_a1 += 8; + filter_s8[0] = vld1_s8(ip_a0); + ip_a0 += 8; + filter_s8[1] = vld1_s8(ip_a1); + ip_a1 += 8; - input_s16[0] = vld1q_s16(ip_b0); - ip_b0 += 8; - input_s16[1] = vld1q_s16(ip_b1); - ip_b1 += 8; + input_s16[0] = vld1q_s16(ip_b0); + ip_b0 += 8; + input_s16[1] = vld1q_s16(ip_b1); + ip_b1 += 8; - for (int i = 0; i < 2; i++) + for (int i = 0; i < 2; i++) { - filter_s16[i] = vmovl_s8(filter_s8[i]); - res[i * 2] = vmlal_s16(res[i * 2], - vget_low_s16(filter_s16[i]), - vget_low_s16(input_s16[0])); - res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], - vget_low_s16(filter_s16[i]), - vget_low_s16(input_s16[1])); - res[i * 2] = vmlal_s16(res[i * 2], - vget_high_s16(filter_s16[i]), - vget_high_s16(input_s16[0])); - res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], - vget_high_s16(filter_s16[i]), - vget_high_s16(input_s16[1])); + filter_s16[i] = vmovl_s8(filter_s8[i]); + res[i * 2] = vmlal_s16(res[i * 2], + vget_low_s16(filter_s16[i]), + vget_low_s16(input_s16[0])); + res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], + vget_low_s16(filter_s16[i]), + vget_low_s16(input_s16[1])); + res[i * 2] = vmlal_s16(res[i * 2], + vget_high_s16(filter_s16[i]), + vget_high_s16(input_s16[0])); + res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], + vget_high_s16(filter_s16[i]), + vget_high_s16(input_s16[1])); } - col_count --; + col_count--; } - for (int i = 0; i < 2; i++) + + for (int i = 0; i < 2; i++) { - for (int j = 0; j < 2; j++) + for (int j = 0; j < 2; j++) { - ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0); - ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1); - ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2); - ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3); } } - col_count = num_col_a % 8; - while (col_count) + col_count = num_col_a % 8; + while (col_count) /* while over col_count */ { - int8_t a0 = *ip_a0++; // filter - int8_t a1 = *ip_a1++; - int16_t b0 = *ip_b0++; // input - int16_t b1 = *ip_b1++; - - ch_out[0][0] += a0 * b0; - ch_out[1][1] += a1 * b1; - ch_out[1][0] += a1 * b0; - ch_out[0][1] += a0 * b1; - col_count--; - } /* while over col_count */ - - for (int i = 0; i < 2; i++) + int8_t a0 = *ip_a0++; /* filter */ + int8_t a1 = *ip_a1++; + int16_t b0 = *ip_b0++; /* input */ + int16_t b1 = *ip_b1++; + + ch_out[0][0] += a0 * b0; + ch_out[1][1] += a1 * b1; + ch_out[1][0] += a1 * b0; + ch_out[0][1] += a0 * b1; + col_count--; + } + + for (int i = 0; i < 2; i++) { - ch_out[i][0] = arm_nn_requantize(ch_out[i][0], *out_mult, *out_shift); - ch_out[i][1] = arm_nn_requantize(ch_out[i][1], *out_mult, *out_shift); - ch_out[i][0] += out_offset; - ch_out[i][1] += out_offset; - ch_out[i][0] = MAX(ch_out[i][0], activation_min); - ch_out[i][1] = MAX(ch_out[i][1], activation_min); - ch_out[i][0] = MIN(ch_out[i][0], activation_max); - ch_out[i][1] = MIN(ch_out[i][1], activation_max); - *out_0++ = (int8_t)ch_out[i][0]; - *out_1++ = (int8_t)ch_out[i][1]; - out_mult++; - out_shift++; + ch_out[i][0] = arm_nn_requantize( + ch_out[i][0], *out_mult, *out_shift); + ch_out[i][1] = arm_nn_requantize( + ch_out[i][1], *out_mult, *out_shift); + ch_out[i][0] += out_offset; + ch_out[i][1] += out_offset; + ch_out[i][0] = MAX(ch_out[i][0], activation_min); + ch_out[i][1] = MAX(ch_out[i][1], activation_min); + ch_out[i][0] = MIN(ch_out[i][0], activation_max); + ch_out[i][1] = MIN(ch_out[i][1], activation_max); + *out_0++ = (int8_t)ch_out[i][0]; + *out_1++ = (int8_t)ch_out[i][1]; + out_mult++; + out_shift++; } - /* skip row */ - ip_a0 += num_col_a; - row_count -= 2; + /* skip row */ + + ip_a0 += num_col_a; + row_count -= 2; } - /* compute the last odd numbered row if any */ - if (output_ch & 0x1) + /* compute the last odd numbered row if any */ + + if (output_ch & 0x1) { - int32_t col_count = num_col_a / 8; - const int16_t *ip_b0 = input_b; - const int16_t *ip_b1 = ip_b0 + aligned_num_col_a; - int32_t ch_out[2] = {0}; - int32x4_t res[2]; - - /* load the bias */ - if (bias) + int32_t col_count = num_col_a / 8; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + aligned_num_col_a; + int32_t ch_out[2] = { - ch_out[0] = *bias; - ch_out[1] = *bias++; - } + 0 + }; + + int32x4_t res[2]; - res[0] = vdupq_n_s32(0); - res[1] = vdupq_n_s32(0); + /* load the bias */ - while(col_count) + if (bias) { - int8x8_t filter_s8 = vld1_s8(ip_a0); - int16x8_t filter_s16 = vmovl_s8(filter_s8); - int16x8_t input_0_s16 = vld1q_s16(ip_b0); - int16x8_t input_1_s16 = vld1q_s16(ip_b1); - ip_a0 += 8; - ip_b0 += 8; - ip_b1 += 8; - res[0] = vmlal_s16(res[0], - vget_low_s16(filter_s16), - vget_low_s16(input_0_s16)); - res[1] = vmlal_s16(res[1], - vget_low_s16(filter_s16), - vget_low_s16(input_1_s16)); - res[0] = vmlal_s16(res[0], - vget_high_s16(filter_s16), - vget_high_s16(input_0_s16)); - res[1] = vmlal_s16(res[1], - vget_high_s16(filter_s16), - vget_high_s16(input_1_s16)); - col_count --; + ch_out[0] = *bias; + ch_out[1] = *bias++; } - ch_out[0] += vgetq_lane_s32(res[0], 0); - ch_out[0] += vgetq_lane_s32(res[0], 1); - ch_out[0] += vgetq_lane_s32(res[0], 2); - ch_out[0] += vgetq_lane_s32(res[0], 3); + res[0] = vdupq_n_s32(0); + res[1] = vdupq_n_s32(0); + while (col_count) + { + int8x8_t filter_s8 = vld1_s8(ip_a0); + int16x8_t filter_s16 = vmovl_s8(filter_s8); + int16x8_t input_0_s16 = vld1q_s16(ip_b0); + int16x8_t input_1_s16 = vld1q_s16(ip_b1); + ip_a0 += 8; + ip_b0 += 8; + ip_b1 += 8; + res[0] = vmlal_s16(res[0], + vget_low_s16(filter_s16), + vget_low_s16(input_0_s16)); + res[1] = vmlal_s16(res[1], + vget_low_s16(filter_s16), + vget_low_s16(input_1_s16)); + res[0] = vmlal_s16(res[0], + vget_high_s16(filter_s16), + vget_high_s16(input_0_s16)); + res[1] = vmlal_s16(res[1], + vget_high_s16(filter_s16), + vget_high_s16(input_1_s16)); + col_count--; + } - ch_out[1] += vgetq_lane_s32(res[1], 0); - ch_out[1] += vgetq_lane_s32(res[1], 1); - ch_out[1] += vgetq_lane_s32(res[1], 2); - ch_out[1] += vgetq_lane_s32(res[1], 3); + ch_out[0] += vgetq_lane_s32(res[0], 0); + ch_out[0] += vgetq_lane_s32(res[0], 1); + ch_out[0] += vgetq_lane_s32(res[0], 2); + ch_out[0] += vgetq_lane_s32(res[0], 3); - col_count = num_col_a % 8; - while (col_count) + ch_out[1] += vgetq_lane_s32(res[1], 0); + ch_out[1] += vgetq_lane_s32(res[1], 1); + ch_out[1] += vgetq_lane_s32(res[1], 2); + ch_out[1] += vgetq_lane_s32(res[1], 3); + + col_count = num_col_a % 8; + while (col_count) { - int8_t a0 = *ip_a0++; - int16_t b0 = *ip_b0++; - int16_t b1 = *ip_b1++; + int8_t a0 = *ip_a0++; + int16_t b0 = *ip_b0++; + int16_t b1 = *ip_b1++; - ch_out[0] += a0 * b0; - ch_out[1] += a0 * b1; - col_count--; + ch_out[0] += a0 * b0; + ch_out[1] += a0 * b1; + col_count--; } - ch_out[0] = arm_nn_requantize(ch_out[0], *out_mult, *out_shift); - ch_out[0] += out_offset; - ch_out[0] = MAX(ch_out[0], activation_min); - ch_out[0] = MIN(ch_out[0], activation_max); - *out_0++ = (int8_t)ch_out[0]; - - ch_out[1] = arm_nn_requantize(ch_out[1], *out_mult, *out_shift); - ch_out[1] += out_offset; - ch_out[1] = MAX(ch_out[1], activation_min); - ch_out[1] = MIN(ch_out[1], activation_max); - *out_1++ = (int8_t)ch_out[1]; - - out_mult++; - out_shift++; + ch_out[0] = arm_nn_requantize( + ch_out[0], *out_mult, *out_shift); + ch_out[0] += out_offset; + ch_out[0] = MAX(ch_out[0], activation_min); + ch_out[0] = MIN(ch_out[0], activation_max); + *out_0++ = (int8_t)ch_out[0]; + + ch_out[1] = arm_nn_requantize( + ch_out[1], *out_mult, *out_shift); + ch_out[1] += out_offset; + ch_out[1] = MAX(ch_out[1], activation_min); + ch_out[1] = MIN(ch_out[1], activation_max); + *out_1++ = (int8_t)ch_out[1]; + + out_mult++; + out_shift++; } - out_0 += output_ch; + out_0 += output_ch; + + /* return the new output pointer with offset */ - /* return the new output pointer with offset */ - return out_0; + return out_0; } diff --git a/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c b/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c index b7156e473..770e1a88c 100644 --- a/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c +++ b/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c @@ -1,60 +1,62 @@ -/* - * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-off...@arm.com> +/**************************************************************************** + * apps/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c + * + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or + * its affiliates <open-source-off...@arm.com> * * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in_q7x4 compliance with the License. + * not use this file except in compliance with the License. * You may obtain a copy of the License at * * www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in_q7x4 writing, software + * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an AS IS BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ #include <arm_neon.h> #include "arm_nnsupportfunctions.h" -/** - * @ingroup groupSupport - */ - -/** - * @addtogroup supportConversion - * @{ - */ +/**************************************************************************** + * Public Functions + ****************************************************************************/ -void arm_q7_to_q15_with_offset(const int8_t *src, int16_t *dst, int32_t block_size, int16_t offset) +void arm_q7_to_q15_with_offset(const int8_t *src, + int16_t *dst, + int32_t block_size, + int16_t offset) { - int32_t block_cnt; + int32_t block_cnt; - block_cnt = block_size / 8; - int16x8_t offset_s16 = vdupq_n_s16(offset); - while (block_cnt) + block_cnt = block_size / 8; + int16x8_t offset_s16 = vdupq_n_s16(offset); + while (block_cnt) { - int8x8_t src_s8 = vld1_s8(src); - int16x8_t src_s16 = vmovl_s8(src_s8); - src += 8; - src_s16 = vaddq_s16(offset_s16, src_s16); - block_cnt--; - vst1q_s16(dst, src_s16); - dst += 8; + int8x8_t src_s8 = vld1_s8(src); + int16x8_t src_s16 = vmovl_s8(src_s8); + src += 8; + src_s16 = vaddq_s16(offset_s16, src_s16); + block_cnt--; + vst1q_s16(dst, src_s16); + dst += 8; } - block_cnt = block_size % 8; - while (block_cnt > 0) + block_cnt = block_size % 8; + while (block_cnt > 0) { - *dst++ = (int16_t)*src++ + offset; + *dst++ = (int16_t)*src++ + offset; - /* Decrement the loop counter */ - block_cnt--; + /* Decrement the loop counter */ + + block_cnt--; } } - -/** - * @} end of Doxygen group - */