This is an automated email from the ASF dual-hosted git repository. xiaoxiang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nuttx-apps.git
commit a37ca36a8a9afc418a830e7dab8469c1f49c34de Author: xinhaiteng <xinhait...@xiaomi.com> AuthorDate: Sat Mar 2 21:46:44 2024 +0800 Neon optimized Conv operator Based on CMSIS-NN, the Conv operator was optimized. Using Neon acceleration, multiply 8 input data and 8 filter data in a single loop; Using Im2col technology, convert the output data into a matrix, calculate 2 rows of input data and 4 rows of filter data in a single large loop, and obtain 2x4 output data. Signed-off-by: xinhaiteng <xinhait...@xiaomi.com> --- .../tflite-micro/operators/neon/arm_convolve_s8.c | 204 ++++++++++++ .../operators/neon/arm_nn_mat_mult_kernel_s8_s16.c | 363 +++++++++++++++++++++ .../operators/neon/arm_q7_to_q15_with_offset.c | 60 ++++ 3 files changed, 627 insertions(+) diff --git a/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c b/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c new file mode 100644 index 000000000..454c467a1 --- /dev/null +++ b/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c @@ -0,0 +1,204 @@ +/* + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-off...@arm.com> + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <arm_neon.h> +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup Public + */ + +/** + * @addtogroup NNConv + * @{ + */ + +/* + * Basic s8 convolution function. + * + * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels + * are multiples of 4 or atleast greater than 4. + * + */ +arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx, + const cmsis_nn_conv_params *conv_params, + const cmsis_nn_per_channel_quant_params *quant_params, + const cmsis_nn_dims *input_dims, + const int8_t *input_data, + const cmsis_nn_dims *filter_dims, + const int8_t *filter_data, + const cmsis_nn_dims *bias_dims, + const int32_t *bias_data, + const cmsis_nn_dims *output_dims, + int8_t *output_data) +{ + (void)bias_dims; + + if (ctx->buf == NULL) + { + return ARM_CMSIS_NN_ARG_ERROR; + } + int16_t *buffer_a = (int16_t *)ctx->buf; + + const int32_t input_batches = input_dims->n; + const uint16_t input_x = input_dims->w; + const uint16_t input_y = input_dims->h; + const uint16_t input_ch = input_dims->c; + const uint16_t kernel_x = filter_dims->w; + const uint16_t kernel_y = filter_dims->h; + const uint16_t output_x = output_dims->w; + const uint16_t output_y = output_dims->h; + const uint16_t output_ch = output_dims->c; + + const uint16_t pad_x = conv_params->padding.w; + const uint16_t pad_y = conv_params->padding.h; + const uint16_t stride_x = conv_params->stride.w; + const uint16_t stride_y = conv_params->stride.h; + const int32_t dilation_x = conv_params->dilation.w; + const int32_t dilation_y = conv_params->dilation.h; + const int32_t out_offset = conv_params->output_offset; + const int32_t out_activation_min = conv_params->activation.min; + const int32_t out_activation_max = conv_params->activation.max; + const int32_t rhs_cols = kernel_x * kernel_y * input_ch; + const int32_t input_offset = conv_params->input_offset; + + int32_t *output_mult = quant_params->multiplier; + int32_t *output_shift = quant_params->shift; + + int i_batch; + for (i_batch = 0; i_batch < input_batches; i_batch++) + { + const int32_t remainder = rhs_cols % 4; + const int32_t aligned_rhs_cols = remainder != 0 ? rhs_cols + 4 - remainder : rhs_cols; + /** + * Use Im2col to speed up conv2d calculations. + * Use as a ping-pong buffer for unordered elements. + */ + int8_t *im2col_buf = (int8_t *)buffer_a + aligned_rhs_cols * 2; + int16_t *im2col_buf_start_s16 = buffer_a; + int8_t *out = output_data; + int32_t lhs_rows = 0; + /* This part implements the im2col function */ + for (int i_out_x = 0; i_out_x < output_x; i_out_x++) + { + const int32_t base_idx_x = stride_x * i_out_x - pad_x; + for (int i_out_y = 0; i_out_y < output_y; i_out_y++) + { + const int32_t base_idx_y = stride_y * i_out_y - pad_y; + for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++) + { + int32_t k_x = base_idx_x + dilation_x * i_ker_x; + int32_t k_y = base_idx_y - dilation_y; + for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++) + { + k_y += dilation_y; + arm_memcpy_s8(im2col_buf, + input_data + (k_y * input_x + k_x) * input_ch, + input_ch); + im2col_buf += input_ch; + } + } + lhs_rows++; + /* Extend the input data from int8 to int16, and add offset. */ + arm_q7_to_q15_with_offset(im2col_buf - rhs_cols, + im2col_buf_start_s16, + rhs_cols, + (int16_t)input_offset); + im2col_buf_start_s16 += aligned_rhs_cols; + if (lhs_rows & 2) + { + out = arm_nn_mat_mult_kernel_s8_s16(filter_data, + buffer_a, + output_ch, + output_shift, + output_mult, + out_offset, + out_activation_min, + out_activation_max, + rhs_cols, + aligned_rhs_cols, + bias_data, + out); + /* counter reset */ + im2col_buf_start_s16 = buffer_a; + im2col_buf = (int8_t *)buffer_a + (aligned_rhs_cols << 1); + lhs_rows = 0; + } + } + } + if (lhs_rows != 0) + { + const int8_t *ker_a = filter_data; + int i; + for (i = 0; i < output_ch; i++) + { + /* Load the accumulator with bias first */ + uint16_t col_count = rhs_cols / 8; + int32_t sum = 0; + const int16_t *ip_as_col = buffer_a; + int32x4_t res_s32 = vdupq_n_s32(0); + if (bias_data) + { + sum = bias_data[i]; + } + while (col_count) + { + int8x8_t filter_s8 = vld1_s8(ker_a); + int16x8_t input_s16 = vld1q_s16(ip_as_col); + int16x8_t filter_s16 = vmovl_s8(filter_s8); + ker_a += 8; + ip_as_col += 8; + res_s32 = vmlal_s16(res_s32, + vget_low_s16(input_s16), + vget_low_s16(filter_s16)); + res_s32 = vmlal_s16(res_s32, + vget_high_s16(input_s16), + vget_high_s16(filter_s16)); + col_count --; + } + sum += vgetq_lane_s32(res_s32, 0); + sum += vgetq_lane_s32(res_s32, 1); + sum += vgetq_lane_s32(res_s32, 2); + sum += vgetq_lane_s32(res_s32, 3); + col_count = rhs_cols % 8; + while (col_count) + { + int8_t ker_a1 = *ker_a++; + int16_t ip_b1 = *ip_as_col++; + sum += ker_a1 * ip_b1; + col_count--; + } + sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]); + sum += out_offset; + sum = MAX(sum, out_activation_min); + sum = MIN(sum, out_activation_max); + *out++ = (int8_t)sum; + } + } + /* Advance to the next batch */ + input_data += (input_x * input_y * input_ch); + output_data += (output_x * output_y * output_ch); + } + /* Return to application */ + return ARM_CMSIS_NN_SUCCESS; +} + +/** + * @} end of NNConv group + */ \ No newline at end of file diff --git a/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c b/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c new file mode 100644 index 000000000..6cbea4de0 --- /dev/null +++ b/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c @@ -0,0 +1,363 @@ +/* + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-off...@arm.com> + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_nn_mat_mult_kernel_s8_s16.c + * Description: Matrix-multiplication function for convolution + * + * $Date: 29 May 2023 + * $Revision: V.2.0.0 + * + * Target : Arm(R) M-Profile Architecture + * -------------------------------------------------------------------- */ + +#include <arm_neon.h> +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/* + * Matrix-multiplication function for convolution with per-channel requantization. + * + * Refer header file for details. + * + */ +int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t *input_a, + const int16_t *input_b, + const uint16_t output_ch, + const int32_t *out_shift, + const int32_t *out_mult, + const int32_t out_offset, + const int16_t activation_min, + const int16_t activation_max, + const int32_t num_col_a, + const int32_t aligned_num_col_a, + const int32_t *const output_bias, + int8_t *out_0) +{ + int8_t *out_1 = out_0 + output_ch; + const int32_t *bias = output_bias; + + uint16_t row_count = output_ch / 4; + const int8_t *ip_a0 = input_a; + + /* this loop over rows in A */ + while (row_count) + { + int32_t col_count = num_col_a / 8; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + aligned_num_col_a; + const int8_t *ip_a[4] = {ip_a0, + ip_a0 + num_col_a, + ip_a0 + 2 * num_col_a, + ip_a0 + 3 * num_col_a}; + int32_t ch_out[4][2] = {0}; + int32x4_t res[8]; + + for (int i = 0; i < 8; i++) + { + res[i] = vdupq_n_s32(0); + } + + /* Init accumulator with bias for channel N and N + 1 */ + if (bias) + { + for (int i = 0; i < 4; i++) + { + ch_out[i][0] = *bias; + ch_out[i][1] = *bias++; + } + } + + /** + * Each time eight int8 data of four filters and eight int16 data + * of two inputs are read.First, the filter data is expanded to + * int16, and then cross-multiplied to obtain eight calculation results. + */ + while (col_count) + { + int8x8_t filter_s8[4]; + int16x8_t input_s16[2]; + int16x8_t filter_s16[4]; + + input_s16[0] = vld1q_s16(ip_b0); + ip_b0 += 8; + input_s16[1] = vld1q_s16(ip_b1); + ip_b1 += 8; + + for (int i = 0; i < 4; i++) + { + filter_s8[i] = vld1_s8(ip_a[i]); + ip_a[i] += 8; + filter_s16[i] = vmovl_s8(filter_s8[i]); + res[i * 2] = vmlal_s16(res[i * 2], + vget_low_s16(filter_s16[i]), + vget_low_s16(input_s16[0])); + res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], + vget_low_s16(filter_s16[i]), + vget_low_s16(input_s16[1])); + res[i * 2] = vmlal_s16(res[i * 2], + vget_high_s16(filter_s16[i]), + vget_high_s16(input_s16[0])); + res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], + vget_high_s16(filter_s16[i]), + vget_high_s16(input_s16[1])); + } + + col_count --; + } + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 2; j++) + { + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3); + } + } + + col_count = num_col_a % 8; + while (col_count) + { + int16_t b0 = *ip_b0++; + int16_t b1 = *ip_b1++; + + for (int i = 0; i < 4; i++) + { + int8_t input_remaining = *(ip_a[i]++); + ch_out[i][0] += input_remaining * b0; + ch_out[i][1] += input_remaining * b1; + } + col_count--; + } /* while over col_count */ + + for (int i = 0; i < 4; i++) + { + ch_out[i][0] = arm_nn_requantize(ch_out[i][0], *out_mult, *out_shift); + ch_out[i][1] = arm_nn_requantize(ch_out[i][1], *out_mult, *out_shift); + ch_out[i][0] += out_offset; + ch_out[i][1] += out_offset; + ch_out[i][0] = MAX(ch_out[i][0], activation_min); + ch_out[i][1] = MAX(ch_out[i][1], activation_min); + ch_out[i][0] = MIN(ch_out[i][0], activation_max); + ch_out[i][1] = MIN(ch_out[i][1], activation_max); + *out_0++ = (int8_t)ch_out[i][0]; + *out_1++ = (int8_t)ch_out[i][1]; + out_mult++; + out_shift++; + } + + /* skip row */ + ip_a0 = ip_a[3]; + row_count--; + } + row_count = output_ch % 4; + if (row_count >= 2) + { + int32_t col_count = num_col_a / 8; + const int8_t *ip_a1 = ip_a0 + num_col_a; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + aligned_num_col_a; + int32_t ch_out[2][2] = {0}; + int32x4_t res[4]; + + /* Init accumulator with bias for channel N and N + 1 */ + if (bias) + { + for (int i = 0; i < 2; i++) + { + ch_out[i][0] = *bias; + ch_out[i][1] = *bias++; + } + } + + for (int i = 0; i < 4; i++) + { + res[i] = vdupq_n_s32(0); + } + + /** + * Each time eight int8 data of four filters and eight int16 data + * of two inputs are read.First, the filter data is expanded to + * int16, and then cross-multiplied to obtain 8 calculation results. + */ + while (col_count) + { + int8x8_t filter_s8[2]; + int16x8_t input_s16[2]; + int16x8_t filter_s16[2]; + + filter_s8[0] = vld1_s8(ip_a0); + ip_a0 += 8; + filter_s8[1] = vld1_s8(ip_a1); + ip_a1 += 8; + + input_s16[0] = vld1q_s16(ip_b0); + ip_b0 += 8; + input_s16[1] = vld1q_s16(ip_b1); + ip_b1 += 8; + + for (int i = 0; i < 2; i++) + { + filter_s16[i] = vmovl_s8(filter_s8[i]); + res[i * 2] = vmlal_s16(res[i * 2], + vget_low_s16(filter_s16[i]), + vget_low_s16(input_s16[0])); + res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], + vget_low_s16(filter_s16[i]), + vget_low_s16(input_s16[1])); + res[i * 2] = vmlal_s16(res[i * 2], + vget_high_s16(filter_s16[i]), + vget_high_s16(input_s16[0])); + res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1], + vget_high_s16(filter_s16[i]), + vget_high_s16(input_s16[1])); + } + + col_count --; + } + for (int i = 0; i < 2; i++) + { + for (int j = 0; j < 2; j++) + { + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2); + ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3); + } + } + + col_count = num_col_a % 8; + while (col_count) + { + int8_t a0 = *ip_a0++; // filter + int8_t a1 = *ip_a1++; + int16_t b0 = *ip_b0++; // input + int16_t b1 = *ip_b1++; + + ch_out[0][0] += a0 * b0; + ch_out[1][1] += a1 * b1; + ch_out[1][0] += a1 * b0; + ch_out[0][1] += a0 * b1; + col_count--; + } /* while over col_count */ + + for (int i = 0; i < 2; i++) + { + ch_out[i][0] = arm_nn_requantize(ch_out[i][0], *out_mult, *out_shift); + ch_out[i][1] = arm_nn_requantize(ch_out[i][1], *out_mult, *out_shift); + ch_out[i][0] += out_offset; + ch_out[i][1] += out_offset; + ch_out[i][0] = MAX(ch_out[i][0], activation_min); + ch_out[i][1] = MAX(ch_out[i][1], activation_min); + ch_out[i][0] = MIN(ch_out[i][0], activation_max); + ch_out[i][1] = MIN(ch_out[i][1], activation_max); + *out_0++ = (int8_t)ch_out[i][0]; + *out_1++ = (int8_t)ch_out[i][1]; + out_mult++; + out_shift++; + } + + /* skip row */ + ip_a0 += num_col_a; + row_count -= 2; + } + + /* compute the last odd numbered row if any */ + if (output_ch & 0x1) + { + int32_t col_count = num_col_a / 8; + const int16_t *ip_b0 = input_b; + const int16_t *ip_b1 = ip_b0 + aligned_num_col_a; + int32_t ch_out[2] = {0}; + int32x4_t res[2]; + + /* load the bias */ + if (bias) + { + ch_out[0] = *bias; + ch_out[1] = *bias++; + } + + res[0] = vdupq_n_s32(0); + res[1] = vdupq_n_s32(0); + + while(col_count) + { + int8x8_t filter_s8 = vld1_s8(ip_a0); + int16x8_t filter_s16 = vmovl_s8(filter_s8); + int16x8_t input_0_s16 = vld1q_s16(ip_b0); + int16x8_t input_1_s16 = vld1q_s16(ip_b1); + ip_a0 += 8; + ip_b0 += 8; + ip_b1 += 8; + res[0] = vmlal_s16(res[0], + vget_low_s16(filter_s16), + vget_low_s16(input_0_s16)); + res[1] = vmlal_s16(res[1], + vget_low_s16(filter_s16), + vget_low_s16(input_1_s16)); + res[0] = vmlal_s16(res[0], + vget_high_s16(filter_s16), + vget_high_s16(input_0_s16)); + res[1] = vmlal_s16(res[1], + vget_high_s16(filter_s16), + vget_high_s16(input_1_s16)); + col_count --; + } + + for (int i = 0; i < 4; i++) + { + ch_out[0] += vgetq_lane_s32(res[0], i); + ch_out[1] += vgetq_lane_s32(res[1], 1); + } + + col_count = num_col_a % 8; + while (col_count) + { + int8_t a0 = *ip_a0++; + int16_t b0 = *ip_b0++; + int16_t b1 = *ip_b1++; + + ch_out[0] += a0 * b0; + ch_out[1] += a0 * b1; + col_count--; + } + + ch_out[0] = arm_nn_requantize(ch_out[0], *out_mult, *out_shift); + ch_out[0] += out_offset; + ch_out[0] = MAX(ch_out[0], activation_min); + ch_out[0] = MIN(ch_out[0], activation_max); + *out_0++ = (int8_t)ch_out[0]; + + ch_out[1] = arm_nn_requantize(ch_out[1], *out_mult, *out_shift); + ch_out[1] += out_offset; + ch_out[1] = MAX(ch_out[1], activation_min); + ch_out[1] = MIN(ch_out[1], activation_max); + *out_1++ = (int8_t)ch_out[1]; + + out_mult++; + out_shift++; + } + + out_0 += output_ch; + + /* return the new output pointer with offset */ + return out_0; +} diff --git a/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c b/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c new file mode 100644 index 000000000..b7156e473 --- /dev/null +++ b/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-off...@arm.com> + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in_q7x4 compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in_q7x4 writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <arm_neon.h> +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup groupSupport + */ + +/** + * @addtogroup supportConversion + * @{ + */ + +void arm_q7_to_q15_with_offset(const int8_t *src, int16_t *dst, int32_t block_size, int16_t offset) +{ + int32_t block_cnt; + + block_cnt = block_size / 8; + int16x8_t offset_s16 = vdupq_n_s16(offset); + while (block_cnt) + { + int8x8_t src_s8 = vld1_s8(src); + int16x8_t src_s16 = vmovl_s8(src_s8); + src += 8; + src_s16 = vaddq_s16(offset_s16, src_s16); + block_cnt--; + vst1q_s16(dst, src_s16); + dst += 8; + } + + block_cnt = block_size % 8; + while (block_cnt > 0) + { + *dst++ = (int16_t)*src++ + offset; + + /* Decrement the loop counter */ + block_cnt--; + } +} + +/** + * @} end of Doxygen group + */