This is an automated email from the ASF dual-hosted git repository.

pkarashchenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nuttx.git


The following commit(s) were added to refs/heads/master by this push:
     new 57df1ddcbb Add armv7m assembly strcpy.
57df1ddcbb is described below

commit 57df1ddcbb22c1fe25caf1b1c4c7d8556a4df885
Author: XinStellaris <tianx...@xiaomi.com>
AuthorDate: Tue Mar 14 21:02:03 2023 +0800

    Add armv7m assembly strcpy.
    
    Signed-off-by: XinStellaris <tianx...@xiaomi.com>
---
 libs/libc/machine/arm/armv7-m/Kconfig           |   8 +
 libs/libc/machine/arm/armv7-m/Make.defs         |   4 +
 libs/libc/machine/arm/armv7-m/gnu/arch_strcpy.S | 308 ++++++++++++++++++++++++
 3 files changed, 320 insertions(+)

diff --git a/libs/libc/machine/arm/armv7-m/Kconfig 
b/libs/libc/machine/arm/armv7-m/Kconfig
index 60ac9d4b34..10ab6ee02a 100644
--- a/libs/libc/machine/arm/armv7-m/Kconfig
+++ b/libs/libc/machine/arm/armv7-m/Kconfig
@@ -45,6 +45,14 @@ config ARMV7M_STRCMP
        ---help---
                Enable optimized ARMv7-M specific strcmp() library function
 
+config ARMV7M_STRCPY
+       bool "Enable optimized strcpy() for ARMv7-M"
+       default n
+       select LIBC_ARCH_STRCPY
+       depends on ARCH_TOOLCHAIN_GNU
+       ---help---
+               Enable optimized ARMv7-M specific strcpy() library function
+
 config ARMV7M_STRLEN
        bool "Enable optimized strlen() for ARMv7-M"
        default n
diff --git a/libs/libc/machine/arm/armv7-m/Make.defs 
b/libs/libc/machine/arm/armv7-m/Make.defs
index 75c9109cd8..e86b896453 100644
--- a/libs/libc/machine/arm/armv7-m/Make.defs
+++ b/libs/libc/machine/arm/armv7-m/Make.defs
@@ -38,6 +38,10 @@ ifeq ($(CONFIG_ARMV7M_STRCMP),y)
 ASRCS += arch_strcmp.S
 endif
 
+ifeq ($(CONFIG_ARMV7M_STRCPY),y)
+ASRCS += arch_strcpy.S
+endif
+
 ifeq ($(CONFIG_ARMV7M_STRLEN),y)
 ASRCS += arch_strlen.S
 endif
diff --git a/libs/libc/machine/arm/armv7-m/gnu/arch_strcpy.S 
b/libs/libc/machine/arm/armv7-m/gnu/arch_strcpy.S
new file mode 100644
index 0000000000..873279e16a
--- /dev/null
+++ b/libs/libc/machine/arm/armv7-m/gnu/arch_strcpy.S
@@ -0,0 +1,308 @@
+/***************************************************************************
+ * libs/libc/machine/arm/armv7-m/gnu/arch_strcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ***************************************************************************/
+
+/* This strcpy borrowed some ideas from arch_strcmp.S(). */
+
+/* Parameters and result. */
+#define dst     r0
+#define src     r1
+#define result  r0
+
+/* Internal variables, or callee saved registers */
+#define tmp1          r4
+#define tmp2          r5
+#define tmp3          r6
+#define src_offset    r7
+
+#ifdef __ARM_BIG_ENDIAN
+#  define MASK_0          0xff000000
+#  define MASK_1          0xff0000
+#  define MASK_2          0xff00
+#  define MASK_3          0xff
+#  define BYTE_0_SHIFT    24
+#  define BYTE_1_SHIFT    16
+#  define BYTE_2_SHIFT    8
+#  define BYTE_3_SHIFT    0
+#else
+#  define MASK_0          0xff
+#  define MASK_1          0xff00
+#  define MASK_2          0xff0000
+#  define MASK_3          0xff000000
+#  define BYTE_0_SHIFT    0
+#  define BYTE_1_SHIFT    8
+#  define BYTE_2_SHIFT    16
+#  define BYTE_3_SHIFT    24
+#endif
+
+
+    .syntax unified
+    .text
+    .align  2
+    .global strcpy
+    .thumb
+    .type   strcpy, %function
+
+strcpy:
+    push    {result, tmp1, tmp2, tmp3, src_offset}
+    eor     tmp1, dst, src
+    tst     tmp1, #3 
+    /* If dst and src not at same byte offset from a word boundary */
+    bne     .Lstrs_diff_offset
+    /* Process same byte offset then, get the offset */
+    ands    tmp1, src, #3
+    beq     .Ldst_src_aligned
+    /* get number of bytes unaligned */
+    rsb     tmp1, #4
+
+.Lbyte_copy_until_dsr_src_aligned:
+    ldrb    tmp2, [src], #1
+    cmp     tmp2, #0
+    beq     .Lcopy_done
+    strb    tmp2, [dst], #1
+    subs    tmp1, #1
+    bne     .Lbyte_copy_until_dsr_src_aligned
+
+.Ldst_src_aligned:
+    /* Now dst and src are aligned */
+    ldr     tmp1, [src], #4
+    sub     tmp2, tmp1, #0x01010101
+    bic     tmp2, tmp1
+    tst     tmp2, #0x80808080
+    /* All zero means no zero byte is detected */
+    it      eq
+    streq   tmp1, [dst], #4
+    beq     .Ldst_src_aligned
+
+    /* There is a zero in the word, copy until zero */
+    sub     src, #4
+.Lbyte_copy_until_zero:
+    ldrb    tmp2, [src], #1
+    cmp     tmp2, #0
+    beq     .Lcopy_done
+    strb    tmp2, [dst], #1
+    b       .Lbyte_copy_until_zero
+
+/* Make dst aligned, so we won't write anything before dst.
+ * If we attempt to write before dst, atomic read-write must
+ * be ensured. Atomic operation complicates things.
+ * So the solution here is byte by byte copy until dst aligned.
+ */
+.Lstrs_diff_offset:
+    ands    tmp1, dst, #3
+    beq     .Ldiff_offset_loop_begin
+    /* get number of dst bytes unaligned */
+    rsb     tmp1, #4
+
+.Lbyte_copy_until_dst_aligned:
+    ldrb    tmp2, [src], #1
+    cmp     tmp2, #0
+    beq     .Lcopy_done
+    strb    tmp2, [dst], #1
+    subs    tmp1, #1
+    bne     .Lbyte_copy_until_dst_aligned
+
+.Ldiff_offset_loop_begin:
+    /* src_offset mustn't be 0 here */
+    and     src_offset, src, 3
+    lsls    src_offset, #3
+    bic     src, #3
+/* first word logic
+ * prepend 0xff to make the algorithm simpler
+ * only the first word needs to be prepended
+ */
+    ldr     tmp1, [src], #4
+    mov     tmp2, #0xffffffff
+    rsb     tmp3, src_offset, #32
+    
+#ifdef __ARM_BIG_ENDIAN
+    lsls    tmp2, tmp3
+#else
+    lsrs    tmp2, tmp3
+#endif
+    orr     tmp1, tmp1, tmp2
+    /* Test if the first word contains zero */
+    sub     tmp3, tmp1, #0x01010101
+    bic     tmp3, tmp1
+    tst     tmp3, #0x80808080
+    /* non-zero means zero byte is detected */
+    bne     .Ltail_copy
+
+    /* before loop, set tmp2=tmp1 to simplify the logic in the loop */
+    mov     tmp2, tmp1
+.Ldiff_offset_loop:
+    mov     tmp1, tmp2
+    ldr     tmp2, [src], #4
+    /* Test if  contains zero */
+    sub     tmp3, tmp2, #0x01010101
+    bic     tmp3, tmp2
+    tst     tmp3, #0x80808080
+    /* non-zero means zero byte is detected */
+    bne     .Ltail_copy
+    /* Now let's fill dst */
+#ifdef __ARM_BIG_ENDIAN
+    lsls    tmp1, src_offset
+    rsb     tmp3, src_offset, #32
+    lsrs    tmp3, tmp2, tmp3
+    orr     tmp1, tmp1, tmp3
+#else
+    lsrs    tmp1, src_offset
+    rsb     tmp3, src_offset, #32
+    lsls    tmp3, tmp2, tmp3
+    orr     tmp1, tmp1, tmp3
+#endif
+    str     tmp1, [dst], #4
+    b       .Ldiff_offset_loop
+
+.Ltail_copy:
+    cmp     src_offset, #24
+    beq     .Loffset_3
+    cmp     src_offset, #16
+    beq     .Loffset_2
+    /*  src_offset == 8 here */
+    ands    tmp3, tmp1, MASK_1
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_1_SHIFT
+    strb    tmp3, [dst], #1
+.Loffset_2:
+    ands    tmp3, tmp1, MASK_2
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_2_SHIFT
+    strb    tmp3, [dst], #1
+.Loffset_3:
+    ands    tmp3, tmp1, MASK_3
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_3_SHIFT
+    strb    tmp3, [dst], #1
+    ands    tmp3, tmp2, MASK_0
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_0_SHIFT
+    strb    tmp3, [dst], #1
+    ands    tmp3, tmp2, MASK_1
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_1_SHIFT
+    strb    tmp3, [dst], #1
+    ands    tmp3, tmp2, MASK_2
+    beq     .Lcopy_done
+    lsrs    tmp3, BYTE_2_SHIFT
+    strb    tmp3, [dst], #1
+.Lcopy_done:
+    mov     tmp3, #0
+    strb    tmp3, [dst]
+    pop     {result, tmp1, tmp2, tmp3, src_offset}
+    bx      lr
+
+#if 0
+/* Pseudo Code of strcpy when dst/src not at same byte offset */
+
+/* Make dst aligned, so we won't write anything before dst.
+ * If we attempt to write before dst, atomic read-write must
+ * be ensured. Atomic operation complicates things.
+ * So the solution here is byte by byte copy until dst aligned.
+ */
+    if (dst & 3 == 0)
+        goto diff_offset_loop_begin;
+    ByteCopyUntilDstAligned();
+
+.diff_offset_loop_begin:
+/* src_offset mustn't be 0 here */
+    src_offset = src & 3;
+    src_offset = src_offset * 8;
+    src = src & 0xfffffffc;
+    tmp1 = *src;
+    src +=4;
+/* first word logic
+ * prepend 0xff to make the algorithm simpler
+ * only the first word needs to be prepended
+ */
+    if (src_offset != 0)
+    {
+        tmp2 = 0xffffffff
+#if  big endian
+        tmp2 = tmp2 << (32 - src_offset)
+#else
+        tmp2 = tmp2 >> (32 - src_offset)
+#endif
+        tmp1 |= tmp2
+    }
+    if (HasZeroByte(tmp1)) 
+    {
+        goto .tail_copy;
+    }
+
+/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
+    tmp2 = tmp1
+.diff_offset_loop:
+    tmp1 = tmp2;
+    tmp2 = *src;
+    src += 4;
+
+ /* double word tail means we have to copy from tmp1 and tmp2 to dst */
+    if (HasZeroByte(tmp2))
+    {
+        goto .tail_copy;
+    }
+/* Now let's fill dst */
+#if  big endian
+    tmp1 = tmp1 << (src_offset);
+    tmp1 |= tmp2 >> (32 - src_offset);
+    *dst = tmp1;
+#else
+    tmp1 = tmp1 >> (src_offset);
+    tmp1 |= tmp2 << (32 - src_offset);
+    *dst = tmp1;
+#endif
+    dst +=4;
+    goto .diff_offset_loop;
+
+/* byte by byte copy at the tail */
+.tail_copy:
+    if (src_offset == 3)
+        goto offset_3;
+    if (src_offset == 2)
+        goto offset_2;
+
+/* src_offset mustn't be 0 here */
+/* default src_offset == 1 */
+    if (tmp1 & MASK_1 == 0)
+        goto cpy_done;
+    *dst++ = tmp1 & MASK_1;
+offset_2:
+    if (tmp1 & MASK_2 == 0)
+        goto cpy_done;
+    *dst++ = tmp1 & MASK_2;
+offset_3:
+    if (tmp1 & MASK_3 == 0)
+        goto cpy_done;
+    *dst++ = tmp1 & MASK_3;
+    if (tmp2 & MASK_0 == 0)
+        goto cpy_done;
+    *dst++ = tmp2 & MASK_0;
+    if (tmp2 & MASK_1 == 0)
+        goto cpy_done;
+    *dst++ = tmp2 & MASK_1;
+    if (tmp2 & MASK_2 == 0)
+        goto cpy_done;
+    *dst++ = tmp2 & MASK_2;
+/* tmp2 BYTE3 must be zero here */
+
+.cpy_done:
+    *dst++ = 0;
+#endif  /* Pseudo code end */
+

Reply via email to