The branch main has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=fc0e38a7a67a6d43095efb00cf19ee5f95dcf710

commit fc0e38a7a67a6d43095efb00cf19ee5f95dcf710
Author:     Robert Clausecker <[email protected]>
AuthorDate: 2023-12-02 12:28:05 +0000
Commit:     Robert Clausecker <[email protected]>
CommitDate: 2023-12-25 13:59:42 +0000

    lib/libc/amd64/string: add memccpy scalar, baseline implementation
    
    Based on the strlcpy code from D42863, this patch adds a SIMD-enhanced
    implementation of memccpy for amd64. A scalar implementation calling
    into memchr and memcpy to do the job is provided, too.
    
    Please note that this code does not behave exactly the same as the C
    implementation of memccpy for overlapping inputs. However, overlapping
    inputs are not allowed for this function by ISO/IEC 9899:1999 and neither
    has the C implementation any code to deal with the possibility. It just
    proceeds byte-by-byte, which may or may not do the expected thing for
    some overlaps. We do not document whether overlapping inputs are
    supported in memccpy(3).
    
    Tested by:      developers@, exp-run
    Approved by:    mjg
    MFC after:      1 month
    MFC to:         stable/14
    PR:             275785
    Differential Revision:  https://reviews.freebsd.org/D42902
---
 lib/libc/amd64/string/Makefile.inc |   1 +
 lib/libc/amd64/string/memccpy.S    | 259 +++++++++++++++++++++++++++++++++++++
 2 files changed, 260 insertions(+)

diff --git a/lib/libc/amd64/string/Makefile.inc 
b/lib/libc/amd64/string/Makefile.inc
index 2b1e276cb3da..b569d2cb8be8 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -3,6 +3,7 @@ MDSRCS+= \
        bcmp.S \
        memchr.S \
        memcmp.S \
+       memccpy.S \
        memcpy.S \
        memmove.S \
        memset.S \
diff --git a/lib/libc/amd64/string/memccpy.S b/lib/libc/amd64/string/memccpy.S
new file mode 100644
index 000000000000..a2d9e33b3d36
--- /dev/null
+++ b/lib/libc/amd64/string/memccpy.S
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <[email protected]>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT     .p2align 4, 0x90
+
+       .weak memccpy
+       .set memccpy, __memccpy
+ARCHFUNCS(__memccpy)
+       ARCHFUNC(__memccpy, scalar)
+       ARCHFUNC(__memccpy, baseline)
+ENDARCHFUNCS(__memccpy)
+
+ARCHENTRY(__memccpy, scalar)
+       push    %rbp                    # establish stack frame
+       mov     %rsp, %rbp
+       push    %rax                    # dummy push for alignment
+       push    %rbx
+       push    %rdi
+       push    %rsi
+
+       mov     %rsi, %rdi
+       mov     %edx, %esi
+       mov     %rcx, %rdx
+       mov     %rcx, %rbx
+       call    CNAME(__memchr)         # ptr = memchr(src, c, len)
+
+       pop     %rsi
+       pop     %rdi
+       lea     1(%rax), %rdx
+       sub     %rsi, %rdx              # size = ptr - src + 1
+       mov     %rbx, %rcx
+       lea     (%rdi, %rdx, 1), %rbx   # res = dest + size
+       test    %rax, %rax              # if (ptr == NULL)
+       cmovz   %rcx, %rdx              # size = len
+       cmovz   %rax, %rbx              # res = NULL
+       call    CNAME(memcpy)
+
+       mov     %rbx, %rax              # return (res)
+       pop     %rbx
+       leave
+       ret
+ARCHEND(__memccpy, scalar)
+
+ARCHENTRY(__memccpy, baseline)
+       sub             $1, %rcx                # RCX refers to last character 
in buffer
+       jb              .L0                     # go to special code path if 
len was 0
+
+       movd            %edx, %xmm4
+       mov             %rcx, %rdx
+       punpcklbw       %xmm4, %xmm4            # c -> cc
+       mov             %esi, %ecx
+       punpcklwd       %xmm4, %xmm4            # cc -> cccc
+       mov             %rsi, %r9               # stash a copy of the source 
pointer for later
+       pshufd          $0, %xmm4, %xmm4        # cccc -> cccccccccccccccc
+       and             $~0xf, %rsi
+       movdqa          %xmm4, %xmm1
+       pcmpeqb         (%rsi), %xmm1           # NUL found in head?
+       mov             $-1, %r8d
+       and             $0xf, %ecx
+       shl             %cl, %r8d               # mask of bytes in the string
+       pmovmskb        %xmm1, %eax
+       and             %r8d, %eax
+       jnz             .Lhead_nul
+
+       movdqa          16(%rsi), %xmm3         # load second string chunk
+       movdqu          (%r9), %xmm2            # load unaligned string head
+       mov             $32, %r8d
+       sub             %ecx, %r8d              # head length + length of 
second chunk
+       movdqa          %xmm4, %xmm1
+       pcmpeqb         %xmm3, %xmm1            # NUL found in second chunk?
+
+       sub             %r8, %rdx               # enough space left for the 
second chunk?
+       jb              .Lhead_buf_end
+
+       /* process second chunk */
+       pmovmskb        %xmm1, %eax
+       test            %eax, %eax
+       jnz             .Lsecond_nul
+
+       /* string didn't end in second chunk and neither did buffer -- not a 
runt! */
+       movdqa          32(%rsi), %xmm0         # load next string chunk
+       movdqa          %xmm4, %xmm1
+       movdqu          %xmm2, (%rdi)           # deposit head into buffer
+       sub             %rcx, %rdi              # adjust RDI to correspond to 
RSI
+       movdqu          %xmm3, 16(%rdi)         # deposit second chunk
+       sub             %rsi, %rdi              # express RDI as distance from 
RSI
+       add             $32, %rsi               # advance RSI past first two 
chunks
+       sub             $16, %rdx               # enough left for another round?
+       jb              1f
+
+       /* main loop unrolled twice */
+       ALIGN_TEXT
+0:     pcmpeqb         %xmm0, %xmm1            # NUL byte encountered?
+       pmovmskb        %xmm1, %eax
+       test            %eax, %eax
+       jnz             3f
+
+       movdqu          %xmm0, (%rsi, %rdi)
+       movdqa          16(%rsi), %xmm0         # load next string chunk
+       movdqa          %xmm4, %xmm1
+       cmp             $16, %rdx               # more than a full chunk left?
+       jb              2f
+
+       add             $32, %rsi               # advance pointers to next chunk
+       pcmpeqb         %xmm0, %xmm1            # NUL byte encountered?
+       pmovmskb        %xmm1, %eax
+       test            %eax, %eax
+       jnz             4f
+
+       movdqu          %xmm0, -16(%rsi, %rdi)
+       movdqa          (%rsi), %xmm0           # load next string chunk
+       movdqa          %xmm4, %xmm1
+       sub             $32, %rdx
+       jae             0b
+
+1:     sub             $16, %rsi               # undo second advancement
+       add             $16, %edx
+
+       /* 1--16 bytes left in the buffer but string has not ended yet */
+2:     pcmpeqb         %xmm1, %xmm0            # NUL byte encountered?
+       pmovmskb        %xmm0, %r8d
+       mov             %r8d, %ecx
+       bts             %edx, %r8d              # treat end of buffer as end of 
string
+       or              $0x10000, %eax          # ensure TZCNT finds a set bit
+       tzcnt           %r8d, %r8d              # find tail length
+       add             %rsi, %rdi              # restore RDI
+       movdqu          1(%rsi, %r8, 1), %xmm0  # load string tail
+       movdqu          %xmm0, 1(%rdi, %r8, 1)  # store string tail
+       lea             17(%rdi, %r8, 1), %rsi  # return value if terminator 
encountered
+       xor             %eax, %eax              # return value if no terminator 
encountered
+       bt              %r8d, %ecx              # terminator encountered inside 
buffer?
+       cmovc           %rsi, %rax              # if yes, return pointer, else 
NULL
+       ret
+
+4:     sub             $16, %rsi               # undo second advancement
+       add             $16, %rdx               # restore number of remaining 
bytes
+
+       /* string has ended but buffer has not */
+3:     tzcnt           %eax, %eax              # find length of string tail
+       movdqu          -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL)
+       add             %rsi, %rdi              # restore destination pointer
+       movdqu          %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. 
NUL)
+       lea             1(%rdi, %rax, 1), %rax  # compute return value
+       ret
+
+.Lhead_buf_end:
+       pmovmskb        %xmm1, %r8d
+       add             $32, %edx               # restore edx to (len-1) + ecx
+       shl             $16, %r8d               # place 2nd chunk NUL mask into 
bits 16--31
+       mov             %r8d, %r10d
+       bts             %rdx, %r8               # treat end of buffer as if 
terminator present
+       xor             %eax, %eax              # return value if terminator 
not found
+       tzcnt           %r8, %rdx               # find string/buffer len from 
alignment boundary
+       lea             1(%rdi, %rdx, 1), %r8   # return value if terminator 
found + rcx
+       sub             %rcx, %r8               # subtract rcx
+       bt              %rdx, %r10              # was the terminator present?
+       cmovc           %r8, %rax               # if yes, return pointer, else 
NULL
+       sub             %ecx, %edx              # find actual string/buffer len
+       jmp             .L0132
+
+.Lsecond_nul:
+       add             %r8, %rdx               # restore buffer length
+       tzcnt           %eax, %r8d              # where is the NUL byte?
+       lea             -16(%rcx), %eax
+       sub             %eax, %r8d              # string length
+       lea             1(%rdi, %r8, 1), %rax   # return value if NUL before 
end of buffer
+       xor             %ecx, %ecx              # return value if not
+       cmp             %r8, %rdx               # is the string shorter than 
the buffer?
+       cmova           %r8, %rdx               # copy only min(buflen, srclen) 
bytes
+       cmovb           %rcx, %rax              # return NUL if buffer ended 
before string
+.L0132:        cmp             $16, %rdx               # at least 17 bytes to 
copy (not incl NUL)?
+       jb              .L0116
+
+       /* copy 17--32 bytes */
+       movdqu          (%r9), %xmm0            # load first 16 bytes
+       movdqu          -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes
+       movdqu          %xmm0, (%rdi)
+       movdqu          %xmm1, -15(%rdi, %rdx, 1)
+       ret
+
+.Lhead_nul:
+       tzcnt           %eax, %r8d              # where is the NUL byte?
+       sub             %ecx, %r8d              # ... from the beginning of the 
string?
+       lea             1(%rdi, %r8, 1), %rax   # return value if NUL before 
end of buffer
+       xor             %ecx, %ecx              # return value if not
+       cmp             %r8, %rdx               # is the string shorter than 
the buffer?
+       cmova           %r8, %rdx               # copy only min(buflen, srclen) 
bytes
+       cmovb           %rcx, %rax              # return NUL if buffer ended 
before string
+
+       /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: 
srclen) */
+.L0116:        cmp             $8, %rdx                # at least 9 bytes to 
copy?
+       jae             .L0916
+
+       cmp             $4, %rdx                # at least 5 bytes to copy?
+       jae             .L0508
+
+       cmp             $2, %rdx                # at least 3 bytes to copy?
+       jae             .L0304
+
+       /* copy one or two bytes */
+       movzbl          (%r9), %ecx             # load first byte from src
+       movzbl          (%r9, %rdx, 1), %esi    # load last byte from src
+       mov             %cl, (%rdi)             # deposit into destination
+       mov             %sil, (%rdi, %rdx, 1)
+       ret
+
+.L0304:        movzwl          (%r9), %ecx
+       movzwl          -1(%r9, %rdx, 1), %esi
+       mov             %cx, (%rdi)
+       mov             %si, -1(%rdi, %rdx, 1)
+       ret
+
+.L0508:        mov             (%r9), %ecx
+       mov             -3(%r9, %rdx, 1), %esi
+       mov             %ecx, (%rdi)
+       mov             %esi, -3(%rdi, %rdx, 1)
+       ret
+
+.L0916:        mov             (%r9), %rcx
+       mov             -7(%r9, %rdx, 1), %rsi
+       mov             %rcx, (%rdi)
+       mov             %rsi, -7(%rdi, %rdx, 1)
+       ret
+
+       /* length zero destination: return null pointer */
+.L0:   xor             %eax, %eax
+       ret
+ARCHEND(__memccpy, baseline)
+
+       .section .note.GNU-stack,"",%progbits

Reply via email to