The branch main has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=de12a689fad271f5a2ba7c188b0b5fb5cabf48e7

commit de12a689fad271f5a2ba7c188b0b5fb5cabf48e7
Author:     Robert Clausecker <[email protected]>
AuthorDate: 2023-08-24 16:43:40 +0000
Commit:     Robert Clausecker <[email protected]>
CommitDate: 2023-09-08 21:22:20 +0000

    lib/libc/amd64/string: add memchr(3) scalar, baseline implementation
    
    This is conceptually similar to strchr(3), but there are
    slight changes to account for the buffer having an explicit
    buffer length.
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    mjg
    MFC after:      1 week
    MFC to:         stable/14
    Differential Revision:  https://reviews.freebsd.org/D41598
---
 lib/libc/amd64/string/Makefile.inc |   1 +
 lib/libc/amd64/string/memchr.S     | 204 +++++++++++++++++++++++++++++++++++++
 2 files changed, 205 insertions(+)

diff --git a/lib/libc/amd64/string/Makefile.inc 
b/lib/libc/amd64/string/Makefile.inc
index b03811f40062..7c6a1f429590 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -2,6 +2,7 @@
 MDSRCS+= \
        amd64_archlevel.c \
        bcmp.S \
+       memchr.S \
        memcmp.S \
        memcpy.S \
        memmove.S \
diff --git a/lib/libc/amd64/string/memchr.S b/lib/libc/amd64/string/memchr.S
new file mode 100644
index 000000000000..e10bd6c22f90
--- /dev/null
+++ b/lib/libc/amd64/string/memchr.S
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <[email protected]>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+        .weak memchr
+        .set memchr, __memchr
+ARCHFUNCS(__memchr)
+       ARCHFUNC(__memchr, scalar)
+       ARCHFUNC(__memchr, baseline)
+ENDARCHFUNCS(__memchr)
+
+ARCHENTRY(__memchr, scalar)
+       test    %rdx, %rdx      # empty input?
+       je      .Lnomatch
+
+       lea     (, %rdi, 8), %ecx
+       add     %rdi, %rdx      # pointer to end of buffer
+       and     $~7, %rdi       # align to 8 bytes
+       mov     (%rdi), %rax    # load first word
+       movzbl  %sil, %esi      # clear stray high bits
+       movabs  $0x0101010101010101, %r8
+       imul    %r8, %rsi       # replicate char 8 times
+
+       /* compute head and tail masks */
+       mov     %r8, %r10
+       movabs  $0x8080808080808080, %r9
+       shl     %cl, %r10       # 0x01 where string head is
+       lea     (, %rdx, 8), %ecx
+       xor     %r8, %r10       # 0x01 where it is not
+       neg     %r8             # negate 01..01 so we can use lea
+       mov     %r9, %r11
+       xor     %rsi, %rax      # str ^ c (0x00 where str[i] == c)
+       neg     %ecx
+       or      %r10, %rax      # except before the string
+       shr     %cl, %r11       # 0x80 where string tail is
+
+       add     $8, %rdi        # advance to next 8 bytes
+       cmp     %rdx, %rdi      # end of buffer reached during head?
+       jae     .Ltail          # and go to tail-processing code
+
+       /* main loop, unrolled twice */
+       ALIGN_TEXT
+0:     lea     (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
+       not     %rax            # ~(str ^ c)
+       and     %r9, %rax       # ((str^c) - 0x01..01) & ~(str^c)
+       and     %rcx, %rax      # not including junk bytes
+       jnz     .Lmatch
+
+       mov     (%rdi), %rax
+       add     $8, %rdi
+       xor     %rsi, %rax      # str ^ c
+       cmp     %rdx, %rdi
+       jae     .Ltail
+
+       lea     (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
+       not     %rax            # ~(str ^ c)
+       and     %r9, %rax       # ((str^c) - 0x01..01) & ~(str^c)
+       and     %rcx, %rax      # not including junk bytes
+       jnz     .Lmatch
+
+       mov     (%rdi), %rax
+       add     $8, %rdi
+       xor     %rsi, %rax      # str ^ c
+       cmp     %rdx, %rdi
+       jb      0b
+
+.Ltail:        lea     (%rax, %r8, 1), %rcx # (str ^ c) - 0x01..01
+       not     %rax            # ~(str ^ c)
+       and     %r11, %rax      # ((str^c) - 0x01..01) & ~(str^c)
+       and     %rcx, %rax      # not including junk bytes or bytes past buffer
+       jz      .Lnomatch
+
+.Lmatch:
+       tzcnt   %rax, %rax      # first match
+       shr     $3, %eax        # scale from bit to byte index
+       lea     -8(%rdi, %rax), %rax # pointer to found c
+       ret
+
+       /* no match found */
+.Lnomatch:
+       xor     %eax, %eax      # return null pointer
+       ret
+ARCHEND(__memchr, scalar)
+
+ARCHENTRY(__memchr, baseline)
+       test            %rdx, %rdx              # empty input?
+       je              .Lnomatchb
+
+       movd            %esi, %xmm2
+       mov             %edi, %ecx
+       add             %rdi, %rdx              # pointer to end of buffer
+       and             $~0x1f, %rdi            # align to 32 bytes
+       movdqa          (%rdi), %xmm0           # load first 32 bytes
+       movdqa          16(%rdi), %xmm1
+
+       punpcklbw       %xmm2, %xmm2            # c -> cc
+
+       mov             $-1, %r9d
+       shl             %cl, %r9d               # mask with zeroes before the 
string
+
+       punpcklwd       %xmm2, %xmm2            # cc -> cccc
+
+       mov             $-1, %r8d
+       xor             %ecx, %ecx
+       sub             %edx, %ecx              # edx = -ecx
+       shr             %cl, %r8d               # bytes in tail that are part 
of the buffer
+
+       pshufd          $0, %xmm2, %xmm2        # cccc -> cccccccccccccccc
+
+       add             $32, %rdi               # advance to next 32 bytes
+       mov             $-1, %eax
+       cmp             %rdx, %rdi              # end of buffer reached during 
head?
+       cmovae          %r8d, %eax              # if yes, do combined head/tail 
processing
+       and             %r9d, %eax              # mask of bytes in head part of 
string
+
+       /* process head */
+       pcmpeqb         %xmm2, %xmm1
+       pcmpeqb         %xmm2, %xmm0
+       pmovmskb        %xmm1, %esi
+       pmovmskb        %xmm0, %ecx
+       shl             $16, %esi
+       or              %esi, %ecx              # locations of matches
+       and             %ecx, %eax              # any match inside buffer?
+       jnz             .Lprecisematchb
+
+       cmp             %rdx, %rdi              # did the buffer end here?
+       jae             .Lnomatchb              # if yes we are done
+
+       /* main loop */
+       ALIGN_TEXT
+0:     movdqa          (%rdi), %xmm0           # load next string chunk
+       movdqa          16(%rdi), %xmm1
+       add             $32, %rdi
+       cmp             %rdx, %rdi              # ready for main loop?
+       jae             .Ltailb
+
+       pcmpeqb         %xmm2, %xmm0
+       pcmpeqb         %xmm2, %xmm1
+       por             %xmm1, %xmm0            # match in either half?
+       pmovmskb        %xmm0, %eax
+       test            %eax, %eax
+       jz              0b
+
+.Lmatchb:
+       pcmpeqb         -32(%rdi), %xmm2        # redo comparison of first 16 
bytes
+       pmovmskb        %xmm1, %ecx
+       pmovmskb        %xmm2, %eax
+       shl             $16, %ecx
+       or              %ecx, %eax              # location of matches
+
+.Lprecisematchb:
+       tzcnt           %eax, %eax              # find location of match
+       lea             -32(%rdi, %rax, 1), %rax # point to matching byte
+       ret
+
+.Ltailb:
+       pcmpeqb         %xmm2, %xmm1
+       pcmpeqb         %xmm2, %xmm0
+       pmovmskb        %xmm1, %edx
+       pmovmskb        %xmm0, %eax
+       shl             $16, %edx
+       or              %edx, %eax              # location of matches
+       and             %r8d, %eax              # mask out matches beyond buffer
+       bsf             %eax, %edx              # location of match
+       lea             -32(%rdi, %rdx, 1), %rdx # pointer to match (if any)
+       cmovnz          %rdx, %rax              # point to match if present,
+       ret                                     # else null pointer
+
+.Lnomatchb:
+       xor     %eax, %eax      # return null pointer
+       ret
+ARCHEND(__memchr, baseline)
+
+       .section .note.GNU-stack,"",%progbits

Reply via email to