The branch main has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=fb197a4f7751bb4e116989e57ba7fb12a981895f

commit fb197a4f7751bb4e116989e57ba7fb12a981895f
Author:     Robert Clausecker <[email protected]>
AuthorDate: 2023-12-06 10:05:47 +0000
Commit:     Robert Clausecker <[email protected]>
CommitDate: 2023-12-25 14:00:05 +0000

    lib/libc/amd64/string: add memrchr() scalar, baseline implementation
    
    The scalar implementation is fairly simplistic and only performs
    slightly better than the generic C implementation. It could be
    improved by using the same algorithm as for memchr, but it would
    have been a lot more complicated.
    
    The baseline implementation is similar to timingsafe_memcmp.  It's
    slightly slower than memchr() due to the more complicated main
    loop, but I don't think that can be significantly improved.
    
    Tested by:      developers@, exp-run
    Approved by:    mjg
    MFC after:      1 month
    MFC to:         stable/14
    PR:             275785
    Differential Revision:  https://reviews.freebsd.org/D42925
---
 lib/libc/amd64/string/Makefile.inc |   1 +
 lib/libc/amd64/string/memrchr.S    | 166 +++++++++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+)

diff --git a/lib/libc/amd64/string/Makefile.inc 
b/lib/libc/amd64/string/Makefile.inc
index a14e8a768f01..b1369841bc74 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -6,6 +6,7 @@ MDSRCS+= \
        memccpy.S \
        memcpy.S \
        memmove.S \
+       memrchr.S \
        memset.S \
        stpcpy.S \
        stpncpy.S \
diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S
new file mode 100644
index 000000000000..4f6c5a238daa
--- /dev/null
+++ b/lib/libc/amd64/string/memrchr.S
@@ -0,0 +1,166 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Robert Clausecker
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define        ALIGN_TEXT      .p2align 4, 0x90
+
+ARCHFUNCS(memrchr)
+       ARCHFUNC(memrchr, scalar)
+       ARCHFUNC(memrchr, baseline)
+ENDARCHFUNCS(memrchr)
+
+ARCHENTRY(memrchr, scalar)
+       xor     %eax, %eax              # prospective return value
+       sub     $4, %rdx                # 4 bytes left to process?
+       jb      1f
+
+       ALIGN_TEXT
+0:     xor     %r8, %r8
+       lea     2(%rdi), %r10
+       cmp     %sil, 2(%rdi)
+       cmovne  %r8, %r10               # point to null if no match
+
+       cmp     %sil, (%rdi)
+       cmove   %rdi, %r8               # point to first char if match
+
+       lea     1(%rdi), %r9
+       cmp     %sil, 1(%rdi)
+       cmovne  %r8, %r9                # point to first result if no match in 
second
+
+       lea     3(%rdi), %r11
+       cmp     %sil, 3(%rdi)
+       cmovne  %r10, %r11
+
+       test    %r11, %r11
+       cmovz   %r9, %r11               # take first pair match if none in 
second
+
+       test    %r11, %r11
+       cmovnz  %r11, %rax              # take match in current set if any
+
+       add     $4, %rdi
+       sub     $4, %rdx
+       jae     0b
+
+1:     cmp     $-3, %edx               # a least one character left to process?
+       jb      2f
+
+       cmp     %sil, (%rdi)
+       cmove   %rdi, %rax
+
+       lea     1(%rdi), %rcx
+       cmp     $-2, %edx               # at least two characters left to 
process?
+       jb      2f
+
+       cmp     %sil, 1(%rdi)
+       cmove   %rcx, %rax
+
+       lea     2(%rdi), %rcx
+       cmp     $-1, %edx               # at least three character left to 
process?
+       jb      2f
+
+       cmp     %sil, 2(%rdi)
+       cmove   %rcx, %rax
+
+2:     ret
+ARCHEND(memrchr, scalar)
+
+ARCHENTRY(memrchr, baseline)
+       movd            %esi, %xmm4
+       test            %rdx, %rdx              # empty buffer?
+       jz              .L0                     # if yes, return immediately
+
+       punpcklbw       %xmm4, %xmm4            # c -> cc
+       mov             %edi, %ecx
+       punpcklwd       %xmm4, %xmm4            # cc -> cccc
+       and             $~0xf, %rdi             # align source pointer
+       pshufd          $0, %xmm4, %xmm4        # cccc -> cccccccccccccccc
+       and             $0xf, %ecx
+       movdqa          %xmm4, %xmm0
+       mov             $-1, %r8d
+       pcmpeqb         (%rdi), %xmm0           # compare aligned head
+       shl             %cl, %r8d               # mask of bytes in the head of 
the buffer
+       pmovmskb        %xmm0, %eax
+
+       sub             $16, %rcx
+       and             %r8d, %eax              # match mask
+       add             %rcx, %rdx              # advance past head
+       cmc
+       jbe             .Lrunt                  # did the string end in the 
buffer?
+
+       mov             %rdi, %rsi              # pointer to matching chunk
+       add             $16, %rdi
+       sub             $16, %rdx               # enough left for another round?
+       jbe             1f
+
+       /* main loop unrolled twice */
+       ALIGN_TEXT
+0:     movdqa          %xmm4, %xmm0
+       pcmpeqb         (%rdi), %xmm0
+       pmovmskb        %xmm0, %r8d
+
+       cmp             $16, %rdx               # enough left for second chunk?
+       jbe             2f
+
+       movdqa          %xmm4, %xmm0
+       pcmpeqb         16(%rdi), %xmm0
+       pmovmskb        %xmm0, %ecx
+
+       lea             16(%rdi), %r9
+       test            %ecx, %ecx              # match found in second chunk?
+       cmovz           %r8d, %ecx              # if not, use match data from 
first chunk
+       cmovz           %rdi, %r9
+
+       test            %ecx, %ecx              # any match found?
+       cmovnz          %ecx, %eax              # if yes, overwrite previously 
found match
+       cmovnz          %r9, %rsi
+
+       add             $32, %rdi               # advance to next iteration
+       sub             $32, %rdx               # advance to next chunks
+       ja              0b
+
+       /* process remaining 1--16 bytes */
+1:     pcmpeqb         (%rdi), %xmm4
+       mov             $0xffff, %r8d
+       xor             %ecx, %ecx
+       sub             %edx, %ecx              # number of bytes to be masked 
out
+       pmovmskb        %xmm4, %r9d
+       shr             %cl, %r8d               # mask of bytes to be kept in 
the buffer
+       and             %r9d, %r8d
+       cmovnz          %r8d, %eax
+       cmovnz          %rdi, %rsi
+       bsr             %eax, %eax
+       lea             (%rsi, %rax, 1), %rsi   # pointer to match (or junk)
+       cmovnz          %rsi, %rax              # if any match was found, 
return it
+       ret
+
+       /* end of chunk reached within first half iteration */
+2:     test            %r8d, %r8d              # match in previous chunk?
+       cmovnz          %r8d, %eax              # if yes, overwrite previous 
chunks
+       cmovnz          %rdi, %rsi
+       add             $16, %rdi               # point to tail
+       sub             $16, %edx
+       jmp             1b                      # handle tail the same otherwise
+
+       /* runt: string ends within head, edx has negated amount of invalid 
head bytes */
+.Lrunt:        mov             $0xffff, %r8d
+       xor             %ecx, %ecx
+       sub             %edx, %ecx
+       shr             %cl, %r8d
+       and             %r8d, %eax
+       bsr             %eax, %eax
+       lea             (%rdi, %rax, 1), %rdi
+       cmovnz          %rdi, %rax
+       ret
+
+       /* empty buffer: return a null pointer */
+.L0:   xor             %eax, %eax
+       ret
+ARCHEND(memrchr, baseline)
+
+       .section        .note.GNU-stack, "", %progbits

Reply via email to