The branch main has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=61f4c4d3dd38c5b79e04fff1dedb02071ebb33f2

commit 61f4c4d3dd38c5b79e04fff1dedb02071ebb33f2
Author:     Robert Clausecker <[email protected]>
AuthorDate: 2023-06-30 14:45:11 +0000
Commit:     Robert Clausecker <[email protected]>
CommitDate: 2023-08-06 13:58:27 +0000

    lib/libc/amd64/string: add strchrnul implementations (scalar, baseline)
    
    A lot better than the generic (pre) implementaion.  We do not beat glibc
    for long strings, likely due to glibc switching to AVX once the input is
    sufficiently long.  X86-64-v3 and v4 implementations may be added at a
    future time.
    
    os: FreeBSD
    arch: amd64
    cpu: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
            │ strchrnul_pre.out │         strchrnul_scalar.out         │       
strchrnul_baseline.out        │
            │      sec/op       │    sec/op     vs base                │   
sec/op     vs base                │
    Short          129.68µ ± 3%    59.91µ ± 1%  -53.80% (p=0.000 n=20)   44.37µ 
± 1%  -65.79% (p=0.000 n=20)
    Mid             21.15µ ± 0%    19.30µ ± 0%   -8.76% (p=0.000 n=20)   12.30µ 
± 0%  -41.85% (p=0.000 n=20)
    Long           13.772µ ± 0%   11.028µ ± 0%  -19.92% (p=0.000 n=20)   3.285µ 
± 0%  -76.15% (p=0.000 n=20)
    geomean         33.55µ         23.36µ       -30.37%                  12.15µ 
      -63.80%
    
            │ strchrnul_pre.out │          strchrnul_scalar.out          │      
   strchrnul_baseline.out         │
            │        B/s        │      B/s       vs base                 │      
B/s       vs base                 │
    Short          919.3Mi ± 3%   1989.7Mi ± 1%  +116.45% (p=0.000 n=20)   
2686.8Mi ± 1%  +192.28% (p=0.000 n=20)
    Mid            5.505Gi ± 0%    6.033Gi ± 0%    +9.60% (p=0.000 n=20)    
9.466Gi ± 0%   +71.97% (p=0.000 n=20)
    Long           8.453Gi ± 0%   10.557Gi ± 0%   +24.88% (p=0.000 n=20)   
35.441Gi ± 0%  +319.26% (p=0.000 n=20)
    geomean        3.470Gi         4.983Gi        +43.62%                   
9.584Gi       +176.22%
    
    For comparison, glibc on the same machine:
    
            │ strchrnul_glibc.out │
            │       sec/op        │
    Short             49.73µ ± 0%
    Mid               14.60µ ± 0%
    Long              1.237µ ± 0%
    geomean           9.646µ
    
            │ strchrnul_glibc.out │
            │         B/s         │
    Short            2.341Gi ± 0%
    Mid              7.976Gi ± 0%
    Long             94.14Gi ± 0%
    geomean          12.07Gi
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    mjg
    Differential Revision: https://reviews.freebsd.org/D41333
---
 lib/libc/amd64/string/Makefile.inc |   1 +
 lib/libc/amd64/string/strchrnul.S  | 167 +++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)

diff --git a/lib/libc/amd64/string/Makefile.inc 
b/lib/libc/amd64/string/Makefile.inc
index 1bfefa7be98c..b5260cfd78d1 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -8,6 +8,7 @@ MDSRCS+= \
        memmove.S \
        memset.S \
        strcat.S \
+       strchrnul.S \
        strcmp.S \
        strlen.S \
        stpcpy.S
diff --git a/lib/libc/amd64/string/strchrnul.S 
b/lib/libc/amd64/string/strchrnul.S
new file mode 100644
index 000000000000..6d22f31850f8
--- /dev/null
+++ b/lib/libc/amd64/string/strchrnul.S
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <[email protected]>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT     .p2align 4,0x90 # 16-byte alignment, nop-filled
+
+       .weak   strchrnul
+       .set    strchrnul, __strchrnul
+
+ARCHFUNCS(__strchrnul)
+        ARCHFUNC(__strchrnul, scalar)
+       ARCHFUNC(__strchrnul, baseline)
+ENDARCHFUNCS(__strchrnul)
+
+/*
+ * strchrnul(str, c)
+ * This is implemented like strlen(str), but we check for the
+ * presence of both NUL and c in each iteration.
+ */
+ARCHENTRY(__strchrnul, scalar)
+       mov     %edi, %ecx
+       and     $~7, %rdi               # align to 8 byte
+       movzbl  %sil, %esi              # clear stray high bits
+       movabs  $0x0101010101010101, %r8
+       mov     (%rdi), %rax            # load first word
+       imul    %r8, %rsi               # replicate char 8 times
+       movabs  $0x8080808080808080, %r9
+
+       /*
+        * Unaligned input: align to 8 bytes.  Then proceed the same
+        * way as with aligned input, but ignore matches before the
+        * beginning of the string.  This is achieved by shifting r9
+        * into r10 to have 0x00 bytes before the string begins.
+        */
+       shl     $3, %ecx
+       mov     %r9, %r10
+       add     $8, %rdi
+       shl     %cl, %r10               # 0x80 where the string is
+       neg     %r8                     # negate 01..01 so we can use lea
+
+       mov     %rsi, %rcx
+       xor     %rax, %rcx              # str ^ c
+       lea     (%rax, %r8, 1), %rdx    # str - 0x01..01
+       lea     (%rcx, %r8, 1), %r11    # (str ^ c) - 0x01..01
+       not     %rax                    # ~str
+       not     %rcx                    # ~(str ^ c)
+       and     %rdx, %rax              # (str - 0x01..01) & ~str
+       and     %r11, %rcx              # ((str ^ c - 0x01..01) & ~(str ^ c)
+       or      %rcx, %rax              # matches for both
+       and     %r10, %rax              # not including junk bytes or bytes 
before the string
+       jnz     1f
+
+       /* main loop unrolled twice */
+       ALIGN_TEXT
+0:     mov     (%rdi), %rax            # str
+       mov     %rsi, %rcx
+       xor     %rax, %rcx              # str ^ c
+       lea     (%rax, %r8, 1), %rdx    # str - 0x01..01
+       lea     (%rcx, %r8, 1), %r11    # (str ^ c) - 0x01..01
+       not     %rax                    # ~str
+       not     %rcx                    # ~(str ^ c)
+       and     %rdx, %rax              # (str - 0x01..01) & ~str
+       and     %r11, %rcx              # ((str ^ c - 0x01..01) & ~(str ^ c)
+       or      %rcx, %rax              # matches for both
+       and     %r9, %rax               # not including junk bits
+       jnz     2f
+
+       mov     8(%rdi), %rax           # str
+       add     $16, %rdi
+       mov     %rsi, %rcx
+       xor     %rax, %rcx              # str ^ c
+       lea     (%rax, %r8, 1), %rdx    # str - 0x01..01
+       lea     (%rcx, %r8, 1), %r11    # (str ^ c) - 0x01..01
+       not     %rax                    # ~str
+       not     %rcx                    # ~(str ^ c)
+       and     %rdx, %rax              # (str - 0x01..01) & ~str
+       and     %r11, %rcx              # ((str ^ c - 0x01..01) & ~(str ^ c)
+       or      %rcx, %rax              # matches for both
+       and     %r9, %rax               # not including junk bits
+       jz      0b
+
+       /* NUL or c found */
+1:     sub     $8, %rdi                # undo advance past buffer
+2:     tzcnt   %rax, %rax              # first NUL or c byte match
+       shr     $3, %eax                # scale from bit to byte index
+       add     %rdi, %rax              # pointer to found c or NUL
+       ret
+ARCHEND(__strchrnul, scalar)
+
+ARCHENTRY(__strchrnul, baseline)
+       mov             %edi, %ecx
+       and             $~0xf, %rdi             # align to 16 byte
+       movdqa          (%rdi), %xmm1
+       movd            %esi, %xmm0
+       and             $0xf, %ecx              # distance from (%rdi) to start 
of string
+       pxor            %xmm2, %xmm2
+       mov             $-1, %edx
+       punpcklbw       %xmm0, %xmm0            # c -> cc
+       shl             %cl, %edx               # bits corresponding to bytes 
in the string
+       punpcklwd       %xmm0, %xmm0            # cc -> cccc
+       add             $16, %rdi
+
+       /* check for match in head */
+       pcmpeqb         %xmm1, %xmm2            # NUL bytes present?
+       pshufd          $0, %xmm0, %xmm0        # cccc -> cccccccccccccccc
+       pcmpeqb         %xmm0, %xmm1            # c present?
+       por             %xmm2, %xmm1            # either present?
+       pmovmskb        %xmm1, %eax
+       and             %edx, %eax              # match in the string?
+       jnz             1f
+
+       /* main loop unrolled twice */
+       ALIGN_TEXT
+0:     movdqa          (%rdi), %xmm1
+       pxor            %xmm2, %xmm2
+       pcmpeqb         %xmm1, %xmm2            # NUL bytes present?
+       pcmpeqb         %xmm0, %xmm1            # c present?
+       por             %xmm2, %xmm1            # either present?
+       pmovmskb        %xmm1, %eax
+       test            %eax, %eax              # match in the string?
+       jnz             2f
+
+       movdqa          16(%rdi), %xmm1
+       add             $32, %rdi
+       pxor            %xmm2, %xmm2
+       pcmpeqb         %xmm1, %xmm2            # NUL bytes present?
+       pcmpeqb         %xmm0, %xmm1            # c present?
+       por             %xmm2, %xmm1            # either present?
+       pmovmskb        %xmm1, %eax
+       test            %eax, %eax              # match in the string?
+       jz              0b
+
+1:     sub             $16, %rdi               # undo advance past buffer
+2:     tzcnt           %eax, %eax              # where is the match?
+       add             %rdi, %rax              # pointer to found c or NUL
+       ret
+ARCHEND(__strchrnul, baseline)
+
+       .section .note.GNU-stack,"",%progbits

Reply via email to