The branch stable/15 has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=514677bd33fb8f9d99d0f4f51be0efb4bcd86583

commit 514677bd33fb8f9d99d0f4f51be0efb4bcd86583
Author:     Strahinja Stanišić <[email protected]>
AuthorDate: 2024-07-17 11:19:52 +0000
Commit:     Robert Clausecker <[email protected]>
CommitDate: 2025-11-30 00:43:05 +0000

    libc: scalar memchr() in RISC-V assembly
    
    Added an optimized memchr() implementation in RISC-V assembly and updated
    the relevant manpage.
    
            │ memchr_baseline │            memchr_scalar            │
            │     sec/op      │   sec/op     vs base                │
    Short         636.6µ ± 1%   495.9µ ± 1%  -22.10% (p=0.000 n=20)
    Mid           279.7µ ± 1%   224.1µ ± 1%  -19.87% (p=0.000 n=20)
    Long          138.8µ ± 0%   124.9µ ± 0%  -10.00% (p=0.000 n=20)
    geomean       291.3µ        240.3µ       -17.48%
    
            │ memchr_baseline │            memchr_scalar             │
            │       B/s       │     B/s       vs base                │
    Short        187.3Mi ± 1%   240.4Mi ± 1%  +28.37% (p=0.000 n=20)
    Mid          426.2Mi ± 1%   531.9Mi ± 1%  +24.79% (p=0.000 n=20)
    Long         859.0Mi ± 0%   954.4Mi ± 0%  +11.11% (p=0.000 n=20)
    geomean      409.3Mi        496.0Mi       +21.19%
    
    MFC after:      1 month
    MFC to:         stable/15
    Approved by:    mhorne, markj (mentor)
    Reviewed by:    fuz
    Sponsored by:   Google LLC (GSoC 2024)
    Differential Revision:  https://reviews.freebsd.org/D46023
    
    (cherry picked from commit 563efdd3bd5d5f94e356444bb64fd66e13dda5e1)
---
 lib/libc/riscv/string/Makefile.inc |   1 +
 lib/libc/riscv/string/memchr.S     | 188 +++++++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+)

diff --git a/lib/libc/riscv/string/Makefile.inc 
b/lib/libc/riscv/string/Makefile.inc
index a9cf8bf52481..cdbc19d286fc 100644
--- a/lib/libc/riscv/string/Makefile.inc
+++ b/lib/libc/riscv/string/Makefile.inc
@@ -1,2 +1,3 @@
 MDSRCS+= \
+       memchr.S \
        strrchr.S
diff --git a/lib/libc/riscv/string/memchr.S b/lib/libc/riscv/string/memchr.S
new file mode 100644
index 000000000000..e6e04bfae96a
--- /dev/null
+++ b/lib/libc/riscv/string/memchr.S
@@ -0,0 +1,188 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <[email protected]>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * a0 - const void *b
+ * a1 - int c
+ * a2 - size_t len
+ */
+ENTRY(memchr)
+       /*
+        * a0 - const char *ptr
+        * a1 - char cccccccc[8]
+        * a2 - char iter[8]
+        * a3 - uint8_t *end
+        * a4 - uint64_t *end_align
+        * a5 - uint64_t *end_unroll
+        */
+
+       beqz a2, .Lno_match
+
+       /* c = (uint8_t) c */
+       andi a1, a1, 0xFF
+
+       /*
+        * t0 = 0x0101010101010101
+        * t1 = 0x8080808080808080
+        * t2 = b << 3
+        * cccccccc = (uint8_t)c * t0
+        * end = b + len;
+        * ptr = b & ~0b111
+        */
+       add a3, a0, a2
+       li t0, 0x01010101
+       sltu t2, a0, a3
+       slli t1, t0, 32
+       neg t2, t2
+       or t0, t0, t1
+       and a3, a3, t2
+       slli t1, t0, 7
+       slli t2, a0, 3
+       and a0, a0, ~0b111
+       mul a1, t0, a1
+
+       ld a2, (a0)
+
+       /*
+        * mask_start = REP8_0x01 ^ (REP8_0x01 << t2)
+        * iter = iter ^ cccccccc
+        * iter = iter | mask_start
+        */
+       sll t2, t0, t2
+       xor a2, a2, a1
+       xor t2, t2, t0
+       or a2, a2, t2
+
+       /* has_zero(iter)
+        * end_align = (end + 7) & ~0b111;
+        */
+       addi a4, a3, 7
+       not t2, a2
+       sub a2, a2, t0
+       and t2, t2, t1
+       andi a4, a4, ~0b111
+       and a2, a2, t2
+
+       /* ptr = ptr + 8 */
+       addi a0, a0, 8
+
+       bnez a2, .Lfind_zero
+
+       /* if(ptr == end_align) */
+       beq a0, a4, .Lno_match
+
+       /* end_unroll = end_align & ~0b1111 */
+       andi a5, a4, ~0b1111
+
+       /*
+        * Instead of branching to check if `ptr` is 16-byte aligned:
+        *   - Probe the next 8 bytes for `c`
+        *   - Align `ptr` down to the nearest 16-byte boundary
+        *
+        * If `ptr` was already 16-byte aligned, those 8 bytes will be
+        * checked again inside the unrolled loop.
+        *
+        * This removes an unpredictable branch and improves performance.
+        */
+
+       ld a2, (a0)
+       xor a2, a2, a1
+
+       not t2, a2
+       sub a2, a2, t0
+       and t2, t2, t1
+       and a2, a2, t2
+
+       addi a0, a0, 8
+
+       bnez a2, .Lfind_zero
+
+       andi a0, a0, ~0b1111
+
+       /* while(ptr != end_unroll) */
+       beq a0, a5, .Lskip_loop
+.Lloop:
+       ld a2, (a0)
+       ld t3, 8(a0)
+
+       xor a2, a2, a1
+       xor t3, t3, a1
+
+       not t2, a2
+       not t4, t3
+       sub a2, a2, t0
+       sub t3, t3, t0
+       and t2, t2, t1
+       and t4, t4, t1
+       and a2, a2, t2
+       and t3, t3, t4
+
+       addi a0, a0, 8
+
+       bnez a2, .Lfind_zero
+
+       /* move into iter for find_zero */
+       mv a2, t3
+
+       addi a0, a0, 8
+
+       bnez a2, .Lfind_zero
+
+       bne a0, a5, .Lloop
+.Lskip_loop:
+
+       /* there might be one 8byte left */
+       beq a0, a4, .Lno_match
+
+       ld a2, (a0)
+       xor a2, a2, a1
+
+       not t2, a2
+       sub a2, a2, t0
+       and t2, t2, t1
+       and a2, a2, t2
+
+       addi a0, a0, 8
+
+       beqz a2, .Lno_match
+
+.Lfind_zero:
+       /*
+        * ptr = ptr - 8
+        * t1 = 0x0001020304050607
+        * iter = iter & (-iter)
+        * iter = iter >> 7
+        * iter = iter * t1
+        * iter = iter >> 56
+        */
+       li t1, 0x10203000
+       neg t0, a2
+       slli t1, t1, 4
+       and a2, a2, t0
+       addi t1, t1, 0x405
+       srli a2, a2, 7
+       slli t1, t1, 16
+       addi a0, a0, -8
+       addi t1, t1, 0x607
+       mul a2, a2, t1
+       srli a2, a2, 56
+
+       /* left = end - ptr */
+       sub t0, a3, a0
+
+       /* return iter < left ? ptr + iter : NULL */
+       sltu t1, a2, t0
+       neg t1, t1
+       add a0, a0, a2
+       and a0, a0, t1
+       ret
+
+.Lno_match:
+       li a0, 0
+       ret
+END(memchr)

Reply via email to