The branch stable/15 has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=3f0dda7c59280fe05b57fe695022053cd8b7d5f1

commit 3f0dda7c59280fe05b57fe695022053cd8b7d5f1
Author:     Strahinja Stanišić <[email protected]>
AuthorDate: 2024-08-04 15:12:00 +0000
Commit:     Robert Clausecker <[email protected]>
CommitDate: 2025-11-30 00:43:06 +0000

    libc: scalar strnlen() in RISC-V assembly
    
    Optimized implementation of strnlen() in RISC-V assembly
    
    Performance was measured using strperf on a HiFive Unmatched (SiFive 
HF105-001) board.
    
    os: FreeBSD
    arch: riscv
            │ strnlen_baseline │           strnlen_scalar            │
            │      sec/op      │   sec/op     vs base                │
    Short          787.0µ ± 0%   430.9µ ± 1%  -45.24% (p=0.000 n=20)
    Mid            621.6µ ± 0%   195.1µ ± 1%  -68.61% (p=0.000 n=20)
    Long           569.4µ ± 1%   100.6µ ± 0%  -82.34% (p=0.000 n=20)
    geomean        653.1µ        203.7µ       -68.81%
    
            │ strnlen_baseline │            strnlen_scalar            │
            │      MiB/s       │    MiB/s     vs base                 │
    Short           158.8 ± 0%    290.1 ± 1%   +82.62% (p=0.000 n=20)
    Mid             201.1 ± 0%    640.6 ± 1%  +218.59% (p=0.000 n=20)
    Long            219.5 ± 1%   1242.9 ± 0%  +466.19% (p=0.000 n=20)
    geomean         191.4         613.5       +220.57%
    
    MFC after:      1 month
    MFC to:         stable/15
    Approved by:    mhorne, markj (mentor)
    Reviewed by:    fuz, Jari Sihvola <[email protected]>
    Sponsored by:   Google LLC (GSoC 2024)
    Differential Revision:  https://reviews.freebsd.org/D46230
    
    (cherry picked from commit 5a52f0704435b089199201be0029e0d7c9ef2fce)
---
 lib/libc/riscv/string/Makefile.inc |   1 +
 lib/libc/riscv/string/strnlen.S    | 143 +++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+)

diff --git a/lib/libc/riscv/string/Makefile.inc 
b/lib/libc/riscv/string/Makefile.inc
index ebea8d1d3412..4b97490a5494 100644
--- a/lib/libc/riscv/string/Makefile.inc
+++ b/lib/libc/riscv/string/Makefile.inc
@@ -3,4 +3,5 @@ MDSRCS+= \
        memcpy.S \
        memset.S \
        strlen.S \
+       strnlen.S \
        strrchr.S
diff --git a/lib/libc/riscv/string/strnlen.S b/lib/libc/riscv/string/strnlen.S
new file mode 100644
index 000000000000..c0fd959548ff
--- /dev/null
+++ b/lib/libc/riscv/string/strnlen.S
@@ -0,0 +1,143 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Strahinja Stanisic <[email protected]>
+ */
+
+#include <machine/asm.h>
+
+/*
+ * a0 - const char *s
+ * a1 - size_t maxlen;
+ */
+ENTRY(strnlen)
+       /*
+        * a0 - const char *s;
+        * a1 - size_t maxlen;
+        * a2 - uint64_t *ptr;
+        * a3 - char iter[8];
+        * a4 - uint64_t *end_align;
+        * a5 - uint64_t *end_unroll;
+        */
+
+       beqz a1, .Lnot_found
+
+       /* ptr = s & ~0b111 */
+       /* t0 = 0x0101010101010101 */
+       /* t1 = 0x8080808080808080 */
+       /* end_align = (s + maxlen + 7) & ~0b111 */
+       /* mask_start = t0 >> ((-s.value) << 3) */
+       add a4, a0, a1
+       li t0, 0x01010101
+       addi a4, a4, 7
+       slli t1, t0, 32
+       neg t2, a0
+       andi a4, a4, ~0b111
+       or t0, t0, t1
+       slli t2, t2, 3
+       andi a2, a0, ~0b111
+       slli t1, t0, 7
+       srl t2, t0, t2
+
+       /* if pointer is aligned skip to loop */
+       beq a0, a2, .Lskip_start
+
+       /* iter = *ptr */
+       ld a3, (a2)
+
+       /* iter = iter | mask_start */
+       or a3, a3, t2
+
+       /* has_zero */
+       not t2, a3
+       sub a3, a3, t0
+       and t2, t2, t1
+       and a3, a3, t2
+
+       addi a2, a2, 8
+       bnez a3, .Lfind_zero
+
+.Lskip_start:
+       /* end_unroll */
+       sub t2, a4, a2
+       andi t2, t2, ~0b1111
+       add a5, a2, t2
+
+       /* while (ptr != end_unroll) */
+       beq a2, a5, .Lskip_loop
+.Lloop:
+       ld a3, (a2)
+       ld a6, 8(a2)
+
+       /* has_zero */
+       not t2, a3
+       not t3, a6
+       sub a3, a3, t0
+       sub a6, a6, t0
+       and t2, t2, t1
+       and t3, t3, t1
+       and a3, a3, t2
+       and a6, a6, t3
+
+       addi a2, a2, 8
+       bnez a3, .Lfind_zero
+
+       mv a3, a6
+
+       addi a2, a2, 8
+       bnez a3, .Lfind_zero
+
+       bne a2, a5, .Lloop
+
+.Lskip_loop:
+
+       beq a2, a4, .Lnot_found
+
+       ld a3, (a2)
+
+       /* has_zero */
+       not t2, a3
+       sub a3, a3, t0
+       and t2, t2, t1
+       and a3, a3, t2
+
+
+       addi a2, a2, 8
+       beqz a3, .Lnot_found
+
+.Lfind_zero:
+
+       /* move ptr back */
+       addi a2, a2, -8
+
+       /* isolate lowest set bit */
+       neg t0, a3
+       and a3, a3, t0
+
+       li t0, 0x0001020304050607
+       srli a3, a3, 7
+
+       /* lowest set bit is 2^(8*k)
+        * multiplying by it shifts the idx array in t0 by k bytes to the left 
*/
+       mul     a3, a3, t0
+
+       /* highest byte contains idx of first zero */
+       srli a3, a3, 56
+
+       /* zero_idx */
+       sub a2, a2, a0
+       add a2, a2, a3
+
+       /* min(zero_idx, maxlen) */
+       sub a2, a2, a1
+       srai t1, a2, 63
+       and a2, a2, t1
+       add a0, a1, a2
+
+       ret
+
+.Lnot_found:
+       mv a0, a1
+       ret
+
+END(strnlen)

Reply via email to