[gcc r17-1979] AArch64: Add streaming compatible memory operations implementation in libgcc

Wilco Dijkstra via Gcc-cvs Mon, 29 Jun 2026 08:42:17 -0700

https://gcc.gnu.org/g:8bdd602c6dc534aa482653ad7e99f3332599c671


commit r17-1979-g8bdd602c6dc534aa482653ad7e99f3332599c671
Author: Claudio Bantaloukas <[email protected]>
Date:   Wed Jun 17 13:46:34 2026 +0000

    AArch64: Add streaming compatible memory operations implementation in libgcc
    
    The ACLE[1] specified the following four functions in "Streaming-compatible
    versions of standard routines" as having "the same behavior as the standard 
C
    functions that they are named after" and external linkage.
    
    - void *__arm_sc_memcpy(void *dest, const void *src, size_t n)
        __arm_streaming_compatible;
    - void *__arm_sc_memmove(void *dest, const void *src, size_t n)
        __arm_streaming_compatible;
    - void *__arm_sc_memset(void *s, int c, size_t n)
        __arm_streaming_compatible;
    - void *__arm_sc_memchr(void *s, int c, size_t n)
        __arm_streaming_compatible;
    
    Declarations for these are already available in arm_sme.h
    This patch provides implementations for these based entirely on scalar
    operations, along with basic tests that exercise the code both in streaming 
and
    non-streaming mode.
    
    See also [2] for further testing.
    
    [1] https://arm-software.github.io/acle/main/acle.html
    [2] https://github.com/ARM-software/optimized-routines/pull/94
    
    libgcc/ChangeLog:
            * config/aarch64/__arm_sc_memchr_scalar.S: New file.
            * config/aarch64/__arm_sc_memcpy_scalar.S: Likewise.
            * config/aarch64/__arm_sc_memset_scalar.S: Likewise.
            * config/aarch64/libgcc-sme.ver (GCC_17.0): Export
            __arm_sc_memchr, __arm_sc_memcpy, __arm_sc_memmove and
            __arm_sc_memset.
            * config/aarch64/t-aarch64: Add new files.
            * config/aarch64/aarch64-asm.h: Add ENTRY_ALIAS.
    
    gcc/testsuite/ChangeLog:
            * gcc.target/aarch64/acle/arm_sc_memchr.c: New file.
            * gcc.target/aarch64/acle/arm_sc_memcpy.c: Likewise.
            * gcc.target/aarch64/acle/arm_sc_memmove.c: Likewise.
            * gcc.target/aarch64/acle/arm_sc_memset.c: Likewise.
            * gcc.target/aarch64/sme/arm_sc_memchr.c: New file.
            * gcc.target/aarch64/sme/arm_sc_memcpy.c: Likewise.
            * gcc.target/aarch64/sme/arm_sc_memmove.c: Likewise.
            * gcc.target/aarch64/sme/arm_sc_memset.c: Likewise.

Diff:
---
 .../gcc.target/aarch64/acle/arm_sc_memchr.c        |  38 +++
 .../gcc.target/aarch64/acle/arm_sc_memcpy.c        |  23 ++
 .../gcc.target/aarch64/acle/arm_sc_memmove.c       |  39 ++++
 .../gcc.target/aarch64/acle/arm_sc_memset.c        |  33 +++
 .../gcc.target/aarch64/sme/arm_sc_memchr.c         |  27 +++
 .../gcc.target/aarch64/sme/arm_sc_memcpy.c         |  26 +++
 .../gcc.target/aarch64/sme/arm_sc_memmove.c        |  27 +++
 .../gcc.target/aarch64/sme/arm_sc_memset.c         |  33 +++
 libgcc/config/aarch64/__arm_sc_memchr_scalar.S     | 128 ++++++++++
 libgcc/config/aarch64/__arm_sc_memcpy_scalar.S     | 258 +++++++++++++++++++++
 libgcc/config/aarch64/__arm_sc_memset_scalar.S     | 178 ++++++++++++++
 libgcc/config/aarch64/aarch64-asm.h                |   5 +
 libgcc/config/aarch64/libgcc-sme.ver               |   7 +
 libgcc/config/aarch64/t-aarch64                    |   5 +-
 14 files changed, 826 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memchr.c 
b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memchr.c
new file mode 100644
index 000000000000..03aa3cdcebdd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memchr.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_variant_pcs } */
+/* { dg-options "-O1 -march=armv8-a" } */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <arm_sme.h>
+
+unsigned char basic[] = {'a', 'b', 'c', 'd', 'e', 'f'};
+
+int
+main (void)
+{
+  unsigned char buffer[1024];
+  memset (buffer, 0xaa, sizeof buffer);
+  buffer[128] = 'x';
+
+  assert (__arm_sc_memchr (basic, 'a', sizeof basic) == basic);
+  assert (__arm_sc_memchr (basic, 'd', sizeof basic) == basic + 3);
+  assert (__arm_sc_memchr (basic, 'f', sizeof basic) == basic + 5);
+  assert (__arm_sc_memchr (basic, 'x', sizeof basic) == NULL);
+
+  assert (__arm_sc_memchr (basic, 'd', 3) == NULL);
+  assert (__arm_sc_memchr (basic, 'd', 4) == basic + 3);
+  assert (__arm_sc_memchr (basic, 'a', 0) == NULL);
+
+  assert (__arm_sc_memchr (basic, 'a' + 256, sizeof basic) == basic);
+
+  assert (__arm_sc_memchr (basic + 1, 'a', sizeof (basic) - 1) == NULL);
+
+  assert (__arm_sc_memchr (buffer, 'x', sizeof buffer) == buffer + 128);
+
+  assert (__arm_sc_memchr (buffer, 'x', SIZE_MAX) == buffer + 128);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memcpy.c 
b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memcpy.c
new file mode 100644
index 000000000000..193f2b1af5f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memcpy.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_variant_pcs } */
+/* { dg-options "-O1 -march=armv8-a" } */
+
+#include <assert.h>
+#include <string.h>
+
+#include <arm_sme.h>
+
+int
+main (void)
+{
+  char src[] = "abcdef";
+  char dst[8] = { 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x' };
+
+  assert (__arm_sc_memcpy (dst + 4, src, 0) == dst + 4);
+  assert (memcmp (dst, "xxxxxxxx", 8) == 0);
+
+  assert (__arm_sc_memcpy (dst + 1, src + 2, 3) == dst + 1);
+  assert (memcmp (dst, "xcdexxxx", 8) == 0);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memmove.c 
b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memmove.c
new file mode 100644
index 000000000000..02597555888f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memmove.c
@@ -0,0 +1,39 @@
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_variant_pcs } */
+/* { dg-options "-O1 -march=armv8-a" } */
+
+#include <assert.h>
+#include <string.h>
+
+#include <arm_sme.h>
+
+int
+main (void)
+{
+  char src[] = "abcdef";
+  char dst[16] = {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'};
+
+  assert (__arm_sc_memmove (dst + 4, src, 0) == dst + 4);
+  assert (memcmp (dst, "xxxxxxxx", 8) == 0);
+
+  assert (__arm_sc_memmove (dst + 1, src + 2, 3) == dst + 1);
+  assert (memcmp (dst, "xcdexxxx", 8) == 0);
+
+  __arm_sc_memcpy (dst, src, sizeof src);
+  assert (__arm_sc_memmove (dst + 2, dst, 4) == dst + 2);
+  assert (memcmp (dst, "ababcd", 6) == 0);
+
+  __arm_sc_memcpy (dst, src, sizeof src);
+  assert (__arm_sc_memmove (dst, dst + 2, 4) == dst);
+  assert (memcmp (dst, "cdefef", 6) == 0);
+
+  __arm_sc_memcpy (dst, src, sizeof src);
+  assert (__arm_sc_memmove (dst + 1, dst + 1, 3) == dst + 1);
+  assert (memcmp (dst, "abcdef", 6) == 0);
+
+  __arm_sc_memcpy (dst, src, sizeof src);
+  assert (__arm_sc_memmove (dst + 3, dst, 0) == dst + 3);
+  assert (memcmp (dst, "abcdef", 6) == 0);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memset.c 
b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memset.c
new file mode 100644
index 000000000000..1f53f0c05620
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memset.c
@@ -0,0 +1,33 @@
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_variant_pcs } */
+/* { dg-options "-O1 -march=armv8-a" } */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <arm_sme.h>
+
+int
+main (void)
+{
+  unsigned char buf[16];
+
+  memset (buf, 0x11, sizeof buf);
+
+  assert (__arm_sc_memset (buf + 3, 0xaa, 5) == buf + 3);
+
+  assert (buf[2] == 0x11);
+  assert (buf[3] == 0xaa);
+  assert (buf[7] == 0xaa);
+  assert (buf[8] == 0x11);
+
+  assert (__arm_sc_memset (buf + 4, 0x55, 0) == buf + 4);
+  assert (buf[4] == 0xaa);
+
+  assert (__arm_sc_memset (buf, 0x1234, 2) == buf);
+  assert (buf[0] == 0x34);
+  assert (buf[1] == 0x34);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memchr.c 
b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memchr.c
new file mode 100644
index 000000000000..65bc63de09f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memchr.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target aarch64_sme_hw } } */
+/* { dg-do compile { target { ! { aarch64_sme_hw } } } } */
+/* { dg-options "-O2 -fbuilding-libgcc" } */
+
+#pragma GCC target "+sme"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <arm_sme.h>
+
+unsigned char basic[] = {'a', 'b', 'c', 'd', 'e', 'f'};
+
+void
+test (void) __arm_streaming
+{
+  assert (__arm_sc_memchr (basic, 'x', sizeof basic) == NULL);
+  assert (__arm_sc_memchr (basic, 'd', 4) == basic + 3);
+}
+
+int
+main (void)
+{
+  test ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memcpy.c 
b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memcpy.c
new file mode 100644
index 000000000000..e49e24d47c99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memcpy.c
@@ -0,0 +1,26 @@
+/* { dg-do run { target aarch64_sme_hw } } */
+/* { dg-do compile { target { ! { aarch64_sme_hw } } } } */
+/* { dg-options "-O2 -fbuilding-libgcc" } */
+
+#pragma GCC target "+sme"
+
+#include <assert.h>
+#include <string.h>
+
+#include <arm_sme.h>
+
+void
+test (void) __arm_streaming
+{
+  char src[] = "abcdef";
+  char dst[8] = {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'};
+  assert (__arm_sc_memcpy (dst + 1, src + 2, 3) == dst + 1);
+  assert (memcmp (dst, "xcdexxxx", 8) == 0);
+}
+
+int
+main (void)
+{
+  test ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memmove.c 
b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memmove.c
new file mode 100644
index 000000000000..fa4207cdc081
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memmove.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target aarch64_sme_hw } } */
+/* { dg-do compile { target { ! { aarch64_sme_hw } } } } */
+/* { dg-options "-O2 -fbuilding-libgcc" } */
+
+#pragma GCC target "+sme"
+
+#include <assert.h>
+#include <string.h>
+
+#include <arm_sme.h>
+
+void
+test (void) __arm_streaming
+{
+  char src[] = "abcdef";
+  char dst[16] = {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'};
+
+  assert (__arm_sc_memmove (dst + 1, src + 2, 3) == dst + 1);
+  assert (memcmp (dst, "xcdexxxx", 8) == 0);
+}
+
+int
+main (void)
+{
+  test ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memset.c 
b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memset.c
new file mode 100644
index 000000000000..0a138c065eee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memset.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sme_hw } } */
+/* { dg-do compile { target { ! { aarch64_sme_hw } } } } */
+/* { dg-options "-fbuilding-libgcc" } */
+
+#pragma GCC target "+sme"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <arm_sme.h>
+
+void
+test (void) __arm_streaming
+{
+  unsigned char buf[16];
+
+  memset (buf, 0x11, sizeof buf);
+
+  assert (__arm_sc_memset (buf + 3, 0xaa, 5) == buf + 3);
+
+  assert (buf[2] == 0x11);
+  assert (buf[3] == 0xaa);
+  assert (buf[7] == 0xaa);
+  assert (buf[8] == 0x11);
+}
+
+int
+main (void)
+{
+  test ();
+  return 0;
+}
diff --git a/libgcc/config/aarch64/__arm_sc_memchr_scalar.S 
b/libgcc/config/aarch64/__arm_sc_memchr_scalar.S
new file mode 100644
index 000000000000..7ef026ff1928
--- /dev/null
+++ b/libgcc/config/aarch64/__arm_sc_memchr_scalar.S
@@ -0,0 +1,128 @@
+/* Support routine for SME.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "aarch64-asm.h"
+
+.arch armv8-a
+
+#define result                 x0
+#define src_in                 x0
+#define chr_in                 w1
+#define chr_repeated   x1
+#define count_in               x2
+
+#define src                            x3
+#define zeroones               x4
+#define        data1                   x5
+#define        tmp1                    x6
+#define tmp2                   x7
+#define found1                 x8
+#define src_end                        x9
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* This function implements the hacker's delight algorithm for searching
+   matching values. We find them in words by performing an XOR against the
+   repeated searched char. Then, x being the xor result, we calculate
+   (x - 0x0101010101010101) & ~(x | 0x7f7f7f7f7f7f7f7f). Words with a matching
+   byte will produce a non-zero result.
+
+   __arm_sc_mem* functions must be able to run in streaming mode and can thus
+   not make use of most AdvSIMD instructions. */
+
+variant_pcs (__arm_sc_memchr)
+
+ENTRY (__arm_sc_memchr)
+       cbz     count_in, L(no_match)
+
+       /* Replicate the needle byte across the rest of the word.  */
+       and     chr_in, chr_in, 255
+       orr     chr_in, chr_in, chr_in, lsl 8
+       orr     chr_in, chr_in, chr_in, lsl 16
+       orr     chr_repeated, chr_repeated, chr_repeated, lsl 32
+       mov     zeroones, REP8_01
+
+       bic     src, src_in, 7
+       ldr     data1, [src], 8
+#ifdef __AARCH64EB__
+       rev     data1, data1
+#endif
+       /* Skip unused bytes before src_in.  */
+       lsl     tmp1, src_in, 3
+       lsr     tmp2, zeroones, tmp1
+       lsr     data1, data1, tmp1
+
+       eor     data1, data1, chr_repeated
+       sub     tmp1, data1, tmp2
+       orr     tmp2, data1, REP8_7f
+       bic     found1, tmp1, tmp2
+       cbz     found1, L(loop_start)
+
+    /* Calculate the offset and return it if offset < count_in. */
+       rbit    found1, found1
+       clz     tmp1, found1
+       cmp     count_in, tmp1, lsr 3
+       add     result, src_in, tmp1, lsr 3
+       csel    result, result, xzr, hi
+       ret
+
+       .p2align 4
+
+L(no_match):
+       mov     result, 0
+       ret
+
+L(loop_start):
+       /* Deal with count_in being so large that src_end comes before src_in 
due
+          to wraparound. When this happens, set src_end to SIZE_MAX so that we 
do
+          not reject results based on the address.  */
+       adds    src_end, src_in, count_in
+       csinv   src_end, src_end, xzr, cc
+
+       .p2align 4
+L(loop):
+       cmp     src, src_end
+       b.hs    L(no_match)
+       ldr     data1, [src], 8
+#ifdef __AARCH64EB__
+       rev     data1, data1
+#endif
+       eor     data1, data1, chr_repeated
+       sub     tmp1, data1, zeroones
+       orr     tmp2, data1, REP8_7f
+       bics    xzr, tmp1, tmp2
+
+       b.eq    L(loop)
+
+       /* Found a match - return the address if it is before src_end.  */
+       bic     found1, tmp1, tmp2
+       sub     src, src, 8
+       rbit    found1, found1
+       clz     tmp1, found1
+       add     result, src, tmp1, lsr 3
+       cmp     src_end, result
+       csel    result, result, xzr, hi
+       ret
+END (__arm_sc_memchr)
diff --git a/libgcc/config/aarch64/__arm_sc_memcpy_scalar.S 
b/libgcc/config/aarch64/__arm_sc_memcpy_scalar.S
new file mode 100644
index 000000000000..1e8dcffc7a7d
--- /dev/null
+++ b/libgcc/config/aarch64/__arm_sc_memcpy_scalar.S
@@ -0,0 +1,258 @@
+/* Support routine for SME.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "aarch64-asm.h"
+
+.arch armv8-a
+
+#define dstin  x0
+#define src    x1
+#define count  x2
+#define dst    x3
+#define srcend x4
+#define dstend x5
+#define A_l    x6
+#define A_lw   w6
+#define A_h    x7
+#define B_l    x8
+#define B_lw   w8
+#define B_h    x9
+#define C_l    x10
+#define C_lw   w10
+#define C_h    x11
+#define D_l    x12
+#define D_h    x13
+#define E_l    x14
+#define E_h    x15
+#define F_l    x16
+#define F_h    x17
+#define G_l    count
+#define G_h    dst
+#define H_l    src
+#define H_h    srcend
+#define tmp1   x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   It uses unaligned accesses and branchless sequences to keep the code small,
+   simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per 
iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+
+   __arm_sc_mem* functions must be able to run in streaming mode and can thus
+   not make use of most AdvSIMD instructions.
+*/
+
+variant_pcs (__arm_sc_memmove)
+ENTRY_ALIAS (__arm_sc_memmove)
+
+variant_pcs (__arm_sc_memcpy)
+
+ENTRY (__arm_sc_memcpy)
+       add     srcend, src, count
+       add     dstend, dstin, count
+       cmp     count, 128
+       b.hi    L(copy_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
+
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
+       ldp     A_l, A_h, [src]
+       ldp     D_l, D_h, [srcend, -16]
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       /* Copy 8-15 bytes.  */
+L(copy16):
+       tbz     count, 3, L(copy8)
+       ldr     A_l, [src]
+       ldr     A_h, [srcend, -8]
+       str     A_l, [dstin]
+       str     A_h, [dstend, -8]
+       ret
+
+       .p2align 3
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
+       ldr     A_lw, [src]
+       ldr     B_lw, [srcend, -4]
+       str     A_lw, [dstin]
+       str     B_lw, [dstend, -4]
+       ret
+
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
+       lsr     tmp1, count, 1
+       ldrb    A_lw, [src]
+       ldrb    C_lw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    C_lw, [dstend, -1]
+L(copy0):
+       ret
+
+       .p2align 4
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       ldp     A_l, A_h, [src]
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       ldp     D_l, D_h, [srcend, -16]
+       cmp     count, 64
+       b.hi    L(copy128)
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
+       /* Copy 65..128 bytes.  */
+L(copy128):
+       ldp     E_l, E_h, [src, 32]
+       ldp     F_l, F_h, [src, 48]
+       cmp     count, 96
+       b.ls    L(copy96)
+       ldp     G_l, G_h, [srcend, -64]
+       ldp     H_l, H_h, [srcend, -48]
+       stp     G_l, G_h, [dstend, -64]
+       stp     H_l, H_h, [dstend, -48]
+L(copy96):
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     E_l, E_h, [dstin, 32]
+       stp     F_l, F_h, [dstin, 48]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
+       /* Copy more than 128 bytes.  */
+L(copy_long):
+       /* Use backwards copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(copy0)
+       cmp     tmp1, count
+       b.lo    L(copy_long_backwards)
+
+       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+       ldp     D_l, D_h, [src]
+       and     tmp1, dstin, 15
+       bic     dst, dstin, 15
+       sub     src, src, tmp1
+       add     count, count, tmp1      /* Count is now 16 too large.  */
+       ldp     A_l, A_h, [src, 16]
+       stp     D_l, D_h, [dstin]
+       ldp     B_l, B_h, [src, 32]
+       ldp     C_l, C_h, [src, 48]
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 128 + 16  /* Test and readjust count.  */
+       b.ls    L(copy64_from_end)
+
+L(loop64):
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [src, 16]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [src, 32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [src, 48]
+       stp     D_l, D_h, [dst, 64]!
+       ldp     D_l, D_h, [src, 64]!
+       subs    count, count, 64
+       b.hi    L(loop64)
+
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+       ldp     E_l, E_h, [srcend, -64]
+       stp     A_l, A_h, [dst, 16]
+       ldp     A_l, A_h, [srcend, -48]
+       stp     B_l, B_h, [dst, 32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dst, 48]
+       ldp     C_l, C_h, [srcend, -16]
+       stp     D_l, D_h, [dst, 64]
+       stp     E_l, E_h, [dstend, -64]
+       stp     A_l, A_h, [dstend, -48]
+       stp     B_l, B_h, [dstend, -32]
+       stp     C_l, C_h, [dstend, -16]
+       ret
+
+       .p2align 4
+
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+       ldp     D_l, D_h, [srcend, -16]
+       and     tmp1, dstend, 15
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       ldp     A_l, A_h, [srcend, -16]
+       stp     D_l, D_h, [dstend, -16]
+       ldp     B_l, B_h, [srcend, -32]
+       ldp     C_l, C_h, [srcend, -48]
+       ldp     D_l, D_h, [srcend, -64]!
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [srcend, -16]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [srcend, -48]
+       stp     D_l, D_h, [dstend, -64]!
+       ldp     D_l, D_h, [srcend, -64]!
+       subs    count, count, 64
+       b.hi    L(loop64_backwards)
+
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+       ldp     G_l, G_h, [src, 48]
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [src, 32]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [src, 16]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [src]
+       stp     D_l, D_h, [dstend, -64]
+       stp     G_l, G_h, [dstin, 48]
+       stp     A_l, A_h, [dstin, 32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin]
+       ret
+END (__arm_sc_memcpy)
diff --git a/libgcc/config/aarch64/__arm_sc_memset_scalar.S 
b/libgcc/config/aarch64/__arm_sc_memset_scalar.S
new file mode 100644
index 000000000000..1d3fc1c8c2c0
--- /dev/null
+++ b/libgcc/config/aarch64/__arm_sc_memset_scalar.S
@@ -0,0 +1,178 @@
+/* Support routine for SME.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "aarch64-asm.h"
+
+.arch armv8-a
+
+#define dstin  x0
+#define val    x1
+#define valw   w1
+#define count  x2
+#define dst    x3
+#define dstend x4
+#define zva_val        x5
+#define off    x3
+#define dstend2        x5
+
+/* __arm_sc_mem* functions must be able to run in streaming mode and can thus
+   not make use of most AdvSIMD instructions.  */
+
+variant_pcs (__arm_sc_memset)
+ENTRY (__arm_sc_memset)
+       /* Replicate the fill byte across the last half of the word.  */
+       and     valw, valw, 255
+       orr     valw, valw, valw, lsl 8
+       orr     valw, valw, valw, lsl 16
+
+       cmp     count, 16
+       b.lo    L(set_lt16)
+
+       /* Replicate the fill byte across the rest of the word.  */
+       orr     val, val, val, lsl 32
+
+       add     dstend, dstin, count
+       cmp     count, 64
+       b.hi    L(set_gt64)
+
+L(set_16_64):
+       /* Calculate an interior store offset based on half the count, This 
will be
+         one of 0, 16 or 32 bytes. Then perform four overlapping stores.  */
+       mov     off, 48
+       and     off, off, count, lsr 1
+       sub     dstend2, dstend, off
+
+       stp     val, val, [dstin]
+       add     off, dstin, off
+       stp     val, val, [off]
+       stp     val, val, [dstend2, -16]
+       stp     val, val, [dstend, -16]
+       ret
+
+       .p2align 4
+L(set_lt16):
+       add     dstend, dstin, count
+       cmp     count, 4
+       b.lo    1f
+
+       /* For sizes 4..15, salculate an interior offset based on the count. 
Then
+          perform four overlapping 32 bit stores.  */
+       lsr     off, count, 3                           /* off = count >> 3.  */
+       sub     dstend2, dstend, off, lsl 2
+       str     valw, [dstin]
+       str     valw, [dstin, off, lsl 2]
+       str     valw, [dstend2, -4]
+       str     valw, [dstend, -4]
+       ret
+
+       /* Potentially overlapping stores for sizes 0..3. */
+1:     cbz     count, 2f               /* Skip stores entirely for a 
zero-length memset.  */
+       lsr     off, count, 1
+       strb    valw, [dstin]
+       strb    valw, [dstin, off]
+       strb    valw, [dstend, -1]
+2:     ret
+
+       .p2align 4
+L(set_gt64):
+       bic     dst, dstin, 15          /* Align dst down to 16 bytes.  */
+       cmp     count, 128
+       b.hi    L(set_gt128)
+
+       /* For sizes 64..128, perform eight overlapping 16-byte stores.  */
+       stp     val, val, [dstin]
+       stp     val, val, [dstin, 16]
+       stp     val, val, [dstin, 32]
+       stp     val, val, [dstin, 48]
+       stp     val, val, [dstend, -64]
+       stp     val, val, [dstend, -48]
+       stp     val, val, [dstend, -32]
+       stp     val, val, [dstend, -16]
+       ret
+
+       .p2align 4
+L(set_gt128):
+       /* For sizes above 128, perform initial stores before entering loop on
+          aligned addresses.  */
+       stp     val, val, [dstin]
+       stp     val, val, [dst, 16]
+
+       /* We can use DC ZVA to zero memory when the fill byte is zero and the 
block
+          size reported by DCZID_EL0 is 64 bytes.  */
+       cbnz valw, L(no_zva)
+#ifndef SKIP_ZVA_CHECK
+       mrs     zva_val, dczid_el0
+       and     zva_val, zva_val, 31
+       cmp     zva_val, 4
+       b.ne    L(no_zva)
+#endif
+       /* Complete stores up to the first 64 byte aligned block.  */
+       stp     val, val, [dst, 32]
+       stp     val, val, [dst, 48]
+
+       bic     dst, dstin, 63          /* Align dst down to 64 bytes.  */
+       /* Compute count, accounting for the stores above and bias for the loop
+          exit.  */
+       sub     count, dstend, dst
+       sub     count, count, 64 + 64
+
+       /* Write last bytes before entering ZVA loop.  */
+       stp     val, val, [dstend, -64]
+       stp     val, val, [dstend, -48]
+       stp     val, val, [dstend, -32]
+       stp     val, val, [dstend, -16]
+
+       .p2align 4
+L(zva64_loop):
+       add     dst, dst, 64
+       dc      zva, dst                /* Zero the 64-byte cache block 
containing dst.  */
+       subs    count, count, 64
+       b.hi    L(zva64_loop)
+       ret
+
+       .p2align 3
+L(no_zva):
+       /* Compute count, accounting for the two stores in set_gt128 and bias 
for
+          the loop exit.  */
+       sub     count, dstend, dst
+       sub     count, count, 64 + 32
+
+L(no_zva_loop):
+       /* Perform eight contiguous 16-byte stores per iteration.  */
+       stp     val, val, [dst, 32]
+       stp     val, val, [dst, 48]
+       stp     val, val, [dst, 64]
+       stp     val, val, [dst, 80]
+       add     dst, dst, 64
+       subs    count, count, 64
+       b.hi    L(no_zva_loop)
+
+       /* Perform final stores for the tail of the range, potentially 
overlapping
+          with previous ones.  */
+       stp     val, val, [dstend, -64]
+       stp     val, val, [dstend, -48]
+       stp     val, val, [dstend, -32]
+       stp     val, val, [dstend, -16]
+       ret
+END (__arm_sc_memset)
diff --git a/libgcc/config/aarch64/aarch64-asm.h 
b/libgcc/config/aarch64/aarch64-asm.h
index 8ce7e3101480..5b5feefc342a 100644
--- a/libgcc/config/aarch64/aarch64-asm.h
+++ b/libgcc/config/aarch64/aarch64-asm.h
@@ -127,6 +127,11 @@ FEATURE_1_AND_MARK (BTI_FLAG|PAC_FLAG|GCS_FLAG)
 # endif
 #endif
 
+# define ENTRY_ALIAS(name)     \
+  .global name;                        \
+  SYMBOL_TYPE(name, %function);                \
+  name:
+
 #define ENTRY_ALIGN(name, align) \
   .global name;                \
   SYMBOL_TYPE(name, %function);                \
diff --git a/libgcc/config/aarch64/libgcc-sme.ver 
b/libgcc/config/aarch64/libgcc-sme.ver
index f8c67905ba71..8d8dc0fd4413 100644
--- a/libgcc/config/aarch64/libgcc-sme.ver
+++ b/libgcc/config/aarch64/libgcc-sme.ver
@@ -26,3 +26,10 @@ GCC_14.0 {
 GCC_16.0 {
   __arm_get_current_vg
 }
+
+GCC_17.0 {
+  __arm_sc_memchr
+  __arm_sc_memcpy
+  __arm_sc_memmove
+  __arm_sc_memset
+}
diff --git a/libgcc/config/aarch64/t-aarch64 b/libgcc/config/aarch64/t-aarch64
index 7bc51ad65248..e1391651466a 100644
--- a/libgcc/config/aarch64/t-aarch64
+++ b/libgcc/config/aarch64/t-aarch64
@@ -28,7 +28,10 @@ LIB2ADDEH += \
        $(srcdir)/config/aarch64/__arm_sme_state.S \
        $(srcdir)/config/aarch64/__arm_tpidr2_restore.S \
        $(srcdir)/config/aarch64/__arm_tpidr2_save.S \
-       $(srcdir)/config/aarch64/__arm_za_disable.S
+       $(srcdir)/config/aarch64/__arm_za_disable.S \
+       $(srcdir)/config/aarch64/__arm_sc_memchr_scalar.S \
+       $(srcdir)/config/aarch64/__arm_sc_memcpy_scalar.S \
+       $(srcdir)/config/aarch64/__arm_sc_memset_scalar.S
 
 SHLIB_MAPFILES += $(srcdir)/config/aarch64/libgcc-sme.ver
 LIBGCC2_CFLAGS += $(WERROR) -Wno-prio-ctor-dtor

[gcc r17-1979] AArch64: Add streaming compatible memory operations implementation in libgcc

Reply via email to