https://gcc.gnu.org/g:8bdd602c6dc534aa482653ad7e99f3332599c671
commit r17-1979-g8bdd602c6dc534aa482653ad7e99f3332599c671 Author: Claudio Bantaloukas <[email protected]> Date: Wed Jun 17 13:46:34 2026 +0000 AArch64: Add streaming compatible memory operations implementation in libgcc The ACLE[1] specified the following four functions in "Streaming-compatible versions of standard routines" as having "the same behavior as the standard C functions that they are named after" and external linkage. - void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible; - void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible; - void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible; - void *__arm_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible; Declarations for these are already available in arm_sme.h This patch provides implementations for these based entirely on scalar operations, along with basic tests that exercise the code both in streaming and non-streaming mode. See also [2] for further testing. [1] https://arm-software.github.io/acle/main/acle.html [2] https://github.com/ARM-software/optimized-routines/pull/94 libgcc/ChangeLog: * config/aarch64/__arm_sc_memchr_scalar.S: New file. * config/aarch64/__arm_sc_memcpy_scalar.S: Likewise. * config/aarch64/__arm_sc_memset_scalar.S: Likewise. * config/aarch64/libgcc-sme.ver (GCC_17.0): Export __arm_sc_memchr, __arm_sc_memcpy, __arm_sc_memmove and __arm_sc_memset. * config/aarch64/t-aarch64: Add new files. * config/aarch64/aarch64-asm.h: Add ENTRY_ALIAS. gcc/testsuite/ChangeLog: * gcc.target/aarch64/acle/arm_sc_memchr.c: New file. * gcc.target/aarch64/acle/arm_sc_memcpy.c: Likewise. * gcc.target/aarch64/acle/arm_sc_memmove.c: Likewise. * gcc.target/aarch64/acle/arm_sc_memset.c: Likewise. * gcc.target/aarch64/sme/arm_sc_memchr.c: New file. * gcc.target/aarch64/sme/arm_sc_memcpy.c: Likewise. * gcc.target/aarch64/sme/arm_sc_memmove.c: Likewise. * gcc.target/aarch64/sme/arm_sc_memset.c: Likewise. Diff: --- .../gcc.target/aarch64/acle/arm_sc_memchr.c | 38 +++ .../gcc.target/aarch64/acle/arm_sc_memcpy.c | 23 ++ .../gcc.target/aarch64/acle/arm_sc_memmove.c | 39 ++++ .../gcc.target/aarch64/acle/arm_sc_memset.c | 33 +++ .../gcc.target/aarch64/sme/arm_sc_memchr.c | 27 +++ .../gcc.target/aarch64/sme/arm_sc_memcpy.c | 26 +++ .../gcc.target/aarch64/sme/arm_sc_memmove.c | 27 +++ .../gcc.target/aarch64/sme/arm_sc_memset.c | 33 +++ libgcc/config/aarch64/__arm_sc_memchr_scalar.S | 128 ++++++++++ libgcc/config/aarch64/__arm_sc_memcpy_scalar.S | 258 +++++++++++++++++++++ libgcc/config/aarch64/__arm_sc_memset_scalar.S | 178 ++++++++++++++ libgcc/config/aarch64/aarch64-asm.h | 5 + libgcc/config/aarch64/libgcc-sme.ver | 7 + libgcc/config/aarch64/t-aarch64 | 5 +- 14 files changed, 826 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memchr.c b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memchr.c new file mode 100644 index 000000000000..03aa3cdcebdd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memchr.c @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_variant_pcs } */ +/* { dg-options "-O1 -march=armv8-a" } */ + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +#include <arm_sme.h> + +unsigned char basic[] = {'a', 'b', 'c', 'd', 'e', 'f'}; + +int +main (void) +{ + unsigned char buffer[1024]; + memset (buffer, 0xaa, sizeof buffer); + buffer[128] = 'x'; + + assert (__arm_sc_memchr (basic, 'a', sizeof basic) == basic); + assert (__arm_sc_memchr (basic, 'd', sizeof basic) == basic + 3); + assert (__arm_sc_memchr (basic, 'f', sizeof basic) == basic + 5); + assert (__arm_sc_memchr (basic, 'x', sizeof basic) == NULL); + + assert (__arm_sc_memchr (basic, 'd', 3) == NULL); + assert (__arm_sc_memchr (basic, 'd', 4) == basic + 3); + assert (__arm_sc_memchr (basic, 'a', 0) == NULL); + + assert (__arm_sc_memchr (basic, 'a' + 256, sizeof basic) == basic); + + assert (__arm_sc_memchr (basic + 1, 'a', sizeof (basic) - 1) == NULL); + + assert (__arm_sc_memchr (buffer, 'x', sizeof buffer) == buffer + 128); + + assert (__arm_sc_memchr (buffer, 'x', SIZE_MAX) == buffer + 128); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memcpy.c b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memcpy.c new file mode 100644 index 000000000000..193f2b1af5f2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memcpy.c @@ -0,0 +1,23 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_variant_pcs } */ +/* { dg-options "-O1 -march=armv8-a" } */ + +#include <assert.h> +#include <string.h> + +#include <arm_sme.h> + +int +main (void) +{ + char src[] = "abcdef"; + char dst[8] = { 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x' }; + + assert (__arm_sc_memcpy (dst + 4, src, 0) == dst + 4); + assert (memcmp (dst, "xxxxxxxx", 8) == 0); + + assert (__arm_sc_memcpy (dst + 1, src + 2, 3) == dst + 1); + assert (memcmp (dst, "xcdexxxx", 8) == 0); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memmove.c b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memmove.c new file mode 100644 index 000000000000..02597555888f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memmove.c @@ -0,0 +1,39 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_variant_pcs } */ +/* { dg-options "-O1 -march=armv8-a" } */ + +#include <assert.h> +#include <string.h> + +#include <arm_sme.h> + +int +main (void) +{ + char src[] = "abcdef"; + char dst[16] = {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}; + + assert (__arm_sc_memmove (dst + 4, src, 0) == dst + 4); + assert (memcmp (dst, "xxxxxxxx", 8) == 0); + + assert (__arm_sc_memmove (dst + 1, src + 2, 3) == dst + 1); + assert (memcmp (dst, "xcdexxxx", 8) == 0); + + __arm_sc_memcpy (dst, src, sizeof src); + assert (__arm_sc_memmove (dst + 2, dst, 4) == dst + 2); + assert (memcmp (dst, "ababcd", 6) == 0); + + __arm_sc_memcpy (dst, src, sizeof src); + assert (__arm_sc_memmove (dst, dst + 2, 4) == dst); + assert (memcmp (dst, "cdefef", 6) == 0); + + __arm_sc_memcpy (dst, src, sizeof src); + assert (__arm_sc_memmove (dst + 1, dst + 1, 3) == dst + 1); + assert (memcmp (dst, "abcdef", 6) == 0); + + __arm_sc_memcpy (dst, src, sizeof src); + assert (__arm_sc_memmove (dst + 3, dst, 0) == dst + 3); + assert (memcmp (dst, "abcdef", 6) == 0); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memset.c b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memset.c new file mode 100644 index 000000000000..1f53f0c05620 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/arm_sc_memset.c @@ -0,0 +1,33 @@ +/* { dg-do run } */ +/* { dg-require-effective-target aarch64_variant_pcs } */ +/* { dg-options "-O1 -march=armv8-a" } */ + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +#include <arm_sme.h> + +int +main (void) +{ + unsigned char buf[16]; + + memset (buf, 0x11, sizeof buf); + + assert (__arm_sc_memset (buf + 3, 0xaa, 5) == buf + 3); + + assert (buf[2] == 0x11); + assert (buf[3] == 0xaa); + assert (buf[7] == 0xaa); + assert (buf[8] == 0x11); + + assert (__arm_sc_memset (buf + 4, 0x55, 0) == buf + 4); + assert (buf[4] == 0xaa); + + assert (__arm_sc_memset (buf, 0x1234, 2) == buf); + assert (buf[0] == 0x34); + assert (buf[1] == 0x34); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memchr.c b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memchr.c new file mode 100644 index 000000000000..65bc63de09f8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memchr.c @@ -0,0 +1,27 @@ +/* { dg-do run { target aarch64_sme_hw } } */ +/* { dg-do compile { target { ! { aarch64_sme_hw } } } } */ +/* { dg-options "-O2 -fbuilding-libgcc" } */ + +#pragma GCC target "+sme" + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +#include <arm_sme.h> + +unsigned char basic[] = {'a', 'b', 'c', 'd', 'e', 'f'}; + +void +test (void) __arm_streaming +{ + assert (__arm_sc_memchr (basic, 'x', sizeof basic) == NULL); + assert (__arm_sc_memchr (basic, 'd', 4) == basic + 3); +} + +int +main (void) +{ + test (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memcpy.c b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memcpy.c new file mode 100644 index 000000000000..e49e24d47c99 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memcpy.c @@ -0,0 +1,26 @@ +/* { dg-do run { target aarch64_sme_hw } } */ +/* { dg-do compile { target { ! { aarch64_sme_hw } } } } */ +/* { dg-options "-O2 -fbuilding-libgcc" } */ + +#pragma GCC target "+sme" + +#include <assert.h> +#include <string.h> + +#include <arm_sme.h> + +void +test (void) __arm_streaming +{ + char src[] = "abcdef"; + char dst[8] = {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}; + assert (__arm_sc_memcpy (dst + 1, src + 2, 3) == dst + 1); + assert (memcmp (dst, "xcdexxxx", 8) == 0); +} + +int +main (void) +{ + test (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memmove.c b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memmove.c new file mode 100644 index 000000000000..fa4207cdc081 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memmove.c @@ -0,0 +1,27 @@ +/* { dg-do run { target aarch64_sme_hw } } */ +/* { dg-do compile { target { ! { aarch64_sme_hw } } } } */ +/* { dg-options "-O2 -fbuilding-libgcc" } */ + +#pragma GCC target "+sme" + +#include <assert.h> +#include <string.h> + +#include <arm_sme.h> + +void +test (void) __arm_streaming +{ + char src[] = "abcdef"; + char dst[16] = {'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x'}; + + assert (__arm_sc_memmove (dst + 1, src + 2, 3) == dst + 1); + assert (memcmp (dst, "xcdexxxx", 8) == 0); +} + +int +main (void) +{ + test (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memset.c b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memset.c new file mode 100644 index 000000000000..0a138c065eee --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sme/arm_sc_memset.c @@ -0,0 +1,33 @@ +/* { dg-do run { target aarch64_sme_hw } } */ +/* { dg-do compile { target { ! { aarch64_sme_hw } } } } */ +/* { dg-options "-fbuilding-libgcc" } */ + +#pragma GCC target "+sme" + +#include <assert.h> +#include <stdint.h> +#include <string.h> + +#include <arm_sme.h> + +void +test (void) __arm_streaming +{ + unsigned char buf[16]; + + memset (buf, 0x11, sizeof buf); + + assert (__arm_sc_memset (buf + 3, 0xaa, 5) == buf + 3); + + assert (buf[2] == 0x11); + assert (buf[3] == 0xaa); + assert (buf[7] == 0xaa); + assert (buf[8] == 0x11); +} + +int +main (void) +{ + test (); + return 0; +} diff --git a/libgcc/config/aarch64/__arm_sc_memchr_scalar.S b/libgcc/config/aarch64/__arm_sc_memchr_scalar.S new file mode 100644 index 000000000000..7ef026ff1928 --- /dev/null +++ b/libgcc/config/aarch64/__arm_sc_memchr_scalar.S @@ -0,0 +1,128 @@ +/* Support routine for SME. + Copyright (C) 2026 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#include "aarch64-asm.h" + +.arch armv8-a + +#define result x0 +#define src_in x0 +#define chr_in w1 +#define chr_repeated x1 +#define count_in x2 + +#define src x3 +#define zeroones x4 +#define data1 x5 +#define tmp1 x6 +#define tmp2 x7 +#define found1 x8 +#define src_end x9 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +/* This function implements the hacker's delight algorithm for searching + matching values. We find them in words by performing an XOR against the + repeated searched char. Then, x being the xor result, we calculate + (x - 0x0101010101010101) & ~(x | 0x7f7f7f7f7f7f7f7f). Words with a matching + byte will produce a non-zero result. + + __arm_sc_mem* functions must be able to run in streaming mode and can thus + not make use of most AdvSIMD instructions. */ + +variant_pcs (__arm_sc_memchr) + +ENTRY (__arm_sc_memchr) + cbz count_in, L(no_match) + + /* Replicate the needle byte across the rest of the word. */ + and chr_in, chr_in, 255 + orr chr_in, chr_in, chr_in, lsl 8 + orr chr_in, chr_in, chr_in, lsl 16 + orr chr_repeated, chr_repeated, chr_repeated, lsl 32 + mov zeroones, REP8_01 + + bic src, src_in, 7 + ldr data1, [src], 8 +#ifdef __AARCH64EB__ + rev data1, data1 +#endif + /* Skip unused bytes before src_in. */ + lsl tmp1, src_in, 3 + lsr tmp2, zeroones, tmp1 + lsr data1, data1, tmp1 + + eor data1, data1, chr_repeated + sub tmp1, data1, tmp2 + orr tmp2, data1, REP8_7f + bic found1, tmp1, tmp2 + cbz found1, L(loop_start) + + /* Calculate the offset and return it if offset < count_in. */ + rbit found1, found1 + clz tmp1, found1 + cmp count_in, tmp1, lsr 3 + add result, src_in, tmp1, lsr 3 + csel result, result, xzr, hi + ret + + .p2align 4 + +L(no_match): + mov result, 0 + ret + +L(loop_start): + /* Deal with count_in being so large that src_end comes before src_in due + to wraparound. When this happens, set src_end to SIZE_MAX so that we do + not reject results based on the address. */ + adds src_end, src_in, count_in + csinv src_end, src_end, xzr, cc + + .p2align 4 +L(loop): + cmp src, src_end + b.hs L(no_match) + ldr data1, [src], 8 +#ifdef __AARCH64EB__ + rev data1, data1 +#endif + eor data1, data1, chr_repeated + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + bics xzr, tmp1, tmp2 + + b.eq L(loop) + + /* Found a match - return the address if it is before src_end. */ + bic found1, tmp1, tmp2 + sub src, src, 8 + rbit found1, found1 + clz tmp1, found1 + add result, src, tmp1, lsr 3 + cmp src_end, result + csel result, result, xzr, hi + ret +END (__arm_sc_memchr) diff --git a/libgcc/config/aarch64/__arm_sc_memcpy_scalar.S b/libgcc/config/aarch64/__arm_sc_memcpy_scalar.S new file mode 100644 index 000000000000..1e8dcffc7a7d --- /dev/null +++ b/libgcc/config/aarch64/__arm_sc_memcpy_scalar.S @@ -0,0 +1,258 @@ +/* Support routine for SME. + Copyright (C) 2026 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#include "aarch64-asm.h" + +.arch armv8-a + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 + +/* This implementation handles overlaps and supports both memcpy and memmove + It uses unaligned accesses and branchless sequences to keep the code small, + simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. + + __arm_sc_mem* functions must be able to run in streaming mode and can thus + not make use of most AdvSIMD instructions. +*/ + +variant_pcs (__arm_sc_memmove) +ENTRY_ALIAS (__arm_sc_memmove) + +variant_pcs (__arm_sc_memcpy) + +ENTRY (__arm_sc_memcpy) + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) + +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret +END (__arm_sc_memcpy) diff --git a/libgcc/config/aarch64/__arm_sc_memset_scalar.S b/libgcc/config/aarch64/__arm_sc_memset_scalar.S new file mode 100644 index 000000000000..1d3fc1c8c2c0 --- /dev/null +++ b/libgcc/config/aarch64/__arm_sc_memset_scalar.S @@ -0,0 +1,178 @@ +/* Support routine for SME. + Copyright (C) 2026 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#include "aarch64-asm.h" + +.arch armv8-a + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 +#define off x3 +#define dstend2 x5 + +/* __arm_sc_mem* functions must be able to run in streaming mode and can thus + not make use of most AdvSIMD instructions. */ + +variant_pcs (__arm_sc_memset) +ENTRY (__arm_sc_memset) + /* Replicate the fill byte across the last half of the word. */ + and valw, valw, 255 + orr valw, valw, valw, lsl 8 + orr valw, valw, valw, lsl 16 + + cmp count, 16 + b.lo L(set_lt16) + + /* Replicate the fill byte across the rest of the word. */ + orr val, val, val, lsl 32 + + add dstend, dstin, count + cmp count, 64 + b.hi L(set_gt64) + +L(set_16_64): + /* Calculate an interior store offset based on half the count, This will be + one of 0, 16 or 32 bytes. Then perform four overlapping stores. */ + mov off, 48 + and off, off, count, lsr 1 + sub dstend2, dstend, off + + stp val, val, [dstin] + add off, dstin, off + stp val, val, [off] + stp val, val, [dstend2, -16] + stp val, val, [dstend, -16] + ret + + .p2align 4 +L(set_lt16): + add dstend, dstin, count + cmp count, 4 + b.lo 1f + + /* For sizes 4..15, salculate an interior offset based on the count. Then + perform four overlapping 32 bit stores. */ + lsr off, count, 3 /* off = count >> 3. */ + sub dstend2, dstend, off, lsl 2 + str valw, [dstin] + str valw, [dstin, off, lsl 2] + str valw, [dstend2, -4] + str valw, [dstend, -4] + ret + + /* Potentially overlapping stores for sizes 0..3. */ +1: cbz count, 2f /* Skip stores entirely for a zero-length memset. */ + lsr off, count, 1 + strb valw, [dstin] + strb valw, [dstin, off] + strb valw, [dstend, -1] +2: ret + + .p2align 4 +L(set_gt64): + bic dst, dstin, 15 /* Align dst down to 16 bytes. */ + cmp count, 128 + b.hi L(set_gt128) + + /* For sizes 64..128, perform eight overlapping 16-byte stores. */ + stp val, val, [dstin] + stp val, val, [dstin, 16] + stp val, val, [dstin, 32] + stp val, val, [dstin, 48] + stp val, val, [dstend, -64] + stp val, val, [dstend, -48] + stp val, val, [dstend, -32] + stp val, val, [dstend, -16] + ret + + .p2align 4 +L(set_gt128): + /* For sizes above 128, perform initial stores before entering loop on + aligned addresses. */ + stp val, val, [dstin] + stp val, val, [dst, 16] + + /* We can use DC ZVA to zero memory when the fill byte is zero and the block + size reported by DCZID_EL0 is 64 bytes. */ + cbnz valw, L(no_zva) +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 + b.ne L(no_zva) +#endif + /* Complete stores up to the first 64 byte aligned block. */ + stp val, val, [dst, 32] + stp val, val, [dst, 48] + + bic dst, dstin, 63 /* Align dst down to 64 bytes. */ + /* Compute count, accounting for the stores above and bias for the loop + exit. */ + sub count, dstend, dst + sub count, count, 64 + 64 + + /* Write last bytes before entering ZVA loop. */ + stp val, val, [dstend, -64] + stp val, val, [dstend, -48] + stp val, val, [dstend, -32] + stp val, val, [dstend, -16] + + .p2align 4 +L(zva64_loop): + add dst, dst, 64 + dc zva, dst /* Zero the 64-byte cache block containing dst. */ + subs count, count, 64 + b.hi L(zva64_loop) + ret + + .p2align 3 +L(no_zva): + /* Compute count, accounting for the two stores in set_gt128 and bias for + the loop exit. */ + sub count, dstend, dst + sub count, count, 64 + 32 + +L(no_zva_loop): + /* Perform eight contiguous 16-byte stores per iteration. */ + stp val, val, [dst, 32] + stp val, val, [dst, 48] + stp val, val, [dst, 64] + stp val, val, [dst, 80] + add dst, dst, 64 + subs count, count, 64 + b.hi L(no_zva_loop) + + /* Perform final stores for the tail of the range, potentially overlapping + with previous ones. */ + stp val, val, [dstend, -64] + stp val, val, [dstend, -48] + stp val, val, [dstend, -32] + stp val, val, [dstend, -16] + ret +END (__arm_sc_memset) diff --git a/libgcc/config/aarch64/aarch64-asm.h b/libgcc/config/aarch64/aarch64-asm.h index 8ce7e3101480..5b5feefc342a 100644 --- a/libgcc/config/aarch64/aarch64-asm.h +++ b/libgcc/config/aarch64/aarch64-asm.h @@ -127,6 +127,11 @@ FEATURE_1_AND_MARK (BTI_FLAG|PAC_FLAG|GCS_FLAG) # endif #endif +# define ENTRY_ALIAS(name) \ + .global name; \ + SYMBOL_TYPE(name, %function); \ + name: + #define ENTRY_ALIGN(name, align) \ .global name; \ SYMBOL_TYPE(name, %function); \ diff --git a/libgcc/config/aarch64/libgcc-sme.ver b/libgcc/config/aarch64/libgcc-sme.ver index f8c67905ba71..8d8dc0fd4413 100644 --- a/libgcc/config/aarch64/libgcc-sme.ver +++ b/libgcc/config/aarch64/libgcc-sme.ver @@ -26,3 +26,10 @@ GCC_14.0 { GCC_16.0 { __arm_get_current_vg } + +GCC_17.0 { + __arm_sc_memchr + __arm_sc_memcpy + __arm_sc_memmove + __arm_sc_memset +} diff --git a/libgcc/config/aarch64/t-aarch64 b/libgcc/config/aarch64/t-aarch64 index 7bc51ad65248..e1391651466a 100644 --- a/libgcc/config/aarch64/t-aarch64 +++ b/libgcc/config/aarch64/t-aarch64 @@ -28,7 +28,10 @@ LIB2ADDEH += \ $(srcdir)/config/aarch64/__arm_sme_state.S \ $(srcdir)/config/aarch64/__arm_tpidr2_restore.S \ $(srcdir)/config/aarch64/__arm_tpidr2_save.S \ - $(srcdir)/config/aarch64/__arm_za_disable.S + $(srcdir)/config/aarch64/__arm_za_disable.S \ + $(srcdir)/config/aarch64/__arm_sc_memchr_scalar.S \ + $(srcdir)/config/aarch64/__arm_sc_memcpy_scalar.S \ + $(srcdir)/config/aarch64/__arm_sc_memset_scalar.S SHLIB_MAPFILES += $(srcdir)/config/aarch64/libgcc-sme.ver LIBGCC2_CFLAGS += $(WERROR) -Wno-prio-ctor-dtor
