On 13/11/2023 11:37, Victor Do Nascimento wrote:
The armv9.4-a architectural revision adds three new atomic operations
associated with the LSE128 feature:

   * LDCLRP - Atomic AND NOT (bitclear) of a location with 128-bit
   value held in a pair of registers, with original data loaded into
   the same 2 registers.
   * LDSETP - Atomic OR (bitset) of a location with 128-bit value held
   in a pair of registers, with original data loaded into the same 2
   registers.
   * SWPP - Atomic swap of one 128-bit value with 128-bit value held
   in a pair of registers.

This patch adds the logic required to make use of these when the
architectural feature is present and a suitable assembler available.

In order to do this, the following changes are made:

   1. Add a configure-time check to check for LSE128 support in the
   assembler.
   2. Edit host-config.h so that when N == 16, nifunc = 2.
   3. Where available due to LSE128, implement the second ifunc, making
   use of the novel instructions.
   4. For atomic functions unable to make use of these new
   instructions, define a new alias which causes the _i1 function
   variant to point ahead to the corresponding _i2 implementation.

libatomic/ChangeLog:

        * Makefile.am (AM_CPPFLAGS): add conditional setting of
        -DHAVE_FEAT_LSE128.
        * acinclude.m4 (LIBAT_TEST_FEAT_LSE128): New.
        * config/linux/aarch64/atomic_16.S (LSE128): New macro
        definition.
        (libat_exchange_16): New LSE128 variant.
        (libat_fetch_or_16): Likewise.
        (libat_or_fetch_16): Likewise.
        (libat_fetch_and_16): Likewise.
        (libat_and_fetch_16): Likewise.
        * config/linux/aarch64/host-config.h (IFUNC_COND_2): New.
        (IFUNC_NCOND): Add operand size checking.
        (has_lse2): Renamed from `ifunc1`.
        (has_lse128): New.
        (HAS_LSE128): Likewise.
        * libatomic/configure.ac: Add call to LIBAT_TEST_FEAT_LSE128.
        * configure (ac_subst_vars): Regenerated via autoreconf.
        * libatomic/Makefile.in: Likewise.
        * libatomic/auto-config.h.in: Likewise.
---
  libatomic/Makefile.am                        |   3 +
  libatomic/Makefile.in                        |   1 +
  libatomic/acinclude.m4                       |  19 +++
  libatomic/auto-config.h.in                   |   3 +
  libatomic/config/linux/aarch64/atomic_16.S   | 170 ++++++++++++++++++-
  libatomic/config/linux/aarch64/host-config.h |  27 ++-
  libatomic/configure                          |  59 ++++++-
  libatomic/configure.ac                       |   1 +
  8 files changed, 274 insertions(+), 9 deletions(-)

diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
index c0b8dea5037..24e843db67d 100644
--- a/libatomic/Makefile.am
+++ b/libatomic/Makefile.am
@@ -130,6 +130,9 @@ libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix 
_$(s)_.lo,$(SIZEOBJS)))
  ## On a target-specific basis, include alternates to be selected by IFUNC.
  if HAVE_IFUNC
  if ARCH_AARCH64_LINUX
+if ARCH_AARCH64_HAVE_LSE128
+AM_CPPFLAGS         = -DHAVE_FEAT_LSE128
+endif
  IFUNC_OPTIONS      = -march=armv8-a+lse
  libatomic_la_LIBADD += $(foreach s,$(SIZES),$(addsuffix 
_$(s)_1_.lo,$(SIZEOBJS)))
  libatomic_la_SOURCES += atomic_16.S
diff --git a/libatomic/Makefile.in b/libatomic/Makefile.in
index dc2330b91fd..cd48fa21334 100644
--- a/libatomic/Makefile.in
+++ b/libatomic/Makefile.in
@@ -452,6 +452,7 @@ M_SRC = $(firstword $(filter %/$(M_FILE), $(all_c_files)))
  libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix \
        _$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_3) \
        $(am__append_4) $(am__append_5)
+@ARCH_AARCH64_HAVE_LSE128_TRUE@@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@AM_CPPFLAGS
 = -DHAVE_FEAT_LSE128
  @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv8-a+lse
  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv7-a+fp 
-DHAVE_KERNEL64
  @ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=i586
diff --git a/libatomic/acinclude.m4 b/libatomic/acinclude.m4
index f35ab5b60a5..4197db8f404 100644
--- a/libatomic/acinclude.m4
+++ b/libatomic/acinclude.m4
@@ -83,6 +83,25 @@ AC_DEFUN([LIBAT_TEST_ATOMIC_BUILTIN],[
    ])
  ])
+dnl
+dnl Test if the host assembler supports armv9.4-a LSE128 isns.
+dnl
+AC_DEFUN([LIBAT_TEST_FEAT_LSE128],[
+  AC_CACHE_CHECK([for armv9.4-a LSE128 insn support],
+    [libat_cv_have_feat_lse128],[
+    AC_LANG_CONFTEST([AC_LANG_PROGRAM([],[asm(".arch armv9-a+lse128")])])
+    if AC_TRY_EVAL(ac_link); then
+      eval libat_cv_have_feat_lse128=yes
+    else
+      eval libat_cv_have_feat_lse128=no
+    fi
+    rm -f conftest*
+  ])
+  LIBAT_DEFINE_YESNO([HAVE_FEAT_LSE128], [$libat_cv_have_feat_lse128],
+       [Have LSE128 support for 16 byte integers.])
+  AM_CONDITIONAL([ARCH_AARCH64_HAVE_LSE128], [test x$libat_cv_have_feat_lse128 
= xyes])
+])
+
  dnl
  dnl Test if we have __atomic_load and __atomic_store for mode $1, size $2
  dnl
diff --git a/libatomic/auto-config.h.in b/libatomic/auto-config.h.in
index ab3424a759e..7c78933b07d 100644
--- a/libatomic/auto-config.h.in
+++ b/libatomic/auto-config.h.in
@@ -105,6 +105,9 @@
  /* Define to 1 if you have the <dlfcn.h> header file. */
  #undef HAVE_DLFCN_H
+/* Have LSE128 support for 16 byte integers. */
+#undef HAVE_FEAT_LSE128
+
  /* Define to 1 if you have the <fenv.h> header file. */
  #undef HAVE_FENV_H
diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 3f6225830e6..44a773031f8 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -34,10 +34,14 @@
     writes, this will be true when using atomics in actual code.
The libat_<op>_16 entry points are ARMv8.0.
-   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
-
+   The libat_<op>_16_i1 entry points are used when LSE128 is available.
+   The libat_<op>_16_i2 entry points are used when LSE2 is available.  */
+#if HAVE_FEAT_LSE128
+       .arch   armv8-a+lse128

Shouldn't this match the test we run during configure? ie armv9-a+lse128? I'm not sure which is preferable, but it seems odd for them not to be the same.

+#else
        .arch   armv8-a+lse
+#endif
#define ENTRY(name, feat) \
        ENTRY1(name, feat)
@@ -66,7 +70,8 @@ name##feat:                           \
        .set alias##from, alias##to;
#define CORE
-#define LSE2   _i1
+#define LSE128 _i1
+#define LSE2   _i2
#define res0 x0
  #define res1 x1
@@ -201,6 +206,31 @@ ENTRY (libat_exchange_16, CORE)
  END (libat_exchange_16, CORE)
+#if HAVE_FEAT_LSE128
+ENTRY (libat_exchange_16, LSE128)
+       mov     tmp0, x0
+       mov     res0, in0
+       mov     res1, in1
+       cbnz    w4, 1f
+
+       /* RELAXED.  */
+       swpp    res0, res1, [tmp0]
+       ret
+1:
+       cmp     w4, ACQUIRE
+       b.hi    2f
+
+       /* ACQUIRE/CONSUME.  */
+       swppa   res0, res1, [tmp0]
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+2:     swppal  res0, res1, [tmp0]
+       ret
+END (libat_exchange_16, LSE128)
+#endif
+
+
  ENTRY (libat_compare_exchange_16, CORE)
        ldp     exp0, exp1, [x1]
        cbz     w4, 3f
@@ -389,6 +419,31 @@ ENTRY (libat_fetch_or_16, CORE)
  END (libat_fetch_or_16, CORE)
+#if HAVE_FEAT_LSE128
+ENTRY (libat_fetch_or_16, LSE128)
+       mov     tmp0, x0
+       mov     res0, in0
+       mov     res1, in1
+       cbnz    w4, 1f
+
+       /* RELAXED.  */
+       ldsetp  res0, res1, [tmp0]
+       ret
+1:
+       cmp     w4, ACQUIRE
+       b.hi    2f
+
+       /* ACQUIRE/CONSUME.  */
+       ldsetpa res0, res1, [tmp0]
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+2:     ldsetpal        res0, res1, [tmp0]
+       ret
+END (libat_fetch_or_16, LSE128)
+#endif
+
+
  ENTRY (libat_or_fetch_16, CORE)
        mov     x5, x0
        cbnz    w4, 2f
@@ -411,6 +466,36 @@ ENTRY (libat_or_fetch_16, CORE)
  END (libat_or_fetch_16, CORE)
+#if HAVE_FEAT_LSE128
+ENTRY (libat_or_fetch_16, LSE128)
+       cbnz    w4, 1f
+       mov     tmp0, in0
+       mov     tmp1, in1
+
+       /* RELAXED.  */
+       ldsetp  in0, in1, [x0]
+       orr     res0, in0, tmp0
+       orr     res1, in1, tmp1
+       ret
+1:
+       cmp     w4, ACQUIRE
+       b.hi    2f
+
+       /* ACQUIRE/CONSUME.  */
+       ldsetpa in0, in1, [x0]
+       orr     res0, in0, tmp0
+       orr     res1, in1, tmp1
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+2:     ldsetpal        in0, in1, [x0]
+       orr     res0, in0, tmp0
+       orr     res1, in1, tmp1
+       ret
+END (libat_or_fetch_16, LSE128)
+#endif
+
+
  ENTRY (libat_fetch_and_16, CORE)
        mov     x5, x0
        cbnz    w4, 2f
@@ -433,6 +518,32 @@ ENTRY (libat_fetch_and_16, CORE)
  END (libat_fetch_and_16, CORE)
+#if HAVE_FEAT_LSE128
+ENTRY (libat_fetch_and_16, LSE128)
+       mov     tmp0, x0
+       mvn     res0, in0
+       mvn     res1, in1
+       cbnz    w4, 1f
+
+       /* RELAXED.  */
+       ldclrp  res0, res1, [tmp0]
+       ret
+
+1:
+       cmp     w4, ACQUIRE
+       b.hi    2f
+
+       /* ACQUIRE/CONSUME.  */
+       ldclrpa res0, res1, [tmp0]
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+2:     ldclrpal        res0, res1, [tmp0]
+       ret
+END (libat_fetch_and_16, LSE128)
+#endif
+
+
  ENTRY (libat_and_fetch_16, CORE)
        mov     x5, x0
        cbnz    w4, 2f
@@ -455,6 +566,37 @@ ENTRY (libat_and_fetch_16, CORE)
  END (libat_and_fetch_16, CORE)
+#if HAVE_FEAT_LSE128
+ENTRY (libat_and_fetch_16, LSE128)
+       mvn     tmp0, in0
+       mvn     tmp0, in1
+       cbnz    w4, 1f
+
+       /* RELAXED.  */
+       ldclrp  tmp0, tmp1, [x0]
+       and     res0, tmp0, in0
+       and     res1, tmp1, in1
+       ret
+
+1:
+       cmp     w4, ACQUIRE
+       b.hi    2f
+
+       /* ACQUIRE/CONSUME.  */
+       ldclrpa tmp0, tmp1, [x0]
+       and     res0, tmp0, in0
+       and     res1, tmp1, in1
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+2:     ldclrpal        tmp0, tmp1, [x5]
+       and     res0, tmp0, in0
+       and     res1, tmp1, in1
+       ret
+END (libat_and_fetch_16, LSE128)
+#endif
+
+
  ENTRY (libat_fetch_xor_16, CORE)
        mov     x5, x0
        cbnz    w4, 2f
@@ -560,6 +702,28 @@ ENTRY (libat_test_and_set_16, CORE)
  END (libat_test_and_set_16, CORE)
+/* Alias entry points which are the same in LSE2 and LSE128. */
+
+#if !HAVE_FEAT_LSE128
+ALIAS (libat_exchange_16, LSE128, LSE2)
+ALIAS (libat_fetch_or_16, LSE128, LSE2)
+ALIAS (libat_fetch_and_16, LSE128, LSE2)
+ALIAS (libat_or_fetch_16, LSE128, LSE2)
+ALIAS (libat_and_fetch_16, LSE128, LSE2)
+#endif
+ALIAS (libat_load_16, LSE128, LSE2)
+ALIAS (libat_store_16, LSE128, LSE2)
+ALIAS (libat_compare_exchange_16, LSE128, LSE2)
+ALIAS (libat_fetch_add_16, LSE128, LSE2)
+ALIAS (libat_add_fetch_16, LSE128, LSE2)
+ALIAS (libat_fetch_sub_16, LSE128, LSE2)
+ALIAS (libat_sub_fetch_16, LSE128, LSE2)
+ALIAS (libat_fetch_xor_16, LSE128, LSE2)
+ALIAS (libat_xor_fetch_16, LSE128, LSE2)
+ALIAS (libat_fetch_nand_16, LSE128, LSE2)
+ALIAS (libat_nand_fetch_16, LSE128, LSE2)
+ALIAS (libat_test_and_set_16, LSE128, LSE2)
+
  /* Alias entry points which are the same in baseline and LSE2.  */
ALIAS (libat_exchange_16, LSE2, CORE)
diff --git a/libatomic/config/linux/aarch64/host-config.h 
b/libatomic/config/linux/aarch64/host-config.h
index 30ef21c7715..d873e91b1c9 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -26,14 +26,17 @@
#ifdef HWCAP_USCAT
  # if N == 16
-#  define IFUNC_COND_1 (ifunc1 (hwcap))
+#  define IFUNC_COND_1         (has_lse128 (hwcap))
+#  define IFUNC_COND_2         (has_lse2 (hwcap))
+#  define IFUNC_NCOND(N)       2
  # else
-#  define IFUNC_COND_1 (hwcap & HWCAP_ATOMICS)
+#  define IFUNC_COND_1         (hwcap & HWCAP_ATOMICS)
+#  define IFUNC_NCOND(N)       1
  # endif
  #else
  #  define IFUNC_COND_1        (false)
+#  define IFUNC_NCOND(N)       1
  #endif
-#define IFUNC_NCOND(N) (1)
#endif /* HAVE_IFUNC */ @@ -56,7 +59,7 @@
  #define MIDR_PARTNUM(midr)    (((midr) >> 4) & 0xfff)
static inline bool
-ifunc1 (unsigned long hwcap)
+has_lse2 (unsigned long hwcap)
  {
    if (hwcap & HWCAP_USCAT)
      return true;
@@ -69,6 +72,22 @@ ifunc1 (unsigned long hwcap)
      return true;
    return false;
  }
+
+/* LSE128 atomic support encoded in ID_AA64ISAR0_EL1.Atomic,
+   bits[23:20].  The expected value is 0b0011.  Check that.  */
+#define HAS_LSE128() ({                                                \
+  unsigned long val;                                           \
+  asm volatile ("mrs %0, ID_AA64ISAR0_EL1" : "=r" (val));  \
+  (val & 0xf00000) >= 0x300000;                                 \
+    })
+

The pseudo-code for this register reads:

if PSTATE.EL == EL0 then
  if IsFeatureImplemented(FEAT_IDST) then
    if EL2Enabled() && HCR_EL2.TGE == '1' then
      AArch64.SystemAccessTrap(EL2, 0x18);
    else
      AArch64.SystemAccessTrap(EL1, 0x18);
  else
    UNDEFINED;
...

So this instruction may result in SIGILL if run on cores without FEAT_IDST. SystemAccessTrap just punts the problem up to the kernel or hypervisor as well.

I think we need a hwcap bit to work this out, which is the preferred way on Linux anyway. Something like this? :) https://lore.kernel.org/linux-arm-kernel/20231003124544.858804-2-joey.go...@arm.com/T/



+static inline bool
+has_lse128 (unsigned long hwcap)
+{
+  if (has_lse2 (hwcap) && HAS_LSE128 ())

Why does this need to test for LSE2, surely that's mandatory if LSE128 is implemented.

+    return true;
+  return false;
+}
  #endif
#include_next <host-config.h>
diff --git a/libatomic/configure b/libatomic/configure
index d579bab96f8..ee3bbb97d69 100755
--- a/libatomic/configure
+++ b/libatomic/configure
@@ -657,6 +657,8 @@ LIBAT_BUILD_VERSIONED_SHLIB_TRUE
  OPT_LDFLAGS
  SECTION_LDFLAGS
  SYSROOT_CFLAGS_FOR_TARGET
+ARCH_AARCH64_HAVE_LSE128_FALSE
+ARCH_AARCH64_HAVE_LSE128_TRUE
  enable_aarch64_lse
  libtool_VERSION
  ENABLE_DARWIN_AT_RPATH_FALSE
@@ -11456,7 +11458,7 @@ else
    lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
    lt_status=$lt_dlunknown
    cat > conftest.$ac_ext <<_LT_EOF
-#line 11459 "configure"
+#line 11461 "configure"
  #include "confdefs.h"
#if HAVE_DLFCN_H
@@ -11562,7 +11564,7 @@ else
    lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
    lt_status=$lt_dlunknown
    cat > conftest.$ac_ext <<_LT_EOF
-#line 11565 "configure"
+#line 11567 "configure"
  #include "confdefs.h"
#if HAVE_DLFCN_H
@@ -11926,6 +11928,55 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS 
$LDFLAGS conftest.$ac_ext $
  ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for armv9.4-a LSE128 insn 
support" >&5
+$as_echo_n "checking for armv9.4-a LSE128 insn support... " >&6; }
+if ${libat_cv_have_feat_lse128+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+asm(".arch armv9-a+lse128")
+  ;
+  return 0;
+}
+_ACEOF
+    if { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_link\""; } >&5
+  (eval $ac_link) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+      eval libat_cv_have_feat_lse128=yes
+    else
+      eval libat_cv_have_feat_lse128=no
+    fi
+    rm -f conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libat_cv_have_feat_lse128" 
>&5
+$as_echo "$libat_cv_have_feat_lse128" >&6; }
+
+  yesno=`echo $libat_cv_have_feat_lse128 | tr 'yesno' '1  0 '`
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_FEAT_LSE128 $yesno
+_ACEOF
+
+
+   if test x$libat_cv_have_feat_lse128 = xyes; then
+  ARCH_AARCH64_HAVE_LSE128_TRUE=
+  ARCH_AARCH64_HAVE_LSE128_FALSE='#'
+else
+  ARCH_AARCH64_HAVE_LSE128_TRUE='#'
+  ARCH_AARCH64_HAVE_LSE128_FALSE=
+fi
+
+
      ;;
  esac
@@ -15989,6 +16040,10 @@ if test -z "${ENABLE_DARWIN_AT_RPATH_TRUE}" && test -z "${ENABLE_DARWIN_AT_RPATH
    as_fn_error $? "conditional \"ENABLE_DARWIN_AT_RPATH\" was never defined.
  Usually this means the macro was only invoked conditionally." "$LINENO" 5
  fi
+if test -z "${ARCH_AARCH64_HAVE_LSE128_TRUE}" && test -z 
"${ARCH_AARCH64_HAVE_LSE128_FALSE}"; then
+  as_fn_error $? "conditional \"ARCH_AARCH64_HAVE_LSE128\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
if test -z "${LIBAT_BUILD_VERSIONED_SHLIB_TRUE}" && test -z "${LIBAT_BUILD_VERSIONED_SHLIB_FALSE}"; then
    as_fn_error $? "conditional \"LIBAT_BUILD_VERSIONED_SHLIB\" was never 
defined.
diff --git a/libatomic/configure.ac b/libatomic/configure.ac
index 5f2821ac3f4..b2fe68d7d0f 100644
--- a/libatomic/configure.ac
+++ b/libatomic/configure.ac
@@ -169,6 +169,7 @@ AC_MSG_RESULT([$target_thread_file])
  case "$target" in
   *aarch64*)
      ACX_PROG_CC_WARNING_OPTS([-march=armv8-a+lse],[enable_aarch64_lse])
+    LIBAT_TEST_FEAT_LSE128()
      ;;
  esac

R.

Reply via email to