[PATCH v2] Arm: Fix ldrd offset range [PR115153]

2024-06-11 Thread Wilco Dijkstra
v2: use a new arm_arch_v7ve_neon, fix use of DImode in output_move_neon

The valid offset range of LDRD in arm_legitimate_index_p is increased to
-1024..1020 if NEON is enabled since VALID_NEON_DREG_MODE includes DImode.
Fix this by moving the LDRD check earlier.

Passes bootstrap & regress, OK for commit?

gcc:
PR target/115153
* config/arm/arm.cc (arm_legitimate_index_p): Move LDRD case before 
NEON.
(thumb2_legitimate_index_p): Update comments.
(output_move_neon): Use DFmode for vldr/vstr.
* lib/target-supports.exp: Add arm_arch_v7ve_neon target support.

gcc/testsuite:
PR target/115153
* gcc.target/arm/pr115153.c: Add new test.

---

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 
ea0c963a4d67ecd70e1571624e84dfe46d757df9..7dec0254f5a953050c9c52aa297fad7f3dfb6c74
 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -8852,6 +8852,28 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
 
+  if (arm_address_register_rtx_p (index, strict_p)
+  && (GET_MODE_SIZE (mode) <= 4))
+return 1;
+
+  /* This handles DFmode only if !TARGET_HARD_FLOAT.  */
+  if (mode == DImode || mode == DFmode)
+{
+  if (code == CONST_INT)
+   {
+ HOST_WIDE_INT val = INTVAL (index);
+
+ /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
+If vldr is selected it uses arm_coproc_mem_operand.  */
+ if (TARGET_LDRD)
+   return val > -256 && val < 256;
+ else
+   return val > -4096 && val < 4092;
+   }
+
+  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
+}
+
   /* For quad modes, we restrict the constant offset to be slightly less
  than what the instruction format permits.  We do this because for
  quad mode moves, we will actually decompose them into two separate
@@ -8864,7 +8886,7 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& (INTVAL (index) & 3) == 0);
 
   /* We have no such constraint on double mode offsets, so we permit the
- full range of the instruction format.  */
+ full range of the instruction format.  Note DImode is included here.  */
   if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
 return (code == CONST_INT
&& INTVAL (index) < 1024
@@ -8877,27 +8899,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
 
-  if (arm_address_register_rtx_p (index, strict_p)
-  && (GET_MODE_SIZE (mode) <= 4))
-return 1;
-
-  if (mode == DImode || mode == DFmode)
-{
-  if (code == CONST_INT)
-   {
- HOST_WIDE_INT val = INTVAL (index);
-
- /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
-If vldr is selected it uses arm_coproc_mem_operand.  */
- if (TARGET_LDRD)
-   return val > -256 && val < 256;
- else
-   return val > -4096 && val < 4092;
-   }
-
-  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
-}
-
   if (GET_MODE_SIZE (mode) <= 4
   && ! (arm_arch4
&& (mode == HImode
@@ -9000,7 +9001,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, 
int strict_p)
&& (INTVAL (index) & 3) == 0);
 
   /* We have no such constraint on double mode offsets, so we permit the
- full range of the instruction format.  */
+ full range of the instruction format.  Note DImode is included here.  */
   if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
 return (code == CONST_INT
&& INTVAL (index) < 1024
@@ -9011,6 +9012,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, 
int strict_p)
   && (GET_MODE_SIZE (mode) <= 4))
 return 1;
 
+  /* This handles DImode if !TARGET_NEON, and DFmode if !TARGET_VFP_BASE.  */
   if (mode == DImode || mode == DFmode)
 {
   if (code == CONST_INT)
@@ -20854,7 +20856,7 @@ output_move_neon (rtx *operands)
/* We're only using DImode here because it's a convenient
   size.  */
ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i);
-   ops[1] = adjust_address (mem, DImode, 8 * i);
+   ops[1] = adjust_address (mem, DFmode, 8 * i);
if (reg_overlap_mentioned_p (ops[0], mem))
  {
gcc_assert (overlap == -1);
@@ -20872,7 +20874,7 @@ output_move_neon (rtx *operands)
if (overlap != -1)
  {
ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * overlap);
-   ops[1] = adjust_address (mem, SImode, 8 * overlap);
+   ops[1] = adjust_address (mem, DFmode, 8 * overlap);
if (TARGET_HAVE_MVE && LABEL_REF_P (addr))
  sprintf (buff, "v%sr.32\t%%P0, %%1", load ? "ld" : "st");
else
diff --git a/gcc/testsuite/gcc.target/arm/pr115153.c 

[PATCH v2] Arm: Fix disassembly error in Thumb-1 relaxed load/store [PR115188]

2024-06-11 Thread Wilco Dijkstra
Hi Christophe,

>  PR target/115153
I guess this is typo (should be 115188) ?

Correct.

> +/* { dg-options "-O2 -mthumb" } */-mthumb is included in arm_arch_v6m, so I 
> think you don't need to add it
here?

Indeed, it's not strictly necessary. Fixed in v2:

A Thumb-1 memory operand allows single-register LDMIA/STMIA. This doesn't get
printed as LDR/STR with writeback in unified syntax, resulting in strange
assembler errors if writeback is selected.  To work around this, use the 'Uw'
constraint that blocks writeback.

Passes bootstrap & regress, OK for commit and backport?

gcc:
PR target/115188
* config/arm/sync.md (arm_atomic_load): Use 'Uw' constraint.
(arm_atomic_store): Likewise.

gcc/testsuite:
PR target/115188
* gcc.target/arm/pr115188.c: Add new test.

---

diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md
index 
df8dbe170cacb6b60d56a6f19aadd5a6c9c51f7a..e856ee51d9ae7b945c4d1e9d1f08afeedc95707a
 100644
--- a/gcc/config/arm/sync.md
+++ b/gcc/config/arm/sync.md
@@ -65,7 +65,7 @@
 (define_insn "arm_atomic_load"
   [(set (match_operand:QHSI 0 "register_operand" "=r,l")
 (unspec_volatile:QHSI
-  [(match_operand:QHSI 1 "memory_operand" "m,m")]
+  [(match_operand:QHSI 1 "memory_operand" "m,Uw")]
   VUNSPEC_LDR))]
   ""
   "ldr\t%0, %1"
@@ -81,7 +81,7 @@
 )
 
 (define_insn "arm_atomic_store"
-  [(set (match_operand:QHSI 0 "memory_operand" "=m,m")
+  [(set (match_operand:QHSI 0 "memory_operand" "=m,Uw")
 (unspec_volatile:QHSI
   [(match_operand:QHSI 1 "register_operand" "r,l")]
   VUNSPEC_STR))]
diff --git a/gcc/testsuite/gcc.target/arm/pr115188.c 
b/gcc/testsuite/gcc.target/arm/pr115188.c
new file mode 100644
index 
..9a4022b56796d6962bb3f22e40bac4b81eb78ccf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr115188.c
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_arch_v6m_ok }
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_arch_v6m } */
+
+void init (int *p, int n)
+{
+  for (int i = 0; i < n; i++)
+__atomic_store_4 (p + i, 0, __ATOMIC_RELAXED);
+}



Re: PATCH] AArch64: Fix cpu features initialization [PR115342]

2024-06-05 Thread Wilco Dijkstra
Hi Richard,

>> Essentially anything covered by HWCAP doesn't need an explicit check. So I 
>> kept
>> the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not
>> entirely convinced we need these, let alone having 3 individual bits for 
>> LS64, but
>> that's something for the ACLE spec to sort out). The goal here is to fix all 
>> obvious
>> bugs so one can use FMV as intended.
>
> Didn't we take the opposite approach for libatomic though?

We started the work before LSE128/RCPC3 HWCAPs were added, so there was no
alternative at the time. Checking both means a higher QoI, but once most distros
use modern kernels, the CPUID checks become unnecessary and will be removed.

> I suppose one difference is that the libatomic code is gating a
> choice between a well-defined, curated set of routines, whereas the
> libgcc code is providing a general user-facing feature.  So maybe
> libgcc should be more conservative for that reason?

Indeed. Using HWCAP means it's trivially correct and working identically 
between GCC
and LLVM.

I don't rule out adding extra CPUID checks for some features. However unlike 
libatomic,
the selected features are very user visible, so we would need to specify for 
which features
this is both useful and correct, and make sure GCC and LLVM behave in the same 
way.

Cheers,
Wilco


Re: PATCH] AArch64: Fix cpu features initialization [PR115342]

2024-06-04 Thread Wilco Dijkstra
Hi Richard,

I've reworded the commit message a bit:

The CPU features initialization code uses CPUID registers (rather than
HWCAP).  The equality comparisons it uses are incorrect: for example FEAT_SVE
is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
correct.  The initialization must also be done atomically to avoid multiple
threads causing corruption due to non-atomic RMW accesses to the global.

> What criteria did you use for choosing whether to keep or remove
> the system register checks?

Essentially anything covered by HWCAP doesn't need an explicit check. So I kept
the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not
entirely convinced we need these, let alone having 3 individual bits for LS64, 
but
that's something for the ACLE spec to sort out). The goal here is to fix all 
obvious
bugs so one can use FMV as intended.

> Passes regress, OK for commit and backport?
>
> libgcc:
> PR target/115342
> * config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
> Use HWCAP where possible.  Use atomic write for initialization.

> It'd be good to mention the fix for the FEAT_PREDRES system register check
> as well.

Done, see below.

Cheers,
Wilco


v2: Update commit message and mention PREDRES.

The CPU features initialization code uses CPUID registers (rather than
HWCAP).  The equality comparisons it uses are incorrect: for example FEAT_SVE
is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
correct.  The initialization must also be done atomically to avoid multiple
threads causing corruption due to non-atomic RMW accesses to the global.

Passes regress, OK for commit and backport?

libgcc:
PR target/115342
* config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
Use HWCAP where possible.  Use atomic write for initialization.
Fix FEAT_PREDRES comparison.
(__init_cpu_features_resolver): Use atomic load for correct
initialization.
(__init_cpu_features): Likewise.

---

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index 
4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9
 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -227,14 +227,22 @@ struct {
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1UL << 33)
 #endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3  (1UL << 46)
+#endif
 
 static void
-__init_cpu_features_constructor(unsigned long hwcap,
-   const __ifunc_arg_t *arg) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
+__init_cpu_features_constructor (unsigned long hwcap,
+const __ifunc_arg_t *arg)
+{
+  unsigned long feat = 0;
+#define setCPUFeature(F) feat |= 1UL << F
 #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
 #define extractBits(val, start, number) \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
+  (val & ((1UL << number) - 1UL) << start) >> start
   unsigned long hwcap2 = 0;
   if (hwcap & _IFUNC_ARG_HWCAP)
 hwcap2 = arg->_hwcap2;
@@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
 setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
 setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
+  if (hwcap & HWCAP_SM4)
 setCPUFeature(FEAT_SM4);
   if (hwcap & HWCAP_ASIMDDP)
 setCPUFeature(FEAT_DOTPROD);
   if (hwcap & HWCAP_ASIMDFHM)
 setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
+  if (hwcap & HWCAP_FPHP)
 setCPUFeature(FEAT_FP16);
-setCPUFeature(FEAT_FP);
-  }
   if (hwcap & HWCAP_DIT)
 setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
 setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
 setCPUFeature(FEAT_AES);
   if (hwcap & HWCAP_SHA1)
@@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap,
   if (hwcap & HWCAP_SB)
 setCPUFeature(FEAT_SB);
   if (hwcap & HWCAP_SSBS)
-setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
+{
+  setCPUFeature(FEAT_SSBS);
+  setCPUFeature(FEAT_SSBS2);
+}
+  if (hwcap2 & HWCAP2_MTE)
+{
+  setCPUFeature(FEAT_MEMTAG);
+  setCPUFeature(FEAT_MEMTAG2);
+}
+  if (hwcap2 & HWCAP2_MTE3)
 setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
 setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
 

PATCH] AArch64: Fix cpu features initialization [PR115342]

2024-06-04 Thread Wilco Dijkstra

Fix CPU features initialization.  Use HWCAP rather than explicit accesses
to CPUID registers.  Perform the initialization atomically to avoid multi-
threading issues.

Passes regress, OK for commit and backport?

libgcc:
PR target/115342
* config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
Use HWCAP where possible.  Use atomic write for initialization.
(__init_cpu_features_resolver): Use atomic load for correct
initialization.
(__init_cpu_features): Likewise.

---

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index 
4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9
 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -227,14 +227,22 @@ struct {
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1UL << 33)
 #endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3  (1UL << 46)
+#endif
 
 static void
-__init_cpu_features_constructor(unsigned long hwcap,
-   const __ifunc_arg_t *arg) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
+__init_cpu_features_constructor (unsigned long hwcap,
+const __ifunc_arg_t *arg)
+{
+  unsigned long feat = 0;
+#define setCPUFeature(F) feat |= 1UL << F
 #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
 #define extractBits(val, start, number) \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
+  (val & ((1UL << number) - 1UL) << start) >> start
   unsigned long hwcap2 = 0;
   if (hwcap & _IFUNC_ARG_HWCAP)
 hwcap2 = arg->_hwcap2;
@@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
 setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
 setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
+  if (hwcap & HWCAP_SM4)
 setCPUFeature(FEAT_SM4);
   if (hwcap & HWCAP_ASIMDDP)
 setCPUFeature(FEAT_DOTPROD);
   if (hwcap & HWCAP_ASIMDFHM)
 setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
+  if (hwcap & HWCAP_FPHP)
 setCPUFeature(FEAT_FP16);
-setCPUFeature(FEAT_FP);
-  }
   if (hwcap & HWCAP_DIT)
 setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
 setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
 setCPUFeature(FEAT_AES);
   if (hwcap & HWCAP_SHA1)
@@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap,
   if (hwcap & HWCAP_SB)
 setCPUFeature(FEAT_SB);
   if (hwcap & HWCAP_SSBS)
-setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
+{
+  setCPUFeature(FEAT_SSBS);
+  setCPUFeature(FEAT_SSBS2);
+}
+  if (hwcap2 & HWCAP2_MTE)
+{
+  setCPUFeature(FEAT_MEMTAG);
+  setCPUFeature(FEAT_MEMTAG2);
+}
+  if (hwcap2 & HWCAP2_MTE3)
 setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
 setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
 setCPUFeature(FEAT_SVE_PMULL128);
-  }
   if (hwcap2 & HWCAP2_SVEBITPERM)
 setCPUFeature(FEAT_SVE_BITPERM);
   if (hwcap2 & HWCAP2_SVESHA3)
@@ -329,108 +330,76 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_WFXT);
   if (hwcap2 & HWCAP2_SME)
 setCPUFeature(FEAT_SME);
+  if (hwcap2 & HWCAP2_SME2)
+setCPUFeature(FEAT_SME2);
   if (hwcap2 & HWCAP2_SME_I16I64)
 setCPUFeature(FEAT_SME_I64);
   if (hwcap2 & HWCAP2_SME_F64F64)
 setCPUFeature(FEAT_SME_F64);
-  if (hwcap & HWCAP_CPUID) {
-unsigned long ftr;
-getCPUFeature(ID_AA64PFR1_EL1, ftr);
-/* ID_AA64PFR1_EL1.MTE >= 0b0001  */
-if (extractBits(ftr, 8, 4) >= 0x1)
-  setCPUFeature(FEAT_MEMTAG);
-/* ID_AA64PFR1_EL1.SSBS == 0b0001  */
-if (extractBits(ftr, 4, 4) == 0x1)
-  setCPUFeature(FEAT_SSBS);
-/* ID_AA64PFR1_EL1.SME == 0b0010  */
-if (extractBits(ftr, 24, 4) == 0x2)
-  setCPUFeature(FEAT_SME2);
-getCPUFeature(ID_AA64PFR0_EL1, ftr);
-/* ID_AA64PFR0_EL1.FP != 0b  */
-if (extractBits(ftr, 16, 4) != 0xF) {
-  setCPUFeature(FEAT_FP);
-  /* ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP  */
-  setCPUFeature(FEAT_SIMD);
-}
-/* ID_AA64PFR0_EL1.SVE != 0b  */
-if (extractBits(ftr, 32, 4) != 0x0) {
-  /* get ID_AA64ZFR0_EL1, that name supported if sve enabled only  */
-  getCPUFeature(S3_0_C0_C4_4, ftr);
-  /* ID_AA64ZFR0_EL1.SVEver == 0b  */
-  if (extractBits(ftr, 0, 4) == 0x0)
-   setCPUFeature(FEAT_SVE);
- 

[PATCH] Arm: Fix disassembly error in Thumb-1 relaxed load/store [PR115188]

2024-06-03 Thread Wilco Dijkstra
A Thumb-1 memory operand allows single-register LDMIA/STMIA. This doesn't get
printed as LDR/STR with writeback in unified syntax, resulting in strange
assembler errors if writeback is selected.  To work around this, use the 'Uw'
constraint that blocks writeback.

Passes bootstrap & regress, OK for commit?

gcc:
PR target/115153
* config/arm/sync.md (arm_atomic_load): Use 'Uw' constraint.
(arm_atomic_store): Likewise.

gcc/testsuite:
PR target/115188
* gcc.target/arm/pr115188.c: Add new test.

---

diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md
index 
df8dbe170cacb6b60d56a6f19aadd5a6c9c51f7a..e856ee51d9ae7b945c4d1e9d1f08afeedc95707a
 100644
--- a/gcc/config/arm/sync.md
+++ b/gcc/config/arm/sync.md
@@ -65,7 +65,7 @@
 (define_insn "arm_atomic_load"
   [(set (match_operand:QHSI 0 "register_operand" "=r,l")
 (unspec_volatile:QHSI
-  [(match_operand:QHSI 1 "memory_operand" "m,m")]
+  [(match_operand:QHSI 1 "memory_operand" "m,Uw")]
   VUNSPEC_LDR))]
   ""
   "ldr\t%0, %1"
@@ -81,7 +81,7 @@
 )
 
 (define_insn "arm_atomic_store"
-  [(set (match_operand:QHSI 0 "memory_operand" "=m,m")
+  [(set (match_operand:QHSI 0 "memory_operand" "=m,Uw")
 (unspec_volatile:QHSI
   [(match_operand:QHSI 1 "register_operand" "r,l")]
   VUNSPEC_STR))]
diff --git a/gcc/testsuite/gcc.target/arm/pr115188.c 
b/gcc/testsuite/gcc.target/arm/pr115188.c
new file mode 100644
index 
..ef40d7732b77936c845707989465a01ecca5adb0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr115188.c
@@ -0,0 +1,10 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_arch_v6m_ok }
+/* { dg-options "-O2 -mthumb" } */
+/* { dg-add-options arm_arch_v6m } */
+
+void init (int *p, int n)
+{
+  for (int i = 0; i < n; i++)
+__atomic_store_4 (p + i, 0, __ATOMIC_RELAXED);
+}



[PATCH] Arm: Fix ldrd offset range [PR115153]

2024-06-03 Thread Wilco Dijkstra

The valid offset range of LDRD in arm_legitimate_index_p is increased to
-1024..1020 if NEON is enabled since VALID_NEON_DREG_MODE includes DImode.
Fix this by moving the LDRD check earlier.

Passes bootstrap & regress, OK for commit?

gcc:
PR target/115153
* config/arm/arm.cc (arm_legitimate_index_p): Move LDRD case before 
NEON.
(thumb2_legitimate_index_p): Update comments.

gcc/testsuite:
PR target/115153
* gcc.target/arm/pr115153.c: Add new test.

---

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 
ea0c963a4d67ecd70e1571624e84dfe46d757df9..d260ebe0734d424942a773386986a02fe6d1803c
 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -8852,6 +8852,28 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
 
+  if (arm_address_register_rtx_p (index, strict_p)
+  && (GET_MODE_SIZE (mode) <= 4))
+return 1;
+
+  /* This handles DFmode only if !TARGET_HARD_FLOAT.  */
+  if (mode == DImode || mode == DFmode)
+{
+  if (code == CONST_INT)
+   {
+ HOST_WIDE_INT val = INTVAL (index);
+
+ /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
+If vldr is selected it uses arm_coproc_mem_operand.  */
+ if (TARGET_LDRD)
+   return val > -256 && val < 256;
+ else
+   return val > -4096 && val < 4092;
+   }
+
+  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
+}
+
   /* For quad modes, we restrict the constant offset to be slightly less
  than what the instruction format permits.  We do this because for
  quad mode moves, we will actually decompose them into two separate
@@ -8864,7 +8886,7 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& (INTVAL (index) & 3) == 0);
 
   /* We have no such constraint on double mode offsets, so we permit the
- full range of the instruction format.  */
+ full range of the instruction format.  Note DImode is included here.  */
   if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
 return (code == CONST_INT
&& INTVAL (index) < 1024
@@ -8877,27 +8899,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
 
-  if (arm_address_register_rtx_p (index, strict_p)
-  && (GET_MODE_SIZE (mode) <= 4))
-return 1;
-
-  if (mode == DImode || mode == DFmode)
-{
-  if (code == CONST_INT)
-   {
- HOST_WIDE_INT val = INTVAL (index);
-
- /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
-If vldr is selected it uses arm_coproc_mem_operand.  */
- if (TARGET_LDRD)
-   return val > -256 && val < 256;
- else
-   return val > -4096 && val < 4092;
-   }
-
-  return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p);
-}
-
   if (GET_MODE_SIZE (mode) <= 4
   && ! (arm_arch4
&& (mode == HImode
@@ -9000,7 +9001,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, 
int strict_p)
&& (INTVAL (index) & 3) == 0);
 
   /* We have no such constraint on double mode offsets, so we permit the
- full range of the instruction format.  */
+ full range of the instruction format.  Note DImode is included here.  */
   if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
 return (code == CONST_INT
&& INTVAL (index) < 1024
@@ -9011,6 +9012,7 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, 
int strict_p)
   && (GET_MODE_SIZE (mode) <= 4))
 return 1;
 
+  /* This handles DImode if !TARGET_NEON, and DFmode if !TARGET_VFP_BASE.  */
   if (mode == DImode || mode == DFmode)
 {
   if (code == CONST_INT)
diff --git a/gcc/testsuite/gcc.target/arm/pr115153.c 
b/gcc/testsuite/gcc.target/arm/pr115153.c
new file mode 100644
index 
..db1cd93b3d31b33a9800dac0d8dbe73e058e4073
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr115153.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -marm" } */
+/* { dg-require-effective-target arm_v8_neon_ok } */
+/* { dg-add-options arm_v8_neon } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f1:
+** add r0, r0, #256
+** ldrdr0, r1, \[r0\]
+** bx  lr
+*/
+long long f1 (long long *p)
+{
+  return __atomic_load_n (p + 32, __ATOMIC_RELAXED);
+}



Re: [PATCH] AArch64: Add ACLE MOPS support

2024-05-31 Thread Wilco Dijkstra
Hi Richard,

> I think this should be in a push_options/pop_options block, as for other
> intrinsics that require certain features.

But then the intrinsic would always be defined, which is contrary to what the
ACLE spec demands - it would not give a compilation error at the callsite
but give assembler errors (potentially in different functions after inlining).

> What was the reason for using an inline asm rather than a builtin?
> Feels a bit old school. :)  Using a builtin should mean that the
> RTL optimisers see the extent of the write.

Given this intrinsic will be used very rarely, if ever, it does not make sense
to provide anything more than the basic functionality.

Cheers,
Wilco

[PATCH] AArch64: Add ACLE MOPS support

2024-05-31 Thread Wilco Dijkstra

Add __ARM_FEATURE_MOPS predefine.  Add support for ACLE __arm_mops_memset_tag.

Passes regress, OK for commit?

gcc:
* config/aaarch64/aarch64-c.cc (aarch64_update_cpp_builtins):
Add __ARM_FEATURE_MOPS predefine.
* config/aarch64/arm_acle.h: Add __arm_mops_memset_tag().

gcc/testsuite:
* gcc.target/aarch64/acle/memtag_5.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index 
fe1a20e4e546a68e5f7eddff3bbb0d3e831fbd9b..884a7ba5d10b58fbe182a765041cf80bdaec9615
 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -260,6 +260,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (TARGET_SME_I16I64, "__ARM_FEATURE_SME_I16I64", pfile);
   aarch64_def_or_undef (TARGET_SME_F64F64, "__ARM_FEATURE_SME_F64F64", pfile);
   aarch64_def_or_undef (TARGET_SME2, "__ARM_FEATURE_SME2", pfile);
+  aarch64_def_or_undef (TARGET_MOPS, "__ARM_FEATURE_MOPS", pfile);
 
   /* Not for ACLE, but required to keep "float.h" correct if we switch
  target between implementations that do or do not support ARMv8.2-A
diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 
2aa681090fa205449cf1ac63151565f960716189..22ee4b211a55ca6537a1d9e3bf4dad09585071fb
 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -344,6 +344,21 @@ __rndrrs (uint64_t *__res)
 
 #pragma GCC pop_options
 
+#if defined (__ARM_FEATURE_MOPS) && defined (__ARM_FEATURE_MEMORY_TAGGING)
+__extension__ extern __inline void *
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_mops_memset_tag (void *__ptr, int __val, size_t __size)
+{
+  void *__ptr2 = __ptr;
+  __asm volatile ("setgp\t[%0]!, %1!, %x2\n\t"
+ "setgm\t[%0]!, %1!, %x2\n\t"
+ "setge\t[%0]!, %1!, %x2"
+ : "+r" (__ptr2), "+r" (__size)
+ : "rZ" (__val) : "cc", "memory");
+  return __ptr;
+}
+#endif
+
 #define __arm_rsr(__regname) \
   __builtin_aarch64_rsr (__regname)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/memtag_5.c 
b/gcc/testsuite/gcc.target/aarch64/acle/memtag_5.c
new file mode 100644
index 
..79ba1eb39d7c6d577fbe98a3285f8cc618428823
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/acle/memtag_5.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.8-a+memtag -O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include "arm_acle.h"
+
+#ifndef __ARM_FEATURE_MOPS
+# error __ARM_FEATURE_MOPS not defined!
+#endif
+
+/*
+** set_tag:
+** mov (x[0-9]+), x0
+** setgp   \[\1\]\!, x1\!, xzr
+** setgm   \[\1\]\!, x1\!, xzr
+** setge   \[\1\]\!, x1\!, xzr
+** ret
+*/
+void *set_tag (void *p, size_t size)
+{
+  return __arm_mops_memset_tag (p, 0, size);
+}




[PATCH] testsuite: Improve check-function-bodies

2024-05-31 Thread Wilco Dijkstra
Improve check-function-bodies by allowing single-character function names.
Also skip '#' comments which may be emitted from inline assembler.

Passes regress, OK for commit?

gcc/testsuite:
* lib/scanasm.exp (configure_check-function-bodies): Allow single-char
function names.  Skip '#' comments.

---

diff --git a/gcc/testsuite/lib/scanasm.exp b/gcc/testsuite/lib/scanasm.exp
index 
6cf9997240deec274a191103d21690d80e34ba95..0e461ef260b7a6fee5a9c60d0571e46468f752c0
 100644
--- a/gcc/testsuite/lib/scanasm.exp
+++ b/gcc/testsuite/lib/scanasm.exp
@@ -869,15 +869,15 @@ proc configure_check-function-bodies { config } {
 # Regexp for the start of a function definition (name in \1).
 if { [istarget nvptx*-*-*] } {
set up_config(start) {
-   {^// BEGIN(?: GLOBAL|) FUNCTION DEF: ([a-zA-Z_]\S+)$}
+   {^// BEGIN(?: GLOBAL|) FUNCTION DEF: ([a-zA-Z_]\S*)$}
}
 } elseif { [istarget *-*-darwin*] } {
set up_config(start) {
-   {^_([a-zA-Z_]\S+):$}
+   {^_([a-zA-Z_]\S*):$}
{^LFB[0-9]+:}
}
 } else {
-   set up_config(start) {{^([a-zA-Z_]\S+):$}}
+   set up_config(start) {{^([a-zA-Z_]\S*):$}}
 }
 
 # Regexp for the end of a function definition.
@@ -899,9 +899,9 @@ proc configure_check-function-bodies { config } {
 } else {
# Skip lines beginning with labels ('.L[...]:') or other directives
# ('.align', '.cfi_startproc', '.quad [...]', '.text', etc.), '//' or
-   # '@' comments ('-fverbose-asm' or ARM-style, for example), or empty
-   # lines.
-   set up_config(fluff) {^\s*(?:\.|//|@|$)}
+   # '@' or '#' comments ('-fverbose-asm' or ARM-style, for example), or
+   # empty lines.
+   set up_config(fluff) {^\s*(?:\.|//|@|#|$)}
 }
 
 # Regexp for expected output lines prefix.



[PATCH v3] aarch64: Fix normal returns inside functions which use eh_returns [PR114843]

2024-05-20 Thread Wilco Dijkstra
Hi Andrew,

A few comments on the implementation, I think it can be simplified a lot:

> +++ b/gcc/config/aarch64/aarch64.h
> @@ -700,8 +700,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = 
> AARCH64_FL_SM_OFF;
> #define DWARF2_UNWIND_INFO 1
>  
>  /* Use R0 through R3 to pass exception handling information.  */
> +#define EH_RETURN_DATA_REGISTERS_N 4
>  #define EH_RETURN_DATA_REGNO(N) \
> -  ((N) < 4 ? ((unsigned int) R0_REGNUM + (N)) : INVALID_REGNUM)
> +  ((N) < EH_RETURN_DATA_REGISTERS_N ? ((unsigned int) R0_REGNUM + (N)) : 
> INVALID_REGNUM)
 
It would be useful to add a macro IS_EH_RETURN_REGNUM(regnum) that just checks
the range R0_REGNUM to R0_REGNUM + EH_RETURN_DATA_REGISTERS_N.

> @@ -929,6 +928,7 @@ struct GTY (()) aarch64_frame
>  outgoing arguments) of each register save slot, or -2 if no save is
>  needed.  */
>   poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
> +  bool eh_return_allocated[EH_RETURN_DATA_REGISTERS_N];

This doesn't make much sense - besides X0-X3, we also need X5 and X6 for 
eh_return.
If these or any of the other temporaries used by epilog are callee-saved 
somehow,
things are going horribly wrong already... So what do we gain by doing this?


> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -7792,6 +7792,7 @@ aarch64_layout_frame (void)
> 
>  #define SLOT_NOT_REQUIRED (-2)
>  #define SLOT_REQUIRED (-1)
> +#define SLOT_EH_RETURN_REQUIRED (-3)
 
I don't see a need for this.


> @@ -7949,6 +7950,18 @@ aarch64_layout_frame (void)
> stopping it from being individually shrink-wrapped.  */
>  allocate_gpr_slot (R30_REGNUM);
>  
> +  /* Allocate the eh_return first. */
> +  if (crtl->calls_eh_return)
> +for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
> +  {
> + int realregno = EH_RETURN_DATA_REGNO (regno);
> + if (known_eq (frame.reg_offset[realregno], SLOT_EH_RETURN_REQUIRED))
> +   {
> + frame.eh_return_allocated[regno] = true;
> + allocate_gpr_slot (realregno);
> +   }
> +  }

This change is unnecessary if we just mark the slots with SLOT_REQUIRED.


> @@ -8035,6 +8048,23 @@ aarch64_layout_frame (void)
>   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
>   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
>  
> +  /* EH data registers are not pop canidates. */
> +  if (crtl->calls_eh_return)
> +for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; 
> regno++)> 
> +  {
> + if (frame.eh_return_allocated[regno]
> + && frame.wb_pop_candidate1 == EH_RETURN_DATA_REGNO (regno))
> + {
> +   frame.wb_pop_candidate1 = frame.wb_pop_candidate2;
> +   frame.wb_pop_candidate2 = INVALID_REGNUM;
> + }
> + if (frame.eh_return_allocated[regno]
> + && frame.wb_pop_candidate2 == EH_RETURN_DATA_REGNO (regno))
> + {
> +   frame.wb_pop_candidate2 = INVALID_REGNUM;
> + }
> +  }

This is unnecessary since we can just avoid making them push candidates
if there is no frame chain, eg:

if ((!crtl->calls_eh_return || frame.emit_frame_chain) && !push_regs.empty ()
  && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))


@@ -8681,6 +8712,20 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
   if (frame.is_scs_enabled && regno == LR_REGNUM)
return true;
 
+  /* Skip the eh return data registers if we are
+returning normally rather than via eh_return. */
+  if (!was_eh_return && crtl->calls_eh_return)
+   {
+ for (unsigned ehregno = 0;
+  EH_RETURN_DATA_REGNO (ehregno) != INVALID_REGNUM;
+  ehregno++)
+   {
+ if (EH_RETURN_DATA_REGNO (ehregno) == regno
+ && frame.eh_return_allocated[ehregno])
+   return true;
+   }
+   }
+

So this could be something like:

  if (!was_eh_return && crtl->calls_eh_return && IS_EH_RETURN_REGNUM 
(regno))
return true;
 
Cheers,
Wilco

Re: [PATCH] AArch64: Improve costing of ctz

2024-05-15 Thread Wilco Dijkstra
Hi Andrew,

> I should note popcount has a similar issue which I hope to fix next week.
> Popcount cost is used during expand so it is very useful to be slightly more 
> correct.

It's useful to set the cost so that all of the special cases still apply - even 
if popcount is
relatively fast, it's still better to use ALU ops with higher throughput 
whenever possible.

Cheers,
Wilco

[PATCH] AArch64: Improve costing of ctz

2024-05-15 Thread Wilco Dijkstra
Improve costing of ctz - both TARGET_CSSC and vector cases were not handled yet.

Passes regress & bootstrap - OK for commit?

gcc:
* config/aarch64/aarch64.cc (aarch64_rtx_costs): Improve CTZ costing.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
fe13c9a0d4863041eb9101882ea57c2094240d16..2a6f76f4008839bf0aa158504430af9b971c
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14309,10 +14309,24 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int 
outer ATTRIBUTE_UNUSED,
   return false;
 
 case CTZ:
-  *cost = COSTS_N_INSNS (2);
-
-  if (speed)
-   *cost += extra_cost->alu.clz + extra_cost->alu.rev;
+  if (VECTOR_MODE_P (mode))
+   {
+ *cost = COSTS_N_INSNS (3);
+ if (speed)
+   *cost += extra_cost->vect.alu * 3;
+   }
+  else if (TARGET_CSSC)
+   {
+ *cost = COSTS_N_INSNS (1);
+ if (speed)
+   *cost += extra_cost->alu.clz;
+   }
+  else
+   {
+ *cost = COSTS_N_INSNS (2);
+ if (speed)
+   *cost += extra_cost->alu.clz + extra_cost->alu.rev;
+   }
   return false;
 
 case COMPARE:



[PATCH] AArch64: Fix printing of 2-instruction alternatives

2024-05-15 Thread Wilco Dijkstra
Add missing '\' in 2-instruction movsi/di alternatives so that they are
printed on separate lines.

Passes bootstrap and regress, OK for commit once stage 1 reopens?

gcc:
* config/aarch64/aarch64.md (movsi_aarch64): Use '\;' to force
newline in 2-instruction pattern.
(movdi_aarch64): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
1a2e01284249223565cd12cf1bfd5db5475e56fb..5416c2e3b2002d0e53baf23e7c0048ddf683
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1447,7 +1447,7 @@ (define_insn_and_split "*movsi_aarch64"
  [w  , m  ; load_4   , fp  , 4] ldr\t%s0, %1
  [m  , r Z; store_4  , *   , 4] str\t%w1, %0
  [m  , w  ; store_4  , fp  , 4] str\t%s1, %0
- [r  , Usw; load_4   , *   , 8] adrp\t%x0, %A1;ldr\t%w0, [%x0, %L1]
+ [r  , Usw; load_4   , *   , 8] adrp\t%x0, %A1\;ldr\t%w0, [%x0, %L1]
  [r  , Usa; adr  , *   , 4] adr\t%x0, %c1
  [r  , Ush; adr  , *   , 4] adrp\t%x0, %A1
  [w  , r Z; f_mcr, fp  , 4] fmov\t%s0, %w1
@@ -1484,7 +1484,7 @@ (define_insn_and_split "*movdi_aarch64"
  [w, m  ; load_8   , fp  , 4] ldr\t%d0, %1
  [m, r Z; store_8  , *   , 4] str\t%x1, %0
  [m, w  ; store_8  , fp  , 4] str\t%d1, %0
- [r, Usw; load_8   , *   , 8] << TARGET_ILP32 ? "adrp\t%0, %A1;ldr\t%w0, 
[%0, %L1]" : "adrp\t%0, %A1;ldr\t%0, [%0, %L1]";
+ [r, Usw; load_8   , *   , 8] << TARGET_ILP32 ? "adrp\t%0, %A1\;ldr\t%w0, 
[%0, %L1]" : "adrp\t%0, %A1\;ldr\t%0, [%0, %L1]";
  [r, Usa; adr  , *   , 4] adr\t%x0, %c1
  [r, Ush; adr  , *   , 4] adrp\t%x0, %A1
  [w, r Z; f_mcr, fp  , 4] fmov\t%d0, %x1




[PATCH] AArch64: Use LDP/STP for large struct types

2024-05-15 Thread Wilco Dijkstra
Use LDP/STP for large struct types as they have useful immediate offsets and 
are typically faster.
This removes differences between little and big endian and allows use of 
LDP/STP without UNSPEC.

Passes regress and bootstrap, OK for commit?

gcc:
* config/aarch64/aarch64.cc (aarch64_classify_address): Treat SIMD 
structs identically
in little and bigendian.
* config/aarch64/aarch64.md (aarch64_mov): Remove VSTRUCT 
instructions.
(aarch64_be_mov): Allow little-endian, rename to 
aarch64_mov.
(aarch64_be_movoi): Allow little-endian, rename to aarch64_movoi.
(aarch64_be_movci): Allow little-endian, rename to aarch64_movci.
(aarch64_be_movxi): Allow little-endian, rename to aarch64_movxi.
Remove big-endian special case in define_split variants.

gcc/testsuite:
* gcc.target/aarch64/torture/simd-abi-8.c: Update to check for LDP/STP.

---

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
16b7445d9f72f77a98ab262e21fd24e6cc97eba0..bb8b6963fd5117be82afe6ccd7154ae5302c3691
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7917,32 +7917,6 @@
   [(set_attr "type" "neon_store1_4reg")]
 )
 
-(define_insn "*aarch64_mov"
-  [(set (match_operand:VSTRUCT_QD 0 "aarch64_simd_nonimmediate_operand")
-   (match_operand:VSTRUCT_QD 1 "aarch64_simd_general_operand"))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN
-   && (register_operand (operands[0], mode)
-   || register_operand (operands[1], mode))"
-  {@ [ cons: =0 , 1   ; attrs: type, length]
- [ w, w   ; multiple   ,   ] #
- [ Utv  , w   ; neon_store_reg_q , 4 ] 
st1\t{%S1. - %1.}, %0
- [ w, Utv ; neon_load_reg_q  , 4 ] 
ld1\t{%S0. - %0.}, %1
-  }
-)
-
-(define_insn "*aarch64_mov"
-  [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand")
-   (match_operand:VSTRUCT 1 "aarch64_simd_general_operand"))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN
-   && (register_operand (operands[0], mode)
-   || register_operand (operands[1], mode))"
-  {@ [ cons: =0 , 1   ; attrs: type, length]
- [ w, w   ; multiple   ,   ] #
- [ Utv  , w   ; neon_store_reg_q , 4 ] 
st1\t{%S1.16b - %1.16b}, %0
- [ w, Utv ; neon_load_reg_q  , 4 ] 
ld1\t{%S0.16b - %0.16b}, %1
-  }
-)
-
 (define_insn "*aarch64_movv8di"
   [(set (match_operand:V8DI 0 "nonimmediate_operand" "=r,m,r")
(match_operand:V8DI 1 "general_operand" " r,r,m"))]
@@ -7972,11 +7946,10 @@
   [(set_attr "type" "neon_store1_1reg")]
 )
 
-(define_insn "*aarch64_be_mov"
+(define_insn "*aarch64_mov"
   [(set (match_operand:VSTRUCT_2D 0 "nonimmediate_operand")
(match_operand:VSTRUCT_2D 1 "general_operand"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
   {@ [ cons: =0 , 1 ; attrs: type , length ]
@@ -7986,11 +7959,10 @@
   }
 )
 
-(define_insn "*aarch64_be_mov"
+(define_insn "*aarch64_mov"
   [(set (match_operand:VSTRUCT_2Q 0 "nonimmediate_operand")
(match_operand:VSTRUCT_2Q 1 "general_operand"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
   {@ [ cons: =0 , 1 ; attrs: type , arch , length ]
@@ -8000,11 +7972,10 @@
   }
 )
 
-(define_insn "*aarch64_be_movoi"
+(define_insn "*aarch64_movoi"
   [(set (match_operand:OI 0 "nonimmediate_operand")
(match_operand:OI 1 "general_operand"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], OImode)
|| register_operand (operands[1], OImode))"
   {@ [ cons: =0 , 1 ; attrs: type , arch , length ]
@@ -8014,11 +7985,10 @@
   }
 )
 
-(define_insn "*aarch64_be_mov"
+(define_insn "*aarch64_mov"
   [(set (match_operand:VSTRUCT_3QD 0 "nonimmediate_operand" "=w,o,w")
(match_operand:VSTRUCT_3QD 1 "general_operand"  " w,w,o"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
   "#"
@@ -8027,11 +7997,10 @@
(set_attr "length" "12,8,8")]
 )
 
-(define_insn "*aarch64_be_movci"
+(define_insn "*aarch64_movci"
   [(set (match_operand:CI 0 "nonimmediate_operand" "=w,o,w")
(match_operand:CI 1 "general_operand"  " w,w,o"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], CImode)
|| register_operand (operands[1], CImode))"
   "#"
@@ -8040,11 +8009,10 @@
(set_attr "length" "12,8,8")]
 )
 
-(define_insn "*aarch64_be_mov"
+(define_insn "*aarch64_mov"
   [(set (match_operand:VSTRUCT_4QD 0 "nonimmediate_operand" "=w,o,w")
(match_operand:VSTRUCT_4QD 1 

[PATCH] AArch64: Use LDP/STP for large struct types

2024-05-15 Thread Wilco Dijkstra
Use LDP/STP for large struct types as they have useful immediate offsets and 
are typically faster.
This removes differences between little and big endian and allows use of 
LDP/STP without UNSPEC.

Passes regress and bootstrap, OK for commit?

gcc:
* config/aarch64/aarch64.cc (aarch64_classify_address): Treat SIMD 
structs identically
in little and bigendian.
* config/aarch64/aarch64.md (aarch64_mov): Remove VSTRUCT 
instructions.
(aarch64_be_mov): Allow little-endian, rename to 
aarch64_mov.
(aarch64_be_movoi): Allow little-endian, rename to aarch64_movoi.
(aarch64_be_movci): Allow little-endian, rename to aarch64_movci.
(aarch64_be_movxi): Allow little-endian, rename to aarch64_movxi.
Remove big-endian special case in define_split variants.

gcc/testsuite:
* gcc.target/aarch64/torture/simd-abi-8.c: Update to check for LDP/STP.

---

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
16b7445d9f72f77a98ab262e21fd24e6cc97eba0..bb8b6963fd5117be82afe6ccd7154ae5302c3691
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7917,32 +7917,6 @@
   [(set_attr "type" "neon_store1_4reg")]
 )
 
-(define_insn "*aarch64_mov"
-  [(set (match_operand:VSTRUCT_QD 0 "aarch64_simd_nonimmediate_operand")
-   (match_operand:VSTRUCT_QD 1 "aarch64_simd_general_operand"))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN
-   && (register_operand (operands[0], mode)
-   || register_operand (operands[1], mode))"
-  {@ [ cons: =0 , 1   ; attrs: type, length]
- [ w, w   ; multiple   ,   ] #
- [ Utv  , w   ; neon_store_reg_q , 4 ] 
st1\t{%S1. - %1.}, %0
- [ w, Utv ; neon_load_reg_q  , 4 ] 
ld1\t{%S0. - %0.}, %1
-  }
-)
-
-(define_insn "*aarch64_mov"
-  [(set (match_operand:VSTRUCT 0 "aarch64_simd_nonimmediate_operand")
-   (match_operand:VSTRUCT 1 "aarch64_simd_general_operand"))]
-  "TARGET_SIMD && !BYTES_BIG_ENDIAN
-   && (register_operand (operands[0], mode)
-   || register_operand (operands[1], mode))"
-  {@ [ cons: =0 , 1   ; attrs: type, length]
- [ w, w   ; multiple   ,   ] #
- [ Utv  , w   ; neon_store_reg_q , 4 ] 
st1\t{%S1.16b - %1.16b}, %0
- [ w, Utv ; neon_load_reg_q  , 4 ] 
ld1\t{%S0.16b - %0.16b}, %1
-  }
-)
-
 (define_insn "*aarch64_movv8di"
   [(set (match_operand:V8DI 0 "nonimmediate_operand" "=r,m,r")
(match_operand:V8DI 1 "general_operand" " r,r,m"))]
@@ -7972,11 +7946,10 @@
   [(set_attr "type" "neon_store1_1reg")]
 )
 
-(define_insn "*aarch64_be_mov"
+(define_insn "*aarch64_mov"
   [(set (match_operand:VSTRUCT_2D 0 "nonimmediate_operand")
(match_operand:VSTRUCT_2D 1 "general_operand"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
   {@ [ cons: =0 , 1 ; attrs: type , length ]
@@ -7986,11 +7959,10 @@
   }
 )
 
-(define_insn "*aarch64_be_mov"
+(define_insn "*aarch64_mov"
   [(set (match_operand:VSTRUCT_2Q 0 "nonimmediate_operand")
(match_operand:VSTRUCT_2Q 1 "general_operand"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
   {@ [ cons: =0 , 1 ; attrs: type , arch , length ]
@@ -8000,11 +7972,10 @@
   }
 )
 
-(define_insn "*aarch64_be_movoi"
+(define_insn "*aarch64_movoi"
   [(set (match_operand:OI 0 "nonimmediate_operand")
(match_operand:OI 1 "general_operand"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], OImode)
|| register_operand (operands[1], OImode))"
   {@ [ cons: =0 , 1 ; attrs: type , arch , length ]
@@ -8014,11 +7985,10 @@
   }
 )
 
-(define_insn "*aarch64_be_mov"
+(define_insn "*aarch64_mov"
   [(set (match_operand:VSTRUCT_3QD 0 "nonimmediate_operand" "=w,o,w")
(match_operand:VSTRUCT_3QD 1 "general_operand"  " w,w,o"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
   "#"
@@ -8027,11 +7997,10 @@
(set_attr "length" "12,8,8")]
 )
 
-(define_insn "*aarch64_be_movci"
+(define_insn "*aarch64_movci"
   [(set (match_operand:CI 0 "nonimmediate_operand" "=w,o,w")
(match_operand:CI 1 "general_operand"  " w,w,o"))]
   "TARGET_FLOAT
-   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
&& (register_operand (operands[0], CImode)
|| register_operand (operands[1], CImode))"
   "#"
@@ -8040,11 +8009,10 @@
(set_attr "length" "12,8,8")]
 )
 
-(define_insn "*aarch64_be_mov"
+(define_insn "*aarch64_mov"
   [(set (match_operand:VSTRUCT_4QD 0 "nonimmediate_operand" "=w,o,w")
(match_operand:VSTRUCT_4QD 1 

[PATCH] AArch64: Use UZP1 instead of INS

2024-05-15 Thread Wilco Dijkstra
Use UZP1 instead of INS when combining low and high halves of vectors.
UZP1 has 3 operands which improves register allocation, and is faster on
some microarchitectures.

Passes regress & bootstrap, OK for commit?

gcc:
* config/aarch64/aarch64-simd.md (aarch64_combine_internal):
Use UZP1 instead of INS.
(aarch64_combine_internal_be): Likewise.

gcc/testsuite:  
* gcc.target/aarch64/ldp_stp_16.c: Update to check for UZP1.
* gcc.target/aarch64/pr109072_1.c: Likewise.
* gcc.target/aarch64/vec-init-14.c: Likewise.
* gcc.target/aarch64/vec-init-9.c: Likewise.

---

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
f8bb973a278c7964f3e3a4f7154a0ab62214b7cf..16b7445d9f72f77a98ab262e21fd24e6cc97eba0
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4388,7 +4388,7 @@
&& (register_operand (operands[0], mode)
|| register_operand (operands[2], mode))"
   {@ [ cons: =0 , 1  , 2   ; attrs: type   , arch  ]
- [ w, 0  , w   ; neon_ins, simd  ] 
ins\t%0.[1], %2.[0]
+ [ w, w  , w   ; neon_permute, simd  ] 
uzp1\t%0.2, %1.2, %2.2
  [ w, 0  , ?r  ; neon_from_gp, simd  ] 
ins\t%0.[1], %2
  [ w, 0  , ?r  ; f_mcr , * ] 
fmov\t%0.d[1], %2
  [ w, 0  , Utv ; neon_load1_one_lane , simd  ] 
ld1\t{%0.}[1], %2
@@ -4407,7 +4407,7 @@
&& (register_operand (operands[0], mode)
|| register_operand (operands[2], mode))"
   {@ [ cons: =0 , 1  , 2   ; attrs: type   , arch  ]
- [ w, 0  , w   ; neon_ins, simd  ] 
ins\t%0.[1], %2.[0]
+ [ w, w  , w   ; neon_permute, simd  ] 
uzp1\t%0.2, %1.2, %2.2
  [ w, 0  , ?r  ; neon_from_gp, simd  ] 
ins\t%0.[1], %2
  [ w, 0  , ?r  ; f_mcr , * ] 
fmov\t%0.d[1], %2
  [ w, 0  , Utv ; neon_load1_one_lane , simd  ] 
ld1\t{%0.}[1], %2
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c 
b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
index 
f1f46e051a86d160a7f7f14872108da87b444ca1..95835aa2eb41c289e7b74f19bb56cf6fa23a3045
 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
@@ -80,16 +80,16 @@ CONS2_FN (2, float);
 
 /*
 ** cons2_4_float:  { target aarch64_little_endian }
-** ins v0.s\[1\], v1.s\[0\]
-** stp d0, d0, \[x0\]
-** stp d0, d0, \[x0, #?16\]
+** uzp1v([0-9])\.2s, v0\.2s, v1\.2s
+** stp d\1, d\1, \[x0\]
+** stp d\1, d\1, \[x0, #?16\]
 ** ret
 */
 /*
 ** cons2_4_float:  { target aarch64_big_endian }
-** ins v1.s\[1\], v0.s\[0\]
-** stp d1, d1, \[x0\]
-** stp d1, d1, \[x0, #?16\]
+** uzp1v([0-9])\.2s, v1\.2s, v0\.2s
+** stp d\1, d\1, \[x0\]
+** stp d\1, d\1, \[x0, #?16\]
 ** ret
 */
 CONS2_FN (4, float);
@@ -125,8 +125,8 @@ CONS4_FN (2, float);
 
 /*
 ** cons4_4_float:
-** ins v[0-9]+\.s[^\n]+
-** ins v[0-9]+\.s[^\n]+
+** uzp1v[0-9]+\.2s[^\n]+
+** uzp1v[0-9]+\.2s[^\n]+
 ** zip1v([0-9]+).4s, [^\n]+
 ** stp q\1, q\1, \[x0\]
 ** stp q\1, q\1, \[x0, #?32\]
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c 
b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
index 
6c1d2b0bdccfb74b80d938a0d94413f0f9dda5ab..0fc195a598f3b82ff188b3151e77e1272254b78c
 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
@@ -54,7 +54,7 @@ f32x2_1 (float32_t x)
 
 /*
 ** f32x2_2:
-** ins v0\.s\[1\], v1.s\[0\]
+** uzp1v0\.2s, v0\.2s, v1\.2s
 ** ret
 */
 float32x2_t
@@ -165,7 +165,7 @@ f64x2_1 (float64_t x)
 
 /*
 ** f64x2_2:
-** ins v0\.d\[1\], v1.d\[0\]
+** uzp1v0\.2d, v0\.2d, v1\.2d
 ** ret
 */
 float64x2_t
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
index 
02875088cd98833882cdf15b14dcb426951e428f..1a2cc9fbf473ad0de2d8ef97d7efdbe40d959866
 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c
@@ -67,7 +67,7 @@ int32x2_t s32_6(int32_t a0, int32_t a1) {
 
 /*
 ** f32_1:
-** ins v0\.s\[1\], v1\.s\[0\]
+** uzp1v0\.2s, v0\.2s, v1\.2s
 ** ret
 */
 float32x2_t f32_1(float32_t a0, float32_t a1) {
@@ -90,7 +90,7 @@ float32x2_t f32_2(float32_t a0, float32_t *ptr) {
 /*
 ** f32_3:
 ** ldr s0, \[x0\]
-** ins v0\.s\[1\], v1\.s\[0\]
+** uzp1v0\.2s, v0\.2s, v1\.2s
 ** ret
 */
 float32x2_t f32_3(float32_t a0, float32_t a1, float32_t *ptr) {
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c
index 
8f68e06a55925b973a87723c7b5924264382e4b0..3cf05cf865e21fad482e5ffc8c769d0f15a57e74
 

[PATCH] regalloc: Ignore '^' in early costing [PR114766]

2024-04-29 Thread Wilco Dijkstra

According to documentation, '^' should only have an effect during reload.
However ira-costs.cc treats it in the same way as '?' during early costing.
As a result using '^' can accidentally disable valid alternatives and cause
significant regressions (see PR114741).  Avoid this by ignoring '^' during
costing.

Passes bootstrap and regress, OK for commit?

gcc:
PR rtl-optimization/114766
* ira-costs.cc (record_reg_classes): Ignore '^' during costing.

---

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index 
c86c5a16563aeefac9d4fa72839bee8d95409f4b..04d2f21b023f3456ba6f8c16c2418d7313965b2f
 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -771,10 +771,6 @@ record_reg_classes (int n_alts, int n_ops, rtx *ops,
  c = *++p;
  break;
 
-   case '^':
- alt_cost += 2;
- break;
-
case '?':
  alt_cost += 2;
  break;



[PATCH] libgcc: Add missing HWCAP entries to aarch64/cpuinfo.c

2024-04-02 Thread Wilco Dijkstra

A few HWCAP entries are missing from aarch64/cpuinfo.c.  This results in build 
errors
on older machines.

This counts a trivial build fix, but since it's late in stage 4 I'll let 
maintainers chip in.
OK for commit?

libgcc/
* config/aarch64/cpuinfo.c: Add HWCAP_EVTSTRM, HWCAP_CRC32, 
HWCAP_CPUID, 
HWCAP_PACA and HWCAP_PACG.

---

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index 
3c6fb8a575b423c2aff71a1a9f40812b154ee284..4b94fca869507145ec690c825f637abbc82a3493
 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -52,15 +52,15 @@ struct {
 #ifndef AT_HWCAP
 #define AT_HWCAP 16
 #endif
-#ifndef HWCAP_CPUID
-#define HWCAP_CPUID (1 << 11)
-#endif
 #ifndef HWCAP_FP
 #define HWCAP_FP (1 << 0)
 #endif
 #ifndef HWCAP_ASIMD
 #define HWCAP_ASIMD (1 << 1)
 #endif
+#ifndef HWCAP_EVTSTRM
+#define HWCAP_EVTSTRM (1 << 2)
+#endif
 #ifndef HWCAP_AES
 #define HWCAP_AES (1 << 3)
 #endif
@@ -73,6 +73,9 @@ struct {
 #ifndef HWCAP_SHA2
 #define HWCAP_SHA2 (1 << 6)
 #endif
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
 #ifndef HWCAP_ATOMICS
 #define HWCAP_ATOMICS (1 << 8)
 #endif
@@ -82,6 +85,9 @@ struct {
 #ifndef HWCAP_ASIMDHP
 #define HWCAP_ASIMDHP (1 << 10)
 #endif
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1 << 11)
+#endif
 #ifndef HWCAP_ASIMDRDM
 #define HWCAP_ASIMDRDM (1 << 12)
 #endif
@@ -133,6 +139,12 @@ struct {
 #ifndef HWCAP_SB
 #define HWCAP_SB (1 << 29)
 #endif
+#ifndef HWCAP_PACA
+#define HWCAP_PACA (1 << 30)
+#endif
+#ifndef HWCAP_PACG
+#define HWCAP_PACG (1UL << 31)
+#endif
 
 #ifndef HWCAP2_DCPODP
 #define HWCAP2_DCPODP (1 << 0)



[PATCH] libatomic: Cleanup macros in atomic_16.S

2024-03-26 Thread Wilco Dijkstra

As mentioned in 
https://gcc.gnu.org/pipermail/gcc-patches/2024-March/648397.html ,
do some additional cleanup of the macros and aliases:

Cleanup the macros to add the libat_ prefixes in atomic_16.S.  Emit the
alias to __atomic_ when ifuncs are not enabled in the ENTRY macro.

Passes regress and bootstrap, OK for commit?

libatomic:
* config/linux/aarch64/atomic_16.S: Add __libat_ prefix in the
LSE2/LSE128/CORE macros, remove elsewhere.  Add ATOMIC macro.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
4e3fa870b0338da4cfcdb0879ab8bed8d041a0a3..d0343507120c06a483ffdae1a793b6b5263cfe98
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -45,7 +45,7 @@
 # define HAVE_FEAT_LSE128 0
 #endif
 
-#define HAVE_FEAT_LSE2  HAVE_IFUNC
+#define HAVE_FEAT_LSE2 HAVE_IFUNC
 
 #if HAVE_FEAT_LSE128
.arch   armv9-a+lse128
@@ -53,31 +53,37 @@
.arch   armv8-a+lse
 #endif
 
-#define LSE128(NAME)   NAME##_i1
-#define LSE2(NAME) NAME##_i2
-#define CORE(NAME) NAME
+#define LSE128(NAME)   libat_##NAME##_i1
+#define LSE2(NAME) libat_##NAME##_i2
+#define CORE(NAME) libat_##NAME
+#define ATOMIC(NAME)   __atomic_##NAME
 
-#define ENTRY_FEAT(NAME, FEAT)  \
-   ENTRY (FEAT (NAME))
+#if HAVE_IFUNC
+# define ENTRY(NAME)   ENTRY2 (CORE (NAME), )
+#else
+/* Emit __atomic_* entrypoints if no ifuncs.  */
+# define ENTRY(NAME)   ENTRY2 (CORE (NAME), ALIAS (NAME, ATOMIC, CORE))
+#endif
+#define ENTRY_FEAT(NAME, FEAT) ENTRY2 (FEAT (NAME), )
+
+#define END(NAME)  END2 (CORE (NAME))
+#define END_FEAT(NAME, FEAT)   END2 (FEAT (NAME))
 
-#define ENTRY(NAME)\
+#define ENTRY2(NAME, ALIASES)  \
.global NAME;   \
.hidden NAME;   \
.type NAME,%function;   \
.p2align 4; \
+   ALIASES;\
 NAME:  \
-   .cfi_startproc; \
-   hint34  // bti c
-
-#define END_FEAT(NAME, FEAT)   \
-   END (FEAT (NAME))
+   .cfi_startproc; \
+   hint34; // bti c
 
-#define END(NAME)  \
+#define END2(NAME) \
.cfi_endproc;   \
.size NAME, .-NAME;
 
-#define ALIAS(NAME, FROM, TO)  ALIAS1 (FROM (NAME),TO (NAME))
-#define ALIAS2(NAME)   ALIAS1 (__atomic_##NAME, libat_##NAME)
+#define ALIAS(NAME, FROM, TO)  ALIAS1 (FROM (NAME), TO (NAME))
 
 #define ALIAS1(ALIAS, NAME)\
.global ALIAS;  \
@@ -116,7 +122,7 @@ NAME:   \
 #define SEQ_CST 5
 
 
-ENTRY (libat_load_16)
+ENTRY (load_16)
mov x5, x0
cbnzw1, 2f
 
@@ -131,11 +137,11 @@ ENTRY (libat_load_16)
stxpw4, res0, res1, [x5]
cbnzw4, 2b
ret
-END (libat_load_16)
+END (load_16)
 
 
 #if HAVE_FEAT_LSE2
-ENTRY_FEAT (libat_load_16, LSE2)
+ENTRY_FEAT (load_16, LSE2)
cbnzw1, 1f
 
/* RELAXED.  */
@@ -155,11 +161,11 @@ ENTRY_FEAT (libat_load_16, LSE2)
ldp res0, res1, [x0]
dmb ishld
ret
-END_FEAT (libat_load_16, LSE2)
+END_FEAT (load_16, LSE2)
 #endif
 
 
-ENTRY (libat_store_16)
+ENTRY (store_16)
cbnzw4, 2f
 
/* RELAXED.  */
@@ -173,11 +179,11 @@ ENTRY (libat_store_16)
stlxp   w4, in0, in1, [x0]
cbnzw4, 2b
ret
-END (libat_store_16)
+END (store_16)
 
 
 #if HAVE_FEAT_LSE2
-ENTRY_FEAT (libat_store_16, LSE2)
+ENTRY_FEAT (store_16, LSE2)
cbnzw4, 1f
 
/* RELAXED.  */
@@ -189,11 +195,11 @@ ENTRY_FEAT (libat_store_16, LSE2)
stlxp   w4, in0, in1, [x0]
cbnzw4, 1b
ret
-END_FEAT (libat_store_16, LSE2)
+END_FEAT (store_16, LSE2)
 #endif
 
 
-ENTRY (libat_exchange_16)
+ENTRY (exchange_16)
mov x5, x0
cbnzw4, 2f
 
@@ -217,11 +223,11 @@ ENTRY (libat_exchange_16)
stlxp   w4, in0, in1, [x5]
cbnzw4, 4b
ret
-END (libat_exchange_16)
+END (exchange_16)
 
 
 #if HAVE_FEAT_LSE128
-ENTRY_FEAT (libat_exchange_16, LSE128)
+ENTRY_FEAT (exchange_16, LSE128)
mov tmp0, x0
mov res0, in0
mov res1, in1
@@ -241,11 +247,11 @@ ENTRY_FEAT (libat_exchange_16, LSE128)
/* RELEASE/ACQ_REL/SEQ_CST.  */
 2: swppal  res0, res1, [tmp0]
ret
-END_FEAT (libat_exchange_16, LSE128)
+END_FEAT (exchange_16, LSE128)
 #endif
 
 
-ENTRY (libat_compare_exchange_16)
+ENTRY (compare_exchange_16)
ldp exp0, exp1, [x1]
cbz w4, 3f
cmp w4, RELEASE
@@ -289,11 +295,11 @@ ENTRY (libat_compare_exchange_16)
stp tmp0, tmp1, [x1]
 6: csetx0, eq
ret
-END (libat_compare_exchange_16)
+END (compare_exchange_16)
 
 
 #if HAVE_FEAT_LSE2
-ENTRY_FEAT (libat_compare_exchange_16, LSE2)
+ENTRY_FEAT (compare_exchange_16, LSE2)
ldp exp0, exp1, [x1]
mov tmp0, exp0
mov tmp1, exp1
@@ 

Re: [PATCH] libatomic: Fix build for --disable-gnu-indirect-function [PR113986]

2024-03-26 Thread Wilco Dijkstra
Hi Richard,

> This description is too brief for me.  Could you say in detail how the
> new scheme works?  E.g. the description doesn't explain:
>
> -if ARCH_AARCH64_HAVE_LSE128
> -AM_CPPFLAGS   = -DHAVE_FEAT_LSE128
> -endif

That is not needed because we can include auto-config.h in atomic_16.S. I needed
this for HAVE_IFUNC, but then we redefine HAVE_FEAT_LSE128...

> And what's the purpose of ARCH_AARCH64_HAVE_LSE128 after this change?

None. I've removed the makefile leftovers in v2.

> Is the indirection via ALIAS2 necessary?  Couldn't ENTRY just define
> the __atomic_* symbols directly, as non-hidden, if we remove the
> libat_ prefix?  That would make it easier to ensure that the lists
> are kept up-to-date.

Yes, we need both the libat_ symbol as well as the __atomic_ variant in this
case. One is for internal calls, the other for external. I have a separate 
cleanup
patch which hides the extra alias in ENTRY and removes all the libat prefixes.
However while trivial, that feels more like a stage 1 patch.

> Shouldn't we skip the ENTRY_FEAT functions and existing aliases
> if !HAVE_IFUNC?

Yes, that's relatively easy, I've added HAVE_FEAT_LSE2 for that. Also we skip 
the
aliases at the end.

> I think it'd be worth (as a prepatch) splitting the file into two
> #included subfiles, one that contains the base AArch64 routines and one
> that contains the optimised versions.  The former would then be #included
> for all builds while the latter would be specific to HAVE_IFUNC.

That sounds like a complete rewrite. We might as well emit our own ifuncs at 
that
point and avoid all of the workarounds needed to fit in the framework of 
libatomic.

So for v2 I have kept things simple and just focus on fixing the bug.

Cheers,
Wilco


v2: 

Fix libatomic build to support --disable-gnu-indirect-function on AArch64.
Always build atomic_16.S, add aliases to the __atomic_ functions if 
!HAVE_IFUNC. 
Include auto-config.h in atomic_16.S to avoid having to pass defines via 
makefiles.
Fix build if HWCAP_ATOMICS/CPUID are not defined.

Passes regress and bootstrap, OK for commit?

libatomic:
PR target/113986
* Makefile.in: Regenerated.
* Makefile.am: Make atomic_16.S not depend on HAVE_IFUNC.
Remove predefine of HAVE_FEAT_LSE128.
* acinclude.m4: Remove ARCH_AARCH64_HAVE_LSE128.
* configure: Regenerated.
* config/linux/aarch64/atomic_16.S: Add __atomic_ aliases if 
!HAVE_IFUNC.   
* config/linux/aarch64/host-config.h: Correctly handle !HAVE_IFUNC.  Add
defines for HWCAP_ATOMICS and HWCAP_CPUID.

---

diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
index 
d49c44c7d5fbe83061fddd1f8ef4813a39eb1b8b..980677f353345c050f6cef2d57090360216c56cf
 100644
--- a/libatomic/Makefile.am
+++ b/libatomic/Makefile.am
@@ -130,12 +130,8 @@ libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix 
_$(s)_.lo,$(SIZEOBJS)))
 ## On a target-specific basis, include alternates to be selected by IFUNC.
 if HAVE_IFUNC
 if ARCH_AARCH64_LINUX
-if ARCH_AARCH64_HAVE_LSE128
-AM_CPPFLAGS = -DHAVE_FEAT_LSE128
-endif
 IFUNC_OPTIONS   = -march=armv8-a+lse
 libatomic_la_LIBADD += $(foreach s,$(SIZES),$(addsuffix 
_$(s)_1_.lo,$(SIZEOBJS)))
-libatomic_la_SOURCES += atomic_16.S
 
 endif
 if ARCH_ARM_LINUX
@@ -155,6 +151,10 @@ libatomic_la_LIBADD += $(addsuffix _16_1_.lo,$(SIZEOBJS)) \
 endif
 endif
 
+if ARCH_AARCH64_LINUX
+libatomic_la_SOURCES += atomic_16.S
+endif
+
 libatomic_convenience_la_SOURCES = $(libatomic_la_SOURCES)
 libatomic_convenience_la_LIBADD = $(libatomic_la_LIBADD)
 
diff --git a/libatomic/Makefile.in b/libatomic/Makefile.in
index 
11c8ec7ba15ba7da5ef55e90bd836317bc270061..d9d529bc502d4ce7b9997640d5f40f5d5cc1232c
 100644
--- a/libatomic/Makefile.in
+++ b/libatomic/Makefile.in
@@ -90,17 +90,17 @@ build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
 @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_1 = $(foreach 
s,$(SIZES),$(addsuffix _$(s)_1_.lo,$(SIZEOBJS)))
-@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = atomic_16.S
-@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(foreach \
+@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = $(foreach \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ s,$(SIZES),$(addsuffix \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ _$(s)_1_.lo,$(SIZEOBJS))) \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ $(addsuffix \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ _8_2_.lo,$(SIZEOBJS)) \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ tas_1_2_.lo
-@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix 
_8_1_.lo,$(SIZEOBJS))
-@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_5 = $(addsuffix 
_16_1_.lo,$(SIZEOBJS)) \
+@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(addsuffix 
_8_1_.lo,$(SIZEOBJS))
+@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix 
_16_1_.lo,$(SIZEOBJS)) \
 @ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@   $(addsuffix 
_16_2_.lo,$(SIZEOBJS))
 

[COMMITTED] ARM: Fix builtin-bswap-1.c test [PR113915]

2024-03-08 Thread Wilco Dijkstra
On Thumb-2 the use of CBZ blocks conditional execution, so change the
test to compare with a non-zero value.

gcc/testsuite/ChangeLog:
PR target/113915
* gcc.target/arm/builtin-bswap.x: Fix test to avoid emitting CBZ.

---

diff --git a/gcc/testsuite/gcc.target/arm/builtin-bswap.x 
b/gcc/testsuite/gcc.target/arm/builtin-bswap.x
index 
c96dbe6329c4dc648fd0bcc972ad494c7d6dc6e5..dc8f910e0007a67ae5cb5100c98101c7b199b5ca
 100644
--- a/gcc/testsuite/gcc.target/arm/builtin-bswap.x
+++ b/gcc/testsuite/gcc.target/arm/builtin-bswap.x
@@ -10,7 +10,7 @@ extern short foos16 (short);
 short swaps16_cond (short x, int y)
 {
   short z = x;
-  if (y)
+  if (y != 2)
 z = __builtin_bswap16 (x);
   return foos16 (z);
 }
@@ -27,7 +27,7 @@ extern unsigned short foou16 (unsigned short);
 unsigned short swapu16_cond (unsigned short x, int y)
 {
   unsigned short z = x;
-  if (y)
+  if (y != 2)
 z = __builtin_bswap16 (x);
   return foou16 (z);
 }
@@ -43,7 +43,7 @@ extern int foos32 (int);
 int swaps32_cond (int x, int y)
 {
   int z = x;
-  if (y)
+  if (y != 2)
 z = __builtin_bswap32 (x);
   return foos32 (z);
 }
@@ -60,7 +60,7 @@ extern unsigned int foou32 (unsigned int);
 unsigned int swapsu2 (unsigned int x, int y)
 {
   int z = x;
-  if (y)
+  if (y != 2)
 z = __builtin_bswap32 (x);
   return foou32 (z);
 }



Re: [PATCH] ARM: Fix conditional execution [PR113915]

2024-02-26 Thread Wilco Dijkstra
Hi Richard,

> Did you test this on a thumb1 target?  It seems to me that the target parts 
> that you've
> removed were likely related to that.  In fact, I don't see why this test 
> would need to be changed at all.

The testcase explicitly forces a Thumb-2 target (arm_arch_v6t2). The patterns
were wrong for Thumb-2 indeed, and the testcase was explicitly testing for this.
There is a separate builtin-bswap-2.c for Thumb-1 target (arm_arch_v6m).

Cheers,
Wilco


[PATCH] libatomic: Fix build for --disable-gnu-indirect-function [PR113986]

2024-02-23 Thread Wilco Dijkstra

Fix libatomic build to support --disable-gnu-indirect-function on AArch64.
Always build atomic_16.S and add aliases to the __atomic_* functions if
!HAVE_IFUNC.

Passes regress and bootstrap, OK for commit?

libatomic:
PR target/113986
* Makefile.in: Regenerated.
* Makefile.am: Make atomic_16.S not depend on HAVE_IFUNC.
Remove predefine of HAVE_FEAT_LSE128.
* config/linux/aarch64/atomic_16.S: Add __atomic_ aliases if 
!HAVE_IFUNC.   
* config/linux/aarch64/host-config.h: Correctly handle !HAVE_IFUNC.

---

diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
index 
d49c44c7d5fbe83061fddd1f8ef4813a39eb1b8b..980677f353345c050f6cef2d57090360216c56cf
 100644
--- a/libatomic/Makefile.am
+++ b/libatomic/Makefile.am
@@ -130,12 +130,8 @@ libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix 
_$(s)_.lo,$(SIZEOBJS)))
 ## On a target-specific basis, include alternates to be selected by IFUNC.
 if HAVE_IFUNC
 if ARCH_AARCH64_LINUX
-if ARCH_AARCH64_HAVE_LSE128
-AM_CPPFLAGS = -DHAVE_FEAT_LSE128
-endif
 IFUNC_OPTIONS   = -march=armv8-a+lse
 libatomic_la_LIBADD += $(foreach s,$(SIZES),$(addsuffix 
_$(s)_1_.lo,$(SIZEOBJS)))
-libatomic_la_SOURCES += atomic_16.S
 
 endif
 if ARCH_ARM_LINUX
@@ -155,6 +151,10 @@ libatomic_la_LIBADD += $(addsuffix _16_1_.lo,$(SIZEOBJS)) \
 endif
 endif
 
+if ARCH_AARCH64_LINUX
+libatomic_la_SOURCES += atomic_16.S
+endif
+
 libatomic_convenience_la_SOURCES = $(libatomic_la_SOURCES)
 libatomic_convenience_la_LIBADD = $(libatomic_la_LIBADD)
 
diff --git a/libatomic/Makefile.in b/libatomic/Makefile.in
index 
11c8ec7ba15ba7da5ef55e90bd836317bc270061..d9d529bc502d4ce7b9997640d5f40f5d5cc1232c
 100644
--- a/libatomic/Makefile.in
+++ b/libatomic/Makefile.in
@@ -90,17 +90,17 @@ build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
 @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_1 = $(foreach 
s,$(SIZES),$(addsuffix _$(s)_1_.lo,$(SIZEOBJS)))
-@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = atomic_16.S
-@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(foreach \
+@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = $(foreach \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ s,$(SIZES),$(addsuffix \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ _$(s)_1_.lo,$(SIZEOBJS))) \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ $(addsuffix \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ _8_2_.lo,$(SIZEOBJS)) \
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@ tas_1_2_.lo
-@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix 
_8_1_.lo,$(SIZEOBJS))
-@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_5 = $(addsuffix 
_16_1_.lo,$(SIZEOBJS)) \
+@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(addsuffix 
_8_1_.lo,$(SIZEOBJS))
+@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix 
_16_1_.lo,$(SIZEOBJS)) \
 @ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@   $(addsuffix 
_16_2_.lo,$(SIZEOBJS))
 
+@ARCH_AARCH64_LINUX_TRUE@am__append_5 = atomic_16.S
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
@@ -156,8 +156,7 @@ am__uninstall_files_from_dir = { \
   }
 am__installdirs = "$(DESTDIR)$(toolexeclibdir)"
 LTLIBRARIES = $(noinst_LTLIBRARIES) $(toolexeclib_LTLIBRARIES)
-@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__objects_1 =  \
-@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@ atomic_16.lo
+@ARCH_AARCH64_LINUX_TRUE@am__objects_1 = atomic_16.lo
 am_libatomic_la_OBJECTS = gload.lo gstore.lo gcas.lo gexch.lo \
glfree.lo lock.lo init.lo fenv.lo fence.lo flag.lo \
$(am__objects_1)
@@ -425,7 +424,7 @@ libatomic_la_LDFLAGS = $(libatomic_version_info) 
$(libatomic_version_script) \
$(lt_host_flags) $(libatomic_darwin_rpath)
 
 libatomic_la_SOURCES = gload.c gstore.c gcas.c gexch.c glfree.c lock.c \
-   init.c fenv.c fence.c flag.c $(am__append_2)
+   init.c fenv.c fence.c flag.c $(am__append_5)
 SIZEOBJS = load store cas exch fadd fsub fand fior fxor fnand tas
 EXTRA_libatomic_la_SOURCES = $(addsuffix _n.c,$(SIZEOBJS))
 libatomic_la_DEPENDENCIES = $(libatomic_la_LIBADD) $(libatomic_version_dep)
@@ -451,9 +450,8 @@ all_c_files := $(foreach dir,$(search_path),$(wildcard 
$(dir)/*.c))
 # Then sort through them to find the one we want, and select the first.
 M_SRC = $(firstword $(filter %/$(M_FILE), $(all_c_files)))
 libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix \
-   _$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_3) \
-   $(am__append_4) $(am__append_5)
-@ARCH_AARCH64_HAVE_LSE128_TRUE@@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@AM_CPPFLAGS
 = -DHAVE_FEAT_LSE128
+   _$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_2) \
+   $(am__append_3) $(am__append_4)
 @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv8-a+lse
 @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv7-a+fp 
-DHAVE_KERNEL64
 @ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=i586
diff --git 

Re: [PATCH] ARM: Fix conditional execution [PR113915]

2024-02-23 Thread Wilco Dijkstra
Hi Richard,

> This bit isn't.  The correct fix here is to fix the pattern(s) concerned to 
> add the missing predicate.
>
> Note that builtin-bswap.x explicitly mentions predicated mnemonics in the 
> comments.

I fixed the patterns in v2. There are likely some more, plus we could likely 
merge many t1 and t2
patterns where the only difference is predication. But those cleanups are for 
another time...

Cheers,
Wilco

v2: Add predicable to the rev patterns.

By default most patterns can be conditionalized on Arm targets.  However
Thumb-2 predication requires the "predicable" attribute be explicitly
set to "yes".  Most patterns are shared between Arm and Thumb(-2) and are
marked with "predicable".  Given this sharing, it does not make sense to
use a different default for Arm.  So only consider conditional execution
of instructions that have the predicable attribute set to yes.  This ensures
that patterns not explicitly marked as such are never conditionally executed.

Passes regress and bootstrap, OK for commit?

gcc/ChangeLog:
PR target/113915
* config/arm/arm.md (NOCOND): Improve comment.
(arm_rev*) Add predicable.
* config/arm/arm.cc (arm_final_prescan_insn): Add check for
PREDICABLE_YES.

gcc/testsuite/ChangeLog:
PR target/113915
* gcc.target/arm/builtin-bswap-1.c: Fix test.

---

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 
1cd69268ee986a0953cc85ab259355d2191250ac..6a35fe44138135998877a9fb74c2a82a7f99dcd5
 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -25613,11 +25613,12 @@ arm_final_prescan_insn (rtx_insn *insn)
  break;
 
case INSN:
- /* Instructions using or affecting the condition codes make it
-fail.  */
+ /* Check the instruction is explicitly marked as predicable.
+Instructions using or affecting the condition codes are not.  
*/
  scanbody = PATTERN (this_insn);
  if (!(GET_CODE (scanbody) == SET
|| GET_CODE (scanbody) == PARALLEL)
+ || get_attr_predicable (this_insn) != PREDICABLE_YES
  || get_attr_conds (this_insn) != CONDS_NOCOND)
fail = TRUE;
  break;
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 
5816409f86f1106b410c5e21d77e599b485f85f2..81237a61d4a2ebcfb77e47c2bd29137aba28a521
 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -307,6 +307,8 @@
 ;
 ; NOCOND means that the instruction does not use or alter the condition
 ;   codes but can be converted into a conditionally exectuted instruction.
+;   Given that NOCOND is the default for most instructions if omitted,
+;   the attribute predicable must be set to yes as well.
 
 (define_attr "conds" "use,set,clob,unconditional,nocond"
(if_then_else
@@ -12547,6 +12549,7 @@
   revsh%?\t%0, %1"
   [(set_attr "arch" "t1,t2,32")
(set_attr "length" "2,2,4")
+   (set_attr "predicable" "no,yes,yes")
(set_attr "type" "rev")]
 )
 
@@ -12560,6 +12563,7 @@
rev16%?\t%0, %1"
   [(set_attr "arch" "t1,t2,32")
(set_attr "length" "2,2,4")
+   (set_attr "predicable" "no,yes,yes")
(set_attr "type" "rev")]
 )
 
@@ -12584,6 +12588,7 @@
rev16%?\t%0, %1"
   [(set_attr "arch" "t1,t2,32")
(set_attr "length" "2,2,4")
+   (set_attr "predicable" "no,yes,yes")
(set_attr "type" "rev")]
 )
 
@@ -12619,6 +12624,7 @@
rev16%?\t%0, %1"
   [(set_attr "arch" "t1,t2,32")
(set_attr "length" "2,2,4")
+   (set_attr "predicable" "no,yes,yes")
(set_attr "type" "rev")]
 )
 
diff --git a/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c 
b/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c
index 
c1e7740d14d3ca4e93a71e38b12f82c19791a204..1a311a6a5af647d40abd553e5d0ba1273c76d288
 100644
--- a/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c
+++ b/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c
@@ -5,14 +5,11 @@
of the instructions.  Add an -mtune option known to facilitate that.  */
 /* { dg-additional-options "-O2 -mtune=cortex-a53" } */
 /* { dg-final { scan-assembler-not "orr\[ \t\]" } } */
-/* { dg-final { scan-assembler-times "revsh\\t" 1 { target { arm_nothumb } } } 
}  */
-/* { dg-final { scan-assembler-times "revshne\\t" 1 { target { arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "revsh\\t" 2 { target { ! arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "rev16\\t" 1 { target { arm_nothumb } } } 
}  */
-/* { dg-final { scan-assembler-times "rev16ne\\t" 1 { target { arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "rev16\\t" 2 { target { ! arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "rev\\t" 2 { target { arm_nothumb } } } } 
 */
-/* { dg-final { scan-assembler-times "revne\\t" 2 { target { arm_nothumb } } } 
}  */
-/* { dg-final { scan-assembler-times "rev\\t" 4 { target { ! arm_nothumb } } } 
}  */
+/* { dg-final { scan-assembler-times "revsh\\t" 1 } }  */
+/* { 

Re: [PATCH] AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618]

2024-02-22 Thread Wilco Dijkstra
Hi Richard,

> It looks like this is really doing two things at once: disabling the
> direct emission of LDP/STP Qs, and switching the GPR handling from using
> pairs of DImode moves to single TImode moves.  At least, that seems to be
> the effect of...

No it still uses TImode for the !TARGET_SIMD case.

> +   if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
> + mode = mode_iter.require ();

> ...hard-coding 16 here and...

This only affects the Q register case.

> -  if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
> +  if (size > 0 && size < 16 && !STRICT_ALIGNMENT)

> ...changing this limit from 8 to 16 for non-SIMD copies.
>
> Is that deliberate?  If so, please mention that kind of thing in the
> covering note.  It sounded like this was intended to change the handling
> of vector moves only.

Yes it's deliberate. It now basically treats everything as blocks of 16 bytes
which has a nice simplifying effect. I've added a note.

> This means that, for GPRs, we are now effectively using the double-word
> move patterns to get an LDP/STP indirectly, rather than directly as before.

No, there is no difference here.

> That seems OK, and I suppose might be slightly preferable to the current
> code for things like:
>
>  char a[31], b[31];
>  void f() { __builtin_memcpy(a, b, 31); }

Yes an unaligned tail improves slightly by using blocks of 16 bytes.
It's a very rare case, both -mgeneral-regs-only is rarely used, and most
fixed-size copies are a nice multiple of 8.

> But that raises the question: should we do the same thing for Q registers
> and V2x16QImode?

I don't believe it makes sense to use those complex types. And it likely
blocks optimizations in a similar way as UNSPEC does.

> If emitting individual vector loads and stores is better than using
> V2x16QI (and I can see that it might be), then why isn't the same
> true for GPRs and DImode vs TImode?

It might be feasible to do the same for scalar copies. But given that
using TImode works fine, there is no regression here, and use of
-mgeneral-regs-only is rare, what would the benefit be of doing that?

> I think the final version of this patch should go in ahead of the
> clean-up patch.  As I mentioned in the other review, I think the
> clean-up should wait for GCC 15.

I've rebased it to the trunk.

Cheers,
Wilco


v2: Rebase to trunk

The new RTL introduced for LDP/STP results in regressions due to use of UNSPEC.
Given the new LDP fusion pass is good at finding LDP opportunities, change the
memcpy, memmove and memset expansions to emit single vector loads/stores.
This fixes the regression and enables more RTL optimization on the standard
memory accesses.  Handling of unaligned tail of memcpy/memmove is improved
with -mgeneral-regs-only.  SPEC2017 performance improves slightly.  Codesize
is a bit worse due to missed LDP opportunities as discussed in the PR.

Passes regress, OK for commit?

gcc/ChangeLog:
PR target/113618
* config/aarch64/aarch64.cc (aarch64_copy_one_block): Remove. 
(aarch64_expand_cpymem): Emit single load/store only.
(aarch64_set_one_block): Emit single stores only.

gcc/testsuite/ChangeLog:
PR target/113618
* gcc.target/aarch64/pr113618.c: New test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
16318bf925883ecedf9345e53fc0824a553b2747..0a28e033088a00818c6ed9fa8c15ecdee5a86c35
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26465,33 +26465,6 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-typedef auto_vec, 12> copy_ops;
-
-/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
-static void
-aarch64_copy_one_block (copy_ops , rtx src, rtx dst,
-   int offset, machine_mode mode)
-{
-  /* Emit explict load/store pair instructions for 32-byte copies.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-{
-  mode = V4SImode;
-  rtx src1 = adjust_address (src, mode, offset);
-  rtx dst1 = adjust_address (dst, mode, offset);
-  rtx reg1 = gen_reg_rtx (mode);
-  rtx reg2 = gen_reg_rtx (mode);
-  rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
-  rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
-  ops.safe_push ({ load, store });
-  return;
-}
-
-  rtx reg = gen_reg_rtx (mode);
-  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
-  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
-  ops.safe_push ({ load, store });
-}
-
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
rather than memcpy.  Return true iff we succeeded.  */
@@ -26527,7 +26500,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx 

[PATCH] ARM: Fix conditional execution [PR113915]

2024-02-21 Thread Wilco Dijkstra

By default most patterns can be conditionalized on Arm targets.  However
Thumb-2 predication requires the "predicable" attribute be explicitly
set to "yes".  Most patterns are shared between Arm and Thumb(-2) and are
marked with "predicable".  Given this sharing, it does not make sense to
use a different default for Arm.  So only consider conditional execution
of instructions that have the predicable attribute set to yes.  This ensures
that patterns not explicitly marked as such are never accidentally 
conditionally executed like in the PR.

GLIBC codesize was ~0.014% worse due to atomic operations now being
unconditional and a small number of patterns not setting "predicable".

Passes regress and bootstrap, OK for commit?

gcc/ChangeLog:
PR target/113915
* config/arm/arm.md (NOCOND): Improve comment.
* config/arm/arm.cc (arm_final_prescan_insn): Add check for
PREDICABLE_YES.

gcc/testsuite/ChangeLog:
PR target/113915
* gcc.target/arm/builtin-bswap-1.c: Fix test.

---

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 
c44047c377a802d0c1dc1406df1b88a6b079607b..29771d284831a995adcf9adbb525396fbabb1ea2
 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -25610,11 +25610,12 @@ arm_final_prescan_insn (rtx_insn *insn)
  break;
 
case INSN:
- /* Instructions using or affecting the condition codes make it
-fail.  */
+ /* Check the instruction is explicitly marked as predicable.
+Instructions using or affecting the condition codes are not.  
*/
  scanbody = PATTERN (this_insn);
  if (!(GET_CODE (scanbody) == SET
|| GET_CODE (scanbody) == PARALLEL)
+ || get_attr_predicable (this_insn) != PREDICABLE_YES
  || get_attr_conds (this_insn) != CONDS_NOCOND)
fail = TRUE;
  break;
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 
5816409f86f1106b410c5e21d77e599b485f85f2..671f093862259c2c0df93a986fc22fa56a8ea6c7
 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -307,6 +307,8 @@
 ;
 ; NOCOND means that the instruction does not use or alter the condition
 ;   codes but can be converted into a conditionally exectuted instruction.
+;   Given that NOCOND is the default for most instructions if omitted,
+;   the attribute predicable must be set to yes as well.
 
 (define_attr "conds" "use,set,clob,unconditional,nocond"
(if_then_else
diff --git a/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c 
b/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c
index 
c1e7740d14d3ca4e93a71e38b12f82c19791a204..3de7cea81c1128c2fe5a9e1216e6b027d26bcab9
 100644
--- a/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c
+++ b/gcc/testsuite/gcc.target/arm/builtin-bswap-1.c
@@ -5,14 +5,8 @@
of the instructions.  Add an -mtune option known to facilitate that.  */
 /* { dg-additional-options "-O2 -mtune=cortex-a53" } */
 /* { dg-final { scan-assembler-not "orr\[ \t\]" } } */
-/* { dg-final { scan-assembler-times "revsh\\t" 1 { target { arm_nothumb } } } 
}  */
-/* { dg-final { scan-assembler-times "revshne\\t" 1 { target { arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "revsh\\t" 2 { target { ! arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "rev16\\t" 1 { target { arm_nothumb } } } 
}  */
-/* { dg-final { scan-assembler-times "rev16ne\\t" 1 { target { arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "rev16\\t" 2 { target { ! arm_nothumb } } 
} }  */
-/* { dg-final { scan-assembler-times "rev\\t" 2 { target { arm_nothumb } } } } 
 */
-/* { dg-final { scan-assembler-times "revne\\t" 2 { target { arm_nothumb } } } 
}  */
-/* { dg-final { scan-assembler-times "rev\\t" 4 { target { ! arm_nothumb } } } 
}  */
+/* { dg-final { scan-assembler-times "revsh\\t" 2 } }  */
+/* { dg-final { scan-assembler-times "rev16\\t" 2 } }  */
+/* { dg-final { scan-assembler-times "rev\\t" 4 } }  */
 
 #include "builtin-bswap.x"



[PATCH] AArch64: memcpy/memset expansions should not emit LDP/STP [PR113618]

2024-02-01 Thread Wilco Dijkstra

The new RTL introduced for LDP/STP results in regressions due to use of UNSPEC.
Given the new LDP fusion pass is good at finding LDP opportunities, change the
memcpy, memmove and memset expansions to emit single vector loads/stores.
This fixes the regression and enables more RTL optimization on the standard
memory accesses.  SPEC2017 performance improves slightly.  Codesize is a bit
worse due to missed LDP opportunities as discussed in the PR.

Passes regress, OK for commit?

gcc/ChangeLog:
PR target/113618
* config/aarch64/aarch64.cc (aarch64_copy_one_block): Remove. 
(aarch64_expand_cpymem): Emit single load/store only.
(aarch64_set_one_block): Remove.
(aarch64_expand_setmem): Emit single stores only.

gcc/testsuite/ChangeLog:
PR target/113618
* gcc.target/aarch64/pr113618.c: New test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
d17198b4a5f73f8be8aeca3258b81809ffb48eac..2194441b949a53f181fe373e07bc18341c014918
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26376,33 +26376,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
next, amount);
 }
 
-typedef auto_vec, 12> copy_ops;
-
-/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
-static void
-aarch64_copy_one_block (copy_ops , rtx src, rtx dst,
-   int offset, machine_mode mode)
-{
-  /* Emit explict load/store pair instructions for 32-byte copies.  */
-  if (known_eq (GET_MODE_SIZE (mode), 32))
-{
-  mode = V4SImode;
-  rtx src1 = adjust_address (src, mode, offset);
-  rtx dst1 = adjust_address (dst, mode, offset);
-  rtx reg1 = gen_reg_rtx (mode);
-  rtx reg2 = gen_reg_rtx (mode);
-  rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
-  rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
-  ops.safe_push ({ load, store });
-  return;
-}
-
-  rtx reg = gen_reg_rtx (mode);
-  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
-  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
-  ops.safe_push ({ load, store });
-}
-
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
rather than memcpy.  Return true iff we succeeded.  */
@@ -26438,7 +26411,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode, next_mode;
+  machine_mode mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
@@ -26465,7 +26438,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
  ??? Although it would be possible to use LDP/STP Qn in streaming mode
  (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
  whether that would improve performance.  */
-  unsigned copy_max = (size <= 24 || !TARGET_SIMD) ? 16 : 32;
+  bool use_qregs = size > 24 && TARGET_SIMD;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -26473,7 +26446,7 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  copy_ops ops;
+  auto_vec, 16> ops;
   int offset = 0;
 
   while (size > 0)
@@ -26482,23 +26455,27 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 or writing.  */
   opt_scalar_int_mode mode_iter;
   FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-   if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
- cur_mode = mode_iter.require ();
+   if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
+ mode = mode_iter.require ();
+
+  gcc_assert (mode != BLKmode);
 
-  gcc_assert (cur_mode != BLKmode);
+  mode_bytes = GET_MODE_SIZE (mode).to_constant ();
 
-  mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+  /* Prefer Q-register accesses.  */
+  if (mode_bytes == 16 && use_qregs)
+   mode = V4SImode;
 
-  /* Prefer Q-register accesses for the last bytes.  */
-  if (mode_bytes == 16 && copy_max == 32)
-   cur_mode = V4SImode;
-  aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+  rtx reg = gen_reg_rtx (mode);
+  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+  ops.safe_push ({ load, store });
   size -= mode_bytes;
   offset += mode_bytes;
 
   /* Emit trailing copies using overlapping unaligned accesses
 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-  if (size > 0 && size < copy_max / 2 && 

Re: [PATCH v4] AArch64: Cleanup memset expansion

2024-01-30 Thread Wilco Dijkstra
Hi Richard,

>> That tune is only used by an obsolete core. I ran the memcpy and memset
>> benchmarks from Optimized Routines on xgene-1 with and without LDP/STP.
>> There is no measurable penalty for using LDP/STP. I'm not sure why it was
>> ever added given it does not do anything useful. I'll post a separate patch
>> to remove it to reduce the maintenance overhead.

Patch: https://gcc.gnu.org/pipermail/gcc-patches/2024-January/62.html

> Is that enough to justify removing it though?  It sounds from:
>
>  https://gcc.gnu.org/pipermail/gcc-patches/2018-June/500017.html
>
> like the problem was in more balanced code, rather than memory-limited
> things like memset/memcpy.
>
> But yeah, I'm not sure if the intuition was supported by numbers
> in the end.  If SPEC also shows no change then we can probably drop it
> (unless someone objects).

SPECINT didn't show any difference either, so LDP doesn't have a measurable
penalty. It doesn't look like the original commit was ever backed up by 
benchmarks...

> Let's leave this patch until that's resolved though, since I think as it
> stands the patch does leave -Os -mtune=xgene1 worse off (bigger code).
> Handling the tune in the meantime would also be OK.

Note it was incorrectly handling -Os, it should still form LDP in that case
and take advantage of longer and faster inlined memcpy/memset instead of
calling a library function.

>    /* Default the maximum to 256-bytes when considering only libcall vs
>   SIMD broadcast sequence.  */

> ...this comment should be deleted along with the code it's describing.
> Don't respin just for that though :)

I've fixed that locally.

Cheers,
Wilco

[PATCH] AArch64: Remove AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS

2024-01-30 Thread Wilco Dijkstra

(follow-on based on review comments on
https://gcc.gnu.org/pipermail/gcc-patches/2024-January/641913.html)


Remove the tune AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS since it is only
used by an old core and doesn't properly support -Os.  SPECINT_2017
shows that removing it has no performance difference, while codesize
is reduced by 0.07%.

Passes regress, OK for commit?

gcc/ChangeLog:
* config/aarch64/aarch64.cc (aarch64_mode_valid_for_sched_fusion_p):
Remove check for AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS.
(aarch64_advsimd_ldp_stp_p): Likewise.
(aarch64_stp_sequence_cost): Likewise.
(aarch64_expand_cpymem): Likewise.
(aarch64_expand_setmem): Likewise.
* config/aarch64/aarch64-ldp-fusion.cc (ldp_operand_mode_ok_p): 
Likewise.   
* config/aarch64/aarch64-ldpstp.md: Likewise.
* config/aarch64/aarch64-tuning-flags.def: Remove NO_LDP_STP_QREGS.
* config/aarch64/tuning_models/emag.h: Likewise.
* config/aarch64/tuning_models/xgene1.h: Likewise.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ldp_stp_q_disable.c: Remove test.

---

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index 
22ed95eb743c9ee44e745560b207d389c8fca03b..de6685f75a2650d9a7d39fe6781ec57214092eb1
 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -315,17 +315,9 @@ any_post_modify_p (rtx x)
 static bool
 ldp_operand_mode_ok_p (machine_mode mode)
 {
-  const bool allow_qregs
-= !(aarch64_tune_params.extra_tuning_flags
-   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
-
   if (!aarch64_ldpstp_operand_mode_p (mode))
 return false;
 
-  const auto size = GET_MODE_SIZE (mode).to_constant ();
-  if (size == 16 && !allow_qregs)
-return false;
-
   // We don't pair up TImode accesses before RA because TImode is
   // special in that it can be allocated to a pair of GPRs or a single
   // FPR, and the RA is best placed to make that decision.
diff --git a/gcc/config/aarch64/aarch64-ldpstp.md 
b/gcc/config/aarch64/aarch64-ldpstp.md
index 
b7c0bf05cd18c971955d667bae91d7c3dc3f512e..7890a8cc32b24f8e1bc29cb722b10e511e7881ab
 100644
--- a/gcc/config/aarch64/aarch64-ldpstp.md
+++ b/gcc/config/aarch64/aarch64-ldpstp.md
@@ -96,9 +96,7 @@ (define_peephole2
(set (match_operand:VQ2 2 "register_operand" "")
(match_operand:VQ2 3 "memory_operand" ""))]
   "TARGET_FLOAT
-   && aarch64_operands_ok_for_ldpstp (operands, true)
-   && (aarch64_tune_params.extra_tuning_flags
-   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+   && aarch64_operands_ok_for_ldpstp (operands, true)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, true);
@@ -111,9 +109,7 @@ (define_peephole2
(set (match_operand:VQ2 2 "memory_operand" "")
(match_operand:VQ2 3 "register_operand" ""))]
   "TARGET_FLOAT
-   && aarch64_operands_ok_for_ldpstp (operands, false)
-   && (aarch64_tune_params.extra_tuning_flags
-   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+   && aarch64_operands_ok_for_ldpstp (operands, false)"
   [(const_int 0)]
 {
   aarch64_finish_ldpstp_peephole (operands, false);
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index 
d917da720b22ed6aaf360dc4ebbe8efc4a3185f2..d5bcaebce770f0b217aac783063d39135f754c77
 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -36,9 +36,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", 
RENAME_FMA_REGS)
are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
-/* Disallow load/store pair instructions on Q-registers.  */
-AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
-
 AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
 
 AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
433c160cba22374f6b7a3445c0202789927abd25..d7e8379b2eb90eccb8608a15cc8d11cc2187a9e7
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -10335,9 +10335,7 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode 
mode)
 || mode == SDmode || mode == DDmode
 || (aarch64_vector_mode_supported_p (mode)
 && (known_eq (GET_MODE_SIZE (mode), 8)
-|| (known_eq (GET_MODE_SIZE (mode), 16)
-   && (aarch64_tune_params.extra_tuning_flags
-   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
+|| known_eq (GET_MODE_SIZE (mode), 16)));
 }
 
 /* Return true if REGNO is a virtual pointer register, or an eliminable
@@ -16448,10 +16446,6 @@ aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt 
kind,
   return false;
 }
 
-  if (aarch64_tune_params.extra_tuning_flags
-  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-return 

Re: [PATCH] AArch64: Add -mcpu=cobalt-100

2024-01-25 Thread Wilco Dijkstra
Hi,

>> Add support for -mcpu=cobalt-100 (Neoverse N2 with a different implementer
>> ID).
>> 
>> Passes regress, OK for commit?
>
> Ok.

Also OK to backport to GCC 13, 12 and 11?

Cheers,
Wilco

[PATCH] AArch64: Add -mcpu=cobalt-100

2024-01-16 Thread Wilco Dijkstra

Add support for -mcpu=cobalt-100 (Neoverse N2 with a different implementer ID).

Passes regress, OK for commit?

gcc/ChangeLog:
* config/aarch64/aarch64-cores.def (AARCH64_CORE): Add 'cobalt-100' CPU.
* config/aarch64/aarch64-tune.md: Regenerated.
* doc/invoke.texi (-mcpu): Add cobalt-100 core.

---

diff --git a/gcc/config/aarch64/aarch64-cores.def 
b/gcc/config/aarch64/aarch64-cores.def
index 
054862f37bc8738e7193348d01f485a46a9a36e3..7ebefcf543b6f84b3df22ab836728111b56fa76f
 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -186,6 +186,7 @@ AARCH64_CORE("cortex-x3",  cortexx3, cortexa57, V9A,  
(SVE2_BITPERM, MEMTAG, I8M
 AARCH64_CORE("cortex-x4",  cortexx4, cortexa57, V9_2A,  (SVE2_BITPERM, MEMTAG, 
PROFILE), neoversen2, 0x41, 0xd81, -1)
 
 AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, V9A, (I8MM, BF16, 
SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversen2, 0x41, 0xd49, -1)
+AARCH64_CORE("cobalt-100",   cobalt100, cortexa57, V9A, (I8MM, BF16, 
SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversen2, 0x6d, 0xd49, -1)
 
 AARCH64_CORE("neoverse-v2", neoversev2, cortexa57, V9A, (I8MM, BF16, 
SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
 AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, 
RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
diff --git a/gcc/config/aarch64/aarch64-tune.md 
b/gcc/config/aarch64/aarch64-tune.md
index 
98e6882d4324d81268e28810b305b87c63bba22d..abd3c9e0822eeb1652f4856cde591ac175ac0a4a
 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a"
+   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexx2,cortexx3,cortexx4,neoversen2,cobalt100,neoversev2,demeter,generic,generic_armv8_a,generic_armv9_a"
(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 
216e2f594d1cbc139c7e0125d9579c6924d23443..a25362b8c157f67d68b19f94cc2d64bd09505bdc
 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21163,7 +21163,7 @@ performance of the code.  Permissible values for this 
option are:
 @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2},
 @samp{cortex-x3}, @samp{cortex-x4}, @samp{cortex-a510}, @samp{cortex-a520},
 @samp{cortex-a710}, @samp{cortex-a715}, @samp{cortex-a720}, @samp{ampere1},
-@samp{ampere1a}, @samp{ampere1b}, and @samp{native}.
+@samp{ampere1a}, @samp{ampere1b}, @samp{cobalt-100} and @samp{native}.
 
 The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
 @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},



Re: [PATCH] AArch64: Reassociate CONST in address expressions [PR112573]

2024-01-16 Thread Wilco Dijkstra
Hi Richard,

>> +  rtx base = strip_offset_and_salt (XEXP (x, 1), );
>
> This should be just strip_offset, so that we don't lose the salt
> during optimisation.

Fixed.

> +
> +  if (offset.is_constant ())

> I'm not sure this is really required.  Logically the same thing
> would apply to SVE, although admittedly:

It's not needed indeed, I've committed it with the if removed.

However I believe CONST only allows immediate offsets here so that it can
be used in const data. Building SPEC with gcc_assert (!offset.is_constant ()) 
doesn't ever trigger it.

Cheers,
Wilco

[PATCH] AArch64: Reassociate CONST in address expressions [PR112573]

2024-01-10 Thread Wilco Dijkstra
GCC tends to optimistically create CONST of globals with an immediate offset. 
However it is almost always better to CSE addresses of globals and add immediate
offsets separately (the offset could be merged later in single-use cases).
Splitting CONST expressions with an index in aarch64_legitimize_address fixes 
part
of PR112573.

Passes regress & bootstrap, OK for commit?

gcc/ChangeLog:
PR target/112573
* config/aarch64/aarch64.cc (aarch64_legitimize_address): Reassociate 
badly
formed CONST expressions.

gcc/testsuite/ChangeLog:
PR target/112573
* gcc.target/aarch64/pr112573.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
0909b319d16b9a1587314bcfda0a8112b42a663f..9fbc8b62455f48baec533d3dd5e2d9ea995d5a8f
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -12608,6 +12608,20 @@ aarch64_legitimize_address (rtx x, rtx /* orig_x  */, 
machine_mode mode)
  not to split a CONST for some forms of address expression, otherwise
  it will generate sub-optimal code.  */
 
+  /* First split X + CONST (base, offset) into (base + X) + offset.  */
+  if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
+{
+  poly_int64 offset;
+  rtx base = strip_offset_and_salt (XEXP (x, 1), );
+
+  if (offset.is_constant ())
+  {
+ base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
+  NULL_RTX, true, OPTAB_DIRECT);
+ x = plus_constant (Pmode, base, offset);
+  }
+}
+
   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
 {
   rtx base = XEXP (x, 0);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr112573.c 
b/gcc/testsuite/gcc.target/aarch64/pr112573.c
new file mode 100644
index 
..be04c0ca86ad9f33975a85f497549955d6d1236d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr112573.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-section-anchors" } */
+
+char a[100];
+
+void f1 (int x, int y)
+{
+  *((a + y) + 3) = x;
+  *((a + y) + 2) = x;
+  *((a + y) + 1) = x;
+  *((a + y) + 0) = x;
+}
+
+/* { dg-final { scan-assembler-times "strb" 4 } } */
+/* { dg-final { scan-assembler-times "adrp" 1 } } */



Re: [PATCH v4] AArch64: Cleanup memset expansion

2024-01-09 Thread Wilco Dijkstra
Hi Richard,

>> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
>
> Since this isn't (AFAIK) a standard macro, there doesn't seem to be
> any need to put it in the header file.  It could just go at the head
> of aarch64.cc instead.

Sure, I've moved it in v4.

>> +  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
>> +   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
>> +    set_max = 16;
>
> I think we should take the tuning parameter into account when applying
> the MAX_SET_SIZE limit for -Os.  Shouldn't it be 48 rather than 96 in
> that case?  (Alternatively, I suppose it would make sense to ignore
> the param for -Os, although we don't seem to do that elsewhere.)

That tune is only used by an obsolete core. I ran the memcpy and memset
benchmarks from Optimized Routines on xgene-1 with and without LDP/STP.
There is no measurable penalty for using LDP/STP. I'm not sure why it was
ever added given it does not do anything useful. I'll post a separate patch
to remove it to reduce the maintenance overhead.

Cheers,
Wilco


Here is v4 (move MAX_SET_SIZE definition to aarch64.cc):

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog:
* config/aarch64/aarch64.cc (MAX_SET_SIZE): New define.
(aarch64_progress_pointer): Remove function.
(aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
(aarch64_expand_setmem): Clean up implementation, use byte offsets,
simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
a5a6b52730d6c5013346d128e89915883f1707ae..62f4eee429c1c5195d54604f1d341a8a5a499d89
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -101,6 +101,10 @@
 /* Defined for convenience.  */
 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 
+/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
+   and 1 MOVI/DUP (same size as a call).  */
+#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
+
 /* Flags that describe how a function shares certain architectural state
with its callers.
 
@@ -26321,15 +26325,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 typedef auto_vec, 12> copy_ops;
 
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
@@ -26484,45 +26479,21 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-   machine_mode mode)
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
 {
-  /* If we are copying 128bits or 256bits, we can do that straight from
- the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-{
-  mode = GET_MODE (src);
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_insn (aarch64_gen_store_pair (*dst, src, src));
-
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 32);
-  return;
-}
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
 {
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, GET_MODE (src), 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, src);
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 16);
+  mode = V16QImode;
+  rtx dst1 = adjust_address (dst, mode, offset);
+  emit_insn (aarch64_gen_store_pair (dst1, src, src));
   return;
 }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  

Re: [PATCH v3 2/3] libatomic: Enable LSE128 128-bit atomics for armv9.4-a

2024-01-08 Thread Wilco Dijkstra
Hi Richard,

>> Benchmarking showed that LSE and LSE2 RMW atomics have similar performance 
>> once
>> the atomic is acquire, release or both. Given there is already a significant 
>> overhead due
>> to the function call, PLT indirection and argument setup, it doesn't make 
>> sense to add
>> extra taken branches that may mispredict or cause extra fetch cycles...
>
> Thanks for the extra context, especially wrt the LSE/LSE2 benchmarking.
> If there isn't any difference for acquire vs. the rest, is there a
> justification we can use for keeping the acquire branch, rather than
> using SWPAL for everything except relaxed?

The results showed that acquire is typically slightly faster than release 
(5-10%), so for the
most frequently used atomics (CAS and SWP) it makes sense to add support for 
acquire.
In most cases once you have release semantics, adding acquire didn't make things
slower, so combining release/acq_rel/seq_cst avoids unnecessary extra branches 
and
keeps the code small.

> If so, then Victor, could you include that in the explanation above and
> add it as a source comment?  Although maybe tone down "doesn't make
> sense to add" to something like "doesn't seem worth adding". :)

Yes it's worth adding a comment to this effect.

Cheers,
Wilco

Re: [PATCH v3 2/3] libatomic: Enable LSE128 128-bit atomics for armv9.4-a

2024-01-08 Thread Wilco Dijkstra
Hi,

>> Is there no benefit to using SWPPL for RELEASE here?  Similarly for the
>> others.
>
> We started off implementing all possible memory orderings available. 
> Wilco saw value in merging less restricted orderings into more 
> restricted ones - mainly to reduce codesize in less frequently used atomics.
> 
> This saw us combine RELEASE and ACQ_REL/SEQ_CST cases to make functions 
> a little smaller.

Benchmarking showed that LSE and LSE2 RMW atomics have similar performance once
the atomic is acquire, release or both. Given there is already a significant 
overhead due
to the function call, PLT indirection and argument setup, it doesn't make sense 
to add
extra taken branches that may mispredict or cause extra fetch cycles...

The goal for next GCC is to inline these instructions directly to avoid these 
overheads.

Cheers,
Wilco

Re: [PATCH v3] AArch64: Cleanup memset expansion

2023-12-22 Thread Wilco Dijkstra
v3: rebased to latest trunk

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress & bootstrap.

gcc/ChangeLog:
* config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
* config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
(aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
(aarch64_expand_setmem): Clean up implementation, use byte offsets,
simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
3ae42be770400da96ea3d9d25d6e1b2d393d034d..dd3b7988d585277181c478cd022fd7b6285929d0
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1178,6 +1178,10 @@ typedef struct
mode that should actually be used.  We allow pairs of registers.  */
 #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
 
+/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
+   and 1 MOVI/DUP (same size as a call).  */
+#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
+
 /* Maximum bytes moved by a single instruction (load/store pair).  */
 #define MOVE_MAX (UNITS_PER_WORD * 2)
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
f9850320f61c5ddccf47e6583d304e5f405a484f..0909b319d16b9a1587314bcfda0a8112b42a663f
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26294,15 +26294,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 typedef auto_vec, 12> copy_ops;
 
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
@@ -26457,45 +26448,21 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-   machine_mode mode)
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
 {
-  /* If we are copying 128bits or 256bits, we can do that straight from
- the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-{
-  mode = GET_MODE (src);
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_insn (aarch64_gen_store_pair (*dst, src, src));
-
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 32);
-  return;
-}
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
 {
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, GET_MODE (src), 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, src);
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 16);
+  mode = V16QImode;
+  rtx dst1 = adjust_address (dst, mode, offset);
+  emit_insn (aarch64_gen_store_pair (dst1, src, src));
   return;
 }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -26524,7 +26491,7 @@ aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
@@ -26537,11 +26504,9 @@ aarch64_expand_setmem (rtx *operands)
   || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
-  unsigned max_set_size = 256;
+  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 

Re: [PATCH v2] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-12-04 Thread Wilco Dijkstra
Hi Richard,

>> Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible 
>> with
>> existing binaries, gives better performance than locking atomics and is what
>> most users expect.
>
> Please add a justification for why it's backwards compatible, rather
> than just stating that it's so.

This isn't any different than the LSE2 support which also switches some CPUs to
lock-free implementations. This is basically switching the rest. It trivially 
follows
from the fact that GCC always calls libatomic so that you switch all atomics in 
a
process. I'll add that to the description.

Note the compatibility story is even better than this. We are also compatible
with LLVM and future GCC versions which may inline these sequences.

> Thanks for adding this.  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95722
> suggests that it's still an open question whether this is a correct thing
> to do, but it sounds from Joseph's comment that he isn't sure whether
> atomic loads from read-only data are valid.

Yes it's not useful to do an atomic read if it is a read-only value... It should
be feasible to mark atomic types as mutable to force them to .data (see eg.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108659 and
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109553).

> Linus's comment in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70490
> suggests that a reasonable compromise might be to use a storing
> implementation but not advertise that it is lock-free.  Also,
> the comment above libat_is_lock_free says:
>
> /* Note that this can return that a size/alignment is not lock-free even if
>    all the operations that we use to implement the respective accesses provide
>    lock-free forward progress as specified in C++14:  Users likely expect
>    "lock-free" to also mean "fast", which is why we do not return true if, for
>    example, we implement loads with this size/alignment using a CAS.  */

I don't believe lying about being lock-free like that is a good idea. When
you use a faster lock-free implementation, you want to tell users about it
(so they aren't forced to use nasty inline assembler hacks for example).

> We don't use a CAS for the fallbacks, but like you say, we do use a
> load/store exclusive loop.  So did you consider not doing this:

> +/* State we have lock-free 128-bit atomics.  */
> +#undef FAST_ATOMIC_LDST_16
> +#define FAST_ATOMIC_LDST_16    1

That would result in __atomic_is_lock_free incorrectly returning false.
Note that __atomic_always_lock_free remains false for 128-bit since there
is no inlining in the compiler, but __atomic_is_lock_free should be true.

> -   /* RELEASE.  */
> -5: ldxp    res0, res1, [x5]
> +   /* RELEASE/ACQ_REL/SEQ_CST.  */
> +4: ldaxp   res0, res1, [x5]
>  stlxp   w4, in0, in1, [x5]
> -   cbnz    w4, 5b
> +   cbnz    w4, 4b
>  ret
> +END (libat_exchange_16)

> Please explain (here and in the commit message) why you're adding
> acquire semantics to the RELEASE case.

That merges the RELEASE with ACQ_REL/SEQ_CST cases to keep the code
short and simple like much of the code. I've added a note in the commit msg.

Cheers,
Wilco

Here is v2 - this also incorporates the PR111404 fix to compare-exchange:

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries (as for these GCC always calls into libatomic, so all 128-bit
atomic uses in  a process are switched), gives better performance than locking
atomics and is what most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the
given address is writeable (which will be true when using atomics in actual 
code).

Passes regress, OK for commit?

libatomic/
config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
(libat_exchange_16): Merge RELEASE and ACQ_REL/SEQ_CST cases.
config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..a099037179b3f1210145baea02a9d43418629813
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,22 @@
.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries (as we swap all uses
+   of 128-bit atomics via an ifunc) and gives better performance than locking
+   atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   

Re: [PATCH v3] AArch64: Add inline memmove expansion

2023-12-01 Thread Wilco Dijkstra
Hi Richard,

> +  rtx load[max_ops], store[max_ops];
>
> Please either add a comment explaining why 40 is guaranteed to be
> enough, or (my preference) use:
>
>  auto_vec, ...> ops;

I've changed to using auto_vec since that should help reduce conflicts
with Alex' LDP changes. I double-checked maximum number of instructions,
with a minor tweak to handle AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
it can now be limited to 12 if you also select -mstrict-align.

v3: update after review, use auto_vec, tweak max_copy_size, add another test.

Add support for inline memmove expansions.  The generated code is identical
as for memcpy, except that all loads are emitted before stores rather than
being interleaved.  The maximum size is 256 bytes which requires at most 16
registers.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
* config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
Change default.
* config/aarch64/aarch64.md (cpymemdi): Add a parameter.
(movmemdi): Call aarch64_expand_cpymem.
* config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
simplify, support storing generated loads/stores. 
(aarch64_expand_cpymem): Support expansion of memmove.
* config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.

gcc/testsuite/ChangeLog/
* gcc.target/aarch64/memmove.c: Add new test.
* gcc.target/aarch64/memmove.c: Likewise.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
d2718cc87b306e9673b166cc40e0af2ba72aa17b..d958b181d79440ab1b4f274cc188559edc04c628
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -769,7 +769,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
748b313092c5af452e9526a0c6747c51e598e4ca..26d1485ff6b977caeeb780dfaee739069742e638
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23058,51 +23058,41 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
+typedef auto_vec, 12> copy_ops;
+
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
- machine_mode mode)
+aarch64_copy_one_block (copy_ops , rtx src, rtx dst,
+   int offset, machine_mode mode)
 {
-  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
- address copies using V4SImode so that we can use Q registers.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+  /* Emit explict load/store pair instructions for 32-byte copies.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
 {
   mode = V4SImode;
+  rtx src1 = adjust_address (src, mode, offset);
+  rtx src2 = adjust_address (src, mode, offset + 16);
+  rtx dst1 = adjust_address (dst, mode, offset);
+  rtx dst2 = adjust_address (dst, mode, offset + 16);
   rtx reg1 = gen_reg_rtx (mode);
   rtx reg2 = gen_reg_rtx (mode);
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-   aarch64_progress_pointer (*src)));
-  emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-aarch64_progress_pointer (*dst), 
reg2));
-  /* Move the pointers forward.  */
-  *src = aarch64_move_pointer (*src, 32);
-  *dst = aarch64_move_pointer (*dst, 32);
+  rtx load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+  rtx store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
+  ops.safe_push ({ load, store });
   return;
 }
 
   rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+  ops.safe_push ({ load, store });
 }
 
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken

Re: [PATCH] AArch64: Fix __sync_val_compare_and_swap [PR111404]

2023-11-30 Thread Wilco Dijkstra
Hi Richard,

Thanks for the review, now committed.

> The new aarch64_split_compare_and_swap code looks a bit twisty.
> The approach in lse.S seems more obvious.  But I'm guessing you
> didn't want to spend any time restructuring the pre-LSE
> -mno-outline-atomics code, and I agree the patch in its current
> form is more backportable.

Indeed this code needs cleaning up - all the complex speculation stuff
should be behind a simple interface. I was thinking of emitting CSEL
here but it would require adding new TI mode patterns or manually
splitting into low/high parts and emitting CSEL.

> I suppose we might want to backport this after it has been in trunk
> for a bit.

Yes that was my plan.

Cheers,
Wilco

Re: [PATCH v2] AArch64: Cleanup memset expansion

2023-11-14 Thread Wilco Dijkstra
Hi Richard,

> +/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
> +   and 1 MOVI/DUP (same size as a call).  */
> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)

> So it looks like this assumes we have AdvSIMD.  What about 
> -mgeneral-regs-only?

After my strictalign bugfix
(https://gcc.gnu.org/pipermail/gcc-patches/2023-November/635309.html)
aarch64_expand_setmem starts with:

  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
  || (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_setmem_mops (operands);

Generating perfect code for every STRICT_ALIGNMENT x TARGET_SIMD
x AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS x speed/size combination
would require a huge rewrite - and that's not the goal of this patch.

Cheers,
Wilco

Re: [PATCH v2] AArch64: Cleanup memset expansion

2023-11-14 Thread Wilco Dijkstra
Hi,

>>> I checked codesize on SPECINT2017, and 96 had practically identical size.
>>> Using 128 would also be a reasonable Os value with a very slight size
>>> increase,
>>> and 384 looks good for O2 - however I didn't want to tune these values
>>> as this
>>> is a cleanup patch.
>>>
>>> Cheers,
>>> Wilco
>>
>> Shouldn't this be a param then?  Also, manifest constants in the middle
>> of code are a potential nightmare, please move it to a #define (even if
>> that's then used as the default value for the param).
> 
> I agree on making this a #define but I wouldn't insist on a param.
> Code size IMO has a much more consistent right or wrong answer as it's 
> statically determinable.
> It this was a speed-related param then I'd expect the flexibility for the 
> power user to override such heuristics would be more widely useful.
> But for code size the compiler should always be able to get it right.
> 
> If Richard would still like the param then I'm fine with having the param, 
> but I'd be okay with the comment above and making this a #define.

> I don't immediately have a feel for how sensitive code would be to the 
> precise value here.  Is this value something that might affect 
> individual benchmarks in different ways?  Or something where a future 
> architecture might want a different value?  For either of those reasons 
> a param might be useful, but if this is primarily a code size trade off 
> and the variation in performance is small, then it's probably not 
> worthwhile having an additional hook.

These are just settings that are good for -Os and -O2. I might tune them once
every 5 years or so, but that's all that is needed. I don't believe there is any
value in giving users too many unnecessary options. Adding the configurable
MOPS settings introduced several bugs that went unnoticed despite multiple
code reviews, so doing this creates extra testing and maintenance overheads.

Cheers,
Wilco

---
v2: Add define MAX_SET_SIZE

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog:
* config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
* config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
(aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
(aarch64_expand_setmem): Clean up implementation, use byte offsets,
simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
2f0777a37acccb787200d15ae89ec186b4221748..1d98b48db43e09ecf8c4289a8cd4fc55cc2c8a26
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -998,6 +998,10 @@ typedef struct
mode that should actually be used.  We allow pairs of registers.  */
 #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
 
+/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
+   and 1 MOVI/DUP (same size as a call).  */
+#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
+
 /* Maximum bytes moved by a single instruction (load/store pair).  */
 #define MOVE_MAX (UNITS_PER_WORD * 2)
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
5a22b576710e29795d65ddf3face9e8587b1df88..83a18b35729ddd07a1925f53a77bc21c9ac7ca36
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25415,8 +25415,7 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
-   MODE bytes.  */
+/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
@@ -25597,46 +25596,22 @@ aarch64_expand_cpymem (rtx *operands)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-   machine_mode mode)
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
 {
-  /* If we are copying 128bits or 256bits, we can do that straight from
- the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-{
-  mode = GET_MODE (src);
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_insn (aarch64_gen_store_pair (mode, *dst, src,
-aarch64_progress_pointer (*dst), src));
-
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 32);
-  return;
-}
-  if 

Re: [PATCH] libatomic: Improve ifunc selection on AArch64

2023-11-10 Thread Wilco Dijkstra
Hi Kyrill,

> +  if (!(hwcap & HWCAP_CPUID))
> +    return false;
> +
> +  unsigned long midr;
> +  asm volatile ("mrs %0, midr_el1" : "=r" (midr));

> From what I recall that midr_el1 register is emulated by the kernel and so 
> userspace software
> has to check that the kernel supports that emulation through hwcaps before 
> reading it.
> According to 
> https://www.kernel.org/doc/html/v5.8/arm64/cpu-feature-registers.html you
> need to check getauxval(AT_HWCAP) & HWCAP_CPUID) before doing that read.

That's why I do that immediately before reading midr_el1 - see above.

Cheers,
Wilco

Re: [PATCH] AArch64: Cleanup memset expansion

2023-11-10 Thread Wilco Dijkstra
Hi Kyrill,

> +  /* Reduce the maximum size with -Os.  */
> +  if (optimize_function_for_size_p (cfun))
> +    max_set_size = 96;
> +

>  This is a new "magic" number in this code. It looks sensible, but how 
> did you arrive at it?

We need 1 instruction to create the value to store (DUP or MOVI) and 1 STP
for every 32 bytes, so the 96 means 4 instructions for typical sizes (sizes not
a multiple of 16 can add one extra instruction).

I checked codesize on SPECINT2017, and 96 had practically identical size.
Using 128 would also be a reasonable Os value with a very slight size increase, 
and 384 looks good for O2 - however I didn't want to tune these values as this
is a cleanup patch.

Cheers,
Wilco

Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-11-06 Thread Wilco Dijkstra


ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 
[PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the 
given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
    PR target/110061
    config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
    config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
    State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat__16 entry points are ARMv8.0.
+   The libat__16_i1 entry points are used when LSE2 is available.  */
+
+
 .arch   armv8-a+lse
 
 #define ENTRY(name) \
@@ -37,6 +52,10 @@ name:    \
 .cfi_endproc;   \
 .size name, .-name;
 
+#define ALIAS(alias,name)  \
+   .global alias;  \
+   .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:    \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+   mov x5, x0
+   cbnz    w1, 2f
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 1b
+   ret
+
+   /* ACQUIRE/CONSUME/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 2b
+   ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+   cbnz    w4, 2f
+
+   /* RELAXED.  */
+1: ldxp    xzr, tmp0, [x0]
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   ret
+
+   /* RELEASE/SEQ_CST.  */
+2: ldxp    xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 2b
+   ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
 ret
 
 /* RELEASE/SEQ_CST.  */
-1: ldaxp   xzr, tmp0, [x0]
+1: ldxp    xzr, tmp0, [x0]
 stlxp   w4, in0, in1, [x0]
 cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 mov x5, x0
 cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
-4:
-   cmp w4, RELEASE
-   b.ne    6f
 
-   /* RELEASE.  */
-5: ldxp    res0, res1, [x5]
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 5b
+   cbnz    w4, 4b
 ret
+END (libat_exchange_16)
 
-   /* ACQ_REL/SEQ_CST.  */
-6: ldaxp   res0, res1, [x5]
-   stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+   ldp exp0, exp1, [x1]
+   cbz w4, 3f
+   cmp w4, RELEASE
+   b.hs    4f
+
+   /* ACQUIRE/CONSUME.  */
+1: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2f
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   mov x0, 1
 ret
-END (libat_exchange_16_i1)
+
+2: stp tmp0, tmp1, [x1]
+   mov x0, 0
+   ret
+
+   /* RELAXED.  */
+3: ldxp    tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2b
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 3b
+   mov x0, 1
+   ret
+
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   tmp0

Re: [PATCH] libatomic: Improve ifunc selection on AArch64

2023-11-06 Thread Wilco Dijkstra
 

ping


From: Wilco Dijkstra
Sent: 04 August 2023 16:05
To: GCC Patches ; Richard Sandiford 

Cc: Kyrylo Tkachov 
Subject: [PATCH] libatomic: Improve ifunc selection on AArch64 
 

Add support for ifunc selection based on CPUID register.  Neoverse N1 supports
atomic 128-bit load/store, so use the FEAT_USCAT ifunc like newer Neoverse
cores.

Passes regress, OK for commit?

libatomic/
    config/linux/aarch64/host-config.h (ifunc1): Use CPUID in ifunc
    selection.

---

diff --git a/libatomic/config/linux/aarch64/host-config.h 
b/libatomic/config/linux/aarch64/host-config.h
index 
851c78c01cd643318aaa52929ce4550266238b79..e5dc33c030a4bab927874fa6c69425db463fdc4b
 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -26,7 +26,7 @@
 
 #ifdef HWCAP_USCAT
 # if N == 16
-#  define IFUNC_COND_1 (hwcap & HWCAP_USCAT)
+#  define IFUNC_COND_1 ifunc1 (hwcap)
 # else
 #  define IFUNC_COND_1  (hwcap & HWCAP_ATOMICS)
 # endif
@@ -50,4 +50,28 @@
 #undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
 #define MAYBE_HAVE_ATOMIC_EXCHANGE_16   1
 
+#ifdef HWCAP_USCAT
+
+#define MIDR_IMPLEMENTOR(midr) (((midr) >> 24) & 255)
+#define MIDR_PARTNUM(midr) (((midr) >> 4) & 0xfff)
+
+static inline bool
+ifunc1 (unsigned long hwcap)
+{
+  if (hwcap & HWCAP_USCAT)
+    return true;
+  if (!(hwcap & HWCAP_CPUID))
+    return false;
+
+  unsigned long midr;
+  asm volatile ("mrs %0, midr_el1" : "=r" (midr));
+
+  /* Neoverse N1 supports atomic 128-bit load/store.  */
+  if (MIDR_IMPLEMENTOR (midr) == 'A' && MIDR_PARTNUM(midr) == 0xd0c)
+    return true;
+
+  return false;
+}
+#endif
+
 #include_next 

Re: [PATCH] AArch64: Fix __sync_val_compare_and_swap [PR111404]

2023-11-06 Thread Wilco Dijkstra

 
ping
 

__sync_val_compare_and_swap may be used on 128-bit types and either calls the
outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
the value is stored successfully using STXP, but the current implementations
do not perform the store if the comparison fails.  In this case the value 
returned
is not read atomically.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
    PR target/111404
    * config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
    For 128-bit store the loaded value and loop if needed.

libgcc/ChangeLog/
    PR target/111404
    * config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
    either new value or loaded value.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
   mem = operands[1];
   oldval = operands[2];
   newval = operands[3];
-  is_weak = (operands[4] != const0_rtx);
   model_rtx = operands[5];
   scratch = operands[7];
   mode = GET_MODE (mem);
   model = memmodel_from_int (INTVAL (model_rtx));
+  is_weak = operands[4] != const0_rtx && mode != TImode;
 
   /* When OLDVAL is zero and we want the strong version we can emit a tighter
 loop:
@@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
   else
 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
 
+  /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
+ store the returned value and loop if the STLXP fails.  */
+  if (mode == TImode)
+    {
+  rtx_code_label *label3 = gen_label_rtx ();
+  emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
+  emit_barrier ();
+
+  emit_label (label2);
+  aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
+
+  if (aarch64_track_speculation)
+   {
+ /* Emit an explicit compare instruction, so that we can correctly
+    track the condition codes.  */
+ rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+   }
+  else
+   x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+   gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+
+  label2 = label3;
+    }
+
   emit_label (label2);
 
   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #define tmp0    16
 #define tmp1    17
 #define tmp2    15
+#define tmp3   14
+#define tmp4   13
 
 #define BTI_C   hint    34
 
@@ -233,10 +235,11 @@ STARTFN   NAME(cas)
 0:  LDXP    x0, x1, [x4]
 cmp x0, x(tmp0)
 ccmp    x1, x(tmp1), #0, eq
-   bne 1f
-   STXP    w(tmp2), x2, x3, [x4]
-   cbnz    w(tmp2), 0b
-1: BARRIER
+   csel    x(tmp2), x2, x0, eq
+   csel    x(tmp3), x3, x1, eq
+   STXP    w(tmp4), x(tmp2), x(tmp3), [x4]
+   cbnz    w(tmp4), 0b
+   BARRIER
 ret
 
 #endif

Re: [PATCH] AArch64: Cleanup memset expansion

2023-11-06 Thread Wilco Dijkstra
ping
 
Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog:
    * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
    (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
    (aarch64_expand_setmem): Clean up implementation, use byte offsets,
    simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8..578a253d6e0e133e19592553fc873b3e73f9f218
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25229,15 +25229,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
 next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
@@ -25393,46 +25384,22 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-   machine_mode mode)
-{
-  /* If we are copying 128bits or 256bits, we can do that straight from
- the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-    {
-  mode = GET_MODE (src);
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_insn (aarch64_gen_store_pair (mode, *dst, src,
-    aarch64_progress_pointer (*dst), src));
-
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 32);
-  return;
-    }
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
+{
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
 {
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, GET_MODE (src), 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, src);
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 16);
+  mode = V16QImode;
+  rtx dst1 = adjust_address (dst, mode, offset);
+  rtx dst2 = adjust_address (dst, mode, offset + 16);
+  emit_insn (aarch64_gen_store_pair (mode, dst1, src, dst2, src));
   return;
 }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+    src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -25461,7 +25428,7 @@ aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
@@ -25474,104 +25441,70 @@ aarch64_expand_setmem (rtx *operands)
   || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
+  /* Reduce the maximum size with -Os.  */
+  if (optimize_function_for_size_p (cfun))
+    max_set_size = 96;
+
   len = UINTVAL (operands[1]);
 
   /* Large memset uses MOPS when available or a library call.  */
   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
 return aarch64_expand_setmem_mops (operands);
 
-  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
-  /* The MOPS sequence takes:
- 3 instructions for the memory storing
- + 1 to move the constant size into a reg
- + 1 if VAL is a non-zero constant to move into a reg
-    (zero constants can use XZR directly).  */
-  unsigned mops_cost = 3 + 1 + cst_val;
-  /* A libcall to memset in the 

Re: [PATCH v2] AArch64: Add inline memmove expansion

2023-11-06 Thread Wilco Dijkstra
ping
 
v2: further cleanups, improved comments

Add support for inline memmove expansions.  The generated code is identical
as for memcpy, except that all loads are emitted before stores rather than
being interleaved.  The maximum size is 256 bytes which requires at most 16
registers.

Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
    * config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
    Change default.
    * config/aarch64/aarch64.md (cpymemdi): Add a parameter.
    (movmemdi): Call aarch64_expand_cpymem.
    * config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
    simplify, support storing generated loads/stores. 
    (aarch64_expand_cpymem): Support expansion of memmove.
    * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.

gcc/testsuite/ChangeLog/
    * gcc.target/aarch64/memmove.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..0d39622bd2826a3fde54d67b5c5da9ee9286cbbd
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -769,7 +769,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
2fa5d09de85d385c1165e399bcc97681ef170916..e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25238,52 +25238,37 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
-   MODE bytes.  */
+/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
- machine_mode mode)
+aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst,
+   int offset, machine_mode mode)
 {
-  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
- address copies using V4SImode so that we can use Q registers.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+  /* Emit explict load/store pair instructions for 32-byte copies.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
 {
   mode = V4SImode;
+  rtx src1 = adjust_address (src, mode, offset);
+  rtx src2 = adjust_address (src, mode, offset + 16);
+  rtx dst1 = adjust_address (dst, mode, offset);
+  rtx dst2 = adjust_address (dst, mode, offset + 16);
   rtx reg1 = gen_reg_rtx (mode);
   rtx reg2 = gen_reg_rtx (mode);
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-   aarch64_progress_pointer (*src)));
-  emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-    aarch64_progress_pointer (*dst), 
reg2));
-  /* Move the pointers forward.  */
-  *src = aarch64_move_pointer (*src, 32);
-  *dst = aarch64_move_pointer (*dst, 32);
+  *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+  *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
   return;
 }
 
   rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  *load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  *store = gen_move_insn (adjust_address (dst, mode, offset), reg);
 }
 
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
    rather than memcpy.  Return true iff we succeeded.  */
 bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
 return false;
@@ -25302,51 +25287,48 @@ aarch64_expand_cpymem_mops (rtx *operands, bool 
is_memmove = false)
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a 

Re: [PATCH v2] AArch64: Fix strict-align cpymem/setmem [PR103100]

2023-11-06 Thread Wilco Dijkstra

ping
 
v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
    PR target/103100
    * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
    (setmemdi): Likewise.
    * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
    strict-align.  Cleanup condition for using MOPS.
    (aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
 return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+  || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
  the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
- when available.  */
-  if (TARGET_MOPS
-  && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
  Count the number of operations involved to see if it's worth it
  against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
   simd_ops++;
   n -= mode_bits;
 
-  /* Do certain trailing copies as overlapping if it's going to be
-    cheaper.  i.e. less instructions to do so.  For instance doing a 15
-    byte copy it's more efficient to do two overlapping 8 byte copies than
-    8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+  /* Emit trailing writes using overlapping unaligned accesses
+   (when 

[PATCH v2] AArch64: Improve immediate generation

2023-10-24 Thread Wilco Dijkstra
v2: Use check-function-bodies in tests

Further improve immediate generation by adding support for 2-instruction
MOV/EOR bitmask immediates.  This reduces the number of 3/4-instruction
immediates in SPECCPU2017 by ~2%.

Passes regress, OK for commit?

gcc/ChangeLog:
* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
Add support for immediates using MOV/EOR bitmask.

gcc/testsuite:
* gcc.target/aarch64/imm_choice_comparison.c: Change tests.
* gcc.target/aarch64/moveor_imm.c: Add new test.
* gcc.target/aarch64/pr106583.c: Change tests.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
578a253d6e0e133e19592553fc873b3e73f9f218..ed5be2b64c9a767d74e9d78415da964c669001aa
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5748,6 +5748,26 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool 
generate,
}
  return 2;
}
+
+  /* Try 2 bitmask immediates which are xor'd together. */
+  for (i = 0; i < 64; i += 16)
+   {
+ val2 = (val >> i) & mask;
+ val2 |= val2 << 16;
+ val2 |= val2 << 32;
+ if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
+   break;
+   }
+
+  if (i != 64)
+   {
+ if (generate)
+   {
+ emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+ emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
+   }
+ return 2;
+   }
 }
 
   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/imm_choice_comparison.c 
b/gcc/testsuite/gcc.target/aarch64/imm_choice_comparison.c
index 
ebc44d6dbc7287d907603d77d7b54496de177c4b..a1fc90ad73411ae8ed848fa321586afcb8d710aa
 100644
--- a/gcc/testsuite/gcc.target/aarch64/imm_choice_comparison.c
+++ b/gcc/testsuite/gcc.target/aarch64/imm_choice_comparison.c
@@ -1,32 +1,64 @@
 /* { dg-do compile } */
 /* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
 
 /* Go from four moves to two.  */
 
+/*
+** foo:
+** mov w[0-9]+, 2576980377
+** movkx[0-9]+, 0x, lsl 32
+** ...
+*/
+
 int
 foo (long long x)
 {
-  return x <= 0x1998;
+  return x <= 0x9998;
 }
 
+/*
+** GT:
+** mov w[0-9]+, -16777217
+** ...
+*/
+
 int
 GT (unsigned int x)
 {
   return x > 0xfefe;
 }
 
+/*
+** LE:
+** mov w[0-9]+, -16777217
+** ...
+*/
+
 int
 LE (unsigned int x)
 {
   return x <= 0xfefe;
 }
 
+/*
+** GE:
+** mov w[0-9]+, 4278190079
+** ...
+*/
+
 int
 GE (long long x)
 {
   return x >= 0xff00;
 }
 
+/*
+** LT:
+** mov w[0-9]+, -16777217
+** ...
+*/
+
 int
 LT (int x)
 {
@@ -35,6 +67,13 @@ LT (int x)
 
 /* Optimize the immediate in conditionals.  */
 
+/*
+** check:
+** ...
+** mov w[0-9]+, -16777217
+** ...
+*/
+
 int
 check (int x, int y)
 {
@@ -44,11 +83,15 @@ check (int x, int y)
   return x;
 }
 
+/*
+** tern:
+** ...
+** mov w[0-9]+, -16777217
+** ...
+*/
+
 int
 tern (int x)
 {
   return x >= 0xff00 ? 5 : -3;
 }
-
-/* baz produces one movk instruction.  */
-/* { dg-final { scan-assembler-times "movk" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/moveor_imm.c 
b/gcc/testsuite/gcc.target/aarch64/moveor_imm.c
new file mode 100644
index 
..1c0c3f3bf8c588f9661112a8b3f9a72c5ddff95c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/moveor_imm.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/*
+** f1:
+**  movx0, -6148914691236517206
+** eor x0, x0, -9223372036854775807
+** ret
+*/
+
+long f1 (void)
+{
+  return 0x2aab;
+}
+
+/*
+** f2:
+** mov x0, -1085102592571150096
+** eor x0, x0, -2305843009213693951
+** ret
+*/
+
+long f2 (void)
+{
+  return 0x10f0f0f0f0f0f0f1;
+}
+
+/*
+** f3:
+** mov x0, -3689348814741910324
+** eor x0, x0, -4611686018427387903
+** ret
+*/
+
+long f3 (void)
+{
+  return 0xccd;
+}
+
+/*
+** f4:
+** mov x0, -7378697629483820647
+** eor x0, x0, -9223372036854775807
+** ret
+*/
+
+long f4 (void)
+{
+  return 0x1998;
+}
+
+/*
+** f5:
+** mov x0, 3689348814741910323
+** eor x0, x0, 864691128656461824
+** ret
+*/
+
+long f5 (void)
+{
+  return 0x3f333f33;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c 
b/gcc/testsuite/gcc.target/aarch64/pr106583.c
index 
0f931580817d78dc1cc58f03b251bd21bec71f59..63df7395edf9491720e3601848e15aa773c51e6d
 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr106583.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
@@ -1,41 +1,94 @@
-/* { dg-do assemble } */
-/* { dg-options "-O2 --save-temps" } */
+/* { dg-do compile } */
+/* { dg-options "-O2" } 

[PATCH] AArch64: Cleanup memset expansion

2023-10-19 Thread Wilco Dijkstra
Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog:
* config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
(aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
(aarch64_expand_setmem): Clean up implementation, use byte offsets,
simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8..578a253d6e0e133e19592553fc873b3e73f9f218
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25229,15 +25229,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
@@ -25393,46 +25384,22 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-   machine_mode mode)
-{
-  /* If we are copying 128bits or 256bits, we can do that straight from
- the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-{
-  mode = GET_MODE (src);
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_insn (aarch64_gen_store_pair (mode, *dst, src,
-aarch64_progress_pointer (*dst), src));
-
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 32);
-  return;
-}
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
+{
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
 {
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, GET_MODE (src), 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, src);
-  /* Move the pointers forward.  */
-  *dst = aarch64_move_pointer (*dst, 16);
+  mode = V16QImode;
+  rtx dst1 = adjust_address (dst, mode, offset);
+  rtx dst2 = adjust_address (dst, mode, offset + 16);
+  emit_insn (aarch64_gen_store_pair (mode, dst1, src, dst2, src));
   return;
 }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -25461,7 +25428,7 @@ aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
@@ -25474,104 +25441,70 @@ aarch64_expand_setmem (rtx *operands)
   || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
+  /* Reduce the maximum size with -Os.  */
+  if (optimize_function_for_size_p (cfun))
+max_set_size = 96;
+
   len = UINTVAL (operands[1]);
 
   /* Large memset uses MOPS when available or a library call.  */
   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
 return aarch64_expand_setmem_mops (operands);
 
-  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
-  /* The MOPS sequence takes:
- 3 instructions for the memory storing
- + 1 to move the constant size into a reg
- + 1 if VAL is a non-zero constant to move into a reg
-(zero constants can use XZR directly).  */
-  unsigned mops_cost = 3 + 1 + cst_val;
-  /* A libcall to memset in the worst 

[PATCH] AArch64: Improve immediate generation

2023-10-19 Thread Wilco Dijkstra
Further improve immediate generation by adding support for 2-instruction
MOV/EOR bitmask immediates.  This reduces the number of 3/4-instruction
immediates in SPECCPU2017 by ~2%.

Passes regress, OK for commit?

gcc/ChangeLog:
* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
Add support for immediates using MOV/EOR bitmask.

gcc/testsuite:
* gcc.target/aarch64/imm_choice_comparison.c: Fix test.
* gcc.target/aarch64/moveor_imm.c: Add new test.
* gcc.target/aarch64/pr106583.c: Fix test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
578a253d6e0e133e19592553fc873b3e73f9f218..ed5be2b64c9a767d74e9d78415da964c669001aa
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5748,6 +5748,26 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool 
generate,
}
  return 2;
}
+
+  /* Try 2 bitmask immediates which are xor'd together. */
+  for (i = 0; i < 64; i += 16)
+   {
+ val2 = (val >> i) & mask;
+ val2 |= val2 << 16;
+ val2 |= val2 << 32;
+ if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
+   break;
+   }
+
+  if (i != 64)
+   {
+ if (generate)
+   {
+ emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
+ emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
+   }
+ return 2;
+   }
 }
 
   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/imm_choice_comparison.c 
b/gcc/testsuite/gcc.target/aarch64/imm_choice_comparison.c
index 
ebc44d6dbc7287d907603d77d7b54496de177c4b..2434ca380ca2cad3e1e4181deeaad680f518b866
 100644
--- a/gcc/testsuite/gcc.target/aarch64/imm_choice_comparison.c
+++ b/gcc/testsuite/gcc.target/aarch64/imm_choice_comparison.c
@@ -6,7 +6,7 @@
 int
 foo (long long x)
 {
-  return x <= 0x1998;
+  return x <= 0x9998;
 }
 
 int
diff --git a/gcc/testsuite/gcc.target/aarch64/moveor_imm.c 
b/gcc/testsuite/gcc.target/aarch64/moveor_imm.c
new file mode 100644
index 
..5f4997b50398fdda5924610959e0c54967ad0735
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/moveor_imm.c
@@ -0,0 +1,31 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+  return 0x2aab;
+}
+
+long f2 (void)
+{
+  return 0x10f0f0f0f0f0f0f1;
+}
+
+long f3 (void)
+{
+  return 0xccd;
+}
+
+long f4 (void)
+{
+  return 0x1998;
+}
+
+long f5 (void)
+{
+  return 0x3f333f33;
+}
+
+/* { dg-final { scan-assembler-not {\tmovk\t} } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 5 } } */
+/* { dg-final { scan-assembler-times {\teor\t} 5 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr106583.c 
b/gcc/testsuite/gcc.target/aarch64/pr106583.c
index 
0f931580817d78dc1cc58f03b251bd21bec71f59..79ada5160ce059d66eeaee407ca02488b2a1f114
 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr106583.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr106583.c
@@ -3,7 +3,7 @@
 
 long f1 (void)
 {
-  return 0x7efefefefefefeff;
+  return 0x75fefefefefefeff;
 }
 
 long f2 (void)



Re: [PATCH] AArch64: Fix __sync_val_compare_and_swap [PR111404]

2023-10-16 Thread Wilco Dijkstra
Hi Ramana,

> I remember this to be the previous discussions and common understanding.
>
> https://gcc.gnu.org/legacy-ml/gcc/2016-06/msg00017.html
>
> and here
> 
> https://gcc.gnu.org/legacy-ml/gcc-patches/2017-02/msg00168.html
>
> Can you point any discussion recently that shows this has changed and
> point me at that discussion if any anywhere ? I can't find it in my
> searches . Perhaps you've had the discussion some place to show it has
> changed.

Here are some more recent discussions about atomics, eg. this has good
arguments from developers wanting lock-free atomics:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80878

We also had some discussion how we could handle the read-only corner
case by either giving a warning/error on const pointers to atomics or
ensuring _Atomic variables are writeable:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108659
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109553

My conclusion from that is that nobody cared enough to fix this for x86
in all these years, so it's not seen as an important issue.

We've had several internal design discussions to figure out how to fix the ABI
issues. The conclusion was that this is the only possible solution that makes
GCC and LLVM compatible without breaking backwards compatibility. It also
allows use of newer atomic instructions (which people want inlined).

Cheers,
Wilco

Re: [PATCH] AArch64: Fix __sync_val_compare_and_swap [PR111404]

2023-10-16 Thread Wilco Dijkstra
ping
 

__sync_val_compare_and_swap may be used on 128-bit types and either calls the
outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
the value is stored successfully using STXP, but the current implementations
do not perform the store if the comparison fails.  In this case the value 
returned
is not read atomically.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
    PR target/111404
    * config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
    For 128-bit store the loaded value and loop if needed.

libgcc/ChangeLog/
    PR target/111404
    * config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
    either new value or loaded value.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
   mem = operands[1];
   oldval = operands[2];
   newval = operands[3];
-  is_weak = (operands[4] != const0_rtx);
   model_rtx = operands[5];
   scratch = operands[7];
   mode = GET_MODE (mem);
   model = memmodel_from_int (INTVAL (model_rtx));
+  is_weak = operands[4] != const0_rtx && mode != TImode;
 
   /* When OLDVAL is zero and we want the strong version we can emit a tighter
 loop:
@@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
   else
 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
 
+  /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
+ store the returned value and loop if the STLXP fails.  */
+  if (mode == TImode)
+    {
+  rtx_code_label *label3 = gen_label_rtx ();
+  emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
+  emit_barrier ();
+
+  emit_label (label2);
+  aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
+
+  if (aarch64_track_speculation)
+   {
+ /* Emit an explicit compare instruction, so that we can correctly
+    track the condition codes.  */
+ rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+   }
+  else
+   x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+   gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+
+  label2 = label3;
+    }
+
   emit_label (label2);
 
   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #define tmp0    16
 #define tmp1    17
 #define tmp2    15
+#define tmp3   14
+#define tmp4   13
 
 #define BTI_C   hint    34
 
@@ -233,10 +235,11 @@ STARTFN   NAME(cas)
 0:  LDXP    x0, x1, [x4]
 cmp x0, x(tmp0)
 ccmp    x1, x(tmp1), #0, eq
-   bne 1f
-   STXP    w(tmp2), x2, x3, [x4]
-   cbnz    w(tmp2), 0b
-1: BARRIER
+   csel    x(tmp2), x2, x0, eq
+   csel    x(tmp3), x3, x1, eq
+   STXP    w(tmp4), x(tmp2), x(tmp3), [x4]
+   cbnz    w(tmp4), 0b
+   BARRIER
 ret
 
 #endif


Re: [PATCH] libatomic: Improve ifunc selection on AArch64

2023-10-16 Thread Wilco Dijkstra
 

ping


From: Wilco Dijkstra
Sent: 04 August 2023 16:05
To: GCC Patches ; Richard Sandiford 

Cc: Kyrylo Tkachov 
Subject: [PATCH] libatomic: Improve ifunc selection on AArch64 
 

Add support for ifunc selection based on CPUID register.  Neoverse N1 supports
atomic 128-bit load/store, so use the FEAT_USCAT ifunc like newer Neoverse
cores.

Passes regress, OK for commit?

libatomic/
    config/linux/aarch64/host-config.h (ifunc1): Use CPUID in ifunc
    selection.

---

diff --git a/libatomic/config/linux/aarch64/host-config.h 
b/libatomic/config/linux/aarch64/host-config.h
index 
851c78c01cd643318aaa52929ce4550266238b79..e5dc33c030a4bab927874fa6c69425db463fdc4b
 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -26,7 +26,7 @@
 
 #ifdef HWCAP_USCAT
 # if N == 16
-#  define IFUNC_COND_1 (hwcap & HWCAP_USCAT)
+#  define IFUNC_COND_1 ifunc1 (hwcap)
 # else
 #  define IFUNC_COND_1  (hwcap & HWCAP_ATOMICS)
 # endif
@@ -50,4 +50,28 @@
 #undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
 #define MAYBE_HAVE_ATOMIC_EXCHANGE_16   1
 
+#ifdef HWCAP_USCAT
+
+#define MIDR_IMPLEMENTOR(midr) (((midr) >> 24) & 255)
+#define MIDR_PARTNUM(midr) (((midr) >> 4) & 0xfff)
+
+static inline bool
+ifunc1 (unsigned long hwcap)
+{
+  if (hwcap & HWCAP_USCAT)
+    return true;
+  if (!(hwcap & HWCAP_CPUID))
+    return false;
+
+  unsigned long midr;
+  asm volatile ("mrs %0, midr_el1" : "=r" (midr));
+
+  /* Neoverse N1 supports atomic 128-bit load/store.  */
+  if (MIDR_IMPLEMENTOR (midr) == 'A' && MIDR_PARTNUM(midr) == 0xd0c)
+    return true;
+
+  return false;
+}
+#endif
+
 #include_next 

Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-10-16 Thread Wilco Dijkstra
 

ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 
[PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the 
given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
    PR target/110061
    config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
    config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
    State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat__16 entry points are ARMv8.0.
+   The libat__16_i1 entry points are used when LSE2 is available.  */
+
+
 .arch   armv8-a+lse
 
 #define ENTRY(name) \
@@ -37,6 +52,10 @@ name:    \
 .cfi_endproc;   \
 .size name, .-name;
 
+#define ALIAS(alias,name)  \
+   .global alias;  \
+   .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:    \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+   mov x5, x0
+   cbnz    w1, 2f
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 1b
+   ret
+
+   /* ACQUIRE/CONSUME/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 2b
+   ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+   cbnz    w4, 2f
+
+   /* RELAXED.  */
+1: ldxp    xzr, tmp0, [x0]
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   ret
+
+   /* RELEASE/SEQ_CST.  */
+2: ldxp    xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 2b
+   ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
 ret
 
 /* RELEASE/SEQ_CST.  */
-1: ldaxp   xzr, tmp0, [x0]
+1: ldxp    xzr, tmp0, [x0]
 stlxp   w4, in0, in1, [x0]
 cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 mov x5, x0
 cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
-4:
-   cmp w4, RELEASE
-   b.ne    6f
 
-   /* RELEASE.  */
-5: ldxp    res0, res1, [x5]
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 5b
+   cbnz    w4, 4b
 ret
+END (libat_exchange_16)
 
-   /* ACQ_REL/SEQ_CST.  */
-6: ldaxp   res0, res1, [x5]
-   stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+   ldp exp0, exp1, [x1]
+   cbz w4, 3f
+   cmp w4, RELEASE
+   b.hs    4f
+
+   /* ACQUIRE/CONSUME.  */
+1: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2f
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   mov x0, 1
 ret
-END (libat_exchange_16_i1)
+
+2: stp tmp0, tmp1, [x1]
+   mov x0, 0
+   ret
+
+   /* RELAXED.  */
+3: ldxp    tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2b
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 3b
+   mov x0, 1
+   ret
+
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   tmp0

Re: [PATCH v2] AArch64: Fix strict-align cpymem/setmem [PR103100]

2023-10-16 Thread Wilco Dijkstra
ping
 
v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
    PR target/103100
    * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
    (setmemdi): Likewise.
    * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
    strict-align.  Cleanup condition for using MOPS.
    (aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
 return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+  || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
  the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
- when available.  */
-  if (TARGET_MOPS
-  && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
  Count the number of operations involved to see if it's worth it
  against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
   simd_ops++;
   n -= mode_bits;
 
-  /* Do certain trailing copies as overlapping if it's going to be
-    cheaper.  i.e. less instructions to do so.  For instance doing a 15
-    byte copy it's more efficient to do two overlapping 8 byte copies than
-    8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+  /* Emit trailing writes using overlapping unaligned accesses
+   (when 

[PATCH v2] AArch64: Add inline memmove expansion

2023-10-16 Thread Wilco Dijkstra
v2: further cleanups, improved comments

Add support for inline memmove expansions.  The generated code is identical
as for memcpy, except that all loads are emitted before stores rather than
being interleaved.  The maximum size is 256 bytes which requires at most 16
registers.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
* config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
Change default.
* config/aarch64/aarch64.md (cpymemdi): Add a parameter.
(movmemdi): Call aarch64_expand_cpymem.
* config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
simplify, support storing generated loads/stores. 
(aarch64_expand_cpymem): Support expansion of memmove.
* config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.

gcc/testsuite/ChangeLog/
* gcc.target/aarch64/memmove.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..0d39622bd2826a3fde54d67b5c5da9ee9286cbbd
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -769,7 +769,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
2fa5d09de85d385c1165e399bcc97681ef170916..e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25238,52 +25238,37 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
-   MODE bytes.  */
+/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
- machine_mode mode)
+aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst,
+   int offset, machine_mode mode)
 {
-  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
- address copies using V4SImode so that we can use Q registers.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+  /* Emit explict load/store pair instructions for 32-byte copies.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
 {
   mode = V4SImode;
+  rtx src1 = adjust_address (src, mode, offset);
+  rtx src2 = adjust_address (src, mode, offset + 16);
+  rtx dst1 = adjust_address (dst, mode, offset);
+  rtx dst2 = adjust_address (dst, mode, offset + 16);
   rtx reg1 = gen_reg_rtx (mode);
   rtx reg2 = gen_reg_rtx (mode);
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-   aarch64_progress_pointer (*src)));
-  emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-aarch64_progress_pointer (*dst), 
reg2));
-  /* Move the pointers forward.  */
-  *src = aarch64_move_pointer (*src, 32);
-  *dst = aarch64_move_pointer (*dst, 32);
+  *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+  *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
   return;
 }
 
   rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  *load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  *store = gen_move_insn (adjust_address (dst, mode, offset), reg);
 }
 
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
rather than memcpy.  Return true iff we succeeded.  */
 bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
 return false;
@@ -25302,51 +25287,48 @@ aarch64_expand_cpymem_mops (rtx *operands, bool 
is_memmove = false)
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to

Re: [PATCH v2] ARM: Block predication on atomics [PR111235]

2023-10-02 Thread Wilco Dijkstra
Hi Ramana,

>> I used --target=arm-none-linux-gnueabihf --host=arm-none-linux-gnueabihf
>> --build=arm-none-linux-gnueabihf --with-float=hard. However it seems that the
>> default armhf settings are incorrect. I shouldn't need the --with-float=hard 
>> since
>> that is obviously implied by armhf, and they should also imply armv7-a with 
>> vfpv3
>> according to documentation. It seems to get confused and skip some tests. I 
>> tried
>> using --with-fpu=auto, but that doesn't work at all, so in the end I forced 
>> it like:
>> --with-arch=armv8-a --with-fpu=neon-fp-armv8. With this it runs a few more 
>> tests.
> 
> Yeah that's a wart that I don't like.
> 
> armhf just implies the hard float ABI and came into being to help
> distinguish from the Base PCS for some of the distros at the time
> (2010s). However we didn't want to set a baseline arch at that time
> given the imminent arrival of v8-a and thus the specification of
> --with-arch , --with-fpu and --with-float became second nature to many
> of us working on it at that time.

Looking at it, the default is indeed incorrect, you get:
'-mcpu=arm10e' '-mfloat-abi=hard' '-marm' '-march=armv5te+fp'

That's like 25 years out of date!

However all the armhf distros have Armv7-a as the baseline and use Thumb-2:
'-mfloat-abi=hard' '-mthumb' '-march=armv7-a+fp'

So the issue is that dg-require-effective-target arm_arch_v7a_ok doesn't work on
armhf. It seems that if you specify an architecture even with hard-float 
configured,
it turns off FP and then complains because hard-float implies you must have 
FP...

So in most configurations Iincluding the one used by distro compilers) we 
basically
skip lots of tests for no apparent reason...

> Ok, thanks for promising to do so - I trust you to get it done. Please
> try out various combinations of -march v7ve, v7-a , v8-a with the tool
> as each of them have slightly different rules. For instance v7ve
> allows LDREXD and STREXD to be single copy atomic for 64 bit loads
> whereas v7-a did not .

You mean LDRD may be generated on CPUs with LPAE. We use LDREXD by
default since that is always atomic on v7-a.

> Ok if no regressions but as you might get nagged by the post commit CI ...

Thanks, I've committed it. Those links don't show anything concrete, however I 
do note
the CI didn't pick up v2.

Btw you're happy with backports if there are no issues reported for a few days?

Cheers,
Wilco

Re: [PATCH v2] ARM: Block predication on atomics [PR111235]

2023-09-27 Thread Wilco Dijkstra
Hi Ramana,

> Hope this helps.

Yes definitely!

>> Passes regress/bootstrap, OK for commit?
>
> Target ? armhf ? --with-arch , -with-fpu , -with-float parameters ?
> Please be specific.

I used --target=arm-none-linux-gnueabihf --host=arm-none-linux-gnueabihf
--build=arm-none-linux-gnueabihf --with-float=hard. However it seems that the
default armhf settings are incorrect. I shouldn't need the --with-float=hard 
since
that is obviously implied by armhf, and they should also imply armv7-a with 
vfpv3
according to documentation. It seems to get confused and skip some tests. I 
tried
using --with-fpu=auto, but that doesn't work at all, so in the end I forced it 
like:
--with-arch=armv8-a --with-fpu=neon-fp-armv8. With this it runs a few more 
tests.

> Since these patterns touch armv8m.baseline can you find all the
> testcases in the testsuite and ensure no change in code for
> armv8m.baseline as that's unpredicated already and this patch brings
> this in line with the same ? Does the testsuite already cover these
> arch variants and are you satisfied that the tests in the testsuite
> can catch / don't make any additional code changes to the other
> architectures affected by this ?

There are various v8-m(.base/.main) tests and they all pass. The generated
code is generally unchanged if there was no conditional execution. I made
the new UNSPEC_LDR/STR patterns support offsets so there is no difference
in generated code for relaxed loads/stores (since they used to use a plain
load/store which has an immediate offset).

>> * onfig/arm/sync.md (arm_atomic_load): Add new pattern.
>
> Nit: s/onfig/config

Fixed.

>> (atomic_load): Always expand atomic loads explicitly.
>> (atomic_store): Always expand atomic stores explicitly.
>
> Nit: Change message to :
> Switch patterns to define_expand.

Fixed.

> Largely looks ok though I cannot work out tonight if we need more v8-a
> or v8m-baseline specific tests for scan-assembler patterns.
>
> Clearly our testsuite doesn't catch it , so perhaps the OP could help
> validate this patch with their formal models to see if this fixes
> these set of issues and creates no new regressions ? Is that feasible
> to do ?

Disabling conditional execution avoids the issue. It's trivial to verify that
atomics can no longer be conditionally executed (no "%?"). When this is
committed, we can run the random testing again to confirm the issue
is no longer present.

> -(define_insn "atomic_load"
> -  [(set (match_operand:QHSI 0 "register_operand" "=r,r,l")
> +(define_insn "arm_atomic_load"
> +  [(set (match_operand:QHSI 0 "register_operand" "=r,l")
>  (unspec_volatile:QHSI
> -  [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q,Q,Q")
> -   (match_operand:SI 2 "const_int_operand" "n,Pf,n")]  ;; model
> +  [(match_operand:QHSI 1 "memory_operand" "m,m")]
>
> Remind me again why is it safe to go from the Q constraint to the m
> constraint here and everywhere else you've done this ?

That's because the relaxed loads/stores use LDR/STR wrapped in an
UNSPEC. To avoid regressions we have to use 'm' so that an immediate
offset can be merged into the memory access.

>> -  VUNSPEC_LDA  ; Represent a store-register-acquire.
>> +  VUNSPEC_LDR  ; Represent a load-register-relaxed.
>> +  VUNSPEC_LDA  ; Represent a load-register-acquire.
>
> Nit: LDA before LDR ? Though I suspect this list can be alphabetically
> ordered at another point of time.

Swapped.

> There are new tests added for v7-a , what happens with the output for
> v8-a and the changes for ldacq and other such instructions ?

v7-a and v8-a generate the same instructions for relaxed load/store.
The acquire/release versions are identical except they are no longer
predicated. Basically the new patterns are not only significantly simpler,
they are now the same between the many ARM/Thumb-2/v7-a/v8-m/v8-a
combinations, so test coverage is much higher now. This is how these
patterns should have been designed all along.

v2 follows below.

Cheers,
Wilco


[PATCH v2] ARM: Block predication on atomics [PR111235]

The v7 memory ordering model allows reordering of conditional atomic
instructions.  To avoid this, make all atomic patterns unconditional.
Expand atomic loads and stores for all architectures so the memory access
can be wrapped into an UNSPEC.

gcc/ChangeLog/
PR target/111235
* config/arm/constraints.md: Remove Pf constraint.
* config/arm/sync.md (arm_atomic_load): Add new pattern.
(arm_atomic_load_acquire): Likewise.
(arm_atomic_store): Likewise.
(arm_atomic_store_release): Likewise.
(atomic_load): Switch patterns to define_expand.
(atomic_store): Likewise.
(arm_atomic_loaddi2_ldrd): Remove predication.
(arm_load_exclusive): Likewise.
(arm_load_acquire_exclusive): Likewise.
(arm_load_exclusivesi): Likewise.
(arm_load_acquire_exclusivesi: Likewise.

[PATCH] AArch64: Remove BTI from outline atomics

2023-09-26 Thread Wilco Dijkstra

The outline atomic functions have hidden visibility and can only be called
directly.  Therefore we can remove the BTI at function entry.  This improves
security by reducing the number of indirect entry points in a binary.
The BTI markings on the objects are still emitted.

Passes regress, OK for commit?

libgcc/ChangeLog:
    * config/aarch64/lse.S (BTI_C): Remove define.

---

diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
ba05047ff02b6fc5752235bffa924fc4a2f48c04..dbfb83fb09083641bf06c50b631a5f27bdf61b80
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -163,8 +163,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #define tmp3    14
 #define tmp4    13
 
-#define BTI_C  hint    34
-
 /* Start and end a function.  */
 .macro  STARTFN name
 .text
@@ -174,7 +172,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 .type   \name, %function
 .cfi_startproc
 \name:
-   BTI_C
 .endm
 
 .macro  ENDFN name


Re: [PATCH] AArch64: Fix __sync_val_compare_and_swap [PR111404]

2023-09-25 Thread Wilco Dijkstra
Hi Ramana,

>> __sync_val_compare_and_swap may be used on 128-bit types and either calls the
>> outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic 
>> if
>> the value is stored successfully using STXP, but the current implementations
>> do not perform the store if the comparison fails.  In this case the value 
>> returned
>> is not read atomically.
>
> IIRC, the previous discussions in this space revolved around the
> difficulty with the store writing to readonly memory which is why I
> think we went with LDXP in this form.

That's not related to this patch - this fixes a serious atomicity bug that may
affect the Linux kernel since it uses the older sync primitives. Given that LDXP
is not atomic on its own, you have to execute the STXP even in the failure case.
Note that you can't rely on compare not to write memory: load-exclusive
loops may either always write or avoid writes in the failure case if the load is
atomic. CAS instructions always write.

> Has something changed from then ?

Yes, we now know that using locking atomics was a bad decision. Developers
actually require efficient and lock-free atomics. Since we didn't support them,
many applications were forced to add their own atomic implementations using
hacky inline assembler. It also resulted in a nasty ABI incompatibility between
GCC and LLVM. Yes - atomics are part of the ABI!

All that is much worse than worrying about a theoretical corner case that
can't happen in real applications - atomics only work on writeable memory
since their purpose is to synchronize reads with writes.

Cheers,
Wilco


[PATCH] AArch64: Add inline memmove expansion

2023-09-21 Thread Wilco Dijkstra

Add support for inline memmove expansions.  The generated code is identical
as for memcpy, except that all loads are emitted before stores rather than
being interleaved.  The maximum size is 256 bytes which requires at most 16
registers.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
* config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
Change default.
* config/aarch64/aarch64.md (cpymemdi): Add a parameter.
(movmemdi): Call aarch64_expand_cpymem.
* config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
simplify, support storing generated loads/stores. 
(aarch64_expand_cpymem): Support expansion of memmove.
* config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.

gcc/testsuite/ChangeLog/
* gcc.target/aarch64/memmove.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
e8d91cba30e32e03c4794ccc24254691d135f2dd..e224218600969d9d052128790f1524414bbab5c6
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -766,7 +766,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
8a12894d6b80de1031d6e7d02dca680c57bce136..a573e3bded2736f5108ad2d4004f530e0f32c99c
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25191,48 +25191,35 @@ aarch64_progress_pointer (rtx pointer)
MODE bytes.  */
 
 static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
- machine_mode mode)
+aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst,
+   int offset, machine_mode mode)
 {
   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
  address copies using V4SImode so that we can use Q registers.  */
   if (known_eq (GET_MODE_BITSIZE (mode), 256))
 {
   mode = V4SImode;
+  rtx src1 = adjust_address (src, mode, offset);
+  rtx src2 = adjust_address (src, mode, offset + 16);
+  rtx dst1 = adjust_address (dst, mode, offset);
+  rtx dst2 = adjust_address (dst, mode, offset + 16);
   rtx reg1 = gen_reg_rtx (mode);
   rtx reg2 = gen_reg_rtx (mode);
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-   aarch64_progress_pointer (*src)));
-  emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-aarch64_progress_pointer (*dst), 
reg2));
-  /* Move the pointers forward.  */
-  *src = aarch64_move_pointer (*src, 32);
-  *dst = aarch64_move_pointer (*dst, 32);
+  *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+  *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
   return;
 }
 
   rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  *load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  *store = gen_move_insn (adjust_address (dst, mode, offset), reg);
 }
 
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
rather than memcpy.  Return true iff we succeeded.  */
 bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
 return false;
@@ -25251,12 +25238,12 @@ aarch64_expand_cpymem_mops (rtx *operands, bool 
is_memmove = false)
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to
-   memcpy should be emitted.  */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
+   if this is a memmove rather than memcpy.  Return true if we succeed,
+   otherwise return false, indicating that a libcall should be emitted.  */
 bool
-aarch64_expand_cpymem (rtx *operands)

[PATCH v2] AArch64: Fix strict-align cpymem/setmem [PR103100]

2023-09-21 Thread Wilco Dijkstra
v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
PR target/103100
* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
(setmemdi): Likewise.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
strict-align.  Cleanup condition for using MOPS.
(aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-= TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
 return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+  || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
  the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
- when available.  */
-  if (TARGET_MOPS
-  && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
  Count the number of operations involved to see if it's worth it
  against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
   simd_ops++;
   n -= mode_bits;
 
-  /* Do certain trailing copies as overlapping if it's going to be
-cheaper.  i.e. less instructions to do so.  For instance doing a 15
-byte copy it's more efficient to do two overlapping 8 byte copies than
-8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+  /* Emit trailing writes using overlapping unaligned accesses
+   (when !STRICT_ALIGNMENT) - 

Re: [PATCH] AArch64: Fix strict-align cpymem/setmem [PR103100]

2023-09-20 Thread Wilco Dijkstra
Hi Richard,

> * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.

> Shouldn't this be a separate patch?  It's not immediately obvious that this 
> is a necessary part of this change.

You mean this?

@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""

Yes that's necessary since that is the bug.

> +  unsigned align = INTVAL (operands[3]);
>
>This should read the value with UINTVAL.  Given the useful range of the 
>alignment, it should be OK that we're not using unsigned HWI.

I'll fix that.

> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>  return aarch64_expand_cpymem_mops (operands);
>
> So what about align=4 and copying, for example, 8 or 12 bytes; wouldn't we 
> want a sequence of LDR/STR in that case?  Doesn't this fall back to MOPS too 
> eagerly?

The goal was to fix the issue in way that is both obvious and can be easily 
backported.
Further improvements can be made to handle other alignments, but it is
slightly tricky (eg. align == 4 won't emit LDP/STP directly using current code
and thus would need additional work to generalize the LDP path).
  
>> +  unsigned max_mops_size = aarch64_mops_memcpy_size_threshold;
>
>I find this name slightly confusing.  Surely it's min_mops_size (since above 
>that we want to use MOPS rather than inlined loads/stores).  But why not just 
>use aarch64_mops_memcpy_size_threshold directly in the one place it's used?

The reason is that in a follow-on patch I check 
aarch64_mops_memcpy_size_threshold
too, so for now this acts as a shortcut for the ridiculously long name.

> Are there any additional tests for this?

There are existing tests that check the expansion which fail if you completely
block expansions with STRICT_ALIGNMENT.

Cheers,
Wilco

[PATCH v2] AArch64: Fix memmove operand corruption [PR111121]

2023-09-20 Thread Wilco Dijkstra
A MOPS memmove may corrupt registers since there is no copy of the input
operands to temporary registers.  Fix this by calling
aarch64_expand_cpymem_mops.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
PR target/21
* config/aarch64/aarch64.md (aarch64_movmemdi): Add new expander.
(movmemdi): Call aarch64_expand_cpymem_mops for correct expansion.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem_mops): Add 
support
for memmove.
* config/aarch64/aarch64-protos.h (aarch64_expand_cpymem_mops): Add 
new
function.

gcc/testsuite/ChangeLog/
PR target/21
* gcc.target/aarch64/mops_4.c: Add memmove testcases.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
70303d6fd953e0c397b9138ede8858c2db2e53db..e8d91cba30e32e03c4794ccc24254691d135f2dd
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -765,6 +765,7 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
+bool aarch64_expand_cpymem_mops (rtx *, bool);
 bool aarch64_expand_cpymem (rtx *);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
219c4ee6d4cd7522f6ad634c794485841e5d08fa..dd6874d13a75f20d10a244578afc355b25c73da2
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25228,10 +25228,11 @@ aarch64_copy_one_block_and_progress_pointers (rtx 
*src, rtx *dst,
   *dst = aarch64_progress_pointer (*dst);
 }
 
-/* Expand a cpymem using the MOPS extension.  OPERANDS are taken
-   from the cpymem pattern.  Return true iff we succeeded.  */
-static bool
-aarch64_expand_cpymem_mops (rtx *operands)
+/* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
+   from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
+   rather than memcpy.  Return true iff we succeeded.  */
+bool
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
 {
   if (!TARGET_MOPS)
 return false;
@@ -25243,8 +25244,10 @@ aarch64_expand_cpymem_mops (rtx *operands)
   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
   rtx src_mem = replace_equiv_address (operands[1], src_addr);
   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
-  emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
-
+  if (is_memmove)
+emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
+  else
+emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
   return true;
 }
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
60133b541e9289610ce58116b0258a61f29bdc00..6d0f072a9dd6d094e8764a513222a9129d8296fa
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1635,7 +1635,22 @@ (define_expand "cpymemdi"
 }
 )
 
-(define_insn "aarch64_movmemdi"
+(define_expand "aarch64_movmemdi"
+  [(parallel
+ [(set (match_operand 2) (const_int 0))
+  (clobber (match_dup 3))
+  (clobber (match_dup 4))
+  (clobber (reg:CC CC_REGNUM))
+  (set (match_operand 0)
+  (unspec:BLK [(match_operand 1) (match_dup 2)] UNSPEC_MOVMEM))])]
+  "TARGET_MOPS"
+  {
+operands[3] = XEXP (operands[0], 0);
+operands[4] = XEXP (operands[1], 0);
+  }
+)
+
+(define_insn "*aarch64_movmemdi"
   [(parallel [
(set (match_operand:DI 2 "register_operand" "+") (const_int 0))
(clobber (match_operand:DI 0 "register_operand" "+"))
@@ -1668,17 +1683,9 @@ (define_expand "movmemdi"
&& INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
  FAIL;
 
-   rtx addr_dst = XEXP (operands[0], 0);
-   rtx addr_src = XEXP (operands[1], 0);
-
-   if (!REG_P (sz_reg))
- sz_reg = force_reg (DImode, sz_reg);
-   if (!REG_P (addr_dst))
- addr_dst = force_reg (DImode, addr_dst);
-   if (!REG_P (addr_src))
- addr_src = force_reg (DImode, addr_src);
-   emit_insn (gen_aarch64_movmemdi (addr_dst, addr_src, sz_reg));
-   DONE;
+  if (aarch64_expand_cpymem_mops (operands, true))
+DONE;
+  FAIL;
 }
 )
 
diff --git a/gcc/testsuite/gcc.target/aarch64/mops_4.c 
b/gcc/testsuite/gcc.target/aarch64/mops_4.c
index 
1b87759cb5e8bbcbb58cf63404d1d579d44b2818..dd796115cb4093251964d881e93bf4b98ade0c32
 100644
--- a/gcc/testsuite/gcc.target/aarch64/mops_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/mops_4.c
@@ -50,6 +50,54 @@ copy3 (int *x, int *y, long z, long *res)
   *res = z;
 }
 
+/*
+** move1:
+** mov (x[0-9]+), x0
+** cpyp\[\1\]!, \[x1\]!, x2!
+** cpym\[\1\]!, \[x1\]!, x2!
+** cpye\[\1\]!, \[x1\]!, x2!
+** str x0, \[x3\]
+** ret
+*/
+void
+move1 (int *x, int *y, long z, int **res)
+{
+  __builtin_memmove (x, y, z);
+  *res = x;
+}
+
+/*
+** 

[PATCH] AArch64: Fix strict-align cpymem/setmem [PR103100]

2023-09-20 Thread Wilco Dijkstra

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
PR target/103100
* config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
(setmemdi): Likewise.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
strict-align.  Cleanup condition for using MOPS.
(aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
dd6874d13a75f20d10a244578afc355b25c73da2..8f3bfb91c0f4ec43f37fe9289a66092a29a47e4d
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = INTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_cpymem_mops (operands);
 
   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-= TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
-
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned max_mops_size = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
- It should be a win even for size optimization in the general case.
- For speed optimization the choice between MOPS and the SIMD sequence
- depends on the size of the copy, rather than number of instructions,
- alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > max_mops_size))
 return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = INTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
- inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+  || (STRICT_ALIGNMENT && align < 16))
 return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
  SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned max_mops_size = aarch64_mops_memset_size_threshold;
 
   len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-return false;
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > max_mops_size))
+return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
  the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
- when available.  */
-  if (TARGET_MOPS
-  && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
  Count the number of operations involved to see if it's worth it
  against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
   simd_ops++;
   n -= mode_bits;
 
-  /* Do certain trailing copies as overlapping if it's going to be
-cheaper.  i.e. less instructions to do so.  For instance doing a 15
-byte copy it's more efficient to do two overlapping 8 byte copies than
-8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+  /* Emit trailing writes using overlapping unaligned accesses
+   (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
   if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
{
  next_mode = 

Re: [PATCH] AArch64: Improve immediate expansion [PR105928]

2023-09-19 Thread Wilco Dijkstra
Hi Richard,

>> Note that aarch64_internal_mov_immediate may be called after reload,
>> so it would end up even more complex.
>
> The sequence I quoted was supposed to work before and after reload.  The:
>
>    rtx tmp = aarch64_target_reg (dest, DImode);
>
> would create a fresh temporary before reload and reuse dest otherwise.
> So the sequence after reload would be the same as in your patch,
> but the sequence before reload would use a temporary.

aarch64_target_reg just returns the input register so it won't do that.
Also the movsi/movdi patterns only split if the destination register is 
physical.
That's typically after register allocation but not uniformly so (eg. immediates 
in
returns will get split early), which is inconsistent. Given we always emit 
register
notes it's not obvious whether splitting early or late is better overall.

Cheers,
Wilco

Re: [PATCH] AArch64: Improve immediate expansion [PR105928]

2023-09-18 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

> I was worried that reusing "dest" for intermediate results would
> prevent CSE for cases like:
>
> void g (long long, long long);
> void
> f (long long *ptr)
> {
>   g (0xee11ee22ee11ee22LL, 0xdc23dc44ee11ee22LL);
> }

Note that aarch64_internal_mov_immediate may be called after reload,
so it would end up even more complex. This should be done as a
dedicated mid-end optimization similar to TARGET_CONST_ANCHOR.
However the number of 3/4-instruction immediates is so small that
sharable cases would be very rare, so I don't believe it is worth it.

Cheers,
Wilco


[PATCH] AArch64: Improve immediate expansion [PR105928]

2023-09-14 Thread Wilco Dijkstra via Gcc-patches

Support immediate expansion of immediates which can be created from 2 MOVKs
and a shifted ORR or BIC instruction.  Change aarch64_split_dimode_const_store
to apply if we save one instruction.

This reduces the number of 4-instruction immediates in SPECINT/FP by 5%.

Passes regress, OK for commit?

gcc/ChangeLog:
PR target/105928
* config/aarch64/aarch64.cc (aarch64_internal_mov_immediate)
Add support for immediates using shifted ORR/BIC.
(aarch64_split_dimode_const_store): Apply if we save one instruction.
* config/aarch64/aarch64.md (_3): 
Make pattern global.

gcc/testsuite:
PR target/105928
* gcc.target/aarch64/pr105928.c: Add new test.
* gcc.target/aarch64/vect-cse-codegen.c: Fix test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
c44c0b979d0cc3755c61dcf566cfddedccebf1ea..832f8197ac8d1a04986791e6f3e51861e41944b2
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5639,7 +5639,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool 
generate,
machine_mode mode)
 {
   int i;
-  unsigned HOST_WIDE_INT val, val2, mask;
+  unsigned HOST_WIDE_INT val, val2, val3, mask;
   int one_match, zero_match;
   int num_insns;
 
@@ -5721,6 +5721,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool 
generate,
}
  return 3;
}
+
+  /* Try shifting and inserting the bottom 32-bits into the top bits.  */
+  val2 = val & 0x;
+  val3 = 0x;
+  val3 = val2 | (val3 << 32);
+  for (i = 17; i < 48; i++)
+   if ((val2 | (val2 << i)) == val)
+ {
+   if (generate)
+ {
+   emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0x)));
+   emit_insn (gen_insv_immdi (dest, GEN_INT (16),
+  GEN_INT (val2 >> 16)));
+   emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
+ }
+   return 3;
+ }
+   else if ((val3 & ~(val3 << i)) == val)
+ {
+   if (generate)
+ {
+   emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0x)));
+   emit_insn (gen_insv_immdi (dest, GEN_INT (16),
+  GEN_INT (val2 >> 16)));
+   emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
+ dest));
+ }
+   return 3;
+ }
 }
 
   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
@@ -25506,8 +25535,6 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
   rtx lo = gen_lowpart (SImode, src);
   rtx hi = gen_highpart_mode (SImode, DImode, src);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   if (!rtx_equal_p (lo, hi))
 return false;
 
@@ -25526,14 +25553,8 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
  MOV   w1, 49370
  MOVK  w1, 0x140, lsl 16
  STP   w1, w1, [x0]
-   So we want to perform this only when we save two instructions
-   or more.  When optimizing for size, however, accept any code size
-   savings we can.  */
-  if (size_p && orig_cost <= lo_cost)
-return false;
-
-  if (!size_p
-  && (orig_cost <= lo_cost + 1))
+   So we want to perform this when we save at least one instruction.  */
+  if (orig_cost <= lo_cost)
 return false;
 
   rtx mem_lo = adjust_address (dst, SImode, 0);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
97f70d39cc0ddeb330e044bae0544d85a695567d..932d4d47a5db1a74e0d0565b565afbd769090853
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4618,7 +4618,7 @@ (define_insn "*and_si3_compare0_uxtw"
   [(set_attr "type" "logics_shift_imm")]
 )
 
-(define_insn "*_3"
+(define_insn "_3"
   [(set (match_operand:GPI 0 "register_operand" "=r")
(LOGICAL:GPI (SHIFT:GPI
  (match_operand:GPI 1 "register_operand" "r")
diff --git a/gcc/testsuite/gcc.target/aarch64/pr105928.c 
b/gcc/testsuite/gcc.target/aarch64/pr105928.c
new file mode 100644
index 
..ab52247df66020d0b8fe70bc81f572e8b64c2bb5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr105928.c
@@ -0,0 +1,43 @@
+/* { dg-do assemble } */
+/* { dg-options "-O2 --save-temps" } */
+
+long f1 (void)
+{
+  return 0x80402010080400;
+}
+
+long f2 (void)
+{
+  return 0x1234567812345678;
+}
+
+long f3 (void)
+{
+  return 0x4567800012345678;
+}
+
+long f4 (void)
+{
+  return 0x3ecd3ecd;
+}
+
+long f5 (void)
+{
+  return 0x38e38e38e38e38e;
+}
+
+long f6 (void)
+{
+  return 0x1745d1745d1745d;
+}
+
+void f7 (long *p)
+{
+  *p = 0x1234567812345678;
+}
+
+/* { dg-final { scan-assembler-times {\tmovk\t} 7 } } */
+/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */
+/* { dg-final { 

Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-09-13 Thread Wilco Dijkstra via Gcc-patches

ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 
[PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the 
given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
    PR target/110061
    config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
    config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
    State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat__16 entry points are ARMv8.0.
+   The libat__16_i1 entry points are used when LSE2 is available.  */
+
+
 .arch   armv8-a+lse
 
 #define ENTRY(name) \
@@ -37,6 +52,10 @@ name:    \
 .cfi_endproc;   \
 .size name, .-name;
 
+#define ALIAS(alias,name)  \
+   .global alias;  \
+   .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:    \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+   mov x5, x0
+   cbnz    w1, 2f
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 1b
+   ret
+
+   /* ACQUIRE/CONSUME/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 2b
+   ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+   cbnz    w4, 2f
+
+   /* RELAXED.  */
+1: ldxp    xzr, tmp0, [x0]
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   ret
+
+   /* RELEASE/SEQ_CST.  */
+2: ldxp    xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 2b
+   ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
 ret
 
 /* RELEASE/SEQ_CST.  */
-1: ldaxp   xzr, tmp0, [x0]
+1: ldxp    xzr, tmp0, [x0]
 stlxp   w4, in0, in1, [x0]
 cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 mov x5, x0
 cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
-4:
-   cmp w4, RELEASE
-   b.ne    6f
 
-   /* RELEASE.  */
-5: ldxp    res0, res1, [x5]
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 5b
+   cbnz    w4, 4b
 ret
+END (libat_exchange_16)
 
-   /* ACQ_REL/SEQ_CST.  */
-6: ldaxp   res0, res1, [x5]
-   stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+   ldp exp0, exp1, [x1]
+   cbz w4, 3f
+   cmp w4, RELEASE
+   b.hs    4f
+
+   /* ACQUIRE/CONSUME.  */
+1: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2f
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   mov x0, 1
 ret
-END (libat_exchange_16_i1)
+
+2: stp tmp0, tmp1, [x1]
+   mov x0, 0
+   ret
+
+   /* RELAXED.  */
+3: ldxp    tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2b
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 3b
+   mov x0, 1
+   ret
+
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   tmp0

Re: [PATCH] libatomic: Improve ifunc selection on AArch64

2023-09-13 Thread Wilco Dijkstra via Gcc-patches

ping


From: Wilco Dijkstra
Sent: 04 August 2023 16:05
To: GCC Patches ; Richard Sandiford 

Cc: Kyrylo Tkachov 
Subject: [PATCH] libatomic: Improve ifunc selection on AArch64 
 

Add support for ifunc selection based on CPUID register.  Neoverse N1 supports
atomic 128-bit load/store, so use the FEAT_USCAT ifunc like newer Neoverse
cores.

Passes regress, OK for commit?

libatomic/
    config/linux/aarch64/host-config.h (ifunc1): Use CPUID in ifunc
    selection.

---

diff --git a/libatomic/config/linux/aarch64/host-config.h 
b/libatomic/config/linux/aarch64/host-config.h
index 
851c78c01cd643318aaa52929ce4550266238b79..e5dc33c030a4bab927874fa6c69425db463fdc4b
 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -26,7 +26,7 @@
 
 #ifdef HWCAP_USCAT
 # if N == 16
-#  define IFUNC_COND_1 (hwcap & HWCAP_USCAT)
+#  define IFUNC_COND_1 ifunc1 (hwcap)
 # else
 #  define IFUNC_COND_1  (hwcap & HWCAP_ATOMICS)
 # endif
@@ -50,4 +50,28 @@
 #undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
 #define MAYBE_HAVE_ATOMIC_EXCHANGE_16   1
 
+#ifdef HWCAP_USCAT
+
+#define MIDR_IMPLEMENTOR(midr) (((midr) >> 24) & 255)
+#define MIDR_PARTNUM(midr) (((midr) >> 4) & 0xfff)
+
+static inline bool
+ifunc1 (unsigned long hwcap)
+{
+  if (hwcap & HWCAP_USCAT)
+    return true;
+  if (!(hwcap & HWCAP_CPUID))
+    return false;
+
+  unsigned long midr;
+  asm volatile ("mrs %0, midr_el1" : "=r" (midr));
+
+  /* Neoverse N1 supports atomic 128-bit load/store.  */
+  if (MIDR_IMPLEMENTOR (midr) == 'A' && MIDR_PARTNUM(midr) == 0xd0c)
+    return true;
+
+  return false;
+}
+#endif
+
 #include_next 


[PATCH] AArch64: Fix __sync_val_compare_and_swap [PR111404]

2023-09-13 Thread Wilco Dijkstra via Gcc-patches

__sync_val_compare_and_swap may be used on 128-bit types and either calls the
outline atomic code or uses an inline loop.  On AArch64 LDXP is only atomic if
the value is stored successfully using STXP, but the current implementations
do not perform the store if the comparison fails.  In this case the value 
returned
is not read atomically.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
PR target/111404
* config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
For 128-bit store the loaded value and loop if needed.

libgcc/ChangeLog/
PR target/111404
* config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
either new value or loaded value.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
   mem = operands[1];
   oldval = operands[2];
   newval = operands[3];
-  is_weak = (operands[4] != const0_rtx);
   model_rtx = operands[5];
   scratch = operands[7];
   mode = GET_MODE (mem);
   model = memmodel_from_int (INTVAL (model_rtx));
+  is_weak = operands[4] != const0_rtx && mode != TImode;
 
   /* When OLDVAL is zero and we want the strong version we can emit a tighter
 loop:
@@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
   else
 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
 
+  /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
+ store the returned value and loop if the STLXP fails.  */
+  if (mode == TImode)
+{
+  rtx_code_label *label3 = gen_label_rtx ();
+  emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
+  emit_barrier ();
+
+  emit_label (label2);
+  aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
+
+  if (aarch64_track_speculation)
+   {
+ /* Emit an explicit compare instruction, so that we can correctly
+track the condition codes.  */
+ rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+   }
+  else
+   x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+   gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+
+  label2 = label3;
+}
+
   emit_label (label2);
 
   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #define tmp0   16
 #define tmp1   17
 #define tmp2   15
+#define tmp3   14
+#define tmp4   13
 
 #define BTI_C  hint34
 
@@ -233,10 +235,11 @@ STARTFN   NAME(cas)
 0: LDXPx0, x1, [x4]
cmp x0, x(tmp0)
ccmpx1, x(tmp1), #0, eq
-   bne 1f
-   STXPw(tmp2), x2, x3, [x4]
-   cbnzw(tmp2), 0b
-1: BARRIER
+   cselx(tmp2), x2, x0, eq
+   cselx(tmp3), x3, x1, eq
+   STXPw(tmp4), x(tmp2), x(tmp3), [x4]
+   cbnzw(tmp4), 0b
+   BARRIER
ret
 
 #endif



[PATCH] AArch64: List official cores before codenames

2023-09-13 Thread Wilco Dijkstra via Gcc-patches
List official cores first so that -cpu=native does not show a codename with -v
or in errors/warnings.

Passes regress, OK for commit?

gcc/ChangeLog:
* config/aarch64/aarch64-cores.def (neoverse-n1): Place before ares.
(neoverse-v1): Place before zeus.
(neoverse-v2): Place before demeter.
* config/aarch64/aarch64-tune.md: Regenerate.

---

diff --git a/gcc/config/aarch64/aarch64-cores.def 
b/gcc/config/aarch64/aarch64-cores.def
index 
dbac497ef3aab410eb81db185b2e9532186888bb..3894f2afc27e71523e5a413fa45c144222082934
 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -115,8 +115,8 @@ AARCH64_CORE("cortex-a65",  cortexa65, cortexa53, V8_2A,  
(F16, RCPC, DOTPROD, S
 AARCH64_CORE("cortex-a65ae",  cortexa65ae, cortexa53, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS), cortexa73, 0x41, 0xd43, -1)
 AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS, PROFILE), neoversen1, 0x41, 0xd44, -1)
 AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, 
SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1)
-AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), 
neoversen1, 0x41, 0xd0c, -1)
 AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, V8_2A,  (F16, RCPC, 
DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
+AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), 
neoversen1, 0x41, 0xd0c, -1)
 AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, V8_2A,  (F16, RCPC, 
DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1)
 
 /* Cavium ('C') cores. */
@@ -143,8 +143,8 @@ AARCH64_CORE("thunderx3t110",  thunderx3t110,  
thunderx3t110, V8_3A,  (CRYPTO, S
 /* ARMv8.4-A Architecture Processors.  */
 
 /* Arm ('A') cores.  */
-AARCH64_CORE("zeus", zeus, cortexa57, V8_4A,  (SVE, I8MM, BF16, PROFILE, SSBS, 
RNG), neoversev1, 0x41, 0xd40, -1)
 AARCH64_CORE("neoverse-v1", neoversev1, cortexa57, V8_4A,  (SVE, I8MM, BF16, 
PROFILE, SSBS, RNG), neoversev1, 0x41, 0xd40, -1)
+AARCH64_CORE("zeus", zeus, cortexa57, V8_4A,  (SVE, I8MM, BF16, PROFILE, SSBS, 
RNG), neoversev1, 0x41, 0xd40, -1)
 AARCH64_CORE("neoverse-512tvb", neoverse512tvb, cortexa57, V8_4A,  (SVE, I8MM, 
BF16, PROFILE, SSBS, RNG), neoverse512tvb, INVALID_IMP, INVALID_CORE, -1)
 
 /* Qualcomm ('Q') cores. */
@@ -182,7 +182,7 @@ AARCH64_CORE("cortex-x3",  cortexx3, cortexa57, V9A,  
(SVE2_BITPERM, MEMTAG, I8M
 
 AARCH64_CORE("neoverse-n2", neoversen2, cortexa57, V9A, (I8MM, BF16, 
SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversen2, 0x41, 0xd49, -1)
 
-AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, 
RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
 AARCH64_CORE("neoverse-v2", neoversev2, cortexa57, V9A, (I8MM, BF16, 
SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
+AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, 
RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
 
 #undef AARCH64_CORE
diff --git a/gcc/config/aarch64/aarch64-tune.md 
b/gcc/config/aarch64/aarch64-tune.md
index 
2170980dddb0d5d410a49631ad26ff2e346b39dd..69e5357fa814e4733b05f7164bfa11e4aa04
 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexx2,cortexx3,neoversen2,demeter,neoversev2"
+   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexx2,cortexx3,neoversen2,neoversev2,demeter"
(const (symbol_ref "((enum 

[PATCH] ARM: Block predication on atomics [PR111235]

2023-09-07 Thread Wilco Dijkstra via Gcc-patches
The v7 memory ordering model allows reordering of conditional atomic 
instructions.
To avoid this, make all atomic patterns unconditional.  Expand atomic loads and
stores for all architectures so the memory access can be wrapped into an UNSPEC.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
PR target/111235
* config/arm/constraints.md: Remove Pf constraint.
* onfig/arm/sync.md (arm_atomic_load): Add new pattern.
(arm_atomic_load_acquire): Likewise.
(arm_atomic_store): Likewise.
(arm_atomic_store_release): Likewise.
(atomic_load): Always expand atomic loads explicitly.
(atomic_store): Always expand atomic stores explicitly.
(arm_atomic_loaddi2_ldrd): Remove predication.
(arm_load_exclusive): Likewise.
(arm_load_acquire_exclusive): Likewise.
(arm_load_exclusivesi): Likewise.
(arm_load_acquire_exclusivesi: Likewise.
(arm_load_exclusivedi): Likewise.
(arm_load_acquire_exclusivedi): Likewise.
(arm_store_exclusive): Likewise.
(arm_store_release_exclusivedi): Likewise.
(arm_store_release_exclusive): Likewise.
* gcc/config/arm/unspecs.md: Add VUNSPEC_LDR and VUNSPEC_STR.

gcc/testsuite/ChangeLog/
PR target/111235
* gcc.target/arm/pr111235.c: Add new test.

---

diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 
05a4ebbdd67601d7b92aa44a619d17634cc69f17..d7c4a1b0cd785f276862048005e6cfa57cdcb20d
 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -36,7 +36,7 @@
 ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe
 ;; in Thumb-2 state: Ha, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Rd, Rf, Rb, 
Ra,
 ;;  Rg, Ri
-;; in all states: Pf, Pg
+;; in all states: Pg
 
 ;; The following memory constraints have been used:
 ;; in ARM/Thumb-2 state: Uh, Ut, Uv, Uy, Un, Um, Us, Up, Uf, Ux, Ul
@@ -239,13 +239,6 @@ (define_constraint "Pe"
   (and (match_code "const_int")
(match_test "TARGET_THUMB1 && ival >= 256 && ival <= 510")))
 
-(define_constraint "Pf"
-  "Memory models except relaxed, consume or release ones."
-  (and (match_code "const_int")
-   (match_test "!is_mm_relaxed (memmodel_from_int (ival))
-   && !is_mm_consume (memmodel_from_int (ival))
-   && !is_mm_release (memmodel_from_int (ival))")))
-
 (define_constraint "Pg"
   "@internal In Thumb-2 state a constant in range 1 to 32"
   (and (match_code "const_int")
diff --git a/gcc/config/arm/sync.md b/gcc/config/arm/sync.md
index 
7626bf3c443285dc63b4c4367b11a879a99c93c6..2210810f67f37ce043b8fdc73b4f21b54c5b1912
 100644
--- a/gcc/config/arm/sync.md
+++ b/gcc/config/arm/sync.md
@@ -62,68 +62,110 @@ (define_insn "*memory_barrier"
(set_attr "conds" "unconditional")
(set_attr "predicable" "no")])
 
-(define_insn "atomic_load"
-  [(set (match_operand:QHSI 0 "register_operand" "=r,r,l")
+(define_insn "arm_atomic_load"
+  [(set (match_operand:QHSI 0 "register_operand" "=r,l")
 (unspec_volatile:QHSI
-  [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q,Q,Q")
-   (match_operand:SI 2 "const_int_operand" "n,Pf,n")]  ;; model
+  [(match_operand:QHSI 1 "memory_operand" "m,m")]
+  VUNSPEC_LDR))]
+  ""
+  "ldr\t%0, %1"
+  [(set_attr "arch" "32,any")])
+
+(define_insn "arm_atomic_load_acquire"
+  [(set (match_operand:QHSI 0 "register_operand" "=r")
+(unspec_volatile:QHSI
+  [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q")]
   VUNSPEC_LDA))]
   "TARGET_HAVE_LDACQ"
-  {
-if (aarch_mm_needs_acquire (operands[2]))
-  {
-   if (TARGET_THUMB1)
- return "lda\t%0, %1";
-   else
- return "lda%?\t%0, %1";
-  }
-else
-  {
-   if (TARGET_THUMB1)
- return "ldr\t%0, %1";
-   else
- return "ldr%?\t%0, %1";
-  }
-  }
-  [(set_attr "arch" "32,v8mb,any")
-   (set_attr "predicable" "yes")])
+  "lda\t%0, %C1"
+)
 
-(define_insn "atomic_store"
-  [(set (match_operand:QHSI 0 "memory_operand" "=Q,Q,Q")
+(define_insn "arm_atomic_store"
+  [(set (match_operand:QHSI 0 "memory_operand" "=m,m")
+(unspec_volatile:QHSI
+  [(match_operand:QHSI 1 "register_operand" "r,l")]
+  VUNSPEC_STR))]
+  ""
+  "str\t%1, %0";
+  [(set_attr "arch" "32,any")])
+
+(define_insn "arm_atomic_store_release"
+  [(set (match_operand:QHSI 0 "arm_sync_memory_operand" "=Q")
 (unspec_volatile:QHSI
-  [(match_operand:QHSI 1 "general_operand" "r,r,l")
-   (match_operand:SI 2 "const_int_operand" "n,Pf,n")]  ;; model
+  [(match_operand:QHSI 1 "register_operand" "r")]
   VUNSPEC_STL))]
   "TARGET_HAVE_LDACQ"
-  {
-if (aarch_mm_needs_release (operands[2]))
-  {
-   if (TARGET_THUMB1)
- return "stl\t%1, %0";
-   else
- return "stl%?\t%1, %0";
-  }
-else
-  {
-   if (TARGET_THUMB1)
- return "str\t%1, %0";
-   else
- return "str%?\t%1, %0";
-  }
- 

Re: [PATCH] AArch64: Fix MOPS memmove operand corruption [PR111121]

2023-08-23 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

(that's quick!)

> +  if (size > max_copy_size || size > max_mops_size)
> +return aarch64_expand_cpymem_mops (operands, is_memmove);
>
> Could you explain this a bit more?  If I've followed the logic correctly,
> max_copy_size will always be 0 for movmem, so this "if" condition will
> always be true for movmem (given that the caller can be relied on to
> optimise away zero-length copies).  So doesn't this function reduce to:

In this patch it is zero yes, but there is no real reason for that. The goal is 
to
share as much code as possible. I have a patch that inlines memmove like
memcpy.

> when is_memmove is true?  If so, I think it would be clearer to do that
> directly, rather than go through aarch64_expand_cpymem.  max_copy_size
> is really an optimisation threshold, whereas the above seems to be
> leaning on it for correctness.

In principle we could for the time being add a assert (!is_memmove) if that
makes it clearer memmove isn't yet handled.

> ...I think we might as well keep this pattern conditional on TARGET_MOPS.

But then we have inconsistencies in the conditions of the expanders, which
is what led to all these bugs in the first place (I lost count, there are 4 or 5
different bugs I fixed). Ensuring everything is 100% identical between
memcpy and memmove makes the code much easier to follow.

> I think we can then also split:
>
>   /* All three registers are changed by the instruction, so each one
>  must be a fresh pseudo.  */
>   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
>   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
>   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
>   rtx src_mem = replace_equiv_address (operands[1], src_addr);
>   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
>
> out of aarch64_expand_cpymem_mops into a new function (say
> aarch64_prepare_mops_operands) and call it from the movmemdi
> expander.  There should then be no need for the extra staging
> expander (aarch64_movmemdi).

So you're saying we could remove aarch64_cpymemdi/movmemdi if
aarch64_expand_cpymem_mops did massage the operands in the
right way so that we can immediately match the underlying instruction?

Hmm, does that actually work, as in we don't lose the extra alias info that
gets lost in the current memmove expander? (another bug/inconsistency)

And the MOPS code would be separated from aarch64_expand_cpymem
so we'd do all the MOPS size tests inside aarch64_expand_cpymem_mops
and the expander tries using MOPS first and if it fails try inline expansion?

So something like:

(define_expand "movmemdi"

  if (aarch64_try_mops_expansion (operands, is_memmove))
DONE;
  if (aarch64_try_inline_copy_expansion (operands, is_memmove))
DONE;
  FAIL;
)

> IMO the STRICT_ALIGNMENT stuff should be a separate patch,
> with its own testcases.

We will need backports to fix all these bugs, so the question is whether it
is worth doing a lot of cleanups now?

Cheers,
Wilco


[PATCH] AArch64: Fix MOPS memmove operand corruption [PR111121]

2023-08-23 Thread Wilco Dijkstra via Gcc-patches

A MOPS memmove may corrupt registers since there is no copy of the input 
operands to temporary
registers.  Fix this by calling aarch64_expand_cpymem which does this.  Also 
fix an issue with
STRICT_ALIGNMENT being ignored if TARGET_MOPS is true, and avoid crashing or 
generating a huge
expansion if aarch64_mops_memcpy_size_threshold is large.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
PR target/21
* config/aarch64/aarch64.md (cpymemdi): Remove STRICT_ALIGNMENT, add 
param for memmove.
(aarch64_movmemdi): Add new expander similar to aarch64_cpymemdi.
(movmemdi): Like cpymemdi call aarch64_expand_cpymem for correct 
expansion.
* config/aarch64/aarch64.cc (aarch64_expand_cpymem_mops): Add support 
for memmove.
(aarch64_expand_cpymem): Add support for memmove. Handle 
STRICT_ALIGNMENT correctly.
Handle TARGET_MOPS size selection correctly.
* config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Update 
prototype. 

gcc/testsuite/ChangeLog/
PR target/21
* gcc.target/aarch64/mops_4.c: Add memmove testcases.

---
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
70303d6fd953e0c397b9138ede8858c2db2e53db..97375e81cbda078847af83bf5dd4e0d7673d6af4
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -765,7 +765,7 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
eba5d4a7e04b7af82437453a691d5607d98133c9..5e8d0a0c91bc7719de2a8c5627b354cf905a4db0
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25135,10 +25135,11 @@ aarch64_copy_one_block_and_progress_pointers (rtx 
*src, rtx *dst,
   *dst = aarch64_progress_pointer (*dst);
 }
 
-/* Expand a cpymem using the MOPS extension.  OPERANDS are taken
-   from the cpymem pattern.  Return true iff we succeeded.  */
+/* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
+   from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
+   rather than memcpy.  Return true iff we succeeded.  */
 static bool
-aarch64_expand_cpymem_mops (rtx *operands)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
 return false;
@@ -25150,17 +25151,19 @@ aarch64_expand_cpymem_mops (rtx *operands)
   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
   rtx src_mem = replace_equiv_address (operands[1], src_addr);
   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
-  emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
-
+  if (is_memmove)
+emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
+  else
+emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to
-   memcpy should be emitted.  */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
+   if this is a memmove rather than memcpy.  Return true if we succeed,
+   otherwise return false, indicating that a libcall should be emitted.  */
 bool
-aarch64_expand_cpymem (rtx *operands)
+aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 {
   int mode_bits;
   rtx dst = operands[0];
@@ -25168,25 +25171,23 @@ aarch64_expand_cpymem (rtx *operands)
   rtx base;
   machine_mode cur_mode = BLKmode;
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
-return aarch64_expand_cpymem_mops (operands);
+  /* Variable-sized or strict align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || STRICT_ALIGNMENT)
+return aarch64_expand_cpymem_mops (operands, is_memmove);
 
   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
 
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-= TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
+  unsigned HOST_WIDE_INT max_copy_size = is_memmove ? 0 : 256;
+  unsigned HOST_WIDE_INT max_mops_size = max_copy_size;
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  if (TARGET_MOPS)
+max_mops_size = is_memmove ? aarch64_mops_memmove_size_threshold
+  : aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when 

Re: [PATCH] libatomic: Improve ifunc selection on AArch64

2023-08-10 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

>>> Answering my own question, N1 does not officially have FEAT_LSE2.
>> 
>> It doesn't indeed. However most cores support atomic 128-bit load/store
>> (part of LSE2), so we can still use the LSE2 ifunc for those cores. Since 
>> there
>> isn't a feature bit for this in the CPU or HWCAP, I check the CPUID register.
>
> That would be a really nice bit to add to HWCAP, then, to consolidate this 
> knowledge in 
> one place.  Certainly I would use it in QEMU as well.

Yes this was suggested by a colleague as well. I'll ask and see whether the 
kernel guys
like the idea. It would take some time to get added, so we still need this for 
the time
being.

Cheers,
Wilco

Re: [PATCH] libatomic: Improve ifunc selection on AArch64

2023-08-10 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

>> Why would HWCAP_USCAT not be set by the kernel?
>> 
>> Failing that, I would think you would check ID_AA64MMFR2_EL1.AT.
>>
> Answering my own question, N1 does not officially have FEAT_LSE2.

It doesn't indeed. However most cores support atomic 128-bit load/store
(part of LSE2), so we can still use the LSE2 ifunc for those cores. Since there
isn't a feature bit for this in the CPU or HWCAP, I check the CPUID register.

Cheers,
Wilco

Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-08-04 Thread Wilco Dijkstra via Gcc-patches
ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 
[PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the 
given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
    PR target/110061
    config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
    config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
    State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat__16 entry points are ARMv8.0.
+   The libat__16_i1 entry points are used when LSE2 is available.  */
+
+
 .arch   armv8-a+lse
 
 #define ENTRY(name) \
@@ -37,6 +52,10 @@ name:    \
 .cfi_endproc;   \
 .size name, .-name;
 
+#define ALIAS(alias,name)  \
+   .global alias;  \
+   .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:    \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+   mov x5, x0
+   cbnz    w1, 2f
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 1b
+   ret
+
+   /* ACQUIRE/CONSUME/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 2b
+   ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+   cbnz    w4, 2f
+
+   /* RELAXED.  */
+1: ldxp    xzr, tmp0, [x0]
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   ret
+
+   /* RELEASE/SEQ_CST.  */
+2: ldxp    xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 2b
+   ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
 ret
 
 /* RELEASE/SEQ_CST.  */
-1: ldaxp   xzr, tmp0, [x0]
+1: ldxp    xzr, tmp0, [x0]
 stlxp   w4, in0, in1, [x0]
 cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 mov x5, x0
 cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
-4:
-   cmp w4, RELEASE
-   b.ne    6f
 
-   /* RELEASE.  */
-5: ldxp    res0, res1, [x5]
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 5b
+   cbnz    w4, 4b
 ret
+END (libat_exchange_16)
 
-   /* ACQ_REL/SEQ_CST.  */
-6: ldaxp   res0, res1, [x5]
-   stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+   ldp exp0, exp1, [x1]
+   cbz w4, 3f
+   cmp w4, RELEASE
+   b.hs    4f
+
+   /* ACQUIRE/CONSUME.  */
+1: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2f
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   mov x0, 1
 ret
-END (libat_exchange_16_i1)
+
+2: stp tmp0, tmp1, [x1]
+   mov x0, 0
+   ret
+
+   /* RELAXED.  */
+3: ldxp    tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2b
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 3b
+   mov x0, 1
+   ret
+
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   tmp0

[PATCH] libatomic: Improve ifunc selection on AArch64

2023-08-04 Thread Wilco Dijkstra via Gcc-patches

Add support for ifunc selection based on CPUID register.  Neoverse N1 supports
atomic 128-bit load/store, so use the FEAT_USCAT ifunc like newer Neoverse
cores.

Passes regress, OK for commit?

libatomic/
config/linux/aarch64/host-config.h (ifunc1): Use CPUID in ifunc
selection.

---

diff --git a/libatomic/config/linux/aarch64/host-config.h 
b/libatomic/config/linux/aarch64/host-config.h
index 
851c78c01cd643318aaa52929ce4550266238b79..e5dc33c030a4bab927874fa6c69425db463fdc4b
 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -26,7 +26,7 @@
 
 #ifdef HWCAP_USCAT
 # if N == 16
-#  define IFUNC_COND_1 (hwcap & HWCAP_USCAT)
+#  define IFUNC_COND_1 ifunc1 (hwcap)
 # else
 #  define IFUNC_COND_1 (hwcap & HWCAP_ATOMICS)
 # endif
@@ -50,4 +50,28 @@
 #undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
 #define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
 
+#ifdef HWCAP_USCAT
+
+#define MIDR_IMPLEMENTOR(midr) (((midr) >> 24) & 255)
+#define MIDR_PARTNUM(midr) (((midr) >> 4) & 0xfff)
+
+static inline bool
+ifunc1 (unsigned long hwcap)
+{
+  if (hwcap & HWCAP_USCAT)
+return true;
+  if (!(hwcap & HWCAP_CPUID))
+return false;
+
+  unsigned long midr;
+  asm volatile ("mrs %0, midr_el1" : "=r" (midr));
+
+  /* Neoverse N1 supports atomic 128-bit load/store.  */
+  if (MIDR_IMPLEMENTOR (midr) == 'A' && MIDR_PARTNUM(midr) == 0xd0c)
+return true;
+
+  return false;
+}
+#endif
+
 #include_next 



Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-07-05 Thread Wilco Dijkstra via Gcc-patches

ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 
[PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the 
given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
    PR target/110061
    config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
    config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
    State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat__16 entry points are ARMv8.0.
+   The libat__16_i1 entry points are used when LSE2 is available.  */
+
+
 .arch   armv8-a+lse
 
 #define ENTRY(name) \
@@ -37,6 +52,10 @@ name:    \
 .cfi_endproc;   \
 .size name, .-name;
 
+#define ALIAS(alias,name)  \
+   .global alias;  \
+   .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:    \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+   mov x5, x0
+   cbnz    w1, 2f
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 1b
+   ret
+
+   /* ACQUIRE/CONSUME/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 2b
+   ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+   cbnz    w4, 2f
+
+   /* RELAXED.  */
+1: ldxp    xzr, tmp0, [x0]
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   ret
+
+   /* RELEASE/SEQ_CST.  */
+2: ldxp    xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 2b
+   ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
 ret
 
 /* RELEASE/SEQ_CST.  */
-1: ldaxp   xzr, tmp0, [x0]
+1: ldxp    xzr, tmp0, [x0]
 stlxp   w4, in0, in1, [x0]
 cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 mov x5, x0
 cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
-4:
-   cmp w4, RELEASE
-   b.ne    6f
 
-   /* RELEASE.  */
-5: ldxp    res0, res1, [x5]
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 5b
+   cbnz    w4, 4b
 ret
+END (libat_exchange_16)
 
-   /* ACQ_REL/SEQ_CST.  */
-6: ldaxp   res0, res1, [x5]
-   stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+   ldp exp0, exp1, [x1]
+   cbz w4, 3f
+   cmp w4, RELEASE
+   b.hs    4f
+
+   /* ACQUIRE/CONSUME.  */
+1: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2f
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   mov x0, 1
 ret
-END (libat_exchange_16_i1)
+
+2: stp tmp0, tmp1, [x1]
+   mov x0, 0
+   ret
+
+   /* RELAXED.  */
+3: ldxp    tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2b
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 3b
+   mov x0, 1
+   ret
+
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   tmp0

Re: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-06-16 Thread Wilco Dijkstra via Gcc-patches

ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 
[PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the 
given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
    PR target/110061
    config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
    config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
    State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat__16 entry points are ARMv8.0.
+   The libat__16_i1 entry points are used when LSE2 is available.  */
+
+
 .arch   armv8-a+lse
 
 #define ENTRY(name) \
@@ -37,6 +52,10 @@ name:    \
 .cfi_endproc;   \
 .size name, .-name;
 
+#define ALIAS(alias,name)  \
+   .global alias;  \
+   .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:    \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+   mov x5, x0
+   cbnz    w1, 2f
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 1b
+   ret
+
+   /* ACQUIRE/CONSUME/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
+   stxp    w4, res0, res1, [x5]
+   cbnz    w4, 2b
+   ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+   cbnz    w4, 2f
+
+   /* RELAXED.  */
+1: ldxp    xzr, tmp0, [x0]
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   ret
+
+   /* RELEASE/SEQ_CST.  */
+2: ldxp    xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 2b
+   ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
 ret
 
 /* RELEASE/SEQ_CST.  */
-1: ldaxp   xzr, tmp0, [x0]
+1: ldxp    xzr, tmp0, [x0]
 stlxp   w4, in0, in1, [x0]
 cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 mov x5, x0
 cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
-4:
-   cmp w4, RELEASE
-   b.ne    6f
 
-   /* RELEASE.  */
-5: ldxp    res0, res1, [x5]
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 5b
+   cbnz    w4, 4b
 ret
+END (libat_exchange_16)
 
-   /* ACQ_REL/SEQ_CST.  */
-6: ldaxp   res0, res1, [x5]
-   stlxp   w4, in0, in1, [x5]
-   cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+   ldp exp0, exp1, [x1]
+   cbz w4, 3f
+   cmp w4, RELEASE
+   b.hs    4f
+
+   /* ACQUIRE/CONSUME.  */
+1: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2f
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 1b
+   mov x0, 1
 ret
-END (libat_exchange_16_i1)
+
+2: stp tmp0, tmp1, [x1]
+   mov x0, 0
+   ret
+
+   /* RELAXED.  */
+3: ldxp    tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmp    tmp1, exp1, 0, eq
+   bne 2b
+   stxp    w4, in0, in1, [x0]
+   cbnz    w4, 3b
+   mov x0, 1
+   ret
+
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   tmp0

[PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

2023-06-02 Thread Wilco Dijkstra via Gcc-patches

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not 
supported.
This results in an implicit store which is invisible to software as long as the 
given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
PR target/110061
config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat__16 entry points are ARMv8.0.
+   The libat__16_i1 entry points are used when LSE2 is available.  */
+
+
.arch   armv8-a+lse
 
 #define ENTRY(name)\
@@ -37,6 +52,10 @@ name:\
.cfi_endproc;   \
.size name, .-name;
 
+#define ALIAS(alias,name)  \
+   .global alias;  \
+   .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:\
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+   mov x5, x0
+   cbnzw1, 2f
+
+   /* RELAXED.  */
+1: ldxpres0, res1, [x5]
+   stxpw4, res0, res1, [x5]
+   cbnzw4, 1b
+   ret
+
+   /* ACQUIRE/CONSUME/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
+   stxpw4, res0, res1, [x5]
+   cbnzw4, 2b
+   ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
cbnzw1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+   cbnzw4, 2f
+
+   /* RELAXED.  */
+1: ldxpxzr, tmp0, [x0]
+   stxpw4, in0, in1, [x0]
+   cbnzw4, 1b
+   ret
+
+   /* RELEASE/SEQ_CST.  */
+2: ldxpxzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnzw4, 2b
+   ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
cbnzw4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
ret
 
/* RELEASE/SEQ_CST.  */
-1: ldaxp   xzr, tmp0, [x0]
+1: ldxpxzr, tmp0, [x0]
stlxp   w4, in0, in1, [x0]
cbnzw4, 1b
ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
mov x5, x0
cbnzw4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
stxpw4, in0, in1, [x5]
cbnzw4, 3b
ret
-4:
-   cmp w4, RELEASE
-   b.ne6f
 
-   /* RELEASE.  */
-5: ldxpres0, res1, [x5]
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   res0, res1, [x5]
stlxp   w4, in0, in1, [x5]
-   cbnzw4, 5b
+   cbnzw4, 4b
ret
+END (libat_exchange_16)
 
-   /* ACQ_REL/SEQ_CST.  */
-6: ldaxp   res0, res1, [x5]
-   stlxp   w4, in0, in1, [x5]
-   cbnzw4, 6b
+
+ENTRY (libat_compare_exchange_16)
+   ldp exp0, exp1, [x1]
+   cbz w4, 3f
+   cmp w4, RELEASE
+   b.hs4f
+
+   /* ACQUIRE/CONSUME.  */
+1: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmptmp1, exp1, 0, eq
+   bne 2f
+   stxpw4, in0, in1, [x0]
+   cbnzw4, 1b
+   mov x0, 1
ret
-END (libat_exchange_16_i1)
+
+2: stp tmp0, tmp1, [x1]
+   mov x0, 0
+   ret
+
+   /* RELAXED.  */
+3: ldxptmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmptmp1, exp1, 0, eq
+   bne 2b
+   stxpw4, in0, in1, [x0]
+   cbnzw4, 3b
+   mov x0, 1
+   ret
+
+   /* RELEASE/ACQ_REL/SEQ_CST.  */
+4: ldaxp   tmp0, tmp1, [x0]
+   cmp tmp0, exp0
+   ccmptmp1, exp1, 0, eq
+   bne 2b
+   stlxp   w4, in0, in1, [x0]
+   cbnzw4, 4b
+   mov x0, 1
+   ret
+END (libat_compare_exchange_16)
 
 
 

Re: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]

2023-03-16 Thread Wilco Dijkstra via Gcc-patches
ping


From: Wilco Dijkstra
Sent: 23 February 2023 15:11
To: GCC Patches 
Cc: Richard Sandiford ; Kyrylo Tkachov 

Subject: [PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891] 
 

The LSE2 ifunc for 16-byte atomic load requires a barrier before the LDP -
without it, it effectively has Load-AcquirePC semantics similar to LDAPR,
which is less restrictive than what __ATOMIC_SEQ_CST requires.  This patch
fixes this and adds comments to make it easier to see which sequence is
used for each case. Use a load/store exclusive loop for store to simplify
testing memory ordering is correct (it is slightly faster too). 

Passes regress, OK for commit?

libatomic/
    PR libgcc/108891
    config/linux/aarch64/atomic_16.S: Fix libat_load_16_i1.
    Add comments describing the memory order.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
732c3534a06678664a252bdbc53652eeab0af506..05439ce394b9653c9bcb582761ff7aaa7c8f9643
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -72,33 +72,38 @@ name:   \
 
 ENTRY (libat_load_16_i1)
 cbnz    w1, 1f
+
+   /* RELAXED.  */
 ldp res0, res1, [x0]
 ret
 1:
-   cmp w1, ACQUIRE
-   b.hi    2f
+   cmp w1, SEQ_CST
+   b.eq    2f
+
+   /* ACQUIRE/CONSUME (Load-AcquirePC semantics).  */
 ldp res0, res1, [x0]
 dmb ishld
 ret
-2:
+
+   /* SEQ_CST.  */
+2: ldar    tmp0, [x0]  /* Block reordering with Store-Release instr.  
*/
 ldp res0, res1, [x0]
-   dmb ish
+   dmb ishld
 ret
 END (libat_load_16_i1)
 
 
 ENTRY (libat_store_16_i1)
 cbnz    w4, 1f
+
+   /* RELAXED.  */
 stp in0, in1, [x0]
 ret
-1:
-   dmb ish
-   stp in0, in1, [x0]
-   cmp w4, SEQ_CST
-   beq 2f
-   ret
-2:
-   dmb ish
+
+   /* RELEASE/SEQ_CST.  */
+1: ldaxp   xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnz    w4, 1b
 ret
 END (libat_store_16_i1)
 
@@ -106,29 +111,33 @@ END (libat_store_16_i1)
 ENTRY (libat_exchange_16_i1)
 mov x5, x0
 cbnz    w4, 2f
-1:
-   ldxp    res0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 1b
 ret
 2:
 cmp w4, ACQUIRE
 b.hi    4f
-3:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME.  */
+3: ldaxp   res0, res1, [x5]
 stxp    w4, in0, in1, [x5]
 cbnz    w4, 3b
 ret
 4:
 cmp w4, RELEASE
 b.ne    6f
-5:
-   ldxp    res0, res1, [x5]
+
+   /* RELEASE.  */
+5: ldxp    res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
 cbnz    w4, 5b
 ret
-6:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQ_REL/SEQ_CST.  */
+6: ldaxp   res0, res1, [x5]
 stlxp   w4, in0, in1, [x5]
 cbnz    w4, 6b
 ret
@@ -142,6 +151,8 @@ ENTRY (libat_compare_exchange_16_i1)
 cbz w4, 2f
 cmp w4, RELEASE
 b.hs    3f
+
+   /* ACQUIRE/CONSUME.  */
 caspa   exp0, exp1, in0, in1, [x0]
 0:
 cmp exp0, tmp0
@@ -153,15 +164,18 @@ ENTRY (libat_compare_exchange_16_i1)
 stp exp0, exp1, [x1]
 mov x0, 0
 ret
-2:
-   casp    exp0, exp1, in0, in1, [x0]
+
+   /* RELAXED.  */
+2: casp    exp0, exp1, in0, in1, [x0]
 b   0b
-3:
-   b.hi    4f
+
+   /* RELEASE.  */
+3: b.hi    4f
 caspl   exp0, exp1, in0, in1, [x0]
 b   0b
-4:
-   caspal  exp0, exp1, in0, in1, [x0]
+
+   /* ACQ_REL/SEQ_CST.  */
+4: caspal  exp0, exp1, in0, in1, [x0]
 b   0b
 END (libat_compare_exchange_16_i1)
 
@@ -169,15 +183,17 @@ END (libat_compare_exchange_16_i1)
 ENTRY (libat_fetch_add_16_i1)
 mov x5, x0
 cbnz    w4, 2f
-1:
-   ldxp    res0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
 adds    tmplo, reslo, inlo
 adc tmphi, reshi, inhi
 stxp    w4, tmp0, tmp1, [x5]
 cbnz    w4, 1b
 ret
-2:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
 adds    tmplo, reslo, inlo
 adc tmphi, reshi, inhi
 stlxp   w4, tmp0, tmp1, [x5]
@@ -189,15 +205,17 @@ END (libat_fetch_add_16_i1)
 ENTRY (libat_add_fetch_16_i1)
 mov x5, x0
 cbnz    w4, 2f
-1:
-   ldxp    res0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxp    res0, res1, [x5]
 adds    reslo, reslo, inlo
 adc reshi, reshi, inhi
 stxp    w4, res0, res1, [x5]
 cbnz    w4, 1b
 ret
-2:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST

[PATCH] libatomic: Fix SEQ_CST 128-bit atomic load [PR108891]

2023-02-23 Thread Wilco Dijkstra via Gcc-patches

The LSE2 ifunc for 16-byte atomic load requires a barrier before the LDP -
without it, it effectively has Load-AcquirePC semantics similar to LDAPR,
which is less restrictive than what __ATOMIC_SEQ_CST requires.  This patch
fixes this and adds comments to make it easier to see which sequence is
used for each case. Use a load/store exclusive loop for store to simplify
testing memory ordering is correct (it is slightly faster too). 

Passes regress, OK for commit?

libatomic/
PR libgcc/108891
config/linux/aarch64/atomic_16.S: Fix libat_load_16_i1.
Add comments describing the memory order.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
b/libatomic/config/linux/aarch64/atomic_16.S
index 
732c3534a06678664a252bdbc53652eeab0af506..05439ce394b9653c9bcb582761ff7aaa7c8f9643
 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -72,33 +72,38 @@ name:   \
 
 ENTRY (libat_load_16_i1)
cbnzw1, 1f
+
+   /* RELAXED.  */
ldp res0, res1, [x0]
ret
 1:
-   cmp w1, ACQUIRE
-   b.hi2f
+   cmp w1, SEQ_CST
+   b.eq2f
+
+   /* ACQUIRE/CONSUME (Load-AcquirePC semantics).  */
ldp res0, res1, [x0]
dmb ishld
ret
-2:
+
+   /* SEQ_CST.  */
+2: ldartmp0, [x0]  /* Block reordering with Store-Release instr.  
*/
ldp res0, res1, [x0]
-   dmb ish
+   dmb ishld
ret
 END (libat_load_16_i1)
 
 
 ENTRY (libat_store_16_i1)
cbnzw4, 1f
+
+   /* RELAXED.  */
stp in0, in1, [x0]
ret
-1:
-   dmb ish
-   stp in0, in1, [x0]
-   cmp w4, SEQ_CST
-   beq 2f
-   ret
-2:
-   dmb ish
+
+   /* RELEASE/SEQ_CST.  */
+1: ldaxp   xzr, tmp0, [x0]
+   stlxp   w4, in0, in1, [x0]
+   cbnzw4, 1b
ret
 END (libat_store_16_i1)
 
@@ -106,29 +111,33 @@ END (libat_store_16_i1)
 ENTRY (libat_exchange_16_i1)
mov x5, x0
cbnzw4, 2f
-1:
-   ldxpres0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxpres0, res1, [x5]
stxpw4, in0, in1, [x5]
cbnzw4, 1b
ret
 2:
cmp w4, ACQUIRE
b.hi4f
-3:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME.  */
+3: ldaxp   res0, res1, [x5]
stxpw4, in0, in1, [x5]
cbnzw4, 3b
ret
 4:
cmp w4, RELEASE
b.ne6f
-5:
-   ldxpres0, res1, [x5]
+
+   /* RELEASE.  */
+5: ldxpres0, res1, [x5]
stlxp   w4, in0, in1, [x5]
cbnzw4, 5b
ret
-6:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQ_REL/SEQ_CST.  */
+6: ldaxp   res0, res1, [x5]
stlxp   w4, in0, in1, [x5]
cbnzw4, 6b
ret
@@ -142,6 +151,8 @@ ENTRY (libat_compare_exchange_16_i1)
cbz w4, 2f
cmp w4, RELEASE
b.hs3f
+
+   /* ACQUIRE/CONSUME.  */
caspa   exp0, exp1, in0, in1, [x0]
 0:
cmp exp0, tmp0
@@ -153,15 +164,18 @@ ENTRY (libat_compare_exchange_16_i1)
stp exp0, exp1, [x1]
mov x0, 0
ret
-2:
-   caspexp0, exp1, in0, in1, [x0]
+
+   /* RELAXED.  */
+2: caspexp0, exp1, in0, in1, [x0]
b   0b
-3:
-   b.hi4f
+
+   /* RELEASE.  */
+3: b.hi4f
caspl   exp0, exp1, in0, in1, [x0]
b   0b
-4:
-   caspal  exp0, exp1, in0, in1, [x0]
+
+   /* ACQ_REL/SEQ_CST.  */
+4: caspal  exp0, exp1, in0, in1, [x0]
b   0b
 END (libat_compare_exchange_16_i1)
 
@@ -169,15 +183,17 @@ END (libat_compare_exchange_16_i1)
 ENTRY (libat_fetch_add_16_i1)
mov x5, x0
cbnzw4, 2f
-1:
-   ldxpres0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxpres0, res1, [x5]
addstmplo, reslo, inlo
adc tmphi, reshi, inhi
stxpw4, tmp0, tmp1, [x5]
cbnzw4, 1b
ret
-2:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
addstmplo, reslo, inlo
adc tmphi, reshi, inhi
stlxp   w4, tmp0, tmp1, [x5]
@@ -189,15 +205,17 @@ END (libat_fetch_add_16_i1)
 ENTRY (libat_add_fetch_16_i1)
mov x5, x0
cbnzw4, 2f
-1:
-   ldxpres0, res1, [x5]
+
+   /* RELAXED.  */
+1: ldxpres0, res1, [x5]
addsreslo, reslo, inlo
adc reshi, reshi, inhi
stxpw4, res0, res1, [x5]
cbnzw4, 1b
ret
-2:
-   ldaxp   res0, res1, [x5]
+
+   /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2: ldaxp   res0, res1, [x5]
addsreslo, reslo, inlo
adc reshi, reshi, inhi
stlxp   w4, res0, res1, [x5]
@@ -209,15 +227,17 @@ END (libat_add_fetch_16_i1)
 ENTRY (libat_fetch_sub_16_i1)
mov x5, x0

Re: [PATCH] libgcc: Fix uninitialized RA signing on AArch64 [PR107678]

2023-01-18 Thread Wilco Dijkstra via Gcc-patches
Hi,

>> +  /* Return-address signing state is toggled by DW_CFA_GNU_window_save 
>> (where
>> + REG_UNDEFINED means enabled), or set by a DW_CFA_expression.  */
>
> Needs updating to REG_UNSAVED_ARCHEXT.
> 
> OK with that changes, thanks, and sorry for the delays & runaround.

Thanks, I've improved the comment and it has been committed to trunk now.

Cheers,
Wilco

Re: [PATCH] libgcc: Fix uninitialized RA signing on AArch64 [PR107678]

2023-01-17 Thread Wilco Dijkstra via Gcc-patches
Hi,

> @Wilco, can you please send the rebased patch for patch review? We would
> need in out openSUSE package soon.

Here is an updated and rebased version:

Cheers,
Wilco

v4: rebase and add REG_UNSAVED_ARCHEXT.

A recent change only initializes the regs.how[] during Dwarf unwinding
which resulted in an uninitialized offset used in return address signing
and random failures during unwinding.  The fix is to encode the return
address signing state in REG_UNSAVED and a new state REG_UNSAVED_ARCHEXT.

Passes bootstrap & regress, OK for commit?

libgcc/
PR target/107678
* unwind-dw2.h (REG_UNSAVED_ARCHEXT): Add new enum.
* unwind-dw2.c (uw_update_context_1): Add REG_UNSAVED_ARCHEXT case.
* unwind-dw2-execute_cfa.h: Use REG_UNSAVED_ARCHEXT/REG_UNSAVED to 
encode the return address signing state.
* config/aarch64/aarch64-unwind.h (aarch64_demangle_return_addr)
Check current return address signing state.
(aarch64_frob_update_contex): Remove.

---
diff --git a/libgcc/config/aarch64/aarch64-unwind.h 
b/libgcc/config/aarch64/aarch64-unwind.h
index 
874cf6c3e77fb72d999f51b636d74cb0b5728bbd..727c27ba5da983958b3134715d9d4d7c0af5c1e2
 100644
--- a/libgcc/config/aarch64/aarch64-unwind.h
+++ b/libgcc/config/aarch64/aarch64-unwind.h
@@ -29,8 +29,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 
 #define MD_DEMANGLE_RETURN_ADDR(context, fs, addr) \
   aarch64_demangle_return_addr (context, fs, addr)
-#define MD_FROB_UPDATE_CONTEXT(context, fs) \
-  aarch64_frob_update_context (context, fs)
 
 static inline int
 aarch64_cie_signed_with_b_key (struct _Unwind_Context *context)
@@ -55,42 +53,27 @@ aarch64_cie_signed_with_b_key (struct _Unwind_Context 
*context)
 
 static inline void *
 aarch64_demangle_return_addr (struct _Unwind_Context *context,
- _Unwind_FrameState *fs ATTRIBUTE_UNUSED,
+ _Unwind_FrameState *fs,
  _Unwind_Word addr_word)
 {
   void *addr = (void *)addr_word;
-  if (context->flags & RA_SIGNED_BIT)
+  const int reg = DWARF_REGNUM_AARCH64_RA_STATE;
+
+  if (fs->regs.how[reg] == REG_UNSAVED)
+return addr;
+
+  /* Return-address signing state is toggled by DW_CFA_GNU_window_save (where
+ REG_UNDEFINED means enabled), or set by a DW_CFA_expression.  */
+  if (fs->regs.how[reg] == REG_UNSAVED_ARCHEXT
+  || (_Unwind_GetGR (context, reg) & 0x1) != 0)
 {
   _Unwind_Word salt = (_Unwind_Word) context->cfa;
   if (aarch64_cie_signed_with_b_key (context) != 0)
return __builtin_aarch64_autib1716 (addr, salt);
   return __builtin_aarch64_autia1716 (addr, salt);
 }
-  else
-return addr;
-}
-
-/* Do AArch64 private initialization on CONTEXT based on frame info FS.  Mark
-   CONTEXT as return address signed if bit 0 of DWARF_REGNUM_AARCH64_RA_STATE 
is
-   set.  */
-
-static inline void
-aarch64_frob_update_context (struct _Unwind_Context *context,
-_Unwind_FrameState *fs)
-{
-  const int reg = DWARF_REGNUM_AARCH64_RA_STATE;
-  int ra_signed;
-  if (fs->regs.how[reg] == REG_UNSAVED)
-ra_signed = fs->regs.reg[reg].loc.offset & 0x1;
-  else
-ra_signed = _Unwind_GetGR (context, reg) & 0x1;
-  if (ra_signed)
-/* The flag is used for re-authenticating EH handler's address.  */
-context->flags |= RA_SIGNED_BIT;
-  else
-context->flags &= ~RA_SIGNED_BIT;
 
-  return;
+  return addr;
 }
 
 #endif /* defined AARCH64_UNWIND_H && defined __ILP32__ */
diff --git a/libgcc/unwind-dw2-execute_cfa.h b/libgcc/unwind-dw2-execute_cfa.h
index 
264c11c03ec4a09cac2c19a241c5b110b1b6b602..aef377092ceede6bdda8532679f9b081c98fadce
 100644
--- a/libgcc/unwind-dw2-execute_cfa.h
+++ b/libgcc/unwind-dw2-execute_cfa.h
@@ -278,10 +278,15 @@
case DW_CFA_GNU_window_save:
 #if defined (__aarch64__) && !defined (__ILP32__)
  /* This CFA is multiplexed with Sparc.  On AArch64 it's used to toggle
-return address signing status.  */
+return address signing status.  REG_UNSAVED/REG_UNSAVED_ARCHEXT
+mean RA signing is disabled/enabled.  */
  reg = DWARF_REGNUM_AARCH64_RA_STATE;
- gcc_assert (fs->regs.how[reg] == REG_UNSAVED);
- fs->regs.reg[reg].loc.offset ^= 1;
+ gcc_assert (fs->regs.how[reg] == REG_UNSAVED
+ || fs->regs.how[reg] == REG_UNSAVED_ARCHEXT);
+ if (fs->regs.how[reg] == REG_UNSAVED)
+   fs->regs.how[reg] = REG_UNSAVED_ARCHEXT;
+ else
+   fs->regs.how[reg] = REG_UNSAVED;
 #else
  /* ??? Hardcoded for SPARC register window configuration.  */
  if (__LIBGCC_DWARF_FRAME_REGISTERS__ >= 32)
diff --git a/libgcc/unwind-dw2.h b/libgcc/unwind-dw2.h
index 
e2f81983e1dcf3df6aebde2454630b7bee87d597..53e1b183c7d60112a14411d3356c49cb39cd0de7
 100644
--- a/libgcc/unwind-dw2.h
+++ b/libgcc/unwind-dw2.h
@@ -29,6 +29,7 @@ enum {
   REG_SAVED_EXP,
   

Re: [PATCH] libgcc: Fix uninitialized RA signing on AArch64 [PR107678]

2023-01-11 Thread Wilco Dijkstra via Gcc-patches
Hi,

> On 1/10/23 19:12, Jakub Jelinek via Gcc-patches wrote:
>> Anyway, the sooner this makes it into gcc trunk, the better, it breaks quite
>> a lot of stuff.
>
> Yep, please, we're also waiting for this patch for pushing to our gcc13 
> package.

Well I'm waiting for an OK from a maintainer... I believe Jakub can approve it 
as well.

Cheers,
Wilco

Re: [PATCH] libgcc: Fix uninitialized RA signing on AArch64 [PR107678]

2023-01-10 Thread Wilco Dijkstra via Gcc-patches
Hi Szabolcs,

> i would keep the assert: how[reg] must be either UNSAVED or UNDEFINED
> here, other how[reg] means the toggle cfi instruction is mixed with
> incompatible instructions for the pseudo reg.
>
> and i would add a comment about this e.g. saying that UNSAVED/UNDEFINED
> how[reg] is used for tracking the return address signing status and
> other how[reg] is not allowed here.

I've added the assert back and updated the comment.

Cheers,
Wilco

v3: Improve comments, add assert.

A recent change only initializes the regs.how[] during Dwarf unwinding
which resulted in an uninitialized offset used in return address signing
and random failures during unwinding.  The fix is to encode the return
address signing state in REG_UNSAVED and REG_UNDEFINED.

Passes bootstrap & regress, OK for commit?

libgcc/
PR target/107678
* unwind-dw2.c (execute_cfa_program): Use REG_UNSAVED/UNDEFINED
to encode return address signing state.
* config/aarch64/aarch64-unwind.h (aarch64_demangle_return_addr)
Check current return address signing state.
(aarch64_frob_update_contex): Remove.

---

diff --git a/libgcc/config/aarch64/aarch64-unwind.h 
b/libgcc/config/aarch64/aarch64-unwind.h
index 
26db9cbd9e5c526e0c410a4fc6be2bedb7d261cf..1afc3f9d308b95bc787398263e629bab226ff1ba
 100644
--- a/libgcc/config/aarch64/aarch64-unwind.h
+++ b/libgcc/config/aarch64/aarch64-unwind.h
@@ -29,8 +29,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 
 #define MD_DEMANGLE_RETURN_ADDR(context, fs, addr) \
   aarch64_demangle_return_addr (context, fs, addr)
-#define MD_FROB_UPDATE_CONTEXT(context, fs) \
-  aarch64_frob_update_context (context, fs)
 
 static inline int
 aarch64_cie_signed_with_b_key (struct _Unwind_Context *context)
@@ -55,42 +53,27 @@ aarch64_cie_signed_with_b_key (struct _Unwind_Context 
*context)
 
 static inline void *
 aarch64_demangle_return_addr (struct _Unwind_Context *context,
- _Unwind_FrameState *fs ATTRIBUTE_UNUSED,
+ _Unwind_FrameState *fs,
  _Unwind_Word addr_word)
 {
   void *addr = (void *)addr_word;
-  if (context->flags & RA_SIGNED_BIT)
+  const int reg = DWARF_REGNUM_AARCH64_RA_STATE;
+
+  if (fs->regs.how[reg] == REG_UNSAVED)
+return addr;
+
+  /* Return-address signing state is toggled by DW_CFA_GNU_window_save (where
+ REG_UNDEFINED means enabled), or set by a DW_CFA_expression.  */
+  if (fs->regs.how[reg] == REG_UNDEFINED
+  || (_Unwind_GetGR (context, reg) & 0x1) != 0)
 {
   _Unwind_Word salt = (_Unwind_Word) context->cfa;
   if (aarch64_cie_signed_with_b_key (context) != 0)
return __builtin_aarch64_autib1716 (addr, salt);
   return __builtin_aarch64_autia1716 (addr, salt);
 }
-  else
-return addr;
-}
-
-/* Do AArch64 private initialization on CONTEXT based on frame info FS.  Mark
-   CONTEXT as return address signed if bit 0 of DWARF_REGNUM_AARCH64_RA_STATE 
is
-   set.  */
-
-static inline void
-aarch64_frob_update_context (struct _Unwind_Context *context,
-_Unwind_FrameState *fs)
-{
-  const int reg = DWARF_REGNUM_AARCH64_RA_STATE;
-  int ra_signed;
-  if (fs->regs.how[reg] == REG_UNSAVED)
-ra_signed = fs->regs.reg[reg].loc.offset & 0x1;
-  else
-ra_signed = _Unwind_GetGR (context, reg) & 0x1;
-  if (ra_signed)
-/* The flag is used for re-authenticating EH handler's address.  */
-context->flags |= RA_SIGNED_BIT;
-  else
-context->flags &= ~RA_SIGNED_BIT;
 
-  return;
+  return addr;
 }
 
 #endif /* defined AARCH64_UNWIND_H && defined __ILP32__ */
diff --git a/libgcc/unwind-dw2.c b/libgcc/unwind-dw2.c
index 
eaceace20298b9b13344aff9d1fe9ee5f9c7bd73..55fe35520106e848c5d4aea4e7104bf4a0c14891
 100644
--- a/libgcc/unwind-dw2.c
+++ b/libgcc/unwind-dw2.c
@@ -139,7 +139,6 @@ struct _Unwind_Context
 #define EXTENDED_CONTEXT_BIT ((~(_Unwind_Word) 0 >> 2) + 1)
   /* Bit reserved on AArch64, return address has been signed with A or B
  key.  */
-#define RA_SIGNED_BIT ((~(_Unwind_Word) 0 >> 3) + 1)
   _Unwind_Word flags;
   /* 0 for now, can be increased when further fields are added to
  struct _Unwind_Context.  */
@@ -1204,10 +1203,15 @@ execute_cfa_program (const unsigned char *insn_ptr,
case DW_CFA_GNU_window_save:
 #if defined (__aarch64__) && !defined (__ILP32__)
  /* This CFA is multiplexed with Sparc.  On AArch64 it's used to toggle
-return address signing status.  */
+return address signing status.  The REG_UNDEFINED/UNSAVED states
+mean RA signing is enabled/disabled.  */
  reg = DWARF_REGNUM_AARCH64_RA_STATE;
- gcc_assert (fs->regs.how[reg] == REG_UNSAVED);
- fs->regs.reg[reg].loc.offset ^= 1;
+ gcc_assert (fs->regs.how[reg] == REG_UNSAVED
+ || fs->regs.how[reg] == REG_UNDEFINED);
+ if (fs->regs.how[reg] == REG_UNSAVED)
+   

Re: [PATCH] libgcc: Fix uninitialized RA signing on AArch64 [PR107678]

2023-01-03 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

> Hmm, but the point of the original patch was to support code generators
> that emit DW_CFA_val_expression instead of DW_CFA_AARCH64_negate_ra_state.
> Doesn't this patch undo that?

Well it wasn't clear from the code or comments that was supported. I've
added that back in v2.

> Also, if I understood correctly, the reason we use REG_UNSAVED is to
> ensure that state from one frame isn't carried across to a parent frame,
> in cases where the parent frame lacks any signing.  That is, each frame
> should start out with a zero bit even if a child frame is unwound while
> it has a set bit.

This works fine since all registers are initialized to REG_UNSAVED every frame.

In v2 I've removed some clutter and encode the signing state in REG_UNSAVED/
REG_UNDEFINED.

Cheers,
Wilco

v2: Further cleanup, support DW_CFA_expression.

A recent change only initializes the regs.how[] during Dwarf unwinding
which resulted in an uninitialized offset used in return address signing
and random failures during unwinding.  The fix is to encode the return
address signing state in REG_UNSAVED and REG_UNDEFINED.

Passes bootstrap & regress, OK for commit?

libgcc/
PR target/107678
* unwind-dw2.c (execute_cfa_program): Use REG_UNSAVED/UNDEFINED
to encode return address signing state.
* config/aarch64/aarch64-unwind.h (aarch64_demangle_return_addr)
Check current return address signing state.
(aarch64_frob_update_contex): Remove.

---
diff --git a/libgcc/config/aarch64/aarch64-unwind.h 
b/libgcc/config/aarch64/aarch64-unwind.h
index 
26db9cbd9e5c526e0c410a4fc6be2bedb7d261cf..1afc3f9d308b95bc787398263e629bab226ff1ba
 100644
--- a/libgcc/config/aarch64/aarch64-unwind.h
+++ b/libgcc/config/aarch64/aarch64-unwind.h
@@ -29,8 +29,6 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 
 #define MD_DEMANGLE_RETURN_ADDR(context, fs, addr) \
   aarch64_demangle_return_addr (context, fs, addr)
-#define MD_FROB_UPDATE_CONTEXT(context, fs) \
-  aarch64_frob_update_context (context, fs)
 
 static inline int
 aarch64_cie_signed_with_b_key (struct _Unwind_Context *context)
@@ -55,42 +53,27 @@ aarch64_cie_signed_with_b_key (struct _Unwind_Context 
*context)
 
 static inline void *
 aarch64_demangle_return_addr (struct _Unwind_Context *context,
- _Unwind_FrameState *fs ATTRIBUTE_UNUSED,
+ _Unwind_FrameState *fs,
  _Unwind_Word addr_word)
 {
   void *addr = (void *)addr_word;
-  if (context->flags & RA_SIGNED_BIT)
+  const int reg = DWARF_REGNUM_AARCH64_RA_STATE;
+
+  if (fs->regs.how[reg] == REG_UNSAVED)
+return addr;
+
+  /* Return-address signing state is toggled by DW_CFA_GNU_window_save (where
+ REG_UNDEFINED means enabled), or set by a DW_CFA_expression.  */
+  if (fs->regs.how[reg] == REG_UNDEFINED
+  || (_Unwind_GetGR (context, reg) & 0x1) != 0)
 {
   _Unwind_Word salt = (_Unwind_Word) context->cfa;
   if (aarch64_cie_signed_with_b_key (context) != 0)
return __builtin_aarch64_autib1716 (addr, salt);
   return __builtin_aarch64_autia1716 (addr, salt);
 }
-  else
-return addr;
-}
-
-/* Do AArch64 private initialization on CONTEXT based on frame info FS.  Mark
-   CONTEXT as return address signed if bit 0 of DWARF_REGNUM_AARCH64_RA_STATE 
is
-   set.  */
-
-static inline void
-aarch64_frob_update_context (struct _Unwind_Context *context,
-_Unwind_FrameState *fs)
-{
-  const int reg = DWARF_REGNUM_AARCH64_RA_STATE;
-  int ra_signed;
-  if (fs->regs.how[reg] == REG_UNSAVED)
-ra_signed = fs->regs.reg[reg].loc.offset & 0x1;
-  else
-ra_signed = _Unwind_GetGR (context, reg) & 0x1;
-  if (ra_signed)
-/* The flag is used for re-authenticating EH handler's address.  */
-context->flags |= RA_SIGNED_BIT;
-  else
-context->flags &= ~RA_SIGNED_BIT;
 
-  return;
+  return addr;
 }
 
 #endif /* defined AARCH64_UNWIND_H && defined __ILP32__ */
diff --git a/libgcc/unwind-dw2.c b/libgcc/unwind-dw2.c
index 
eaceace20298b9b13344aff9d1fe9ee5f9c7bd73..7c200cb6e730c5d63cf200ebe8a903f858e79d07
 100644
--- a/libgcc/unwind-dw2.c
+++ b/libgcc/unwind-dw2.c
@@ -139,7 +139,6 @@ struct _Unwind_Context
 #define EXTENDED_CONTEXT_BIT ((~(_Unwind_Word) 0 >> 2) + 1)
   /* Bit reserved on AArch64, return address has been signed with A or B
  key.  */
-#define RA_SIGNED_BIT ((~(_Unwind_Word) 0 >> 3) + 1)
   _Unwind_Word flags;
   /* 0 for now, can be increased when further fields are added to
  struct _Unwind_Context.  */
@@ -1206,8 +1205,10 @@ execute_cfa_program (const unsigned char *insn_ptr,
  /* This CFA is multiplexed with Sparc.  On AArch64 it's used to toggle
 return address signing status.  */
  reg = DWARF_REGNUM_AARCH64_RA_STATE;
- gcc_assert (fs->regs.how[reg] == REG_UNSAVED);
- fs->regs.reg[reg].loc.offset ^= 1;
+ if (fs->regs.how[reg] == REG_UNSAVED)
+  

[PATCH] AArch64: Enable TARGET_CONST_ANCHOR

2022-12-09 Thread Wilco Dijkstra via Gcc-patches
Enable TARGET_CONST_ANCHOR to allow complex constants to be created via 
immediate add.
Use a 24-bit range as that enables a 3 or 4-instruction immediate to be 
replaced by
2 additions.  Fix the costing of immediate add to support 24-bit immediate and 
12-bit shifted
immediates.  The generated code for the testcase is now the same or better than 
LLVM.
It also results in a small codesize reduction on SPEC.

Passes bootstrap and regress, OK for commit?

gcc/
* config/aarch64/aarch64.cc (aarch64_rtx_costs): Add correct costs for
24-bit immediate add and 12-bit high immediate add.
(TARGET_CONST_ANCHOR): Define.

gcc/testsuite/
* gcc.target/aarch64/movk_3.c: New test.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
e97f3b32f7c7f43564d6a4207eae5a34b9e9bfe7..a73741800c963ee6605fd2cfa918f4399da4bfdf
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14257,6 +14257,16 @@ cost_plus:
return true;
  }
 
+   if (aarch64_pluslong_immediate (op1, mode))
+ {
+   /* 24-bit add in 2 instructions or 12-bit shifted add.  */
+   if ((INTVAL (op1) & 0xfff) != 0)
+ *cost += COSTS_N_INSNS (1);
+
+   *cost += rtx_cost (op0, mode, PLUS, 0, speed);
+   return true;
+ }
+
*cost += rtx_cost (op1, mode, PLUS, 1, speed);
 
/* Look for ADD (extended register).  */
@@ -28051,6 +28061,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_HAVE_SHADOW_CALL_STACK
 #define TARGET_HAVE_SHADOW_CALL_STACK true
 
+#undef TARGET_CONST_ANCHOR
+#define TARGET_CONST_ANCHOR 0x100
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-aarch64.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/movk_3.c 
b/gcc/testsuite/gcc.target/aarch64/movk_3.c
new file mode 100644
index 
..9e8c0c42671bef3f63028b4e51d0bd78c9903994
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/movk_3.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 --save-temps" } */
+
+
+/* 2 MOV */
+void f16 (long *p)
+{
+  p[0] = 0x1234;
+  p[2] = 0x1235;
+}
+
+/* MOV, MOVK and ADD */
+void f32_1 (long *p)
+{
+  p[0] = 0x12345678;
+  p[2] = 0x12345678 + 0xfff;
+}
+
+/* 2 MOV, 2 MOVK */
+void f32_2 (long *p)
+{
+  p[0] = 0x12345678;
+  p[2] = 0x12345678 + 0x55;
+}
+
+/* MOV, MOVK and ADD */
+void f32_3 (long *p)
+{
+  p[0] = 0x12345678;
+  p[2] = 0x12345678 + 0x999000;
+}
+
+/* MOV, 2 MOVK and ADD */
+void f48_1 (long *p)
+{
+  p[0] = 0x123456789abc;
+  p[2] = 0x123456789abc + 0xfff;
+}
+
+/* MOV, 2 MOVK and 2 ADD */
+void f48_2 (long *p)
+{
+  p[0] = 0x123456789abc;
+  p[2] = 0x123456789abc + 0x66;
+}
+
+/* 2 MOV, 4 MOVK */
+void f48_3 (long *p)
+{
+  p[0] = 0x123456789abc;
+  p[2] = 0x123456789abc + 0x166;
+}
+
+/* { dg-final { scan-assembler-times "mov\tx\[0-9\]+, \[0-9\]+" 10 } } */
+/* { dg-final { scan-assembler-times "movk\tx\[0-9\]+, 0x\[0-9a-f\]+" 12 } } */
+/* { dg-final { scan-assembler-times "add\tx\[0-9\]+, x\[0-9\]+, \[0-9\]+" 5 } 
} */



Re: [PATCH][AArch64] Cleanup move immediate code

2022-12-07 Thread Wilco Dijkstra via Gcc-patches
Hi Andreas,

Thanks for the report, I've committed the fix: 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108006

Cheers,
Wilco


[COMMITTED] AArch64: Fix assert in aarch64_move_imm [PR108006]

2022-12-07 Thread Wilco Dijkstra via Gcc-patches
Ensure we only pass SI/DImode which fixes the assert.

Committed as obvious.

gcc/
        PR target/108006
* config/aarch64/aarch64.c (aarch64_expand_sve_const_vector):
        Fix call to aarch64_move_imm to use SI/DI.
---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
89bf0dff904b6b52b71841aec299541f01884f3d..27a814d862101ce244c52d4863c6158cf549f066
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6513,7 +6513,8 @@ aarch64_expand_sve_const_vector (rtx target, rtx src)
  /* If the integer can be moved into a general register by a
 single instruction, do that and duplicate the result.  */
  if (CONST_INT_P (elt_value)
- && aarch64_move_imm (INTVAL (elt_value), elt_mode))
+ && aarch64_move_imm (INTVAL (elt_value),
+  encoded_bits <= 32 ? SImode : DImode))
{
  elt_value = force_reg (elt_mode, elt_value);
  return expand_vector_broadcast (mode, elt_value);



Re: [PATCH] libgcc: Fix uninitialized RA signing on AArch64 [PR107678]

2022-12-06 Thread Wilco Dijkstra via Gcc-patches
Hi,

> i don't think how[*RA_STATE] can ever be set to REG_SAVED_OFFSET,
> this pseudo reg is not spilled to the stack, it is reset to 0 in
> each frame and then toggled within a frame.

It's is just a state, we can use any state we want since it is a pseudo reg.
These registers are global and shared across all functions in an unwind,
so their state or value isn't reset for each frame. So if we want to reset
it in each frame then using a virtual register to hold per-function data
seems like a bad design. I'm surprised nobody has ever tested it...

Cheers,
Wilco

Re: [PATCH][AArch64] Cleanup move immediate code

2022-12-05 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

> -  scalar_int_mode imode = (mode == HFmode
> -    ? SImode
> -    : int_mode_for_mode (mode).require ());
> +  machine_mode imode = (mode == DFmode) ? DImode : SImode;

> It looks like this might mishandle DDmode, if not now then in the future.
> Seems safer to use known_eq (GET_MODE_SIZE (mode), 8)

I've changed that, but it does not matter for the narrow modes as the result
will be identical - only DDmode might get costed incorrectly.

> Sorry for not noticing last time, but: rather than have
> aarch64_zeroextended_move_imm (which is quite a complicated test),
> could we just add an extra (default off) parameter to aarch64_move_imm
> that suppresses the (val >> 32) == 0 test?

That makes things more complicated again - ultimately I'd like to get rid of the
mode parameter since most callers use a fixed mode, and ones that don't are
now creating and passing fake modes... I've change it like aarch64_move_imm
and call aarch64_is_movz twice to check it is not a 64-bit MOVZ/MOVN.

Cheers,
Wilco

v3: Use aarch64_is_movz, use known_eq

Simplify, refactor and improve various move immediate functions.
Allow 32-bit MOVI/N as a valid 64-bit immediate which removes special
cases in aarch64_internal_mov_immediate.  Add new constraint so the movdi
pattern only needs a single alternative for move immediate.

Passes bootstrap and regress, OK for commit?

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_bitmask_imm): Use unsigned type.
(aarch64_zeroextended_move_imm): New function.
(aarch64_move_imm): Refactor, assert mode is SImode or DImode.
(aarch64_internal_mov_immediate): Assert mode is SImode or DImode.
Simplify special cases.
(aarch64_uimm12_shift): Simplify code.
(aarch64_clamp_to_uimm12_shift): Likewise.
(aarch64_movw_imm): Rename to aarch64_is_movz.
(aarch64_float_const_rtx_p): Pass either SImode or DImode to
aarch64_internal_mov_immediate.
(aarch64_rtx_costs): Likewise.
* config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
constraints into single 'O'.
(mov_aarch64): Likewise.
* config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
(aarch64_bitmask_imm): Likewise.
(aarch64_uimm12_shift): Likewise.
(aarch64_zeroextended_move_imm): New prototype.
* config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
limit 'N' to 64-bit only moves.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
4be93c93c26e091f878bc8e4cf06e90888405fb2..8bce6ec7599edcc2e6a1d8006450f35c0ce7f61f
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -756,7 +756,7 @@ void aarch64_post_cfi_startproc (void);
 poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
 bool aarch64_address_valid_for_prefetch_p (rtx, bool);
-bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
+bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
 unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
 unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode 
mode);
@@ -793,7 +793,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, 
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
-bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
+bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
 machine_mode aarch64_sve_pred_mode (machine_mode);
@@ -843,8 +843,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
 bool aarch64_sve_float_mul_immediate_p (rtx);
 bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
-bool aarch64_uimm12_shift (HOST_WIDE_INT);
+bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
 int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
+bool aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
 const char *aarch64_output_casesi (rtx *);
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
a73741800c963ee6605fd2cfa918f4399da4bfdf..00269632eeb52c29ba2011c4c82274968b850d71
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5625,12 +5625,10 @@ aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
 
 /* Return true if VAL is a valid bitmask immediate for MODE.  */
 bool
-aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
+aarch64_bitmask_imm (unsigned HOST_WIDE_INT 

[PATCH] libgcc: Fix uninitialized RA signing on AArch64 [PR107678]

2022-12-01 Thread Wilco Dijkstra via Gcc-patches
A recent change only initializes the regs.how[] during Dwarf unwinding
which resulted in an uninitialized offset used in return address signing
and random failures during unwinding.  The fix is to use REG_SAVED_OFFSET
as the state where the return address signing bit is valid, and if the
state is REG_UNSAVED, initialize it to 0.

Passes bootstrap & regress, OK for commit?

libgcc/
PR target/107678
* unwind-dw2.c (execute_cfa_program): Initialize offset of
DWARF_REGNUM_AARCH64_RA_STATE if in REG_UNSAVED state.
* config/aarch64/aarch64-unwind.h (aarch64_frob_update_contex):
Check state is REG_SAVED_OFFSET before using offset for RA state.

---

diff --git a/libgcc/config/aarch64/aarch64-unwind.h 
b/libgcc/config/aarch64/aarch64-unwind.h
index 
26db9cbd9e5c526e0c410a4fc6be2bedb7d261cf..597133b3d708a50a366c8bfeff57475f5522b3f6
 100644
--- a/libgcc/config/aarch64/aarch64-unwind.h
+++ b/libgcc/config/aarch64/aarch64-unwind.h
@@ -71,21 +71,15 @@ aarch64_demangle_return_addr (struct _Unwind_Context 
*context,
 }
 
 /* Do AArch64 private initialization on CONTEXT based on frame info FS.  Mark
-   CONTEXT as return address signed if bit 0 of DWARF_REGNUM_AARCH64_RA_STATE 
is
-   set.  */
+   CONTEXT as having a signed return address if DWARF_REGNUM_AARCH64_RA_STATE
+   is initialized (REG_SAVED_OFFSET state) and the offset has bit 0 set.  */
 
 static inline void
 aarch64_frob_update_context (struct _Unwind_Context *context,
 _Unwind_FrameState *fs)
 {
-  const int reg = DWARF_REGNUM_AARCH64_RA_STATE;
-  int ra_signed;
-  if (fs->regs.how[reg] == REG_UNSAVED)
-ra_signed = fs->regs.reg[reg].loc.offset & 0x1;
-  else
-ra_signed = _Unwind_GetGR (context, reg) & 0x1;
-  if (ra_signed)
-/* The flag is used for re-authenticating EH handler's address.  */
+  if (fs->regs.how[DWARF_REGNUM_AARCH64_RA_STATE] == REG_SAVED_OFFSET
+  && (fs->regs.reg[DWARF_REGNUM_AARCH64_RA_STATE].loc.offset & 1) != 0)
 context->flags |= RA_SIGNED_BIT;
   else
 context->flags &= ~RA_SIGNED_BIT;
diff --git a/libgcc/unwind-dw2.c b/libgcc/unwind-dw2.c
index 
eaceace20298b9b13344aff9d1fe9ee5f9c7bd73..87f2ae065b67982ce48f74e45523d9c754a7661c
 100644
--- a/libgcc/unwind-dw2.c
+++ b/libgcc/unwind-dw2.c
@@ -1203,11 +1203,16 @@ execute_cfa_program (const unsigned char *insn_ptr,
 
case DW_CFA_GNU_window_save:
 #if defined (__aarch64__) && !defined (__ILP32__)
- /* This CFA is multiplexed with Sparc.  On AArch64 it's used to toggle
-return address signing status.  */
- reg = DWARF_REGNUM_AARCH64_RA_STATE;
- gcc_assert (fs->regs.how[reg] == REG_UNSAVED);
- fs->regs.reg[reg].loc.offset ^= 1;
+/* This CFA is multiplexed with Sparc.  On AArch64 it's used to toggle
+   the return address signing status.  It is initialized at the first
+   use and the state is stored in bit 0 of the offset.  */
+reg = DWARF_REGNUM_AARCH64_RA_STATE;
+if (fs->regs.how[reg] == REG_UNSAVED)
+  {
+fs->regs.how[reg] = REG_SAVED_OFFSET;
+fs->regs.reg[reg].loc.offset = 0;
+  }
+fs->regs.reg[reg].loc.offset ^= 1;
 #else
  /* ??? Hardcoded for SPARC register window configuration.  */
  if (__LIBGCC_DWARF_FRAME_REGISTERS__ >= 32)



Re: [PATCH][AArch64] Cleanup move immediate code

2022-11-29 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

> Just to make sure I understand: isn't it really just MOVN?  I would have
> expected a 32-bit MOVZ to be equivalent to (and add no capabilities over)
> a 64-bit MOVZ.

The 32-bit MOVZ immediates are equivalent, MOVN never overlaps, and
MOVI has some overlaps . Since we allow all 3 variants, the 2 alternatives
in the movdi pattern are overlapping for MOVZ and MOVI immediates.

> I agree the ctz trick is more elegant than (and an improvement over)
> the current approach to testing for movz.  But I think the overall logic
> is harder to follow than it was in the original version.  Initially
> canonicalising val2 based on the sign bit seems unintuitive since we
> still need to handle all four combinations of (top bit set, top bit clear)
> x (low 48 bits set, low 48 bits clear).  I preferred the original
> approach of testing once with the original value (for MOVZ) and once
> with the inverted value (for MOVN).

Yes, the canonicalization on the sign ends up requiring 2 special cases.
Handling the MOVZ case first and then MOVN does avoid that, and makes
things simpler overall, so I've used that approach in v2.

> Don't the new cases boil down to: if mode is DImode and the upper 32 bits
> are clear, we can test based on SImode instead?  In other words, couldn't
> the "(val >> 32) == 0" part of the final test be done first, with the
> effect of changing the mode to SImode?  Something like:

Yes that works. I used masking of the top bits to avoid repeatedly testing the
same condition. The new version removes most special cases and ends up
both smaller and simpler:


v2: Simplify the special cases in aarch64_move_imm, use aarch64_is_movz.

Simplify, refactor and improve various move immediate functions.
Allow 32-bit MOVZ/I/N as a valid 64-bit immediate which removes special
cases in aarch64_internal_mov_immediate.  Add new constraint so the movdi
pattern only needs a single alternative for move immediate.

Passes bootstrap and regress, OK for commit?

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_bitmask_imm): Use unsigned type.
(aarch64_zeroextended_move_imm): New function.
(aarch64_move_imm): Refactor, assert mode is SImode or DImode.
(aarch64_internal_mov_immediate): Assert mode is SImode or DImode.
Simplify special cases.
(aarch64_uimm12_shift): Simplify code.
(aarch64_clamp_to_uimm12_shift): Likewise.
(aarch64_movw_imm): Rename to aarch64_is_movz.
(aarch64_float_const_rtx_p): Pass either SImode or DImode to
aarch64_internal_mov_immediate.
(aarch64_rtx_costs): Likewise.
* config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
constraints into single 'O'.
(mov_aarch64): Likewise.
* config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
(aarch64_bitmask_imm): Likewise.
(aarch64_uimm12_shift): Likewise.
(aarch64_zeroextended_move_imm): New prototype.
* config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
limit 'N' to 64-bit only moves.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
4be93c93c26e091f878bc8e4cf06e90888405fb2..8bce6ec7599edcc2e6a1d8006450f35c0ce7f61f
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -756,7 +756,7 @@ void aarch64_post_cfi_startproc (void);
 poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
 bool aarch64_address_valid_for_prefetch_p (rtx, bool);
-bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
+bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
 unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
 unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode 
mode);
@@ -793,7 +793,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, 
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
-bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
+bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
 machine_mode aarch64_sve_pred_mode (machine_mode);
@@ -843,8 +843,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
 bool aarch64_sve_float_mul_immediate_p (rtx);
 bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
-bool aarch64_uimm12_shift (HOST_WIDE_INT);
+bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
 int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
+bool aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT);
 bool 

  1   2   3   4   5   6   7   8   9   10   >