[PATCH] aarch64: Rework aarch64_modes_tieable_p [PR112105]

Richard Sandiford Fri, 03 Nov 2023 05:46:34 -0700

On AArch64, can_change_mode_class and modes_tieable_p are
mostly answering the same questions:


(a) Do two modes have the same layout for the bytes that are
    common to both modes?

(b) Do all valid subregs involving the two modes behave as
    GCC would expect?

(c) Is there at least one register that can hold both modes?

These questions involve no class-dependent tests, and the relationship
is symmetrical.  This means we can do most of the checks in a common
subroutine.

can_change_mode_class is the hook that matters for correctness,
while modes_tieable_p is more for optimisation.  It was therefore
can_change_mode_class that had the more accurate tests.
modes_tieable_p was looser in some ways (e.g. it missed some
big-endian tests) and overly strict in others (it didn't allow
ties between a vector structure mode and the mode of a single lane).
The overly strict part caused a missed combination in the testcase.

I think the can_change_mode_class logic also needed some tweaks,
as described in the changelog.

Tested on aarch64-linux-gnu.  I'll leave it a day or so for comments,
and to give the CI testers a chance to try it.

Richard


gcc/
        PR target/112105
        * config/aarch64/aarch64.cc (aarch64_modes_compatible_p): New
        function, with the core logic extracted from...
        (aarch64_can_change_mode_class): ...here.  Extend the previous rules
        to allow changes between partial SVE modes and other modes if
        the other mode is no bigger than an element, and if no other rule
        prevents it.  Use the aarch64_modes_tieable_p handling of
        partial Advanced SIMD structure modes.
        (aarch64_modes_tieable_p): Use aarch64_modes_compatible_p.
        Allow all vector mode ties that it allows.

gcc/testusite/
        PR target/112105
        * gcc.target/aarch64/pr112105.c: New test.
        * gcc.target/aarch64/sve/pcs/struct_3_128.c: Expect a 32-bit spill
        rather than a 16-bit spill.
---
 gcc/config/aarch64/aarch64.cc                 | 223 +++++++++---------
 gcc/testsuite/gcc.target/aarch64/pr112105.c   |  31 +++
 .../gcc.target/aarch64/sve/pcs/struct_3_128.c |   4 +-
 3 files changed, 147 insertions(+), 111 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr112105.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5fd7063663c..cb65ccc8465 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25215,53 +25215,131 @@ aarch64_expand_sve_vcond (machine_mode data_mode, 
machine_mode cmp_mode,
   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
 }
 
-/* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
-   true.  However due to issues with register allocation it is preferable
-   to avoid tieing integer scalar and FP scalar modes.  Executing integer
-   operations in general registers is better than treating them as scalar
-   vector operations.  This reduces latency and avoids redundant int<->FP
-   moves.  So tie modes if they are either the same class, or vector modes
-   with other vector modes, vector structs or any scalar mode.  */
+/* Return true if:
+
+   (a) MODE1 and MODE2 use the same layout for bytes that are common
+       to both modes;
+
+   (b) subregs involving the two modes behave as the target-independent
+       subreg rules require; and
+
+   (c) there is at least one register that can hold both modes.
+
+   Return false otherwise.  */
 
 static bool
-aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
+aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
 {
-  if ((aarch64_advsimd_partial_struct_mode_p (mode1)
-       != aarch64_advsimd_partial_struct_mode_p (mode2))
+  unsigned int flags1 = aarch64_classify_vector_mode (mode1);
+  unsigned int flags2 = aarch64_classify_vector_mode (mode2);
+
+  bool sve1_p = (flags1 & VEC_ANY_SVE);
+  bool sve2_p = (flags2 & VEC_ANY_SVE);
+
+  bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
+  bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
+
+  bool pred1_p = (flags1 & VEC_SVE_PRED);
+  bool pred2_p = (flags2 & VEC_SVE_PRED);
+
+  bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
+                                              | VEC_PARTIAL));
+  bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
+                                              | VEC_PARTIAL));
+
+  /* Don't allow changes between predicate modes and other modes.
+     Only predicate registers can hold predicate modes and only
+     non-predicate registers can hold non-predicate modes, so any
+     attempt to mix them would require a round trip through memory.  */
+  if (pred1_p != pred2_p)
+    return false;
+
+  /* The contents of partial SVE modes are distributed evenly across
+     the register, whereas GCC expects them to be clustered together.
+     We therefore need to be careful about mode changes involving them.  */
+  if (partial_sve1_p && partial_sve2_p)
+    {
+      /* Reject changes between partial SVE modes that have different
+        patterns of significant and insignificant bits.  */
+      if ((aarch64_sve_container_bits (mode1)
+          != aarch64_sve_container_bits (mode2))
+         || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
+       return false;
+    }
+  else if (partial_sve1_p)
+    {
+      /* The first lane of MODE1 is where GCC expects it, but anything
+        bigger than that is not.  */
+      if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
+       return false;
+    }
+  else if (partial_sve2_p)
+    {
+      /* Similarly in reverse.  */
+      if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
+       return false;
+    }
+
+  /* Don't allow changes between partial Advanced SIMD structure modes
+     and other modes that are bigger than 8 bytes.  E.g. V16QI and V2x8QI
+     are the same size, but the former occupies one Q register while the
+     latter occupies two D registers.  */
+  if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
       && maybe_gt (GET_MODE_SIZE (mode1), 8)
       && maybe_gt (GET_MODE_SIZE (mode2), 8))
     return false;
 
-  if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
-    return true;
+  if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
+    {
+      /* Don't allow changes between SVE modes and other modes that might
+        be bigger than 128 bits.  In particular, OImode, CImode and XImode
+        divide into 128-bit quantities while SVE modes divide into
+        BITS_PER_SVE_VECTOR quantities.  */
+      if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
+       return false;
+      if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
+       return false;
+    }
 
-  /* Allow changes between scalar modes if both modes fit within 64 bits.
-     This is because:
+  if (BYTES_BIG_ENDIAN)
+    {
+      /* Don't allow changes between SVE data modes and non-SVE modes.
+        See the comment at the head of aarch64-sve.md for details.  */
+      if (sve1_p != sve2_p)
+       return false;
 
-     - We allow all such modes for both FPRs and GPRs.
-     - They occupy a single register for both FPRs and GPRs.
-     - We can reinterpret one mode as another in both types of register.  */
-  if (is_a<scalar_mode> (mode1)
-      && is_a<scalar_mode> (mode2)
-      && known_le (GET_MODE_SIZE (mode1), 8)
-      && known_le (GET_MODE_SIZE (mode2), 8))
-    return true;
+      /* Don't allow changes in element size: lane 0 of the new vector
+        would not then be lane 0 of the old vector.  See the comment
+        above aarch64_maybe_expand_sve_subreg_move for a more detailed
+        description.
 
-  /* We specifically want to allow elements of "structure" modes to
-     be tieable to the structure.  This more general condition allows
-     other rarer situations too.  The reason we don't extend this to
-     predicate modes is that there are no predicate structure modes
-     nor any specific instructions for extracting part of a predicate
-     register.  */
-  if (aarch64_vector_data_mode_p (mode1)
-      && aarch64_vector_data_mode_p (mode2))
-    return true;
+        In the worst case, this forces a register to be spilled in
+        one mode and reloaded in the other, which handles the
+        endianness correctly.  */
+      if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
+       return false;
+    }
+  return true;
+}
 
-  /* Also allow any scalar modes with vectors.  */
-  if (aarch64_vector_mode_supported_p (mode1)
-      || aarch64_vector_mode_supported_p (mode2))
-    return true;
+/* Implement TARGET_MODES_TIEABLE_P.  In principle we should always defer
+   to aarch64_modes_compatible_p.  However due to issues with register
+   allocation it is preferable to avoid tieing integer scalar and FP
+   scalar modes.  Executing integer operations in general registers is
+   better than treating them as scalar vector operations.  This reduces
+   latency and avoids redundant int<->FP moves.  So tie modes if they
+   are either the same class, or one of them is a vector mode.  */
 
+static bool
+aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
+{
+  if (aarch64_modes_compatible_p (mode1, mode2))
+    {
+      if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
+       return true;
+      if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
+       return true;
+    }
   return false;
 }
 
@@ -27294,80 +27372,7 @@ static bool
 aarch64_can_change_mode_class (machine_mode from,
                               machine_mode to, reg_class_t)
 {
-  unsigned int from_flags = aarch64_classify_vector_mode (from);
-  unsigned int to_flags = aarch64_classify_vector_mode (to);
-
-  bool from_sve_p = (from_flags & VEC_ANY_SVE);
-  bool to_sve_p = (to_flags & VEC_ANY_SVE);
-
-  bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
-  bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
-
-  bool from_pred_p = (from_flags & VEC_SVE_PRED);
-  bool to_pred_p = (to_flags & VEC_SVE_PRED);
-
-  bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
-                                                  | VEC_PARTIAL));
-  bool from_partial_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT
-                                                  | VEC_PARTIAL));
-
-  /* Don't allow changes between predicate modes and other modes.
-     Only predicate registers can hold predicate modes and only
-     non-predicate registers can hold non-predicate modes, so any
-     attempt to mix them would require a round trip through memory.  */
-  if (from_pred_p != to_pred_p)
-    return false;
-
-  /* Don't allow changes between partial SVE modes and other modes.
-     The contents of partial SVE modes are distributed evenly across
-     the register, whereas GCC expects them to be clustered together.  */
-  if (from_partial_sve_p != to_partial_sve_p)
-    return false;
-
-  /* Similarly reject changes between partial SVE modes that have
-     different patterns of significant and insignificant bits.  */
-  if (from_partial_sve_p
-      && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
-         || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
-    return false;
-
-  /* Don't allow changes between partial and other registers only if
-     one is a normal SIMD register, allow only if not larger than 64-bit.  */
-  if ((to_partial_advsimd_struct_p ^ from_partial_advsimd_struct_p)
-      && (known_gt (GET_MODE_SIZE (to), 8) || known_gt (GET_MODE_SIZE (to), 
8)))
-    return false;
-
-  if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
-    {
-      /* Don't allow changes between SVE modes and other modes that might
-        be bigger than 128 bits.  In particular, OImode, CImode and XImode
-        divide into 128-bit quantities while SVE modes divide into
-        BITS_PER_SVE_VECTOR quantities.  */
-      if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
-       return false;
-      if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
-       return false;
-    }
-
-  if (BYTES_BIG_ENDIAN)
-    {
-      /* Don't allow changes between SVE data modes and non-SVE modes.
-        See the comment at the head of aarch64-sve.md for details.  */
-      if (from_sve_p != to_sve_p)
-       return false;
-
-      /* Don't allow changes in element size: lane 0 of the new vector
-        would not then be lane 0 of the old vector.  See the comment
-        above aarch64_maybe_expand_sve_subreg_move for a more detailed
-        description.
-
-        In the worst case, this forces a register to be spilled in
-        one mode and reloaded in the other, which handles the
-        endianness correctly.  */
-      if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
-       return false;
-    }
-  return true;
+  return aarch64_modes_compatible_p (from, to);
 }
 
 /* Implement TARGET_EARLY_REMAT_MODES.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr112105.c 
b/gcc/testsuite/gcc.target/aarch64/pr112105.c
new file mode 100644
index 00000000000..1368ea3f784
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr112105.c
@@ -0,0 +1,31 @@
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+typedef struct {
+  float re;
+  float im;
+} cmplx_f32_t;
+
+void test2x2_f32(const cmplx_f32_t *p_src_a,
+             const cmplx_f32_t *p_src_b,
+             cmplx_f32_t *p_dst) {
+  const float32_t *a_ptr = (const float32_t *)p_src_a;
+  const float32_t *b_ptr = (const float32_t *)p_src_b;
+  float32_t *out_ptr = (float32_t *)p_dst;
+
+  float32x2x2_t a_col[2];
+  float32x2x2_t b[2];
+  float32x2x2_t result[2];
+
+  a_col[0] = vld2_f32(a_ptr);
+  b[0] = vld2_f32(b_ptr);
+
+  result[0].val[0] = vmul_lane_f32(a_col[0].val[0], b[0].val[0], 0);
+  result[0].val[1] = vmul_lane_f32(a_col[0].val[1], b[0].val[0], 0);
+
+  vst2_f32(out_ptr, result[0]);
+  out_ptr = out_ptr + 4;
+}
+
+/* { dg-final { scan-assembler-not {\tdup\t} } } */
+/* { dg-final { scan-assembler-times {(?n)\tfmul\t.*v[0-9]+\.s\[0\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
index 443ce4cca6e..f6d78469aa5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c
@@ -908,8 +908,8 @@ SEL2 (union, nonpst3)
 /*
 ** test_nonpst3:
 **     sub     sp, sp, #16
-**     strh    w0, \[sp, #?6\]
-**     ldr     p0, \[sp, #3, mul vl\]
+**     str     w0, \[sp, #?8\]
+**     ldr     p0, \[sp, #4, mul vl\]
 **     add     sp, sp, #?16
 **     ret
 */
-- 
2.25.1

[PATCH] aarch64: Rework aarch64_modes_tieable_p [PR112105]

Reply via email to