This patch fixes the dupq_* testsuite failures.  The tests were
introduced with r15-3669-ga92f54f580c3 (which was a nice improvement)
and Pengxuan originally had a follow-on patch to recognise INDEX
constants during vec_init.

I'd originally wanted to solve this a different way, using wildcards
when building a vector and letting vector_builder::finalize find the
best way of filling them in.  I no longer think that's the best
approach though.  Stepped constants are likely to be more expensive
than unstepped constants, so we should first try finding an unstepped
constant that is valid, even if it has a longer representation than
the stepped version.

This patch therefore uses a variant of Pengxuan's idea.

While there, I noticed that the (old) code for finding an unstepped
constant only tried varying one bit at a time.  So for index 0 in a
16-element constant, the code would try picking a constant from index 8,
4, 2, and then 1.  But since the goal is to create "fewer, larger,
repeating parts", it would be better to iterate over a bit-reversed
increment, so that after trying an XOR with 0 and 8, we try adding 4
to each previous attempt, then 2 to each previous attempt, and so on.
In the previous example this would give 8, 4, 12, 2, 10, 6, 14, ...

The test shows an example of this for 8 shorts.

I'll push this on Monday evening UTC if there are no comments
before then.

Richard

gcc/
        * config/aarch64/aarch64.cc (aarch64_choose_vector_init_constant):
        New function, split out from...
        (aarch64_expand_vector_init_fallback): ...here.  Use a bit-
        reversed increment to find a constant index.  Add support for
        stepped constants.

gcc/testsuite/
        * gcc.target/aarch64/sve/acle/general/dupq_12.c: New test.
---
 gcc/config/aarch64/aarch64.cc                 | 110 +++++++++++++-----
 .../aarch64/sve/acle/general/dupq_12.c        |  13 +++
 2 files changed, 95 insertions(+), 28 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_12.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index be99137b052..16754fa9e7b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24036,6 +24036,85 @@ aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
+/* VALS is a PARALLEL rtx that contains element values for a vector of
+   mode MODE.  Return a constant that contains all the CONST_INT and
+   CONST_DOUBLE elements of VALS, using any convenient values for the
+   other elements.  */
+
+static rtx
+aarch64_choose_vector_init_constant (machine_mode mode, rtx vals)
+{
+  unsigned int n_elts = XVECLEN (vals, 0);
+
+  /* We really don't care what goes into the parts we will overwrite, but we're
+     more likely to be able to load the constant efficiently if it has fewer,
+     larger, repeating parts (see aarch64_simd_valid_imm).  */
+  rtvec copy = shallow_copy_rtvec (XVEC (vals, 0));
+  for (unsigned int i = 0; i < n_elts; ++i)
+    {
+      rtx x = RTVEC_ELT (copy, i);
+      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+       continue;
+      /* This is effectively a bit-reversed increment, e.g.: 8, 4, 12,
+        2, 10, 6, 12, ... for n_elts == 16.  The early break makes the
+        outer "i" loop O(n_elts * log(n_elts)).  */
+      unsigned int j = 0;
+      for (;;)
+       {
+         for (unsigned int bit = n_elts / 2; bit > 0; bit /= 2)
+           {
+             j ^= bit;
+             if (j & bit)
+               break;
+           }
+         rtx test = XVECEXP (vals, 0, i ^ j);
+         if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
+           {
+             RTVEC_ELT (copy, i) = test;
+             break;
+           }
+         gcc_assert (j != 0);
+       }
+    }
+
+  rtx c = gen_rtx_CONST_VECTOR (mode, copy);
+  if (aarch64_simd_valid_mov_imm (c))
+    return c;
+
+  /* Try generating a stepped sequence.  */
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    for (unsigned int i = 0; i < n_elts; ++i)
+      if (CONST_INT_P (XVECEXP (vals, 0, i)))
+       {
+         auto base = UINTVAL (XVECEXP (vals, 0, i));
+         for (unsigned int j = i + 1; j < n_elts; ++j)
+           if (CONST_INT_P (XVECEXP (vals, 0, j)))
+             {
+               /* It doesn't matter whether this division is exact.
+                  All that matters is whether the constant we produce
+                  is valid.  */
+               HOST_WIDE_INT diff = UINTVAL (XVECEXP (vals, 0, j)) - base;
+               unsigned HOST_WIDE_INT step = diff / int (j - i);
+               rtx_vector_builder builder (mode, n_elts, 1);
+               for (unsigned int k = 0; k < n_elts; ++k)
+                 {
+                   rtx x = XVECEXP (vals, 0, k);
+                   if (!CONST_INT_P (x))
+                     x = gen_int_mode (int (k - i) * step + base,
+                                       GET_MODE_INNER (mode));
+                   builder.quick_push (x);
+                 }
+               rtx step_c = builder.build ();
+               if (aarch64_simd_valid_mov_imm (step_c))
+                 return step_c;
+               break;
+             }
+         break;
+       }
+
+  return c;
+}
+
 /* A subroutine of aarch64_expand_vector_init, with the same interface.
    The caller has already tried a divide-and-conquer approach, so do
    not consider that case here.  */
@@ -24049,7 +24128,6 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
   int n_elts = XVECLEN (vals, 0);
   /* The number of vector elements which are not constant.  */
   int n_var = 0;
-  rtx any_const = NULL_RTX;
   /* The first element of vals.  */
   rtx v0 = XVECEXP (vals, 0, 0);
   bool all_same = true;
@@ -24075,8 +24153,6 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
       rtx x = XVECEXP (vals, 0, i);
       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
        ++n_var;
-      else
-       any_const = x;
 
       all_same &= rtx_equal_p (x, v0);
     }
@@ -24220,31 +24296,9 @@ aarch64_expand_vector_init_fallback (rtx target, rtx 
vals)
      can.  */
   if (n_var != n_elts)
     {
-      rtx copy = copy_rtx (vals);
-
-      /* Load constant part of vector.  We really don't care what goes into the
-        parts we will overwrite, but we're more likely to be able to load the
-        constant efficiently if it has fewer, larger, repeating parts
-        (see aarch64_simd_valid_imm).  */
-      for (int i = 0; i < n_elts; i++)
-       {
-         rtx x = XVECEXP (vals, 0, i);
-         if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
-           continue;
-         rtx subst = any_const;
-         for (int bit = n_elts / 2; bit > 0; bit /= 2)
-           {
-             /* Look in the copied vector, as more elements are const.  */
-             rtx test = XVECEXP (copy, 0, i ^ bit);
-             if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
-               {
-                 subst = test;
-                 break;
-               }
-           }
-         XVECEXP (copy, 0, i) = subst;
-       }
-      aarch64_expand_vector_init_fallback (target, copy);
+      /* Load the constant part of the vector.  */
+      rtx constant = aarch64_choose_vector_init_constant (mode, vals);
+      emit_move_insn (target, constant);
     }
 
   /* Insert the variable lanes directly.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_12.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_12.c
new file mode 100644
index 00000000000..690cb134ad5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_12.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+#include <arm_sve.h>
+
+svint16_t
+dupq (int x)
+{
+  return svdupq_s16 (x, 0, x, 0, x, 0, 11, 0);
+}
+
+/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4s, #?(?:0xb|11)} } } */
-- 
2.25.1

Reply via email to