https://gcc.gnu.org/g:470411f44f51d9ef85bfcf3a8f9cb25344dd243f

commit r16-5042-g470411f44f51d9ef85bfcf3a8f9cb25344dd243f
Author: Artemiy Volkov <[email protected]>
Date:   Sat Nov 1 17:17:15 2025 +0000

    forwprop: allow subvectors in simplify_vector_constructor ()
    
    This is an attempt to fix
    https://gcc.gnu.org/pipermail/gcc-patches/2025-October/697879.html in the
    middle-end; the motivation in that patch was to teach gcc to compile:
    
    int16x8_t foo (int16x8_t x)
    {
      return vcombine_s16 (vget_high_s16 (x), vget_low_s16 (x));
    }
    
    into one instruction:
    
    foo:
            ext     v0.16b, v0.16b, v0.16b, #8
            ret
    
    rather than the two we are generating now:
    
    foo:
            dup     d31, v0.d[1]
            uzp1    v0.2d, v31.2d, v0.2d
            ret
    
    Instead of adding a define_insn in the backend, this patch relaxes the
    precondition of tree-ssa-forwprop.cc:simplify_vector_constructor () to
    accept subvectors as constructor elements.  During initial argument
    processing (ll. 3817-3916), subvectors are decomposed into individual
    elements before populating the ELTS array; this allows the rest of the
    function to remain unchanged.  Special handling is also implemented for
    constant and splat subvector elements of a constructor (the latter with
    the use of ssa_uniform_vector_p () from tree-vect-generic.cc, which this
    patch moves to tree.cc).
    
    Add GIMPLE tests to gcc.dg/tree-ssa demonstrating the intended behavior
    with various combinations of subvectors as constructor arguments,
    including constant and splat subvectors; also add some aarch64-specific
    tests to show that the change leads to us picking the "ext" instruction
    for the resulting VEC_PERM_EXPR.
    
    Bootstrapped and regtested on aarch64 and x86_64, regtested on aarch64_be.
    
    gcc/ChangeLog:
    
            * tree-ssa-forwprop.cc (simplify_vector_constructor): Support
            vector constructor elements.
            * tree-vect-generic.cc (ssa_uniform_vector_p): Make non-static and
            move ...
            * tree.cc (ssa_uniform_vector_p): ... here.
            * tree.h (ssa_uniform_vector_p): Declare it.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/tree-ssa/forwprop-43.c: New test.
            * gcc.target/aarch64/simd/combine_ext.c: New test.

Diff:
---
 gcc/testsuite/gcc.dg/tree-ssa/forwprop-43.c        | 169 +++++++++++++++++++++
 .../gcc.target/aarch64/simd/combine_ext.c          |  46 ++++++
 gcc/tree-ssa-forwprop.cc                           |  53 +++++--
 gcc/tree-vect-generic.cc                           |  18 ---
 gcc/tree.cc                                        |  18 +++
 gcc/tree.h                                         |   4 +
 6 files changed, 279 insertions(+), 29 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-43.c 
b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-43.c
new file mode 100644
index 000000000000..f0f6170648a3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-43.c
@@ -0,0 +1,169 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-forwprop1" } */
+/* { dg-additional-options "-fgimple" } */
+
+#include <stdint.h>
+
+typedef int32_t int32x4_t __attribute__((vector_size(16)));
+typedef int32_t int32x2_t __attribute__((vector_size(8)));
+typedef int32_t int32x1_t __attribute__((vector_size(4)));
+
+int32x4_t __GIMPLE (ssa)
+foo (int32x4_t x)
+{
+  int32x2_t _1;
+  int32x2_t _2;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
+  _2 = __BIT_FIELD_REF <int32x2_t> (x, 64, 0);
+  _6 = _Literal (int32x4_t) { _1, _2 };
+  return _6;
+}
+
+int32x4_t __GIMPLE (ssa)
+foo2 (int32x4_t x)
+{
+  int32x1_t _1;
+  int32x1_t _2;
+  int32x1_t _3;
+  int32x1_t _4;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x1_t> (x, 32, 64);
+  _2 = __BIT_FIELD_REF <int32x1_t> (x, 32, 96);
+  _3 = __BIT_FIELD_REF <int32x1_t> (x, 32, 0);
+  _4 = __BIT_FIELD_REF <int32x1_t> (x, 32, 32);
+  _6 = _Literal (int32x4_t) { _1, _2, _3, _4 };
+  return _6;
+}
+
+int32x4_t __GIMPLE (ssa)
+foo3 (int32x4_t x, int32x4_t y)
+{
+  int32x2_t _1;
+  int32x2_t _2;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
+  _2 = __BIT_FIELD_REF <int32x2_t> (y, 64, 0);
+  _6 = _Literal (int32x4_t) { _1, _2 };
+  return _6;
+}
+
+int32x4_t __GIMPLE (ssa)
+foo4 (int32x4_t x, int32x4_t y)
+{
+  int32x1_t _1;
+  int32x1_t _2;
+  int32x1_t _3;
+  int32x1_t _4;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x1_t> (x, 32, 64);
+  _2 = __BIT_FIELD_REF <int32x1_t> (y, 32, 96);
+  _3 = __BIT_FIELD_REF <int32x1_t> (x, 32, 0);
+  _4 = __BIT_FIELD_REF <int32x1_t> (y, 32, 32);
+  _6 = _Literal (int32x4_t) { _1, _2, _3, _4 };
+  return _6;
+}
+
+int32x4_t __GIMPLE (ssa)
+foo5 (int32x4_t x)
+{
+  int32x2_t _1;
+  int32x2_t _2;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
+  _2 = _Literal (int32x2_t) { 1, 2 };
+  _6 = _Literal (int32x4_t) { _1, _2 };
+  return _6;
+}
+
+int32x4_t __GIMPLE (ssa)
+foo6 (int32x4_t x, int32_t y)
+{
+  int32x2_t _1;
+  int32x2_t _2;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
+  _2 = _Literal (int32x2_t) { y, y };
+  _6 = _Literal (int32x4_t) { _1, _2 };
+  return _6;
+}
+
+int32x4_t __GIMPLE (ssa)
+foo7 (int32x4_t x)
+{
+  int32x2_t _1;
+  int32x2_t _2;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
+  _2 = _Literal (int32x2_t) { 1, 2 };
+  _6 = _Literal (int32x4_t) { _2, _1 };
+  return _6;
+}
+
+int32x4_t __GIMPLE (ssa)
+foo8 (int32x4_t x, int32_t y)
+{
+  int32x2_t _1;
+  int32x2_t _2;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
+  _2 = _Literal (int32x2_t) { y, y };
+  _6 = _Literal (int32x4_t) { _2, _1 };
+  return _6;
+}
+
+int32x4_t __GIMPLE (ssa)
+foo9 (int32x4_t x)
+{
+  int32x1_t _1;
+  int32x1_t _2;
+  int32x1_t _3;
+  int32x1_t _4;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x1_t> (x, 32, 96);
+  _2 = __BIT_FIELD_REF <int32x1_t> (x, 32, 64);
+  _3 = _Literal (int32x1_t) { 1 };
+  _4 = _Literal (int32x1_t) { 1 };
+  _6 = _Literal (int32x4_t) { _3, _4, _1, _2 };
+  return _6;
+}
+
+int32x4_t __GIMPLE (ssa)
+foo10 (int32x4_t x, int32_t y)
+{
+  int32x1_t _1;
+  int32x1_t _2;
+  int32x1_t _3;
+  int32x1_t _4;
+  int32x4_t _6;
+
+__BB(2):
+  _1 = __BIT_FIELD_REF <int32x1_t> (x, 32, 96);
+  _2 = __BIT_FIELD_REF <int32x1_t> (x, 32, 64);
+  _3 = _Literal (int32x1_t) { y };
+  _4 = _Literal (int32x1_t) { y };
+  _6 = _Literal (int32x4_t) { _3, _4, _1, _2 };
+
+  return _6;
+}
+
+
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 10 "forwprop1" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c 
b/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
new file mode 100644
index 000000000000..f10a2c6ff240
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-optimized" } */
+
+#include <arm_neon.h>
+
+#ifndef TEST_COMBINE_HIGH_LOW_1
+#define TEST_COMBINE_HIGH_LOW_1(TYPE, SUFF)                            \
+  TYPE rev_##TYPE##_1 (TYPE x)                                         \
+  {                                                                    \
+    return vcombine_##SUFF (vget_high_##SUFF (x), vget_low_##SUFF (x)); \
+  }
+#endif
+
+#ifndef TEST_COMBINE_HIGH_LOW_2
+#define TEST_COMBINE_HIGH_LOW_2(TYPE, SUFF)                            \
+  TYPE rev_##TYPE##_2 (TYPE x, TYPE y)                                 \
+  {                                                                    \
+    return vcombine_##SUFF (vget_high_##SUFF (x), vget_low_##SUFF (y)); \
+  }
+#endif
+
+TEST_COMBINE_HIGH_LOW_1 (int8x16_t, s8)
+TEST_COMBINE_HIGH_LOW_1 (int16x8_t, s16)
+TEST_COMBINE_HIGH_LOW_1 (int32x4_t, s32)
+TEST_COMBINE_HIGH_LOW_1 (int64x2_t, s64)
+TEST_COMBINE_HIGH_LOW_1 (uint8x16_t, u8)
+TEST_COMBINE_HIGH_LOW_1 (uint16x8_t, u16)
+TEST_COMBINE_HIGH_LOW_1 (uint32x4_t, u32)
+TEST_COMBINE_HIGH_LOW_1 (uint64x2_t, u64)
+TEST_COMBINE_HIGH_LOW_1 (float16x8_t, f16)
+TEST_COMBINE_HIGH_LOW_1 (float32x4_t, f32)
+
+TEST_COMBINE_HIGH_LOW_2 (int8x16_t, s8)
+TEST_COMBINE_HIGH_LOW_2 (int16x8_t, s16)
+TEST_COMBINE_HIGH_LOW_2 (int32x4_t, s32)
+TEST_COMBINE_HIGH_LOW_2 (int64x2_t, s64)
+TEST_COMBINE_HIGH_LOW_2 (uint8x16_t, u8)
+TEST_COMBINE_HIGH_LOW_2 (uint16x8_t, u16)
+TEST_COMBINE_HIGH_LOW_2 (uint32x4_t, u32)
+TEST_COMBINE_HIGH_LOW_2 (uint64x2_t, u64)
+TEST_COMBINE_HIGH_LOW_2 (float16x8_t, f16)
+TEST_COMBINE_HIGH_LOW_2 (float32x4_t, f32)
+
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 20 "optimized" } } */
+/* { dg-final { scan-assembler-times {ext\tv0.16b, v0.16b, v0.16b, #8} 10 } } 
*/
+/* { dg-final { scan-assembler-times {ext\tv0.16b, v0.16b, v1.16b, #8} 10 } } 
*/
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 67deecaf0044..ae7f0e770ba2 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -3807,13 +3807,16 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   bool maybe_blend[2] = { true, true };
   tree one_constant = NULL_TREE;
   tree one_nonconstant = NULL_TREE;
+  tree subelt;
   auto_vec<tree> constants;
   constants.safe_grow_cleared (nelts, true);
   auto_vec<std::pair<unsigned, unsigned>, 64> elts;
+  unsigned int tsubelts = 0;
   FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
     {
       tree ref, op1;
-      unsigned int elem;
+      unsigned int elem, src_elem_size;
+      unsigned HOST_WIDE_INT nsubelts = 1;
 
       if (i >= nelts)
        return false;
@@ -3824,10 +3827,16 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
       if (op1
          && TREE_CODE ((ref = TREE_OPERAND (op1, 0))) == SSA_NAME
          && VECTOR_TYPE_P (TREE_TYPE (ref))
-         && useless_type_conversion_p (TREE_TYPE (op1),
+         && (useless_type_conversion_p (TREE_TYPE (op1),
                                        TREE_TYPE (TREE_TYPE (ref)))
-         && constant_multiple_p (bit_field_offset (op1),
-                                 bit_field_size (op1), &elem)
+             || (VECTOR_TYPE_P (TREE_TYPE (op1))
+                 && useless_type_conversion_p (TREE_TYPE (TREE_TYPE (op1)),
+                                               TREE_TYPE (TREE_TYPE (ref)))
+                 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (op1))
+                       .is_constant (&nsubelts)))
+         && constant_multiple_p (bit_field_size (op1), nsubelts,
+                                 &src_elem_size)
+         && constant_multiple_p (bit_field_offset (op1), src_elem_size, &elem)
          && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts))
        {
          unsigned int j;
@@ -3851,7 +3860,9 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
                maybe_ident = false;
              if (elem != i)
                maybe_blend[j] = false;
-             elts.safe_push (std::make_pair (j, elem));
+             for (unsigned int k = 0; k < nsubelts; ++k)
+               elts.safe_push (std::make_pair (j, elem + k));
+             tsubelts += nsubelts;
              continue;
            }
          /* Else fallthru.  */
@@ -3863,27 +3874,47 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
          && orig[1] != error_mark_node)
        return false;
       orig[1] = error_mark_node;
+      if (VECTOR_TYPE_P (TREE_TYPE (elt->value))
+         && !TYPE_VECTOR_SUBPARTS (TREE_TYPE (elt->value))
+                       .is_constant (&nsubelts))
+       return false;
       if (CONSTANT_CLASS_P (elt->value))
        {
          if (one_nonconstant)
            return false;
          if (!one_constant)
-           one_constant = elt->value;
-         constants[i] = elt->value;
+           one_constant = TREE_CODE (elt->value) == VECTOR_CST
+                          ? VECTOR_CST_ELT (elt->value, 0)
+                          : elt->value;
+         if (TREE_CODE (elt->value) == VECTOR_CST)
+           {
+             for (unsigned int k = 0; k < nsubelts; k++)
+               constants[tsubelts + k] = VECTOR_CST_ELT (elt->value, k);
+           }
+         else
+           constants[tsubelts] = elt->value;
        }
       else
        {
          if (one_constant)
            return false;
+         subelt = VECTOR_TYPE_P (TREE_TYPE (elt->value))
+                  ? ssa_uniform_vector_p (elt->value)
+                  : elt->value;
+         if (!subelt)
+           return false;
          if (!one_nonconstant)
-           one_nonconstant = elt->value;
-         else if (!operand_equal_p (one_nonconstant, elt->value, 0))
+           one_nonconstant = subelt;
+         else if (!operand_equal_p (one_nonconstant, subelt, 0))
            return false;
        }
-      elts.safe_push (std::make_pair (1, i));
+      for (unsigned int k = 0; k < nsubelts; ++k)
+       elts.safe_push (std::make_pair (1, tsubelts + k));
+      tsubelts += nsubelts;
       maybe_ident = false;
     }
-  if (i < nelts)
+
+  if (elts.length () < nelts)
     return false;
 
   if (! orig[0]
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index b8e6a7168ff4..29d97cff8156 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1619,24 +1619,6 @@ lower_vec_perm (gimple_stmt_iterator *gsi)
   update_stmt (gsi_stmt (*gsi));
 }
 
-/* If OP is a uniform vector return the element it is a splat from.  */
-
-static tree
-ssa_uniform_vector_p (tree op)
-{
-  if (TREE_CODE (op) == VECTOR_CST
-      || TREE_CODE (op) == VEC_DUPLICATE_EXPR
-      || TREE_CODE (op) == CONSTRUCTOR)
-    return uniform_vector_p (op);
-  if (TREE_CODE (op) == SSA_NAME)
-    {
-      gimple *def_stmt = SSA_NAME_DEF_STMT (op);
-      if (gimple_assign_single_p (def_stmt))
-       return uniform_vector_p (gimple_assign_rhs1 (def_stmt));
-    }
-  return NULL_TREE;
-}
-
 /* Return the type that should be used to implement OP on type TYPE.
    This is TYPE itself if the target can do the operation directly,
    otherwise it is a scalar type or a smaller vector type.  */
diff --git a/gcc/tree.cc b/gcc/tree.cc
index 446261a8a8c8..298784e69605 100644
--- a/gcc/tree.cc
+++ b/gcc/tree.cc
@@ -10823,6 +10823,24 @@ uniform_vector_p (const_tree vec)
   return NULL_TREE;
 }
 
+/* If OP is a uniform vector return the element it is a splat from.  */
+
+tree
+ssa_uniform_vector_p (tree op)
+{
+  if (TREE_CODE (op) == VECTOR_CST
+      || TREE_CODE (op) == VEC_DUPLICATE_EXPR
+      || TREE_CODE (op) == CONSTRUCTOR)
+    return uniform_vector_p (op);
+  if (TREE_CODE (op) == SSA_NAME)
+    {
+      gimple *def_stmt = SSA_NAME_DEF_STMT (op);
+      if (gimple_assign_single_p (def_stmt))
+       return uniform_vector_p (gimple_assign_rhs1 (def_stmt));
+    }
+  return NULL_TREE;
+}
+
 /* If the argument is INTEGER_CST, return it.  If the argument is vector
    with all elements the same INTEGER_CST, return that INTEGER_CST.  Otherwise
    return NULL_TREE.
diff --git a/gcc/tree.h b/gcc/tree.h
index 4a4b8ef7f0a7..70a5ece48ef8 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -5303,6 +5303,10 @@ extern tree vector_cst_elt (const_tree, unsigned int);
 
 extern tree uniform_vector_p (const_tree);
 
+/* Same as above, but if VEC is an SSA_NAME, inspect its definition.  */
+
+extern tree ssa_uniform_vector_p (tree);
+
 /* If the argument is INTEGER_CST, return it.  If the argument is vector
    with all elements the same INTEGER_CST, return that INTEGER_CST.  Otherwise
    return NULL_TREE.  */

Reply via email to