On Tue, 4 Nov 2025, Artemiy Volkov wrote:

> This is an attempt to fix
> https://gcc.gnu.org/pipermail/gcc-patches/2025-October/697879.html in the
> middle-end; the motivation in that patch was to teach gcc to compile:
> 
> int16x8_t foo (int16x8_t x)
> {
>   return vcombine_s16 (vget_high_s16 (x), vget_low_s16 (x));
> }
> 
> into one instruction:
> 
> foo:
>         ext     v0.16b, v0.16b, v0.16b, #8
>         ret
> 
> rather than the two we are generating now:
> 
> foo:
>         dup     d31, v0.d[1]
>         uzp1    v0.2d, v31.2d, v0.2d
>         ret
> 
> Instead of adding a define_insn in the backend, this patch relaxes the
> precondition of tree-ssa-forwprop.cc:simplify_vector_constructor () to
> accept subvectors as constructor elements.  During initial argument
> processing (ll. 3817-3916), subvectors are decomposed into individual
> elements before populating the ELTS array; this allows the rest of the
> function to remain unchanged.  Special handling is also implemented for
> constant and splat subvector elements of a constructor (the latter with
> the use of ssa_uniform_vector_p () from tree-vect-generic.cc).
> 
> Add GIMPLE tests to gcc.dg/tree-ssa demonstrating the intended behavior
> with various combinations of subvectors as constructor arguments,
> including constant and splat subvectors; also add some aarch64-specific
> tests to show that the change leads to us picking the "ext" instruction
> for the resulting VEC_PERM_EXPR.
> 
> Bootstrapped and regtested on aarch64 and x86_64, regtested on aarch64_be.

Please move ssa_uniform_vector_p from tree-vect-generic.cc to
tree.cc after the existing uniform_vector_p.

OK with that change.
Richard.

> gcc/ChangeLog:
> 
>       * tree-ssa-forwprop.cc (simplify_vector_constructor): Support
>       vector constructor elements.
>       * tree-vect-generic.cc (ssa_uniform_vector_p): Make non-static.
>       * tree.h (ssa_uniform_vector_p): Declare it.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.dg/tree-ssa/forwprop-43.c: New test.
>       * gcc.target/aarch64/simd/combine_ext.c: New test.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/forwprop-43.c   | 169 ++++++++++++++++++
>  .../gcc.target/aarch64/simd/combine_ext.c     |  46 +++++
>  gcc/tree-ssa-forwprop.cc                      |  53 ++++--
>  gcc/tree-vect-generic.cc                      |   2 +-
>  gcc/tree.h                                    |   4 +
>  5 files changed, 262 insertions(+), 12 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-43.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
> 
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-43.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-43.c
> new file mode 100644
> index 00000000000..f0f6170648a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-43.c
> @@ -0,0 +1,169 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-forwprop1" } */
> +/* { dg-additional-options "-fgimple" } */
> +
> +#include <stdint.h>
> +
> +typedef int32_t int32x4_t __attribute__((vector_size(16)));
> +typedef int32_t int32x2_t __attribute__((vector_size(8)));
> +typedef int32_t int32x1_t __attribute__((vector_size(4)));
> +
> +int32x4_t __GIMPLE (ssa)
> +foo (int32x4_t x)
> +{
> +  int32x2_t _1;
> +  int32x2_t _2;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
> +  _2 = __BIT_FIELD_REF <int32x2_t> (x, 64, 0);
> +  _6 = _Literal (int32x4_t) { _1, _2 };
> +  return _6;
> +}
> +
> +int32x4_t __GIMPLE (ssa)
> +foo2 (int32x4_t x)
> +{
> +  int32x1_t _1;
> +  int32x1_t _2;
> +  int32x1_t _3;
> +  int32x1_t _4;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x1_t> (x, 32, 64);
> +  _2 = __BIT_FIELD_REF <int32x1_t> (x, 32, 96);
> +  _3 = __BIT_FIELD_REF <int32x1_t> (x, 32, 0);
> +  _4 = __BIT_FIELD_REF <int32x1_t> (x, 32, 32);
> +  _6 = _Literal (int32x4_t) { _1, _2, _3, _4 };
> +  return _6;
> +}
> +
> +int32x4_t __GIMPLE (ssa)
> +foo3 (int32x4_t x, int32x4_t y)
> +{
> +  int32x2_t _1;
> +  int32x2_t _2;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
> +  _2 = __BIT_FIELD_REF <int32x2_t> (y, 64, 0);
> +  _6 = _Literal (int32x4_t) { _1, _2 };
> +  return _6;
> +}
> +
> +int32x4_t __GIMPLE (ssa)
> +foo4 (int32x4_t x, int32x4_t y)
> +{
> +  int32x1_t _1;
> +  int32x1_t _2;
> +  int32x1_t _3;
> +  int32x1_t _4;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x1_t> (x, 32, 64);
> +  _2 = __BIT_FIELD_REF <int32x1_t> (y, 32, 96);
> +  _3 = __BIT_FIELD_REF <int32x1_t> (x, 32, 0);
> +  _4 = __BIT_FIELD_REF <int32x1_t> (y, 32, 32);
> +  _6 = _Literal (int32x4_t) { _1, _2, _3, _4 };
> +  return _6;
> +}
> +
> +int32x4_t __GIMPLE (ssa)
> +foo5 (int32x4_t x)
> +{
> +  int32x2_t _1;
> +  int32x2_t _2;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
> +  _2 = _Literal (int32x2_t) { 1, 2 };
> +  _6 = _Literal (int32x4_t) { _1, _2 };
> +  return _6;
> +}
> +
> +int32x4_t __GIMPLE (ssa)
> +foo6 (int32x4_t x, int32_t y)
> +{
> +  int32x2_t _1;
> +  int32x2_t _2;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
> +  _2 = _Literal (int32x2_t) { y, y };
> +  _6 = _Literal (int32x4_t) { _1, _2 };
> +  return _6;
> +}
> +
> +int32x4_t __GIMPLE (ssa)
> +foo7 (int32x4_t x)
> +{
> +  int32x2_t _1;
> +  int32x2_t _2;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
> +  _2 = _Literal (int32x2_t) { 1, 2 };
> +  _6 = _Literal (int32x4_t) { _2, _1 };
> +  return _6;
> +}
> +
> +int32x4_t __GIMPLE (ssa)
> +foo8 (int32x4_t x, int32_t y)
> +{
> +  int32x2_t _1;
> +  int32x2_t _2;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x2_t> (x, 64, 64);
> +  _2 = _Literal (int32x2_t) { y, y };
> +  _6 = _Literal (int32x4_t) { _2, _1 };
> +  return _6;
> +}
> +
> +int32x4_t __GIMPLE (ssa)
> +foo9 (int32x4_t x)
> +{
> +  int32x1_t _1;
> +  int32x1_t _2;
> +  int32x1_t _3;
> +  int32x1_t _4;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x1_t> (x, 32, 96);
> +  _2 = __BIT_FIELD_REF <int32x1_t> (x, 32, 64);
> +  _3 = _Literal (int32x1_t) { 1 };
> +  _4 = _Literal (int32x1_t) { 1 };
> +  _6 = _Literal (int32x4_t) { _3, _4, _1, _2 };
> +  return _6;
> +}
> +
> +int32x4_t __GIMPLE (ssa)
> +foo10 (int32x4_t x, int32_t y)
> +{
> +  int32x1_t _1;
> +  int32x1_t _2;
> +  int32x1_t _3;
> +  int32x1_t _4;
> +  int32x4_t _6;
> +
> +__BB(2):
> +  _1 = __BIT_FIELD_REF <int32x1_t> (x, 32, 96);
> +  _2 = __BIT_FIELD_REF <int32x1_t> (x, 32, 64);
> +  _3 = _Literal (int32x1_t) { y };
> +  _4 = _Literal (int32x1_t) { y };
> +  _6 = _Literal (int32x4_t) { _3, _4, _1, _2 };
> +
> +  return _6;
> +}
> +
> +
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 10 "forwprop1" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
> new file mode 100644
> index 00000000000..f10a2c6ff24
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/combine_ext.c
> @@ -0,0 +1,46 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fdump-tree-optimized" } */
> +
> +#include <arm_neon.h>
> +
> +#ifndef TEST_COMBINE_HIGH_LOW_1
> +#define TEST_COMBINE_HIGH_LOW_1(TYPE, SUFF)                          \
> +  TYPE rev_##TYPE##_1 (TYPE x)                                               
> \
> +  {                                                                  \
> +    return vcombine_##SUFF (vget_high_##SUFF (x), vget_low_##SUFF (x)); \
> +  }
> +#endif
> +
> +#ifndef TEST_COMBINE_HIGH_LOW_2
> +#define TEST_COMBINE_HIGH_LOW_2(TYPE, SUFF)                          \
> +  TYPE rev_##TYPE##_2 (TYPE x, TYPE y)                                       
> \
> +  {                                                                  \
> +    return vcombine_##SUFF (vget_high_##SUFF (x), vget_low_##SUFF (y)); \
> +  }
> +#endif
> +
> +TEST_COMBINE_HIGH_LOW_1 (int8x16_t, s8)
> +TEST_COMBINE_HIGH_LOW_1 (int16x8_t, s16)
> +TEST_COMBINE_HIGH_LOW_1 (int32x4_t, s32)
> +TEST_COMBINE_HIGH_LOW_1 (int64x2_t, s64)
> +TEST_COMBINE_HIGH_LOW_1 (uint8x16_t, u8)
> +TEST_COMBINE_HIGH_LOW_1 (uint16x8_t, u16)
> +TEST_COMBINE_HIGH_LOW_1 (uint32x4_t, u32)
> +TEST_COMBINE_HIGH_LOW_1 (uint64x2_t, u64)
> +TEST_COMBINE_HIGH_LOW_1 (float16x8_t, f16)
> +TEST_COMBINE_HIGH_LOW_1 (float32x4_t, f32)
> +
> +TEST_COMBINE_HIGH_LOW_2 (int8x16_t, s8)
> +TEST_COMBINE_HIGH_LOW_2 (int16x8_t, s16)
> +TEST_COMBINE_HIGH_LOW_2 (int32x4_t, s32)
> +TEST_COMBINE_HIGH_LOW_2 (int64x2_t, s64)
> +TEST_COMBINE_HIGH_LOW_2 (uint8x16_t, u8)
> +TEST_COMBINE_HIGH_LOW_2 (uint16x8_t, u16)
> +TEST_COMBINE_HIGH_LOW_2 (uint32x4_t, u32)
> +TEST_COMBINE_HIGH_LOW_2 (uint64x2_t, u64)
> +TEST_COMBINE_HIGH_LOW_2 (float16x8_t, f16)
> +TEST_COMBINE_HIGH_LOW_2 (float32x4_t, f32)
> +
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 20 "optimized" } } */
> +/* { dg-final { scan-assembler-times {ext\tv0.16b, v0.16b, v0.16b, #8} 10 } 
> } */
> +/* { dg-final { scan-assembler-times {ext\tv0.16b, v0.16b, v1.16b, #8} 10 } 
> } */
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 9a993ab04de..d99563f83a9 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -3809,13 +3809,16 @@ simplify_vector_constructor (gimple_stmt_iterator 
> *gsi)
>    bool maybe_blend[2] = { true, true };
>    tree one_constant = NULL_TREE;
>    tree one_nonconstant = NULL_TREE;
> +  tree subelt;
>    auto_vec<tree> constants;
>    constants.safe_grow_cleared (nelts, true);
>    auto_vec<std::pair<unsigned, unsigned>, 64> elts;
> +  unsigned int tsubelts = 0;
>    FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt)
>      {
>        tree ref, op1;
> -      unsigned int elem;
> +      unsigned int elem, src_elem_size;
> +      unsigned HOST_WIDE_INT nsubelts = 1;
>  
>        if (i >= nelts)
>       return false;
> @@ -3826,10 +3829,16 @@ simplify_vector_constructor (gimple_stmt_iterator 
> *gsi)
>        if (op1
>         && TREE_CODE ((ref = TREE_OPERAND (op1, 0))) == SSA_NAME
>         && VECTOR_TYPE_P (TREE_TYPE (ref))
> -       && useless_type_conversion_p (TREE_TYPE (op1),
> +       && (useless_type_conversion_p (TREE_TYPE (op1),
>                                       TREE_TYPE (TREE_TYPE (ref)))
> -       && constant_multiple_p (bit_field_offset (op1),
> -                               bit_field_size (op1), &elem)
> +           || (VECTOR_TYPE_P (TREE_TYPE (op1))
> +               && useless_type_conversion_p (TREE_TYPE (TREE_TYPE (op1)),
> +                                             TREE_TYPE (TREE_TYPE (ref)))
> +               && TYPE_VECTOR_SUBPARTS (TREE_TYPE (op1))
> +                     .is_constant (&nsubelts)))
> +       && constant_multiple_p (bit_field_size (op1), nsubelts,
> +                               &src_elem_size)
> +       && constant_multiple_p (bit_field_offset (op1), src_elem_size, &elem)
>         && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts))
>       {
>         unsigned int j;
> @@ -3853,7 +3862,9 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>               maybe_ident = false;
>             if (elem != i)
>               maybe_blend[j] = false;
> -           elts.safe_push (std::make_pair (j, elem));
> +           for (unsigned int k = 0; k < nsubelts; ++k)
> +             elts.safe_push (std::make_pair (j, elem + k));
> +           tsubelts += nsubelts;
>             continue;
>           }
>         /* Else fallthru.  */
> @@ -3865,27 +3876,47 @@ simplify_vector_constructor (gimple_stmt_iterator 
> *gsi)
>         && orig[1] != error_mark_node)
>       return false;
>        orig[1] = error_mark_node;
> +      if (VECTOR_TYPE_P (TREE_TYPE (elt->value))
> +       && !TYPE_VECTOR_SUBPARTS (TREE_TYPE (elt->value))
> +                     .is_constant (&nsubelts))
> +     return false;
>        if (CONSTANT_CLASS_P (elt->value))
>       {
>         if (one_nonconstant)
>           return false;
>         if (!one_constant)
> -         one_constant = elt->value;
> -       constants[i] = elt->value;
> +         one_constant = TREE_CODE (elt->value) == VECTOR_CST
> +                        ? VECTOR_CST_ELT (elt->value, 0)
> +                        : elt->value;
> +       if (TREE_CODE (elt->value) == VECTOR_CST)
> +         {
> +           for (unsigned int k = 0; k < nsubelts; k++)
> +             constants[tsubelts + k] = VECTOR_CST_ELT (elt->value, k);
> +         }
> +       else
> +         constants[tsubelts] = elt->value;
>       }
>        else
>       {
>         if (one_constant)
>           return false;
> +       subelt = VECTOR_TYPE_P (TREE_TYPE (elt->value))
> +                ? ssa_uniform_vector_p (elt->value)
> +                : elt->value;
> +       if (!subelt)
> +         return false;
>         if (!one_nonconstant)
> -         one_nonconstant = elt->value;
> -       else if (!operand_equal_p (one_nonconstant, elt->value, 0))
> +         one_nonconstant = subelt;
> +       else if (!operand_equal_p (one_nonconstant, subelt, 0))
>           return false;
>       }
> -      elts.safe_push (std::make_pair (1, i));
> +      for (unsigned int k = 0; k < nsubelts; ++k)
> +     elts.safe_push (std::make_pair (1, tsubelts + k));
> +      tsubelts += nsubelts;
>        maybe_ident = false;
>      }
> -  if (i < nelts)
> +
> +  if (elts.length () < nelts)
>      return false;
>  
>    if (! orig[0]
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index b8e6a7168ff..9dcd2705a84 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -1621,7 +1621,7 @@ lower_vec_perm (gimple_stmt_iterator *gsi)
>  
>  /* If OP is a uniform vector return the element it is a splat from.  */
>  
> -static tree
> +tree
>  ssa_uniform_vector_p (tree op)
>  {
>    if (TREE_CODE (op) == VECTOR_CST
> diff --git a/gcc/tree.h b/gcc/tree.h
> index 4a4b8ef7f0a..70a5ece48ef 100644
> --- a/gcc/tree.h
> +++ b/gcc/tree.h
> @@ -5303,6 +5303,10 @@ extern tree vector_cst_elt (const_tree, unsigned int);
>  
>  extern tree uniform_vector_p (const_tree);
>  
> +/* Same as above, but if VEC is an SSA_NAME, inspect its definition.  */
> +
> +extern tree ssa_uniform_vector_p (tree);
> +
>  /* If the argument is INTEGER_CST, return it.  If the argument is vector
>     with all elements the same INTEGER_CST, return that INTEGER_CST.  
> Otherwise
>     return NULL_TREE.  */
> 

-- 
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to