On Wed, Feb 12, 2025 at 6:58 AM Richard Biener <rguent...@suse.de> wrote:
>
> For the testcase in question which uses a fold-left vectorized
> reduction of a reverse iterating loop we'd need two forwprop
> invocations to first bypass the permute emitted for the reverse
> iterating loop and then to decompose the vector load that only
> feeds element extracts.  The following moves the first transform
> to a match.pd pattern and makes sure we fold the element extracts
> when the vectorizer emits them so the single forwprop pass can
> then pick up the vector load decomposition, avoiding the forwarding
> fail that causes.
>
> Moving simplify_bitfield_ref also makes forwprop remove the dead
> VEC_PERM_EXPR via the simple-dce it uses - this was also
> previously missing.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?

LGTM

>
> Thanks,
> Richard.
>
>         PR tree-optimization/90579
>         * tree-ssa-forwprop.cc (simplify_bitfield_ref): Move to
>         match.pd.
>         (pass_forwprop::execute): Adjust.
>         * match.pd (bit_field_ref (vec_perm ...)): New pattern
>         modeled after simplify_bitfield_ref.
>         * tree-vect-loop.cc (vect_expand_fold_left): Fold the
>         element extract stmt, combining it with the vector def.
>
>         * gcc.target/i386/pr90579.c: New testcase.
> ---
>  gcc/match.pd                            |  56 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr90579.c |  23 ++++++
>  gcc/tree-ssa-forwprop.cc                | 103 +-----------------------
>  gcc/tree-vect-loop.cc                   |   5 ++
>  4 files changed, 85 insertions(+), 102 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr90579.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 20b2aec6f37..ea44201f2eb 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -9538,6 +9538,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         (BIT_FIELD_REF { CONSTRUCTOR_ELT (ctor, idx / const_k)->value; }
>                        @1 { bitsize_int ((idx % const_k) * width); })))))))))
>
> +(simplify
> + (BIT_FIELD_REF (vec_perm@0 @1 @2 VECTOR_CST@3) @rsize @rpos)
> + (with
> +  {
> +    tree elem_type = TREE_TYPE (TREE_TYPE (@0));
> +    poly_uint64 elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type));
> +    poly_uint64 size = tree_to_poly_uint64 (TYPE_SIZE (type));
> +    unsigned HOST_WIDE_INT nelts, idx;
> +    unsigned HOST_WIDE_INT nelts_op = 0;
> +  }
> +  (if (constant_multiple_p (tree_to_poly_uint64 (@rpos), elem_size, &idx)
> +       && VECTOR_CST_NELTS (@3).is_constant (&nelts)
> +       && (known_eq (size, elem_size)
> +          || (constant_multiple_p (size, elem_size, &nelts_op)
> +              && pow2p_hwi (nelts_op))))
> +   (with
> +    {
> +      bool ok = true;
> +      /* One element.  */
> +      if (known_eq (size, elem_size))
> +        idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (@3, idx)) % (2 * nelts);
> +      else
> +        {
> +         /* Clamp vec_perm_expr index.  */
> +         unsigned start
> +           = TREE_INT_CST_LOW (vector_cst_elt (@3, idx)) % (2 * nelts);
> +         unsigned end
> +           = (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + nelts_op - 1))
> +              % (2 * nelts));
> +         /* Be in the same vector.  */
> +         if ((start < nelts) != (end < nelts))
> +           ok = false;
> +         else
> +           for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
> +             {
> +               /* Continuous area.  */
> +               if ((TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i))
> +                    % (2 * nelts) - 1)
> +                   != (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i - 1))
> +                       % (2 * nelts)))
> +                 {
> +                   ok = false;
> +                   break;
> +                 }
> +             }
> +         /* Alignment not worse than before.  */
> +         if (start % nelts_op)
> +           ok = false;
> +         idx = start;
> +       }
> +    }
> +    (if (ok)
> +     (if (idx < nelts)
> +      (BIT_FIELD_REF @1 @rsize { bitsize_int (idx * elem_size); })
> +      (BIT_FIELD_REF @2 @rsize { bitsize_int ((idx - nelts) * elem_size); 
> })))))))
> +
>  /* Simplify a bit extraction from a bit insertion for the cases with
>     the inserted element fully covering the extraction or the insertion
>     not touching the extraction.  */
> diff --git a/gcc/testsuite/gcc.target/i386/pr90579.c 
> b/gcc/testsuite/gcc.target/i386/pr90579.c
> new file mode 100644
> index 00000000000..ab48a44063c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr90579.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -mfpmath=sse" } */
> +
> +extern double r[6];
> +extern double a[];
> +
> +double
> +loop (int k, double x)
> +{
> +  int i;
> +  double t=0;
> +  for (i=0;i<6;i++)
> +    r[i] = x * a[i + k];
> +  for (i=0;i<6;i++)
> +    t+=r[5-i];
> +  return t;
> +}
> +
> +/* Verify we end up with scalar loads from r for the final sum.  */
> +/* { dg-final { scan-assembler "vaddsd\tr\\\+40" } } */
> +/* { dg-final { scan-assembler "vaddsd\tr\\\+32" } } */
> +/* { dg-final { scan-assembler "vaddsd\tr\\\+24" } } */
> +/* { dg-final { scan-assembler "vaddsd\tr\\\+16" } } */
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 9474682152a..fafc4d6b77a 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq
>  typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq;
>
>  static bool forward_propagate_addr_expr (tree, tree, bool);
> +static void optimize_vector_load (gimple_stmt_iterator *);
>
>  /* Set to true if we delete dead edges during the optimization.  */
>  static bool cfg_changed;
> @@ -2481,106 +2482,6 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator 
> *gsi)
>  }
>
>
> -/* Combine an element access with a shuffle.  Returns true if there were
> -   any changes made, else it returns false.  */
> -
> -static bool
> -simplify_bitfield_ref (gimple_stmt_iterator *gsi)
> -{
> -  gimple *stmt = gsi_stmt (*gsi);
> -  gimple *def_stmt;
> -  tree op, op0, op1;
> -  tree elem_type, type;
> -  tree p, m, tem;
> -  unsigned HOST_WIDE_INT nelts, idx;
> -  poly_uint64 size, elem_size;
> -  enum tree_code code;
> -
> -  op = gimple_assign_rhs1 (stmt);
> -  gcc_checking_assert (TREE_CODE (op) == BIT_FIELD_REF);
> -
> -  op0 = TREE_OPERAND (op, 0);
> -  if (TREE_CODE (op0) != SSA_NAME
> -      || TREE_CODE (TREE_TYPE (op0)) != VECTOR_TYPE)
> -    return false;
> -
> -  def_stmt = get_prop_source_stmt (op0, false, NULL);
> -  if (!def_stmt || !can_propagate_from (def_stmt))
> -    return false;
> -
> -  op1 = TREE_OPERAND (op, 1);
> -  code = gimple_assign_rhs_code (def_stmt);
> -  elem_type = TREE_TYPE (TREE_TYPE (op0));
> -  type = TREE_TYPE (op);
> -  /* Also handle vector type.
> -     .i.e.
> -     _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>;
> -     _11 = BIT_FIELD_REF <_7, 64, 0>;
> -
> -     to
> -
> -     _11 = BIT_FIELD_REF <_1, 64, 64>.  */
> -
> -  size = tree_to_poly_uint64 (TYPE_SIZE (type));
> -  if (maybe_ne (bit_field_size (op), size))
> -    return false;
> -
> -  elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type));
> -  if (code != VEC_PERM_EXPR
> -      || !constant_multiple_p (bit_field_offset (op), elem_size, &idx))
> -    return false;
> -
> -  m = gimple_assign_rhs3 (def_stmt);
> -  if (TREE_CODE (m) != VECTOR_CST
> -      || !VECTOR_CST_NELTS (m).is_constant (&nelts))
> -    return false;
> -
> -  /* One element.  */
> -  if (known_eq (size, elem_size))
> -    idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx)) % (2 * nelts);
> -  else
> -    {
> -      unsigned HOST_WIDE_INT nelts_op;
> -      if (!constant_multiple_p (size, elem_size, &nelts_op)
> -         || !pow2p_hwi (nelts_op))
> -       return false;
> -      /* Clamp vec_perm_expr index.  */
> -      unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx)) % (2 * 
> nelts);
> -      unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - 
> 1))
> -                    % (2 * nelts);
> -      /* Be in the same vector.  */
> -      if ((start < nelts) != (end < nelts))
> -       return false;
> -      for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
> -       {
> -         /* Continuous area.  */
> -         if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) % (2 * nelts) - 1
> -             != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1))
> -                % (2 * nelts))
> -           return false;
> -       }
> -      /* Alignment not worse than before.  */
> -      if (start % nelts_op)
> -       return false;
> -      idx = start;
> -    }
> -
> -  if (idx < nelts)
> -    p = gimple_assign_rhs1 (def_stmt);
> -  else
> -    {
> -      p = gimple_assign_rhs2 (def_stmt);
> -      idx -= nelts;
> -    }
> -
> -  tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
> -               p, op1, bitsize_int (idx * elem_size));
> -  gimple_assign_set_rhs1 (stmt, tem);
> -  fold_stmt (gsi);
> -  update_stmt (gsi_stmt (*gsi));
> -  return true;
> -}
> -
>  /* Determine whether applying the 2 permutations (mask1 then mask2)
>     gives back one of the input.  */
>
> @@ -4677,8 +4578,6 @@ pass_forwprop::execute (function *fun)
>                           cfg_changed = true;
>                         changed = did_something != 0;
>                       }
> -                   else if (code == BIT_FIELD_REF)
> -                     changed = simplify_bitfield_ref (&gsi);
>                     else if (code == CONSTRUCTOR
>                              && TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE)
>                       changed = simplify_vector_constructor (&gsi);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index eea0b89db69..07b19a22d42 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7086,6 +7086,11 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree 
> scalar_dest,
>        rhs = make_ssa_name (scalar_dest, stmt);
>        gimple_assign_set_lhs (stmt, rhs);
>        gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
> +      /* Fold the vector extract, combining it with a previous reversal
> +        like seen in PR90579.  */
> +      auto gsi2 = gsi_for_stmt (stmt);
> +      if (fold_stmt (&gsi2, follow_all_ssa_edges))
> +       update_stmt (gsi_stmt  (gsi2));
>
>        stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
>        tree new_name = make_ssa_name (scalar_dest, stmt);
> --
> 2.43.0

Reply via email to