https://gcc.gnu.org/g:27653070db35216d5115cc25672fcc6a51203d26

commit r15-7520-g27653070db35216d5115cc25672fcc6a51203d26
Author: Richard Biener <rguent...@suse.de>
Date:   Wed Feb 12 14:18:06 2025 +0100

    tree-optimization/90579 - avoid STLF fail by better optimizing
    
    For the testcase in question which uses a fold-left vectorized
    reduction of a reverse iterating loop we'd need two forwprop
    invocations to first bypass the permute emitted for the reverse
    iterating loop and then to decompose the vector load that only
    feeds element extracts.  The following moves the first transform
    to a match.pd pattern and makes sure we fold the element extracts
    when the vectorizer emits them so the single forwprop pass can
    then pick up the vector load decomposition, avoiding the forwarding
    fail that causes.
    
    Moving simplify_bitfield_ref also makes forwprop remove the dead
    VEC_PERM_EXPR via the simple-dce it uses - this was also
    previously missing.
    
            PR tree-optimization/90579
            * tree-ssa-forwprop.cc (simplify_bitfield_ref): Move to
            match.pd.
            (pass_forwprop::execute): Adjust.
            * match.pd (bit_field_ref (vec_perm ...)): New pattern
            modeled after simplify_bitfield_ref.
            * tree-vect-loop.cc (vect_expand_fold_left): Fold the
            element extract stmt, combining it with the vector def.
    
            * gcc.target/i386/pr90579.c: New testcase.

Diff:
---
 gcc/match.pd                            |  56 +++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr90579.c |  23 +++++++
 gcc/tree-ssa-forwprop.cc                | 103 +-------------------------------
 gcc/tree-vect-loop.cc                   |   5 ++
 4 files changed, 85 insertions(+), 102 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 97e0bafdda4b..5c679848bdf2 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9531,6 +9531,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
        (BIT_FIELD_REF { CONSTRUCTOR_ELT (ctor, idx / const_k)->value; }
                       @1 { bitsize_int ((idx % const_k) * width); })))))))))
 
+(simplify
+ (BIT_FIELD_REF (vec_perm@0 @1 @2 VECTOR_CST@3) @rsize @rpos)
+ (with
+  {
+    tree elem_type = TREE_TYPE (TREE_TYPE (@0));
+    poly_uint64 elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type));
+    poly_uint64 size = tree_to_poly_uint64 (TYPE_SIZE (type));
+    unsigned HOST_WIDE_INT nelts, idx;
+    unsigned HOST_WIDE_INT nelts_op = 0;
+  }
+  (if (constant_multiple_p (tree_to_poly_uint64 (@rpos), elem_size, &idx)
+       && VECTOR_CST_NELTS (@3).is_constant (&nelts)
+       && (known_eq (size, elem_size)
+          || (constant_multiple_p (size, elem_size, &nelts_op)
+              && pow2p_hwi (nelts_op))))
+   (with
+    {
+      bool ok = true;
+      /* One element.  */
+      if (known_eq (size, elem_size))
+        idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (@3, idx)) % (2 * nelts);
+      else
+        {
+         /* Clamp vec_perm_expr index.  */
+         unsigned start
+           = TREE_INT_CST_LOW (vector_cst_elt (@3, idx)) % (2 * nelts);
+         unsigned end
+           = (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + nelts_op - 1))
+              % (2 * nelts));
+         /* Be in the same vector.  */
+         if ((start < nelts) != (end < nelts))
+           ok = false;
+         else
+           for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
+             {
+               /* Continuous area.  */
+               if ((TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i))
+                    % (2 * nelts) - 1)
+                   != (TREE_INT_CST_LOW (vector_cst_elt (@3, idx + i - 1))
+                       % (2 * nelts)))
+                 {
+                   ok = false;
+                   break;
+                 }
+             }
+         /* Alignment not worse than before.  */
+         if (start % nelts_op)
+           ok = false;
+         idx = start;
+       }
+    }
+    (if (ok)
+     (if (idx < nelts)
+      (BIT_FIELD_REF @1 @rsize { bitsize_int (idx * elem_size); })
+      (BIT_FIELD_REF @2 @rsize { bitsize_int ((idx - nelts) * elem_size); 
})))))))
+
 /* Simplify a bit extraction from a bit insertion for the cases with
    the inserted element fully covering the extraction or the insertion
    not touching the extraction.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr90579.c 
b/gcc/testsuite/gcc.target/i386/pr90579.c
new file mode 100644
index 000000000000..ab48a44063c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr90579.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -mfpmath=sse" } */
+
+extern double r[6];
+extern double a[];
+
+double
+loop (int k, double x)
+{
+  int i;
+  double t=0;
+  for (i=0;i<6;i++)
+    r[i] = x * a[i + k];
+  for (i=0;i<6;i++)
+    t+=r[5-i];
+  return t;
+}
+
+/* Verify we end up with scalar loads from r for the final sum.  */
+/* { dg-final { scan-assembler "vaddsd\tr\\\+40" } } */
+/* { dg-final { scan-assembler "vaddsd\tr\\\+32" } } */
+/* { dg-final { scan-assembler "vaddsd\tr\\\+24" } } */
+/* { dg-final { scan-assembler "vaddsd\tr\\\+16" } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 9474682152a6..fafc4d6b77ab 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq
 typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq;
 
 static bool forward_propagate_addr_expr (tree, tree, bool);
+static void optimize_vector_load (gimple_stmt_iterator *);
 
 /* Set to true if we delete dead edges during the optimization.  */
 static bool cfg_changed;
@@ -2481,106 +2482,6 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator 
*gsi)
 }
 
 
-/* Combine an element access with a shuffle.  Returns true if there were
-   any changes made, else it returns false.  */
-
-static bool
-simplify_bitfield_ref (gimple_stmt_iterator *gsi)
-{
-  gimple *stmt = gsi_stmt (*gsi);
-  gimple *def_stmt;
-  tree op, op0, op1;
-  tree elem_type, type;
-  tree p, m, tem;
-  unsigned HOST_WIDE_INT nelts, idx;
-  poly_uint64 size, elem_size;
-  enum tree_code code;
-
-  op = gimple_assign_rhs1 (stmt);
-  gcc_checking_assert (TREE_CODE (op) == BIT_FIELD_REF);
-
-  op0 = TREE_OPERAND (op, 0);
-  if (TREE_CODE (op0) != SSA_NAME
-      || TREE_CODE (TREE_TYPE (op0)) != VECTOR_TYPE)
-    return false;
-
-  def_stmt = get_prop_source_stmt (op0, false, NULL);
-  if (!def_stmt || !can_propagate_from (def_stmt))
-    return false;
-
-  op1 = TREE_OPERAND (op, 1);
-  code = gimple_assign_rhs_code (def_stmt);
-  elem_type = TREE_TYPE (TREE_TYPE (op0));
-  type = TREE_TYPE (op);
-  /* Also handle vector type.
-     .i.e.
-     _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>;
-     _11 = BIT_FIELD_REF <_7, 64, 0>;
-
-     to
-
-     _11 = BIT_FIELD_REF <_1, 64, 64>.  */
-
-  size = tree_to_poly_uint64 (TYPE_SIZE (type));
-  if (maybe_ne (bit_field_size (op), size))
-    return false;
-
-  elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type));
-  if (code != VEC_PERM_EXPR
-      || !constant_multiple_p (bit_field_offset (op), elem_size, &idx))
-    return false;
-
-  m = gimple_assign_rhs3 (def_stmt);
-  if (TREE_CODE (m) != VECTOR_CST
-      || !VECTOR_CST_NELTS (m).is_constant (&nelts))
-    return false;
-
-  /* One element.  */
-  if (known_eq (size, elem_size))
-    idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx)) % (2 * nelts);
-  else
-    {
-      unsigned HOST_WIDE_INT nelts_op;
-      if (!constant_multiple_p (size, elem_size, &nelts_op)
-         || !pow2p_hwi (nelts_op))
-       return false;
-      /* Clamp vec_perm_expr index.  */
-      unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx)) % (2 * 
nelts);
-      unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - 1))
-                    % (2 * nelts);
-      /* Be in the same vector.  */
-      if ((start < nelts) != (end < nelts))
-       return false;
-      for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
-       {
-         /* Continuous area.  */
-         if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) % (2 * nelts) - 1
-             != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1))
-                % (2 * nelts))
-           return false;
-       }
-      /* Alignment not worse than before.  */
-      if (start % nelts_op)
-       return false;
-      idx = start;
-    }
-
-  if (idx < nelts)
-    p = gimple_assign_rhs1 (def_stmt);
-  else
-    {
-      p = gimple_assign_rhs2 (def_stmt);
-      idx -= nelts;
-    }
-
-  tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
-               p, op1, bitsize_int (idx * elem_size));
-  gimple_assign_set_rhs1 (stmt, tem);
-  fold_stmt (gsi);
-  update_stmt (gsi_stmt (*gsi));
-  return true;
-}
-
 /* Determine whether applying the 2 permutations (mask1 then mask2)
    gives back one of the input.  */
 
@@ -4677,8 +4578,6 @@ pass_forwprop::execute (function *fun)
                          cfg_changed = true;
                        changed = did_something != 0;
                      }
-                   else if (code == BIT_FIELD_REF)
-                     changed = simplify_bitfield_ref (&gsi);
                    else if (code == CONSTRUCTOR
                             && TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE)
                      changed = simplify_vector_constructor (&gsi);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index eea0b89db69e..07b19a22d42c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7086,6 +7086,11 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree 
scalar_dest,
       rhs = make_ssa_name (scalar_dest, stmt);
       gimple_assign_set_lhs (stmt, rhs);
       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+      /* Fold the vector extract, combining it with a previous reversal
+        like seen in PR90579.  */
+      auto gsi2 = gsi_for_stmt (stmt);
+      if (fold_stmt (&gsi2, follow_all_ssa_edges))
+       update_stmt (gsi_stmt  (gsi2));
 
       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
       tree new_name = make_ssa_name (scalar_dest, stmt);

Reply via email to