[gcc r12-10503] tree-optimization/111070 - fix ICE with recent ifcombine fix

2024-06-11 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:d73137ab352d654f50b703925bd92e021dce1cab

commit r12-10503-gd73137ab352d654f50b703925bd92e021dce1cab
Author: Richard Biener 
Date:   Mon Aug 21 09:01:00 2023 +0200

tree-optimization/111070 - fix ICE with recent ifcombine fix

We now got test coverage for non-SSA name bits so the following amends
the SSA_NAME_OCCURS_IN_ABNORMAL_PHI checks.

PR tree-optimization/111070
* tree-ssa-ifcombine.cc (ifcombine_ifandif): Check we have
an SSA name before checking SSA_NAME_OCCURS_IN_ABNORMAL_PHI.

* gcc.dg/pr111070.c: New testcase.

(cherry picked from commit 966b0a96523fb7adbf498ac71df5e033c70dc546)

Diff:
---
 gcc/testsuite/gcc.dg/pr111070.c | 20 
 gcc/tree-ssa-ifcombine.cc   |  9 ++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr111070.c b/gcc/testsuite/gcc.dg/pr111070.c
new file mode 100644
index 000..1ebc7adf782
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr111070.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+/* common */
+char c;
+/* arrays must be 8 byte aligned, regardless of size */
+char c_ary[1];
+
+/* data */
+char d = 1;
+char d_ary[1] = {1};
+
+int main ()
+{
+  if (((unsigned long)_ary[0] & 7) != 0)
+return 1;
+  if (((unsigned long)_ary[0] & 7) != 0)
+return 1;
+  return 0;
+}
diff --git a/gcc/tree-ssa-ifcombine.cc b/gcc/tree-ssa-ifcombine.cc
index b139328af22..dcfa92c0c82 100644
--- a/gcc/tree-ssa-ifcombine.cc
+++ b/gcc/tree-ssa-ifcombine.cc
@@ -415,7 +415,8 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool 
inner_inv,
 {
   tree t, t2;
 
-  if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1))
+  if (TREE_CODE (name1) == SSA_NAME
+ && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1))
return false;
 
   /* Do it.  */
@@ -468,8 +469,10 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool 
inner_inv,
   gimple_stmt_iterator gsi;
   tree t;
 
-  if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1)
- || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name2))
+  if ((TREE_CODE (name1) == SSA_NAME
+  && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1))
+ || (TREE_CODE (name2) == SSA_NAME
+ && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name2)))
return false;
 
   /* Find the common name which is bit-tested.  */


[gcc r15-1163] tree-optimization/115388 - wrong DSE in irreductible regions

2024-06-10 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:818e760528d436ea8f6c28ef620e2bb82d456ea1

commit r15-1163-g818e760528d436ea8f6c28ef620e2bb82d456ea1
Author: Richard Biener 
Date:   Mon Jun 10 11:29:43 2024 +0200

tree-optimization/115388 - wrong DSE in irreductible regions

The following fixes a latent bug in DSE with regarding to variant
array accesses where the code avoiding bogus DSE in loops fails to
handle irreducible regions.  For those we need to make sure backedges
are marked and discover a header for the irreducible region to check
invariantness.

PR tree-optimization/115388
* tree-ssa-dse.cc (dse_classify_store): Handle irreducible
regions.
(pass_dse::execute): Make sure to mark backedges.

* gcc.dg/torture/pr115388.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/torture/pr115388.c | 34 ++
 gcc/tree-ssa-dse.cc | 61 +
 2 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/torture/pr115388.c 
b/gcc/testsuite/gcc.dg/torture/pr115388.c
new file mode 100644
index 000..c7c902888da
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115388.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+
+int printf(const char *, ...);
+int a[10], b, c, d[0], h, i, j, k, l;
+char e = -1, g;
+volatile int f;
+static void n() {
+  while (e >= 0)
+while (1)
+  ;
+  for (b = 2; b >= 0; b--) {
+for (k = 0; k < 4; k++) {
+  if (e || i)
+continue;
+  for (h = 0; h < 2; h++)
+f;
+}
+for (l = 2; l >= 0; l--)
+  g = 0;
+for (; g < 1; g++)
+  if (c)
+d[l] = 1;
+a[9] = 0;
+a[b] = 1;
+while (j)
+  printf("\n");
+  }
+}
+int main() {
+  n();
+  if (a[1] != 1)
+__builtin_abort();
+  return 0;
+}
diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc
index 9252ca34050..63bf4491cf6 100644
--- a/gcc/tree-ssa-dse.cc
+++ b/gcc/tree-ssa-dse.cc
@@ -1018,8 +1018,11 @@ dse_classify_store (ao_ref *ref, gimple *stmt,
  if (defvar == stop_at_vuse)
return DSE_STORE_LIVE;
 
- FOR_EACH_IMM_USE_STMT (use_stmt, ui, defvar)
+ use_operand_p usep;
+ FOR_EACH_IMM_USE_FAST (usep, ui, defvar)
{
+ use_stmt = USE_STMT (usep);
+
  /* Limit stmt walking.  */
  if (++cnt > param_dse_max_alias_queries_per_store)
{
@@ -1031,31 +1034,43 @@ dse_classify_store (ao_ref *ref, gimple *stmt,
 have to be careful with loops and with memory references
 containing operands that are also operands of PHI nodes.
 See gcc.c-torture/execute/20051110-*.c.  */
- if (gimple_code (use_stmt) == GIMPLE_PHI)
+ if (gphi *phi = dyn_cast  (use_stmt))
{
  /* Look through single-argument PHIs.  */
- if (gimple_phi_num_args (use_stmt) == 1)
-   worklist.safe_push (gimple_phi_result (use_stmt));
-
- /* If we already visited this PHI ignore it for further
-processing.  */
- else if (!bitmap_bit_p (visited,
- SSA_NAME_VERSION
-   (PHI_RESULT (use_stmt
+ if (gimple_phi_num_args (phi) == 1)
+   worklist.safe_push (gimple_phi_result (phi));
+ else
{
  /* If we visit this PHI by following a backedge then we
 have to make sure ref->ref only refers to SSA names
 that are invariant with respect to the loop
-represented by this PHI node.  */
- if (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt),
- gimple_bb (use_stmt))
- && !for_each_index (ref->ref ? >ref : >base,
- check_name, gimple_bb (use_stmt)))
-   return DSE_STORE_LIVE;
- defs.safe_push (use_stmt);
- if (!first_phi_def)
-   first_phi_def = as_a  (use_stmt);
- last_phi_def = as_a  (use_stmt);
+represented by this PHI node.  We handle irreducible
+regions by relying on backedge marking and identifying
+the head of the (sub-)region.  */
+ edge e = gimple_phi_arg_edge
+(phi, PHI_ARG_INDEX_FROM_USE (usep));
+ if (e->flags & EDGE_DFS_BACK)
+   {
+ basic_block rgn_head
+   = nearest_common_dominator (CDI_DOMINATORS,
+   gimple_bb (phi),
+

[gcc r15-1160] tree-optimization/115395 - wrong-code with SLP reduction in epilog

2024-06-10 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:4ed9c5df7efeb98e190573cca42a4fd40666c45f

commit r15-1160-g4ed9c5df7efeb98e190573cca42a4fd40666c45f
Author: Richard Biener 
Date:   Mon Jun 10 10:12:52 2024 +0200

tree-optimization/115395 - wrong-code with SLP reduction in epilog

When we continue a non-SLP reduction from the main loop in the
epilog with a SLP reduction we currently fail to handle an
adjustment by the initial value because that's not a thing with SLP.
As long as we have the possibility to mix SLP and non-SLP we have
to handle it though.

PR tree-optimization/115395
* tree-vect-loop.cc (vect_create_epilog_for_reduction):
Handle STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT also for SLP
reductions of group_size one.

* gcc.dg/vect/pr115395.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr115395.c | 27 +++
 gcc/tree-vect-loop.cc| 27 ---
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr115395.c 
b/gcc/testsuite/gcc.dg/vect/pr115395.c
new file mode 100644
index 000..cd1cee9f3df
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115395.c
@@ -0,0 +1,27 @@
+/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
+
+#include "tree-vect.h"
+
+struct {
+  long header_size;
+  long start_offset;
+  long end_offset;
+} myrar_dbo[5] = {{0, 87, 6980}, {0, 7087, 13980}, {0, 14087, 0}};
+
+int i;
+long offset;
+
+int main()
+{
+  check_vect ();
+
+  offset += myrar_dbo[0].start_offset;
+  while (i < 2) {
+i++;
+offset += myrar_dbo[i].start_offset - myrar_dbo[i - 1].end_offset;
+  }
+  if (offset != 301)
+abort();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 028692614bb..c471f1564a7 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6030,25 +6030,14 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 
   tree induc_val = NULL_TREE;
   tree adjustment_def = NULL;
-  if (slp_node)
-{
-  /* Optimize: for induction condition reduction, if we can't use zero
-for induc_val, use initial_def.  */
-  if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
-   induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
-  /* ???  Coverage for 'else' isn't clear.  */
-}
+  /* Optimize: for induction condition reduction, if we can't use zero
+ for induc_val, use initial_def.  */
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
+induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
+  else if (double_reduc)
+;
   else
-{
-  /* Optimize: for induction condition reduction, if we can't use zero
- for induc_val, use initial_def.  */
-  if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
-   induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
-  else if (double_reduc)
-   ;
-  else
-   adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
-}
+adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
 
   stmt_vec_info single_live_out_stmt[] = { stmt_info };
   array_slice live_out_stmts = single_live_out_stmt;
@@ -6873,7 +6862,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 
   if (adjustment_def)
 {
-  gcc_assert (!slp_reduc);
+  gcc_assert (!slp_reduc || group_size == 1);
   gimple_seq stmts = NULL;
   if (double_reduc)
{


[gcc r15-1126] tree-optimization/115383 - EXTRACT_LAST_REDUCTION with multiple stmt copies

2024-06-10 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:c1429e3a8da0cdfe9391e1e9b2c7228d896a3a87

commit r15-1126-gc1429e3a8da0cdfe9391e1e9b2c7228d896a3a87
Author: Richard Biener 
Date:   Fri Jun 7 12:15:31 2024 +0200

tree-optimization/115383 - EXTRACT_LAST_REDUCTION with multiple stmt copies

The EXTRACT_LAST_REDUCTION code isn't ready to deal with multiple stmt
copies but SLP no longer checks for this.  The following adjusts
code generation to handle the situation.

PR tree-optimization/115383
* tree-vect-stmts.cc (vectorizable_condition): Handle
generating a chain of .FOLD_EXTRACT_LAST.

* gcc.dg/vect/pr115383.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr115383.c | 20 
 gcc/tree-vect-stmts.cc   | 20 +++-
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr115383.c 
b/gcc/testsuite/gcc.dg/vect/pr115383.c
new file mode 100644
index 000..92c24699146
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115383.c
@@ -0,0 +1,20 @@
+#include "tree-vect.h"
+
+int __attribute__((noipa))
+s331 (int i, int n)
+{
+  int j = 0;
+  for (; i < n; i++)
+if ((float)i < 0.)
+  j = i;
+  return j;
+}
+
+int main()
+{
+  check_vect ();
+  int j = s331(-13, 17);
+  if (j != -1)
+abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 5098b7fab6a..05a169ecb2d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12415,6 +12415,9 @@ vectorizable_condition (vec_info *vinfo,
   reduction_type != EXTRACT_LAST_REDUCTION
   ? else_clause : NULL, vectype, _oprnds3);
 
+  if (reduction_type == EXTRACT_LAST_REDUCTION)
+vec_else_clause = else_clause;
+
   /* Arguments are ready.  Create the new vector stmt.  */
   FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_cond_lhs)
 {
@@ -12557,17 +12560,24 @@ vectorizable_condition (vec_info *vinfo,
{
  gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
  tree lhs = gimple_get_lhs (old_stmt);
+ if ((unsigned)i != vec_oprnds0.length () - 1)
+   lhs = copy_ssa_name (lhs);
  if (len)
new_stmt = gimple_build_call_internal
-   (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare,
-vec_then_clause, len, bias);
+   (IFN_LEN_FOLD_EXTRACT_LAST, 5, vec_else_clause, vec_compare,
+vec_then_clause, len, bias);
  else
new_stmt = gimple_build_call_internal
-   (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
-vec_then_clause);
+   (IFN_FOLD_EXTRACT_LAST, 3, vec_else_clause, vec_compare,
+vec_then_clause);
  gimple_call_set_lhs (new_stmt, lhs);
  SSA_NAME_DEF_STMT (lhs) = new_stmt;
- if (old_stmt == gsi_stmt (*gsi))
+ if ((unsigned)i != vec_oprnds0.length () - 1)
+   {
+ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+ vec_else_clause = lhs;
+   }
+ else if (old_stmt == gsi_stmt (*gsi))
vect_finish_replace_stmt (vinfo, stmt_info, new_stmt);
  else
{


[gcc r15-1097] Fix fold-left reduction vectorization with multiple stmt copies

2024-06-07 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:dd6f942c266533b2f72610f354bc9184f8276beb

commit r15-1097-gdd6f942c266533b2f72610f354bc9184f8276beb
Author: Richard Biener 
Date:   Fri Jun 7 09:41:11 2024 +0200

Fix fold-left reduction vectorization with multiple stmt copies

There's a typo when code generating the mask operand for conditional
fold-left reductions in the case we have multiple stmt copies.  The
latter is now allowed for SLP and possibly disabled for non-SLP by
accident.

This fixes the observed run-FAIL for
gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c with AVX512
and 256bit sized vectors.

* tree-vect-loop.cc (vectorize_fold_left_reduction): Fix
mask vector operand indexing.

Diff:
---
 gcc/tree-vect-loop.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index ceb92156b58..028692614bb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7217,7 +7217,7 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, 
i);
   else if (is_cond_op)
-   mask = vec_opmask[0];
+   mask = vec_opmask[i];
   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
{
  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,


[gcc r15-1056] Allow single-lane SLP in-order reductions

2024-06-06 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:4653b682ef161c3c2fc7bf8462b8f9206a1349e6

commit r15-1056-g4653b682ef161c3c2fc7bf8462b8f9206a1349e6
Author: Richard Biener 
Date:   Tue Mar 5 15:46:24 2024 +0100

Allow single-lane SLP in-order reductions

The single-lane case isn't different from non-SLP, no re-association
implied.  But the transform stage cannot handle a conditional reduction
op which isn't checked during analysis - this makes it work, exercised
with a single-lane non-reduction-chain by gcc.target/i386/pr112464.c

* tree-vect-loop.cc (vectorizable_reduction): Allow
single-lane SLP in-order reductions.
(vectorize_fold_left_reduction): Handle SLP reduction with
conditional reduction op.

Diff:
---
 gcc/tree-vect-loop.cc | 48 +++-
 1 file changed, 19 insertions(+), 29 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index b9e8e9b5559..ceb92156b58 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7139,56 +7139,46 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
 
   if (slp_node)
-{
-  if (is_cond_op)
-   {
- if (dump_enabled_p ())
-   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-"fold-left reduction on SLP not supported.\n");
- return false;
-   }
-
-  gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
-   TYPE_VECTOR_SUBPARTS (vectype_in)));
-}
+gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
+ TYPE_VECTOR_SUBPARTS (vectype_in)));
 
   /* The operands either come from a binary operation or an IFN_COND operation.
  The former is a gimple assign with binary rhs and the latter is a
  gimple call with four arguments.  */
   gcc_assert (num_ops == 2 || num_ops == 4);
-  tree op0, opmask;
-  if (!is_cond_op)
-op0 = ops[1 - reduc_index];
-  else
-{
-  op0 = ops[2 + (1 - reduc_index)];
-  opmask = ops[0];
-  gcc_assert (!slp_node);
-}
 
   int group_size = 1;
   stmt_vec_info scalar_dest_def_info;
   auto_vec vec_oprnds0, vec_opmask;
   if (slp_node)
 {
-  auto_vec > vec_defs (2);
-  vect_get_slp_defs (loop_vinfo, slp_node, _defs);
-  vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
-  vec_defs[0].release ();
-  vec_defs[1].release ();
+  vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
+ + (1 - reduc_index)],
+ _oprnds0);
   group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
   scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
+  /* For an IFN_COND_OP we also need the vector mask operand.  */
+  if (is_cond_op)
+   vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], _opmask);
 }
   else
 {
+  tree op0, opmask;
+  if (!is_cond_op)
+   op0 = ops[1 - reduc_index];
+  else
+   {
+ op0 = ops[2 + (1 - reduc_index)];
+ opmask = ops[0];
+   }
   vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
 op0, _oprnds0);
   scalar_dest_def_info = stmt_info;
 
   /* For an IFN_COND_OP we also need the vector mask operand.  */
   if (is_cond_op)
- vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
-opmask, _opmask);
+   vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+  opmask, _opmask);
 }
 
   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
@@ -8210,7 +8200,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 }
 
   if (reduction_type == FOLD_LEFT_REDUCTION
-  && slp_node
+  && (slp_node && SLP_TREE_LANES (slp_node) > 1)
   && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 {
   /* We cannot use in-order reductions in this case because there is


[gcc r15-1054] Allow single-lane COND_REDUCTION vectorization

2024-06-06 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:202a9c8fe7db9dd94e5a77f42e54ef3d966f88e8

commit r15-1054-g202a9c8fe7db9dd94e5a77f42e54ef3d966f88e8
Author: Richard Biener 
Date:   Fri Mar 1 14:39:08 2024 +0100

Allow single-lane COND_REDUCTION vectorization

The following enables single-lane COND_REDUCTION vectorization.

* tree-vect-loop.cc (vect_create_epilog_for_reduction):
Adjust for single-lane COND_REDUCTION SLP vectorization.
(vectorizable_reduction): Likewise.
(vect_transform_cycle_phi): Likewise.

Diff:
---
 gcc/tree-vect-loop.cc | 97 ++-
 1 file changed, 81 insertions(+), 16 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 06292ed8bbe..ccd6acef5c5 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6030,7 +6030,13 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   tree induc_val = NULL_TREE;
   tree adjustment_def = NULL;
   if (slp_node)
-;
+{
+  /* Optimize: for induction condition reduction, if we can't use zero
+for induc_val, use initial_def.  */
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
+   induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
+  /* ???  Coverage for double_reduc and 'else' isn't clear.  */
+}
   else
 {
   /* Optimize: for induction condition reduction, if we can't use zero
@@ -6075,23 +6081,46 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
 {
   auto_vec, 2> ccompares;
-  stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
-  cond_info = vect_stmt_to_vectorize (cond_info);
-  while (cond_info != reduc_info)
+  if (slp_node)
{
- if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
+ slp_tree cond_node = slp_node_instance->root;
+ while (cond_node != slp_node_instance->reduc_phis)
{
- gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
- gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
- ccompares.safe_push
-   (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
-STMT_VINFO_REDUC_IDX (cond_info) == 2));
+ stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
+ if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
+   {
+ gimple *vec_stmt
+   = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
+ gcc_assert (gimple_assign_rhs_code (vec_stmt) == 
VEC_COND_EXPR);
+ ccompares.safe_push
+   (std::make_pair (gimple_assign_rhs1 (vec_stmt),
+STMT_VINFO_REDUC_IDX (cond_info) == 2));
+   }
+ /* ???  We probably want to have REDUC_IDX on the SLP node?  */
+ cond_node = SLP_TREE_CHILDREN
+   (cond_node)[STMT_VINFO_REDUC_IDX (cond_info)];
}
- cond_info
-   = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
-1 + STMT_VINFO_REDUC_IDX
-   (cond_info)));
+   }
+  else
+   {
+ stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
  cond_info = vect_stmt_to_vectorize (cond_info);
+ while (cond_info != reduc_info)
+   {
+ if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
+   {
+ gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
+ gcc_assert (gimple_assign_rhs_code (vec_stmt) == 
VEC_COND_EXPR);
+ ccompares.safe_push
+   (std::make_pair (gimple_assign_rhs1 (vec_stmt),
+STMT_VINFO_REDUC_IDX (cond_info) == 2));
+   }
+ cond_info
+   = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
+1 + STMT_VINFO_REDUC_IDX
+(cond_info)));
+ cond_info = vect_stmt_to_vectorize (cond_info);
+   }
}
   gcc_assert (ccompares.length () != 0);
 
@@ -7844,7 +7873,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   /* If we have a condition reduction, see if we can simplify it further.  */
   if (v_reduc_type == COND_REDUCTION)
 {
-  if (slp_node)
+  if (slp_node && SLP_TREE_LANES (slp_node) != 1)
return false;
 
   /* When the condition uses the reduction value in the condition, fail.  
*/
@@ -8050,6 +8079,18 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
}
 }
 
+  if ((reduction_type == COND_REDUCTION
+   || reduction_type == INTEGER_INDUC_COND_REDUCTION
+

[gcc r15-1055] Add double reduction support for SLP vectorization

2024-06-06 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:2ee41ef76a99ef5a8b62b351e2c01dad93f51b18

commit r15-1055-g2ee41ef76a99ef5a8b62b351e2c01dad93f51b18
Author: Richard Biener 
Date:   Tue Mar 5 15:28:58 2024 +0100

Add double reduction support for SLP vectorization

The following makes double reduction vectorization work when
using (single-lane) SLP vectorization.

* tree-vect-loop.cc (vect_analyze_scalar_cycles_1): Queue
double reductions in LOOP_VINFO_REDUCTIONS.
(vect_create_epilog_for_reduction): Remove asserts disabling
SLP for double reductions.
(vectorizable_reduction): Analyze SLP double reductions
only once and start off the correct places.
* tree-vect-slp.cc (vect_get_and_check_slp_defs): Allow
vect_double_reduction_def.
(vect_build_slp_tree_2): Fix condition for the ignored
reduction initial values.
* tree-vect-stmts.cc (vect_analyze_stmt): Allow
vect_double_reduction_def.

Diff:
---
 gcc/tree-vect-loop.cc  | 35 +--
 gcc/tree-vect-slp.cc   |  3 ++-
 gcc/tree-vect-stmts.cc |  4 
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index ccd6acef5c5..b9e8e9b5559 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -685,6 +685,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, 
class loop *loop,
 
   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
+ /* Make it accessible for SLP vectorization.  */
+ LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
 }
   else
 {
@@ -5975,7 +5977,6 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   stmt_vec_info rdef_info = stmt_info;
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
 {
-  gcc_assert (!slp_node);
   double_reduc = true;
   stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
(stmt_info->stmt, 0));
@@ -6020,7 +6021,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 {
   outer_loop = loop;
   loop = loop->inner;
-  gcc_assert (!slp_node && double_reduc);
+  gcc_assert (double_reduc);
 }
 
   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
@@ -6035,7 +6036,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 for induc_val, use initial_def.  */
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
-  /* ???  Coverage for double_reduc and 'else' isn't clear.  */
+  /* ???  Coverage for 'else' isn't clear.  */
 }
   else
 {
@@ -7605,15 +7606,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
   return true;
 }
-  if (slp_node)
-{
-  slp_node_instance->reduc_phis = slp_node;
-  /* ???  We're leaving slp_node to point to the PHIs, we only
-need it to get at the number of vector stmts which wasn't
-yet initialized for the instance root.  */
-}
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
 {
+  if (gimple_bb (stmt_info->stmt) != loop->header)
+   {
+ /* For SLP we arrive here for both the inner loop LC PHI and
+the outer loop PHI.  The latter is what we want to analyze
+the reduction with.  */
+ gcc_assert (slp_node);
+ return true;
+   }
   use_operand_p use_p;
   gimple *use_stmt;
   bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
@@ -7622,6 +7624,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   phi_info = loop_vinfo->lookup_stmt (use_stmt);
 }
 
+  if (slp_node)
+{
+  slp_node_instance->reduc_phis = slp_node;
+  /* ???  We're leaving slp_node to point to the PHIs, we only
+need it to get at the number of vector stmts which wasn't
+yet initialized for the instance root.  */
+}
+
   /* PHIs should not participate in patterns.  */
   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
   gphi *reduc_def_phi = as_a  (phi_info->stmt);
@@ -7637,6 +7647,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   bool only_slp_reduc_chain = true;
   stmt_info = NULL;
   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
+  /* For double-reductions we start SLP analysis at the inner loop LC PHI
+ which is the def of the outer loop live stmt.  */
+  if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def
+  && slp_node)
+slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
   while (reduc_def != PHI_RESULT (reduc_def_phi))
 {
   stmt_vec_info def = 

[gcc r15-1053] Relax COND_EXPR reduction vectorization SLP restriction

2024-06-06 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:28edeb1409a7b839407ec06031899b933390bff3

commit r15-1053-g28edeb1409a7b839407ec06031899b933390bff3
Author: Richard Biener 
Date:   Fri Feb 23 16:16:38 2024 +0100

Relax COND_EXPR reduction vectorization SLP restriction

Allow one-lane SLP but for the case where we need to swap the arms.

* tree-vect-stmts.cc (vectorizable_condition): Allow
single-lane SLP, but not when we need to swap then and
else clause.

Diff:
---
 gcc/tree-vect-stmts.cc | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b26cc74f417..c82381e799e 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12116,7 +12116,7 @@ vectorizable_condition (vec_info *vinfo,
 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL;
   if (for_reduction)
 {
-  if (slp_node)
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
return false;
   reduc_info = info_for_reduction (vinfo, stmt_info);
   reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
@@ -12205,6 +12205,10 @@ vectorizable_condition (vec_info *vinfo,
  cond_expr = NULL_TREE;
}
}
+  /* ???  The vectorized operand query below doesn't allow swapping
+this way for SLP.  */
+  if (slp_node)
+   return false;
   std::swap (then_clause, else_clause);
 }


[gcc r15-1006] Do single-lane SLP discovery for reductions

2024-06-04 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:d93353e6423ecaaae9fa47d0935caafd9abfe4de

commit r15-1006-gd93353e6423ecaaae9fa47d0935caafd9abfe4de
Author: Richard Biener 
Date:   Fri Feb 23 11:45:50 2024 +0100

Do single-lane SLP discovery for reductions

The following performs single-lane SLP discovery for reductions.
It requires a fixup for outer loop vectorization where a check
for multiple types needs adjustments as otherwise bogus pointer
IV increments happen when there are multiple copies of vector stmts
in the inner loop.

For the reduction epilog handling this extends the optimized path
to cover the trivial single-lane SLP reduction case.

The fix for PR65518 implemented in vect_grouped_load_supported for
non-SLP needs a SLP counterpart that I put in get_group_load_store_type.

I've decided to adjust three testcases for appearing single-lane
SLP instances instead of not dumping "vectorizing stmts using SLP"
for single-lane instances as that also requires testsuite adjustments.

* tree-vect-slp.cc (vect_build_slp_tree_2): Only multi-lane
discoveries are reduction chains and need special backedge
treatment.
(vect_analyze_slp): Fall back to single-lane SLP discovery
for reductions.  Make sure to try single-lane SLP reduction
for all reductions as fallback.
(vectorizable_load): Avoid outer loop SLP vectorization with
multi-copy vector stmts in the inner loop.
(vectorizable_store): Likewise.
* tree-vect-loop.cc (vect_create_epilog_for_reduction): Allow
direct opcode and shift reduction also for SLP reductions
with a single lane.
* tree-vect-stmts.cc (get_group_load_store_type): For SLP also
check for the PR65518 single-element interleaving case as done in
vect_grouped_load_supported.

* gcc.dg/vect/slp-24.c: Expect another SLP instance for the
reduction.
* gcc.dg/vect/slp-24-big-array.c: Likewise.
* gcc.dg/vect/slp-reduc-6.c: Remove scan for zero SLP instances.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-24-big-array.c |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-24.c   |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-reduc-6.c  |  1 -
 gcc/tree-vect-loop.cc|  4 +-
 gcc/tree-vect-slp.cc | 71 +---
 gcc/tree-vect-stmts.cc   | 24 +-
 6 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c 
b/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c
index 5eaea9600ac..63f744338a1 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c
@@ -92,4 +92,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { 
vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
xfail { vect_no_align && ilp32 } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
xfail { vect_no_align && ilp32 } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-24.c 
b/gcc/testsuite/gcc.dg/vect/slp-24.c
index 59178f2c0f2..7814d7c324e 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-24.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-24.c
@@ -78,4 +78,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { 
vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
xfail { vect_no_align && ilp32 } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
xfail { vect_no_align && ilp32 } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c 
b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
index 1fd15aa3c87..5566705a704 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
@@ -45,6 +45,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail { 
vect_no_int_add || { ! { vect_unpack || vect_strided2 } } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } 
} */
 /* { dg-final { scan-tree-dump-times "different interleaving chains in one 
node" 1 "vect" { target { ! vect_no_int_add } } } } */
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index a08357acc11..06292ed8bbe 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6504,7 +6504,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   /* 2.3 Create the reduction code, using one of the three schemes described
  above. In SLP we simply need to extract all the elements from the 
  vector (without reducing them), so we use scalar shifts.  */
-  else if (reduc_fn != IFN_LAST && !slp_reduc)
+  else if 

[gcc r15-1005] Avoid inserting after a GIMPLE_COND with SLP and early break

2024-06-04 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:0592000aeed84d47040946a125154b3c46d7c84f

commit r15-1005-g0592000aeed84d47040946a125154b3c46d7c84f
Author: Richard Biener 
Date:   Mon May 27 14:40:27 2024 +0200

Avoid inserting after a GIMPLE_COND with SLP and early break

When vectorizing an early break loop with LENs (do we miss some
check here to disallow this?) we can end up deciding to insert
stmts after a GIMPLE_COND when doing SLP scheduling and trying
to be conservative with placing of stmts only dependent on
the implicit loop mask/len.  The following avoids this, I guess
it's not perfect but it does the job fixing some observed
RISC-V regression.

* tree-vect-slp.cc (vect_schedule_slp_node): For mask/len
loops make sure to not advance the insertion iterator
beyond a GIMPLE_COND.

Diff:
---
 gcc/tree-vect-slp.cc | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index bf1f467f53f..11ec82086fc 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -9650,7 +9650,12 @@ vect_schedule_slp_node (vec_info *vinfo,
   else
{
  si = gsi_for_stmt (last_stmt);
- gsi_next ();
+ /* When we're getting gsi_after_labels from the starting
+condition of a fully masked/len loop avoid insertion
+after a GIMPLE_COND that can appear as the only header
+stmt with early break vectorization.  */
+ if (gimple_code (last_stmt) != GIMPLE_COND)
+   gsi_next ();
}
 }


[gcc r12-10493] c++: Add testcase for this PR [PR97990]

2024-06-04 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:c7627054b9ee2ded8a22340a6a09bf9786afcafa

commit r12-10493-gc7627054b9ee2ded8a22340a6a09bf9786afcafa
Author: Andrew Pinski 
Date:   Fri Feb 16 10:55:43 2024 -0800

c++: Add testcase for this PR [PR97990]

This testcase was fixed by r14-5934-gf26d68d5d128c8 but we should add
one to make sure it does not regress again.

Committed as obvious after a quick test on the testcase.

PR c++/97990

gcc/testsuite/ChangeLog:

* g++.dg/torture/vector-struct-1.C: New test.

Signed-off-by: Andrew Pinski 
(cherry picked from commit 5f1438db419c9eb8901d1d1d7f98fb69082aec8e)

Diff:
---
 gcc/testsuite/g++.dg/torture/vector-struct-1.C | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/gcc/testsuite/g++.dg/torture/vector-struct-1.C 
b/gcc/testsuite/g++.dg/torture/vector-struct-1.C
new file mode 100644
index 000..e2747417e2d
--- /dev/null
+++ b/gcc/testsuite/g++.dg/torture/vector-struct-1.C
@@ -0,0 +1,18 @@
+/* PR c++/97990 */
+/* This used to crash with lto and strict aliasing enabled as the
+   vector type variant still had TYPE_ALIAS_SET set on it. */
+
+typedef __attribute__((__vector_size__(sizeof(short short TSimd;
+TSimd hh(int);
+struct y6
+{
+  TSimd VALUE;
+  ~y6();
+};
+template 
+auto f2(T1 p1, T2){
+  return hh(p1) <= 0;
+}
+void f1(){
+  f2(0, y6{});
+}


[gcc r12-10492] middle-end/112732 - stray TYPE_ALIAS_SET in type variant

2024-06-04 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:b46486ef0316240eb3c173bda062b52333507e03

commit r12-10492-gb46486ef0316240eb3c173bda062b52333507e03
Author: Richard Biener 
Date:   Tue Nov 28 12:36:21 2023 +0100

middle-end/112732 - stray TYPE_ALIAS_SET in type variant

The following fixes a stray TYPE_ALIAS_SET in a type variant built
by build_opaque_vector_type which is diagnosed by type checking
enabled with -flto.

PR middle-end/112732
* tree.cc (build_opaque_vector_type): Reset TYPE_ALIAS_SET
of the newly built type.

(cherry picked from commit f26d68d5d128c86faaceeb81b1e8f22254ad53df)

Diff:
---
 gcc/tree.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/tree.cc b/gcc/tree.cc
index ead4c1421cd..6b28eb9f10d 100644
--- a/gcc/tree.cc
+++ b/gcc/tree.cc
@@ -10124,6 +10124,8 @@ build_opaque_vector_type (tree innertype, poly_int64 
nunits)
   TYPE_NEXT_VARIANT (cand) = TYPE_NEXT_VARIANT (t);
   TYPE_NEXT_VARIANT (t) = cand;
   TYPE_MAIN_VARIANT (cand) = TYPE_MAIN_VARIANT (t);
+  /* Type variants have no alias set defined.  */
+  TYPE_ALIAS_SET (cand) = -1;
   return cand;
 }


[gcc r12-10491] tree-optimization/110381 - preserve SLP permutation with in-order reductions

2024-06-04 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:8f6d889a8e609710ecfd555778fbff602b2c7d74

commit r12-10491-g8f6d889a8e609710ecfd555778fbff602b2c7d74
Author: Richard Biener 
Date:   Mon Jun 26 12:51:37 2023 +0200

tree-optimization/110381 - preserve SLP permutation with in-order reductions

The following fixes a bug that manifests itself during fold-left
reduction transform in picking not the last scalar def to replace
and thus double-counting some elements.  But the underlying issue
is that we merge a load permutation into the in-order reduction
which is of course wrong.

Now, reduction analysis has not yet been performend when optimizing
permutations so we have to resort to check that ourselves.

PR tree-optimization/110381
* tree-vect-slp.cc (vect_optimize_slp_pass::start_choosing_layouts):
Materialize permutes before fold-left reductions.

* gcc.dg/vect/pr110381.c: New testcase.

(cherry picked from commit 53d6f57c1b20c6da52aefce737fb7d5263686ba3)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr110381.c | 44 
 gcc/tree-vect-slp.cc | 19 +---
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr110381.c 
b/gcc/testsuite/gcc.dg/vect/pr110381.c
new file mode 100644
index 000..278f4426c29
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr110381.c
@@ -0,0 +1,44 @@
+/* { dg-require-effective-target vect_float_strict } */
+
+#include "tree-vect.h"
+
+struct FOO {
+   double a;
+   double b;
+   double c;
+};
+
+double __attribute__((noipa))
+sum_8_foos(const struct FOO* foos)
+{
+  double sum = 0;
+
+  for (int i = 0; i < 8; ++i)
+{
+  struct FOO foo = foos[i];
+
+  /* Need to use an in-order reduction here, preserving
+ the load permutation.  */
+  sum += foo.a;
+  sum += foo.c;
+  sum += foo.b;
+}
+
+  return sum;
+}
+
+int main()
+{
+  struct FOO foos[8];
+
+  check_vect ();
+
+  __builtin_memset (foos, 0, sizeof (foos));
+  foos[0].a = __DBL_MAX__;
+  foos[0].b = 5;
+  foos[0].c = -__DBL_MAX__;
+
+  if (sum_8_foos (foos) != 5)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 54e6a9e4224..19cab93761c 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3733,9 +3733,8 @@ vect_optimize_slp (vec_info *vinfo)
   vertices[idx].perm_out = perms.length () - 1;
 }
 
-  /* In addition to the above we have to mark outgoing permutes facing
- non-reduction graph entries that are not represented as to be
- materialized.  */
+  /* We have to mark outgoing permutations facing non-associating-reduction
+ graph entries that are not represented as to be materialized.  */
   for (slp_instance instance : vinfo->slp_instances)
 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
   {
@@ -3744,6 +3743,20 @@ vect_optimize_slp (vec_info *vinfo)
vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
   }
+else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
+  {
+   stmt_vec_info stmt_info
+ = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
+   stmt_vec_info reduc_info = info_for_reduction (vinfo, stmt_info);
+   if (needs_fold_left_reduction_p (TREE_TYPE
+  (gimple_get_lhs (stmt_info->stmt)),
+STMT_VINFO_REDUC_CODE (reduc_info)))
+ {
+   unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
+   vertices[node_i].perm_in = 0;
+   vertices[node_i].perm_out = 0;
+ }
+  }
 
   /* Propagate permutes along the graph and compute materialization points.  */
   bool changed;


[gcc r12-10490] tree-optimization/113910 - huge compile time during PTA

2024-06-04 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:db0f236aa1c30f703ff564960bd9f3dbd747ea7b

commit r12-10490-gdb0f236aa1c30f703ff564960bd9f3dbd747ea7b
Author: Richard Biener 
Date:   Wed Feb 14 12:33:13 2024 +0100

tree-optimization/113910 - huge compile time during PTA

For the testcase in PR113910 we spend a lot of time in PTA comparing
bitmaps for looking up equivalence class members.  This points to
the very weak bitmap_hash function which effectively hashes set
and a subset of not set bits.

The major problem with it is that it simply truncates the
BITMAP_WORD sized intermediate hash to hashval_t which is
unsigned int, effectively not hashing half of the bits.

This reduces the compile-time for the testcase from tens of minutes
to 42 seconds and PTA time from 99% to 46%.

PR tree-optimization/113910
* bitmap.cc (bitmap_hash): Mix the full element "hash" to
the hashval_t hash.

(cherry picked from commit ad7a365aaccecd23ea287c7faaab9c7bd50b944a)

Diff:
---
 gcc/bitmap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/bitmap.cc b/gcc/bitmap.cc
index 88c329f9325..601c04e2e13 100644
--- a/gcc/bitmap.cc
+++ b/gcc/bitmap.cc
@@ -2673,7 +2673,7 @@ bitmap_hash (const_bitmap head)
   for (ix = 0; ix != BITMAP_ELEMENT_WORDS; ix++)
hash ^= ptr->bits[ix];
 }
-  return (hashval_t)hash;
+  return iterative_hash (, sizeof (hash), 0);
 }


[gcc r15-991] testsuite/115304 - properly guard gcc.dg/vect/slp-gap-1.c

2024-06-03 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:ed8ba88074f3663f810ef2f07d79c3fcde5d9697

commit r15-991-ged8ba88074f3663f810ef2f07d79c3fcde5d9697
Author: Richard Biener 
Date:   Mon Jun 3 14:43:42 2024 +0200

testsuite/115304 - properly guard gcc.dg/vect/slp-gap-1.c

Testing on sparc shows we need vect_unpack and vect_perm.  This
isn't enough to resolve the GCN fail which ends up using interleaving.

PR testsuite/115304
* gcc.dg/vect/slp-gap-1.c: Require vect_unpack and vect_perm.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-gap-1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-gap-1.c 
b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c
index 36463ca22c5..9856da7a7f4 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-gap-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c
@@ -15,4 +15,4 @@ void pixel_sub_wxh(int16_t * __restrict diff, uint8_t *pix1, 
uint8_t *pix2) {
 /* We can vectorize this without peeling for gaps and thus without epilogue,
but the only thing we can reliably scan is the zero-padding trick for the
partial loads.  */
-/* { dg-final { scan-tree-dump-times "\{_\[0-9\]\+, 0" 6 "vect" { target 
vect64 } } } */
+/* { dg-final { scan-tree-dump-times "\{_\[0-9\]\+, 0" 6 "vect" { target { 
vect64 && { vect_unpack && vect_perm } } } } } */


[gcc r15-986] Adjust vector dump scans

2024-06-03 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:5b52517e22540874bac07e2499e9650a9e8278a4

commit r15-986-g5b52517e22540874bac07e2499e9650a9e8278a4
Author: Richard Biener 
Date:   Fri May 31 15:38:29 2024 +0200

Adjust vector dump scans

The following adjusts dump scanning for something followed by
successful vector analysis to more specifically look for
'Analysis succeeded' and not 'Analysis failed' because the
previous look for just 'succeeded' or 'failed' is easily confused
by SLP discovery dumping those words.

* tree-vect-loop.cc (vect_analyze_loop_1): Avoid extra space
before 'failed'.

* gcc.dg/vect/no-scevccp-outer-7.c: Adjust scanning for
succeeded analysis not interrupted by failure.
* gcc.dg/vect/no-scevccp-vect-iv-3.c: Likewise.
* gcc.dg/vect/vect-cond-reduc-4.c: Likewise.
* gcc.dg/vect/vect-live-2.c: Likewise.
* gcc.dg/vect/vect-outer-4c-big-array.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-s16a.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-s8a.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-s8b.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-u16a.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-u16b.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-u8a.c: Likewise.
* gcc.dg/vect/vect-reduc-dot-u8b.c: Likewise.
* gcc.dg/vect/vect-reduc-pattern-1a.c: Likewise.
* gcc.dg/vect/vect-reduc-pattern-1b-big-array.c: Likewise.
* gcc.dg/vect/vect-reduc-pattern-1c-big-array.c: Likewise.
* gcc.dg/vect/vect-reduc-pattern-2a.c: Likewise.
* gcc.dg/vect/vect-reduc-pattern-2b-big-array.c: Likewise.
* gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c  | 2 +-
 gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c| 2 +-
 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c   | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-live-2.c | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-outer-4c-big-array.c | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16a.c | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8a.c  | 4 ++--
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8b.c  | 4 ++--
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16a.c | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16b.c | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8a.c  | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8b.c  | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1a.c   | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1b-big-array.c | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1c-big-array.c | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2a.c   | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2b-big-array.c | 2 +-
 gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8b.c| 4 ++--
 gcc/tree-vect-loop.cc   | 2 +-
 19 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c 
b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c
index 87048422013..e796e6ba216 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-7.c
@@ -77,4 +77,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { 
target vect_widen_mult_hi_to_si } } } */
-/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: 
detected(?:(?!failed)(?!Re-trying).)*succeeded" 1 "vect" { target 
vect_widen_mult_hi_to_si } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: 
detected(?:(?!Analysis failed).)*Analysis succeeded" 1 "vect" { target 
vect_widen_mult_hi_to_si } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c 
b/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c
index 6f2b2210b11..f268d4a5131 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-vect-iv-3.c
@@ -30,4 +30,4 @@ unsigned int main1 ()
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target 
vect_widen_sum_hi_to_si } } } */
-/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: 
detected(?:(?!failed)(?!Re-trying).)*succeeded" 1 "vect" { target 
vect_widen_sum_hi_to_si } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: 
detected(?:(?!Analysis failed).)*Analysis succeeded" 1 "vect" { target 
vect_widen_sum_hi_to_si } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c 
b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c
index 27f18dc5bda..e9d414287e8 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-4.c
@@ -42,6 +42,6 @@ main (void)
 }
 
 /* { dg-final { 

[gcc r15-985] Avoid ICE with pointer reduction

2024-06-03 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:a1810364cd0c36d6408a4c386bdc504a021d68c7

commit r15-985-ga1810364cd0c36d6408a4c386bdc504a021d68c7
Author: Richard Biener 
Date:   Fri May 31 15:17:10 2024 +0200

Avoid ICE with pointer reduction

There's another case where we can refer to neutral_op before
eventually converting it from pointer to integer so simply
do that unconditionally.

* tree-vect-loop.cc (get_initial_defs_for_reduction):
Always convert neutral_op.

Diff:
---
 gcc/tree-vect-loop.cc | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5b85cffb37f..b6e0b9616d5 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5606,6 +5606,12 @@ get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
   tree_vector_builder elts (vector_type, nunits, 1);
   elts.quick_grow (nunits);
   gimple_seq ctor_seq = NULL;
+  if (neutral_op
+  && !useless_type_conversion_p (TREE_TYPE (vector_type),
+TREE_TYPE (neutral_op)))
+neutral_op = gimple_convert (_seq,
+TREE_TYPE (vector_type),
+neutral_op);
   for (j = 0; j < nunits * number_of_vectors; ++j)
 {
   tree op;
@@ -5614,14 +5620,7 @@ get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
   /* Get the def before the loop.  In reduction chain we have only
 one initial value.  Else we have as many as PHIs in the group.  */
   if (i >= initial_values.length () || (j > i && neutral_op))
-   {
- if (!useless_type_conversion_p (TREE_TYPE (vector_type),
- TREE_TYPE (neutral_op)))
-   neutral_op = gimple_convert (_seq,
-TREE_TYPE (vector_type),
-neutral_op);
- op = neutral_op;
-   }
+   op = neutral_op;
   else
{
  if (!useless_type_conversion_p (TREE_TYPE (vector_type),


[gcc r15-941] tree-optimization/115278 - fix DSE in if-conversion wrt volatiles

2024-05-31 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:65dbe0ab7cdaf2aa84b09a74e594f0faacf1945c

commit r15-941-g65dbe0ab7cdaf2aa84b09a74e594f0faacf1945c
Author: Richard Biener 
Date:   Fri May 31 10:14:25 2024 +0200

tree-optimization/115278 - fix DSE in if-conversion wrt volatiles

The following adds the missing guard for volatile stores to the
embedded DSE in the loop if-conversion pass.

PR tree-optimization/115278
* tree-if-conv.cc (ifcvt_local_dce): Do not DSE volatile stores.

* g++.dg/vect/pr115278.cc: New testcase.

Diff:
---
 gcc/testsuite/g++.dg/vect/pr115278.cc | 38 +++
 gcc/tree-if-conv.cc   |  4 +++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/g++.dg/vect/pr115278.cc 
b/gcc/testsuite/g++.dg/vect/pr115278.cc
new file mode 100644
index 000..331075fb278
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/pr115278.cc
@@ -0,0 +1,38 @@
+// { dg-do compile }
+// { dg-require-effective-target c++11 }
+// { dg-additional-options "-fdump-tree-optimized" }
+
+#include 
+
+const int runs = 92;
+
+union BitfieldStructUnion {
+struct {
+uint64_t a : 17;
+uint64_t padding: 39;
+uint64_t b : 8;
+} __attribute__((packed));
+
+struct {
+uint32_t value_low;
+uint32_t value_high;
+} __attribute__((packed));
+
+BitfieldStructUnion(uint32_t value_low, uint32_t value_high) : 
value_low(value_low), value_high(value_high) {}
+};
+
+volatile uint32_t *WRITE = (volatile unsigned*)0x42;
+
+void buggy() {
+for (int i = 0; i < runs; i++) {
+BitfieldStructUnion rt{*WRITE, *WRITE};
+
+rt.a = 99;
+rt.b = 1;
+
+*WRITE = rt.value_low;
+*WRITE = rt.value_high;
+}
+}
+
+// { dg-final { scan-tree-dump-times "\\\*WRITE\[^\r\n\]* ={v} " 2 "optimized" 
} }
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 09d99fb9dda..c4c3ed41a44 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -3381,7 +3381,9 @@ ifcvt_local_dce (class loop *loop)
   gimple_stmt_iterator gsiprev = gsi;
   gsi_prev ();
   stmt = gsi_stmt (gsi);
-  if (gimple_store_p (stmt) && gimple_vdef (stmt))
+  if (!gimple_has_volatile_ops (stmt)
+ && gimple_store_p (stmt)
+ && gimple_vdef (stmt))
{
  tree lhs = gimple_get_lhs (stmt);
  ao_ref write;


[gcc r15-896] tree-optimization/115252 - enhance peeling for gaps avoidance

2024-05-29 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:f46eaad445e680034df51bd0dec4e6c7b1f372a4

commit r15-896-gf46eaad445e680034df51bd0dec4e6c7b1f372a4
Author: Richard Biener 
Date:   Mon May 27 16:04:35 2024 +0200

tree-optimization/115252 - enhance peeling for gaps avoidance

Code generation for contiguous load vectorization can already deal
with generalized avoidance of loading from a gap.  The following
extends detection of peeling for gaps requirement with that,
gets rid of the old special casing of a half load and makes sure
when we do access the gap we have peeling for gaps enabled.

PR tree-optimization/115252
* tree-vect-stmts.cc (get_group_load_store_type): Enhance
detecting the number of cases where we can avoid accessing a gap
during code generation.
(vectorizable_load): Remove old half-vector peeling for gap
avoidance which is now redundant.  Add gap-aligned case where
it's OK to access the gap.  Add assert that we have peeling for
gaps enabled when we access a gap.

* gcc.dg/vect/slp-gap-1.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-gap-1.c | 18 +++
 gcc/tree-vect-stmts.cc| 58 +--
 2 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-gap-1.c 
b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c
new file mode 100644
index 000..36463ca22c5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+
+typedef unsigned char uint8_t;
+typedef short int16_t;
+void pixel_sub_wxh(int16_t * __restrict diff, uint8_t *pix1, uint8_t *pix2) {
+  for (int y = 0; y < 4; y++) {
+for (int x = 0; x < 4; x++)
+  diff[x + y * 4] = pix1[x] - pix2[x];
+pix1 += 16;
+pix2 += 32;
+  }
+}
+
+/* We can vectorize this without peeling for gaps and thus without epilogue,
+   but the only thing we can reliably scan is the zero-padding trick for the
+   partial loads.  */
+/* { dg-final { scan-tree-dump-times "\{_\[0-9\]\+, 0" 6 "vect" { target 
vect64 } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 4219ad832db..935d80f0e1b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2072,16 +2072,22 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
  dr_alignment_support alss;
  int misalign = dr_misalignment (first_dr_info, vectype);
  tree half_vtype;
+ poly_uint64 remain;
+ unsigned HOST_WIDE_INT tem, num;
  if (overrun_p
  && !masked_p
  && (((alss = vect_supportable_dr_alignment (vinfo, first_dr_info,
  vectype, misalign)))
   == dr_aligned
  || alss == dr_unaligned_supported)
- && known_eq (nunits, (group_size - gap) * 2)
- && known_eq (nunits, group_size)
- && (vector_vector_composition_type (vectype, 2, _vtype)
- != NULL_TREE))
+ && can_div_trunc_p (group_size
+ * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap,
+ nunits, , )
+ && (known_eq (remain, 0u)
+ || (constant_multiple_p (nunits, remain, )
+ && (vector_vector_composition_type (vectype, num,
+ _vtype)
+ != NULL_TREE
overrun_p = false;
 
  if (overrun_p && !can_overrun_p)
@@ -11513,33 +11519,14 @@ vectorizable_load (vec_info *vinfo,
unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
unsigned int vect_align
  = vect_known_alignment_in_bytes (first_dr_info, vectype);
-   unsigned int scalar_dr_size
- = vect_get_scalar_dr_size (first_dr_info);
-   /* If there's no peeling for gaps but we have a gap
-  with slp loads then load the lower half of the
-  vector only.  See get_group_load_store_type for
-  when we apply this optimization.  */
-   if (slp
-   && loop_vinfo
-   && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
-   && known_eq (nunits, (group_size - gap) * 2)
-   && known_eq (nunits, group_size)
-   && gap >= (vect_align / scalar_dr_size))
- {
-   tree half_vtype;
-   new_vtype
- = vector_vector_composition_type (vectype, 2,
-   _vtype);
-   if (new_vtype != NULL_TREE)
- ltype = 

[gcc r15-895] tree-optimization/114435 - pcom left around copies confusing SLP

2024-05-29 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:1065a7db6f2a69770a85b4d53b9123b090dd1771

commit r15-895-g1065a7db6f2a69770a85b4d53b9123b090dd1771
Author: Richard Biener 
Date:   Wed May 29 10:41:51 2024 +0200

tree-optimization/114435 - pcom left around copies confusing SLP

The following arranges for the pre-SLP vectorization scalar cleanup
to be run when predictive commoning was applied to a loop in the
function.  This is similar to the complete unroll situation and
facilitating SLP vectorization.  Avoiding the SSA copies in predictive
commoning itself isn't easy (and predcom also sometimes unrolls,
asking for scalar cleanup).

PR tree-optimization/114435
* tree-predcom.cc (tree_predictive_commoning): Queue
the next scalar cleanup sub-pipeline to be run when we
did something.

* gcc.dg/vect/bb-slp-pr114435.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/vect/bb-slp-pr114435.c | 37 +
 gcc/tree-predcom.cc |  3 +++
 2 files changed, 40 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr114435.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-pr114435.c
new file mode 100644
index 000..d1eecf7979a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr114435.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+/* Predictive commining is supposed to happen.  */
+/* { dg-additional-options "-O3 -fdump-tree-pcom" } */
+
+struct res {
+double r0;
+double r1;
+double r2;
+double r3;
+};
+
+struct pxl {
+double v0;
+double v1;
+double v2;
+double v3;
+};
+
+#define IS_NAN(x) ((x) == (x))
+
+void fold(struct res *r, struct pxl *in, double k, int sz)
+{
+  int i;
+
+  for (i = 0; i < sz; i++) {
+  if (IS_NAN(k)) continue;
+  r->r0 += in[i].v0 * k;
+  r->r1 += in[i].v1 * k;
+  r->r2 += in[i].v2 * k;
+  r->r3 += in[i].v3 * k;
+  }
+}
+
+/* { dg-final { scan-tree-dump "# r__r0_lsm\[^\r\n\]* = PHI" "pcom" } } */
+/* { dg-final { scan-tree-dump "optimized: basic block part vectorized" "slp1" 
} } */
+/* { dg-final { scan-tree-dump "# vect\[^\r\n\]* = PHI" "slp1" } } */
diff --git a/gcc/tree-predcom.cc b/gcc/tree-predcom.cc
index 75a4c85164c..9844fee1e97 100644
--- a/gcc/tree-predcom.cc
+++ b/gcc/tree-predcom.cc
@@ -3522,6 +3522,9 @@ tree_predictive_commoning (bool allow_unroll_p)
}
 }
 
+  if (ret != 0)
+cfun->pending_TODOs |= PENDING_TODO_force_next_scalar_cleanup;
+
   return ret;
 }


[gcc r14-10256] tree-optimization/115197 - fix ICE w/ constant in LC PHI and loop distribution

2024-05-29 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:2a1fdd5fd0f6bc02d25da192c8fa6487d93d2d50

commit r14-10256-g2a1fdd5fd0f6bc02d25da192c8fa6487d93d2d50
Author: Richard Biener 
Date:   Thu May 23 14:36:39 2024 +0200

tree-optimization/115197 - fix ICE w/ constant in LC PHI and loop 
distribution

Forgot a check for an SSA name before trying to replace a PHI arg with
its current definition.

PR tree-optimization/115197
* tree-loop-distribution.cc (copy_loop_before): Constant PHI
args remain the same.

* gcc.dg/pr115197.c: New testcase.

(cherry picked from commit 2b2476d4d18c92b8aba3567ebccd2100c2f7c258)

Diff:
---
 gcc/testsuite/gcc.dg/pr115197.c | 14 ++
 gcc/tree-loop-distribution.cc   |  7 +--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr115197.c b/gcc/testsuite/gcc.dg/pr115197.c
new file mode 100644
index 000..00d674b3bd9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr115197.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fno-tree-scev-cprop -ftree-pre 
-ftree-loop-distribute-patterns" } */
+
+int a, b[2], c, d, e, f[2];
+int main() {
+  while (a)
+if (d) {
+  if (e)
+return 0;
+  for (; c; c++)
+f[c] = 0 < (b[c] = ~(f[c + 1] < a));
+}
+  return 0;
+}
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index 45932bae5e7..c5a05ee151d 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -977,8 +977,11 @@ copy_loop_before (class loop *loop, bool 
redirect_lc_phi_defs)
  if (virtual_operand_p (gimple_phi_result (phi)))
continue;
  use_operand_p use_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, exit);
- tree new_def = get_current_def (USE_FROM_PTR (use_p));
- SET_USE (use_p, new_def);
+ if (TREE_CODE (USE_FROM_PTR (use_p)) == SSA_NAME)
+   {
+ tree new_def = get_current_def (USE_FROM_PTR (use_p));
+ SET_USE (use_p, new_def);
+   }
}
 }


[gcc r14-10257] tree-optimization/115149 - VOP live and missing PHIs

2024-05-29 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:90a447677a2abb934b683a012b477e6c52088e35

commit r14-10257-g90a447677a2abb934b683a012b477e6c52088e35
Author: Richard Biener 
Date:   Tue May 21 09:48:04 2024 +0200

tree-optimization/115149 - VOP live and missing PHIs

The following fixes a bug in vop-live get_live_in which was using
NULL to indicate the first processed edge but at the same time
using it for the case the live-in virtual operand cannot be computed.
The following fixes this, avoiding sinking a load to a place where
we'd have to insert virtual PHIs to make the virtual operand SSA
web OK.

PR tree-optimization/115149
* tree-ssa-live.cc (virtual_operand_live::get_live_in):
Explicitly track the first processed edge.

* gcc.dg/pr115149.c: New testcase.

(cherry picked from commit ec9b8bafe20755d13ab9a1b834b5da79ae972c0e)

Diff:
---
 gcc/testsuite/gcc.dg/pr115149.c | 16 
 gcc/tree-ssa-live.cc|  8 ++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr115149.c b/gcc/testsuite/gcc.dg/pr115149.c
new file mode 100644
index 000..9f6bc97dbe6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr115149.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -fno-tree-vrp -fno-ipa-sra -fno-tree-dce 
-fno-tree-ch" } */
+
+int a, c, e, f, g, h[1], i;
+static int j(int b) { return 0; }
+static void k(int d) {}
+int main()
+{
+  if (h[0])
+while (1) {
+   k(f && j(i && (h[g] = e)));
+   while (a)
+ c ^= 1;
+}
+  return 0;
+}
diff --git a/gcc/tree-ssa-live.cc b/gcc/tree-ssa-live.cc
index d94e94eb3bc..122d8e245dd 100644
--- a/gcc/tree-ssa-live.cc
+++ b/gcc/tree-ssa-live.cc
@@ -1684,14 +1684,18 @@ virtual_operand_live::get_live_in (basic_block bb)
   edge_iterator ei;
   edge e;
   tree livein = NULL_TREE;
+  bool first = true;
   FOR_EACH_EDGE (e, ei, bb->preds)
 if (e->flags & EDGE_DFS_BACK)
   /* We can ignore backedges since if there's a def there it would
 have forced a PHI in the source because it also acts as use
 downstream.  */
   continue;
-else if (!livein)
-  livein = get_live_out (e->src);
+else if (first)
+  {
+   livein = get_live_out (e->src);
+   first = false;
+  }
 else if (get_live_out (e->src) != livein)
   /* When there's no virtual use downstream this indicates a point
 where we'd insert a PHI merging the different live virtual


[gcc r14-10255] tree-optimization/114921 - _Float16 -> __bf16 isn't noop fixup

2024-05-29 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:9e971c671ded9647beb0a1c5b9430b4e64060862

commit r14-10255-g9e971c671ded9647beb0a1c5b9430b4e64060862
Author: Richard Biener 
Date:   Mon May 6 12:03:09 2024 +0200

tree-optimization/114921 - _Float16 -> __bf16 isn't noop fixup

The following further strengthens the check which convert expressions
we allow to vectorize as simple copy by resorting to
tree_nop_conversion_p on the vector components.

PR tree-optimization/114921
* tree-vect-stmts.cc (vectorizable_assignment): Use
tree_nop_conversion_p to identify converts we can vectorize
with a simple assignment.

(cherry picked from commit d0d6dcc019cd32eebf85d625f56e0f7573938319)

Diff:
---
 gcc/tree-vect-stmts.cc | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index f8d8636b139..21e8fe98e44 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5955,14 +5955,17 @@ vectorizable_assignment (vec_info *vinfo,
   if (!vectype_in)
 vectype_in = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), slp_node);
 
-  /* We can handle NOP_EXPR conversions that do not change the number
- of elements or the vector size.  */
-  if ((CONVERT_EXPR_CODE_P (code)
-   || code == VIEW_CONVERT_EXPR)
-  && (!vectype_in
- || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in), nunits)
- || maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
-  GET_MODE_SIZE (TYPE_MODE (vectype_in)
+  /* We can handle VIEW_CONVERT conversions that do not change the number
+ of elements or the vector size or other conversions when the component
+ types are nop-convertible.  */
+  if (!vectype_in
+  || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in), nunits)
+  || (code == VIEW_CONVERT_EXPR
+ && maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
+  GET_MODE_SIZE (TYPE_MODE (vectype_in
+  || (CONVERT_EXPR_CODE_P (code)
+ && !tree_nop_conversion_p (TREE_TYPE (vectype),
+TREE_TYPE (vectype_in
 return false;
 
   if (VECTOR_BOOLEAN_TYPE_P (vectype) != VECTOR_BOOLEAN_TYPE_P (vectype_in))


[gcc r15-862] target/115254 - fix gcc.dg/vect/vect-gather-4.c dump scanning

2024-05-28 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:d8d70b783765361a8acef70fc9b54db526cd6ff5

commit r15-862-gd8d70b783765361a8acef70fc9b54db526cd6ff5
Author: Richard Biener 
Date:   Tue May 28 15:55:59 2024 +0200

target/115254 - fix gcc.dg/vect/vect-gather-4.c dump scanning

The dump scanning is supposed to check that we do not merge two
sligtly different gathers into one SLP node but since we now
SLP the store scanning for "ectorizing stmts using SLP" is no
longer good.  Instead the following makes us look for
"stmt 1 .* = .MASK" which would be how the second lane of an SLP
node looks like.  We have to handle both .MASK_GATHER_LOAD (for
targets with ifun mask gathers) and .MASK_LOAD (for ones without).

Tested on x86_64-linux with and without native gather and on GCN
where this now avoids a FAIL.

PR target/115254
* gcc.dg/vect/vect-gather-4.c: Adjust dump scan.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-gather-4.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c 
b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
index d18094d6982..edd9a6783c2 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
@@ -45,4 +45,7 @@ f3 (int *restrict y, int *restrict x, int *restrict indices)
 }
 }
 
-/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" vect } } */
+/* We do not want to see a two-lane .MASK_LOAD or .MASK_GATHER_LOAD since
+   the gathers are different on each lane.  This is a bit fragile and
+   should possibly be turned into a runtime test.  */
+/* { dg-final { scan-tree-dump-not "stmt 1 \[^\r\n\]* = .MASK" vect } } */


[gcc r15-861] tree-optimization/115236 - more points-to *ANYTHING = x fixes

2024-05-28 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:c08b0d3f7b3539b26031de31d88dea6b94474577

commit r15-861-gc08b0d3f7b3539b26031de31d88dea6b94474577
Author: Richard Biener 
Date:   Mon May 27 10:41:02 2024 +0200

tree-optimization/115236 - more points-to *ANYTHING = x fixes

The stored-to ANYTHING handling has more holes, uncovered by treating
volatile accesses as ANYTHING.  We fail to properly build the
pred and succ graphs, in particular we may not elide direct nodes
from receiving from STOREDANYTHING.

PR tree-optimization/115236
* tree-ssa-structalias.cc (build_pred_graph): Properly
handle *ANYTHING = X.
(build_succ_graph): Likewise.  Do not elide direct nodes
from receiving from STOREDANYTHING.

* gcc.dg/pr115236.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/pr115236.c | 12 
 gcc/tree-ssa-structalias.cc | 20 ++--
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr115236.c b/gcc/testsuite/gcc.dg/pr115236.c
new file mode 100644
index 000..91edfab957a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr115236.c
@@ -0,0 +1,12 @@
+/* { dg-do run } */
+/* { dg-options "-O -fno-tree-fre" } */
+
+int a, *b = 
+int main()
+{
+  int *c, *volatile *d = 
+  *d = b;
+  if (c != )
+__builtin_abort();
+  return 0;
+}
diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc
index 9cec2c6cfd9..330e64e65da 100644
--- a/gcc/tree-ssa-structalias.cc
+++ b/gcc/tree-ssa-structalias.cc
@@ -1312,7 +1312,12 @@ build_pred_graph (void)
{
  /* *x = y.  */
  if (rhs.offset == 0 && lhs.offset == 0 && rhs.type == SCALAR)
-   add_pred_graph_edge (graph, FIRST_REF_NODE + lhsvar, rhsvar);
+   {
+ if (lhs.var == anything_id)
+   add_pred_graph_edge (graph, storedanything_id, rhsvar);
+ else
+   add_pred_graph_edge (graph, FIRST_REF_NODE + lhsvar, rhsvar);
+   }
}
   else if (rhs.type == DEREF)
{
@@ -1398,7 +1403,12 @@ build_succ_graph (void)
   if (lhs.type == DEREF)
{
  if (rhs.offset == 0 && lhs.offset == 0 && rhs.type == SCALAR)
-   add_graph_edge (graph, FIRST_REF_NODE + lhsvar, rhsvar);
+   {
+ if (lhs.var == anything_id)
+   add_graph_edge (graph, storedanything_id, rhsvar);
+ else
+   add_graph_edge (graph, FIRST_REF_NODE + lhsvar, rhsvar);
+   }
}
   else if (rhs.type == DEREF)
{
@@ -1418,13 +1428,11 @@ build_succ_graph (void)
}
 }
 
-  /* Add edges from STOREDANYTHING to all non-direct nodes that can
- receive pointers.  */
+  /* Add edges from STOREDANYTHING to all nodes that can receive pointers.  */
   t = find (storedanything_id);
   for (i = integer_id + 1; i < FIRST_REF_NODE; ++i)
 {
-  if (!bitmap_bit_p (graph->direct_nodes, i)
- && get_varinfo (i)->may_have_pointers)
+  if (get_varinfo (i)->may_have_pointers)
add_graph_edge (graph, find (i), t);
 }


[gcc r15-860] Avoid pessimistic constraints for asm memory constraints

2024-05-28 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:19cc6120087619b496732b249b48b1fbe27e

commit r15-860-g19cc6120087619b496732b249b48b1fbe27e
Author: Richard Biener 
Date:   Tue May 28 13:29:30 2024 +0200

Avoid pessimistic constraints for asm memory constraints

We process asm memory input/outputs with constraints to ESCAPED
but for this temporarily build an ADDR_EXPR.  The issue is that
the used build_fold_addr_expr ends up wrapping the ADDR_EXPR in
a conversion which ends up producing  constraints which
is quite bad.  The following uses get_constraint_for_address_of
instead, avoiding the temporary tree and the unhandled conversion.

This avoids a gcc.dg/tree-ssa/restrict-9.c FAIL with the fix
for PR115236.

* tree-ssa-structalias.cc (find_func_aliases): Use
get_constraint_for_address_of to build escape constraints
for asm inputs and outputs.

Diff:
---
 gcc/tree-ssa-structalias.cc | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc
index f93c5df0767..9cec2c6cfd9 100644
--- a/gcc/tree-ssa-structalias.cc
+++ b/gcc/tree-ssa-structalias.cc
@@ -5269,7 +5269,11 @@ find_func_aliases (struct function *fn, gimple *origt)
 
  /* A memory constraint makes the address of the operand escape.  */
  if (!allows_reg && allows_mem)
-   make_escape_constraint (build_fold_addr_expr (op));
+   {
+ auto_vec tmpc;
+ get_constraint_for_address_of (op, );
+ make_constraints_to (escaped_id, tmpc);
+   }
 
  /* The asm may read global memory, so outputs may point to
 any global memory.  */
@@ -5298,7 +5302,11 @@ find_func_aliases (struct function *fn, gimple *origt)
 
  /* A memory constraint makes the address of the operand escape.  */
  if (!allows_reg && allows_mem)
-   make_escape_constraint (build_fold_addr_expr (op));
+   {
+ auto_vec tmpc;
+ get_constraint_for_address_of (op, );
+ make_constraints_to (escaped_id, tmpc);
+   }
  /* Strictly we'd only need the constraint to ESCAPED if
 the asm clobbers memory, otherwise using something
 along the lines of per-call clobbers/uses would be enough.  */


[gcc r15-859] tree-optimization/115254 - don't account single-lane SLP against discovery limit

2024-05-28 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:eaaa4b88038d4d6eda1b20ab662f1568fd9be31f

commit r15-859-geaaa4b88038d4d6eda1b20ab662f1568fd9be31f
Author: Richard Biener 
Date:   Fri Sep 29 15:12:54 2023 +0200

tree-optimization/115254 - don't account single-lane SLP against discovery 
limit

The following avoids accounting single-lane SLP to the discovery
limit.  As the two testcases show this makes discovery fail,
unfortunately even not the same across targets.  The following
should fix two FAILs for GCN as a side-effect.

PR tree-optimization/115254
* tree-vect-slp.cc (vect_build_slp_tree): Only account
multi-lane SLP to limit.

* gcc.dg/vect/slp-cond-2-big-array.c: Expect 4 times SLP.
* gcc.dg/vect/slp-cond-2.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-cond-2.c   |  2 +-
 gcc/tree-vect-slp.cc | 31 ++--
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c 
b/gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c
index cb7eb94b3a3..9a9f63c0b8d 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c
@@ -128,4 +128,4 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } 
} */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-cond-2.c 
b/gcc/testsuite/gcc.dg/vect/slp-cond-2.c
index 1dcee46cd95..08bbb3dbec6 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-cond-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-cond-2.c
@@ -128,4 +128,4 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } 
} */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } 
} */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index c7ed520b629..7a963e28063 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1725,21 +1725,26 @@ vect_build_slp_tree (vec_info *vinfo,
   SLP_TREE_SCALAR_STMTS (res) = stmts;
   bst_map->put (stmts.copy (), res);
 
-  if (*limit == 0)
+  /* Single-lane SLP doesn't have the chance of run-away, do not account
+ it to the limit.  */
+  if (stmts.length () > 1)
 {
-  if (dump_enabled_p ())
-   dump_printf_loc (MSG_NOTE, vect_location,
-"SLP discovery limit exceeded\n");
-  /* Mark the node invalid so we can detect those when still in use
-as backedge destinations.  */
-  SLP_TREE_SCALAR_STMTS (res) = vNULL;
-  SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
-  res->failed = XNEWVEC (bool, group_size);
-  memset (res->failed, 0, sizeof (bool) * group_size);
-  memset (matches, 0, sizeof (bool) * group_size);
-  return NULL;
+  if (*limit == 0)
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"SLP discovery limit exceeded\n");
+ /* Mark the node invalid so we can detect those when still in use
+as backedge destinations.  */
+ SLP_TREE_SCALAR_STMTS (res) = vNULL;
+ SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
+ res->failed = XNEWVEC (bool, group_size);
+ memset (res->failed, 0, sizeof (bool) * group_size);
+ memset (matches, 0, sizeof (bool) * group_size);
+ return NULL;
+   }
+  --*limit;
 }
-  --*limit;
 
   if (dump_enabled_p ())
 dump_printf_loc (MSG_NOTE, vect_location,


[gcc r15-858] Fix SLP reduction neutral op value for pointer reductions

2024-05-28 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:65aa46ffc3b06bba3d49b9b8315610c706a1215b

commit r15-858-g65aa46ffc3b06bba3d49b9b8315610c706a1215b
Author: Richard Biener 
Date:   Mon May 27 11:38:11 2024 +0200

Fix SLP reduction neutral op value for pointer reductions

When the neutral op is the initial value we might need to convert
it from pointer to integer.

* tree-vect-loop.cc (get_initial_defs_for_reduction): Convert
neutral op to the vector component type.

Diff:
---
 gcc/tree-vect-loop.cc | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 83c0544b6aa..3b94bb13a8b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5616,7 +5616,14 @@ get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
   /* Get the def before the loop.  In reduction chain we have only
 one initial value.  Else we have as many as PHIs in the group.  */
   if (i >= initial_values.length () || (j > i && neutral_op))
-   op = neutral_op;
+   {
+ if (!useless_type_conversion_p (TREE_TYPE (vector_type),
+ TREE_TYPE (neutral_op)))
+   neutral_op = gimple_convert (_seq,
+TREE_TYPE (vector_type),
+neutral_op);
+ op = neutral_op;
+   }
   else
{
  if (!useless_type_conversion_p (TREE_TYPE (vector_type),


[gcc r15-851] Fix points-to SCC collapsing bug

2024-05-27 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:07cdba6294756af350198fbb01ea8c8efeac54dd

commit r15-851-g07cdba6294756af350198fbb01ea8c8efeac54dd
Author: Richard Biener 
Date:   Mon May 27 13:50:14 2024 +0200

Fix points-to SCC collapsing bug

When points-to analysis finds SCCs it marks the wrong node as being
part of a found cycle.  It only wants to mark the node it collapses
to but marked the entry node found rather than the one it collapses
to.  This causes fallout in the patch for PR115236 but generally
weakens the points-to solution by collapsing too many nodes.  Note
that this fix might slow down points-to solving.

* tree-ssa-structalias.cc (scc_visit): Mark the node we
collapse to as being in a component.

Diff:
---
 gcc/tree-ssa-structalias.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc
index a39b36c146e..f93c5df0767 100644
--- a/gcc/tree-ssa-structalias.cc
+++ b/gcc/tree-ssa-structalias.cc
@@ -1534,8 +1534,10 @@ scc_visit (constraint_graph_t graph, class scc_info *si, 
unsigned int n)
  graph->indirect_cycles[i - FIRST_REF_NODE] = lowest_node;
}
}
+ bitmap_set_bit (si->deleted, lowest_node);
}
-  bitmap_set_bit (si->deleted, n);
+  else
+   bitmap_set_bit (si->deleted, n);
 }
   else
 si->scc_stack.safe_push (n);


[gcc r15-850] tree-optimization/115220 - fix store sinking virtual operand constraints

2024-05-27 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:f9fbb47987efc8b5261e4cc36613c928a8693493

commit r15-850-gf9fbb47987efc8b5261e4cc36613c928a8693493
Author: Richard Biener 
Date:   Mon May 27 09:40:19 2024 +0200

tree-optimization/115220 - fix store sinking virtual operand constraints

The following makes sure the virtual operand updating when sinking
stores works for the case we ignore paths to kills.  The final
sink location might not post-dominate the original stmt location
which would require inserting of a virtual PHI which we do not support.

PR tree-optimization/115220
PR tree-optimization/115226
* tree-ssa-sink.cc (statement_sink_location): When ignoring
paths to kills when sinking stores make sure the final
sink location is still post-dominated by the original one.
Otherwise we'd need to insert a PHI node to merge virtual operands.

* gcc.dg/torture/pr115220.c: New testcase.
* gcc.dg/torture/pr115226.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/torture/pr115220.c | 14 ++
 gcc/testsuite/gcc.dg/torture/pr115226.c | 15 +++
 gcc/tree-ssa-sink.cc| 12 +---
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/torture/pr115220.c 
b/gcc/testsuite/gcc.dg/torture/pr115220.c
new file mode 100644
index 000..e7b5da6ba42
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115220.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-additional-options "--param logical-op-non-short-circuit=0" } */
+
+extern char **environ;
+static char ***p_environ = 
+int
+_setenv_r (const char *name, const char *value)
+{
+  register char *C;
+  int offset;
+  for (C = (*p_environ)[offset]; (*C = *name++) && *C != '='; ++C);
+  for (*C++ = '='; (*C++ = *value++) != 0;);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr115226.c 
b/gcc/testsuite/gcc.dg/torture/pr115226.c
new file mode 100644
index 000..9a0bc7c9b6a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115226.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+extern void c();
+int a, b;
+int main() {
+  while (b) {
+int d, e = 0, *f = 
+*f = 1;
+e = 1 >> d ? : 1 << d;
+if (e)
+  a = 0;
+c();
+  }
+  return 0;
+}
diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc
index b0fe871cf1e..8c551e42a4d 100644
--- a/gcc/tree-ssa-sink.cc
+++ b/gcc/tree-ssa-sink.cc
@@ -467,11 +467,17 @@ statement_sink_location (gimple *stmt, basic_block frombb,
   if (!sinkbb)
 return false;
   
-  sinkbb = select_best_block (frombb, sinkbb, stmt);
-  if (sinkbb == frombb)
+  basic_block bestbb = select_best_block (frombb, sinkbb, stmt);
+  if (bestbb == frombb
+  /* When we sink a store make sure there's not a path to any of
+the possibly skipped killing defs as that wrecks the virtual
+operand update, requiring inserting of a PHI node.  */
+  || (gimple_vdef (stmt)
+ && bestbb != sinkbb
+ && !dominated_by_p (CDI_POST_DOMINATORS, bestbb, sinkbb)))
 return false;
 
-  *togsi = gsi_after_labels (sinkbb);
+  *togsi = gsi_after_labels (bestbb);
 
   return true;
 }


[gcc r14-10247] tree-optimization/115232 - demangle failure during -Waccess

2024-05-27 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:4790076933ef9337553c3fbbc52a93cce78c584f

commit r14-10247-g4790076933ef9337553c3fbbc52a93cce78c584f
Author: Richard Biener 
Date:   Mon May 27 09:13:11 2024 +0200

tree-optimization/115232 - demangle failure during -Waccess

For the following testcase we fail to demangle
_ZZN5OuterIvE6methodIvEEvvQ3cstITL0__EEN5InnernwEm and
_ZZN5OuterIvE6methodIvEEvvQ3cstITL0__EEN5InnerdlEPv and in turn end
up building NULL references.  The following puts in a safeguard for
faile demangling into -Waccess.

PR tree-optimization/115232
* gimple-ssa-warn-access.cc (new_delete_mismatch_p): Handle
failure to demangle gracefully.

* g++.dg/pr115232.C: New testcase.

(cherry picked from commit 311d7f5c17b8969c7ed8e4f23178d6ec4752e33f)

Diff:
---
 gcc/gimple-ssa-warn-access.cc   |  2 +-
 gcc/testsuite/g++.dg/pr115232.C | 25 +
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/gcc/gimple-ssa-warn-access.cc b/gcc/gimple-ssa-warn-access.cc
index dedaae27b31..194d1a2c02a 100644
--- a/gcc/gimple-ssa-warn-access.cc
+++ b/gcc/gimple-ssa-warn-access.cc
@@ -1762,7 +1762,7 @@ new_delete_mismatch_p (tree new_decl, tree delete_decl)
   void *np = NULL, *dp = NULL;
   demangle_component *ndc = cplus_demangle_v3_components (new_str, 0, );
   demangle_component *ddc = cplus_demangle_v3_components (del_str, 0, );
-  bool mismatch = new_delete_mismatch_p (*ndc, *ddc);
+  bool mismatch = ndc && ddc && new_delete_mismatch_p (*ndc, *ddc);
   free (np);
   free (dp);
   return mismatch;
diff --git a/gcc/testsuite/g++.dg/pr115232.C b/gcc/testsuite/g++.dg/pr115232.C
new file mode 100644
index 000..e1d96d8f899
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr115232.C
@@ -0,0 +1,25 @@
+// { dg-do compile }
+// { dg-require-effective-target c++20 }
+
+using size_t = decltype(sizeof(0));
+template 
+static constexpr bool cst = true;
+template
+struct Outer
+{
+Outer();
+template  void method() requires cst
+{
+struct Inner
+{
+static void* operator new(size_t){return new char;}
+static void operator delete(void*){}
+Outer t;
+};
+new Inner;
+}
+};
+void f()
+{
+Outer{}.method();
+}


[gcc r15-848] tree-optimization/115232 - demangle failure during -Waccess

2024-05-27 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:311d7f5c17b8969c7ed8e4f23178d6ec4752e33f

commit r15-848-g311d7f5c17b8969c7ed8e4f23178d6ec4752e33f
Author: Richard Biener 
Date:   Mon May 27 09:13:11 2024 +0200

tree-optimization/115232 - demangle failure during -Waccess

For the following testcase we fail to demangle
_ZZN5OuterIvE6methodIvEEvvQ3cstITL0__EEN5InnernwEm and
_ZZN5OuterIvE6methodIvEEvvQ3cstITL0__EEN5InnerdlEPv and in turn end
up building NULL references.  The following puts in a safeguard for
faile demangling into -Waccess.

PR tree-optimization/115232
* gimple-ssa-warn-access.cc (new_delete_mismatch_p): Handle
failure to demangle gracefully.

* g++.dg/pr115232.C: New testcase.

Diff:
---
 gcc/gimple-ssa-warn-access.cc   |  2 +-
 gcc/testsuite/g++.dg/pr115232.C | 25 +
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/gcc/gimple-ssa-warn-access.cc b/gcc/gimple-ssa-warn-access.cc
index 0cd5b6d6ef4..61f9f0f3d31 100644
--- a/gcc/gimple-ssa-warn-access.cc
+++ b/gcc/gimple-ssa-warn-access.cc
@@ -1762,7 +1762,7 @@ new_delete_mismatch_p (tree new_decl, tree delete_decl)
   void *np = NULL, *dp = NULL;
   demangle_component *ndc = cplus_demangle_v3_components (new_str, 0, );
   demangle_component *ddc = cplus_demangle_v3_components (del_str, 0, );
-  bool mismatch = new_delete_mismatch_p (*ndc, *ddc);
+  bool mismatch = ndc && ddc && new_delete_mismatch_p (*ndc, *ddc);
   free (np);
   free (dp);
   return mismatch;
diff --git a/gcc/testsuite/g++.dg/pr115232.C b/gcc/testsuite/g++.dg/pr115232.C
new file mode 100644
index 000..e1d96d8f899
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr115232.C
@@ -0,0 +1,25 @@
+// { dg-do compile }
+// { dg-require-effective-target c++20 }
+
+using size_t = decltype(sizeof(0));
+template 
+static constexpr bool cst = true;
+template
+struct Outer
+{
+Outer();
+template  void method() requires cst
+{
+struct Inner
+{
+static void* operator new(size_t){return new char;}
+static void operator delete(void*){}
+Outer t;
+};
+new Inner;
+}
+};
+void f()
+{
+Outer{}.method();
+}


[gcc r15-816] Fix gcc.dg/vect/vect-gather-4.c for cascadelake

2024-05-24 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:85e2ce10f76aee93e43aab6558cf8e39cec911e4

commit r15-816-g85e2ce10f76aee93e43aab6558cf8e39cec911e4
Author: Richard Biener 
Date:   Fri May 24 13:15:38 2024 +0200

Fix gcc.dg/vect/vect-gather-4.c for cascadelake

There's not really a good way to test what the testcase wants to
test, the following exchanges one dump scan for another (imperfect)
one.

* gcc.dg/vect/vect-gather-4.c: Scan for not vectorizing using
SLP.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-gather-4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c 
b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
index 1ce63e69199..d18094d6982 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c
@@ -45,4 +45,4 @@ f3 (int *restrict y, int *restrict x, int *restrict indices)
 }
 }
 
-/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect } } */
+/* { dg-final { scan-tree-dump-not "vectorizing stmts using SLP" vect } } */


[gcc r15-815] tree-optimization/115144 - improve sinking destination choice

2024-05-24 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:5b9b3bae33cae7fca2e3c3e3028be6b8bee9b698

commit r15-815-g5b9b3bae33cae7fca2e3c3e3028be6b8bee9b698
Author: Richard Biener 
Date:   Wed May 22 09:16:51 2024 +0200

tree-optimization/115144 - improve sinking destination choice

When sinking code closer to its uses we already try to minimize the
distance we move by inserting at the start of the basic-block.  The
following makes sure to sink closest to the control dependence
check of the region we want to sink to as well as make sure to
ignore control dependences that are only guarding exceptional code.
This restores somewhat the old profile check but without requiring
nearly even probabilities.  The patch also makes sure to not give
up completely when the best sink location is one we do not want to
sink to but possibly then choose the next best one.

PR tree-optimization/115144
* tree-ssa-sink.cc (do_not_sink): New function, split out
from ...
(select_best_block): Here.  First pick valid block to
sink to.  From that search for the best valid block,
avoiding sinking across conditions to exceptional code.
(sink_code_in_bb): When updating vuses of stores in
paths we do not sink a store to make sure we didn't
pick a dominating sink location.

* gcc.dg/tree-ssa/ssa-sink-22.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c |  14 
 gcc/tree-ssa-sink.cc| 106 +++-
 2 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c
new file mode 100644
index 000..e35626d4070
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-sink-22.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-sink1-details" } */
+
+extern void abort (void);
+
+int foo (int x, int y, int f)
+{
+  int tem = x / y;
+  if (f)
+abort ();
+  return tem;
+}
+
+/* { dg-final { scan-tree-dump-not "Sinking" "sink1" } } */
diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc
index 2188b7523c7..b0fe871cf1e 100644
--- a/gcc/tree-ssa-sink.cc
+++ b/gcc/tree-ssa-sink.cc
@@ -172,6 +172,39 @@ nearest_common_dominator_of_uses (def_operand_p def_p, 
bool *debug_stmts)
   return commondom;
 }
 
+/* Return whether sinking STMT from EARLY_BB to BEST_BB should be avoided.  */
+
+static bool
+do_not_sink (gimple *stmt, basic_block early_bb, basic_block best_bb)
+{
+  /* Placing a statement before a setjmp-like function would be invalid
+ (it cannot be reevaluated when execution follows an abnormal edge).
+ If we selected a block with abnormal predecessors, just punt.  */
+  if (bb_has_abnormal_pred (best_bb))
+return true;
+
+  /* If the latch block is empty, don't make it non-empty by sinking
+ something into it.  */
+  if (best_bb == early_bb->loop_father->latch
+  && empty_block_p (best_bb))
+return true;
+
+  /* Avoid turning an unconditional read into a conditional one when we
+ still might want to perform vectorization.  */
+  if (best_bb->loop_father == early_bb->loop_father
+  && loop_outer (best_bb->loop_father)
+  && !best_bb->loop_father->inner
+  && gimple_vuse (stmt)
+  && !gimple_vdef (stmt)
+  && flag_tree_loop_vectorize
+  && !(cfun->curr_properties & PROP_loop_opts_done)
+  && dominated_by_p (CDI_DOMINATORS, best_bb->loop_father->latch, early_bb)
+  && !dominated_by_p (CDI_DOMINATORS, best_bb->loop_father->latch, 
best_bb))
+return true;
+
+  return false;
+}
+
 /* Given EARLY_BB and LATE_BB, two blocks in a path through the dominator
tree, return the best basic block between them (inclusive) to place
statements.
@@ -185,54 +218,57 @@ select_best_block (basic_block early_bb,
   basic_block late_bb,
   gimple *stmt)
 {
+  /* First pick a block we do not disqualify.  */
+  while (late_bb != early_bb
+&& do_not_sink (stmt, early_bb, late_bb))
+late_bb = get_immediate_dominator (CDI_DOMINATORS, late_bb);
+
   basic_block best_bb = late_bb;
   basic_block temp_bb = late_bb;
-
   while (temp_bb != early_bb)
 {
   /* Walk up the dominator tree, hopefully we'll find a shallower
 loop nest.  */
   temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb);
 
+  /* Do not consider blocks we do not want to sink to.  */
+  if (temp_bb != early_bb && do_not_sink (stmt, early_bb, temp_bb))
+   ;
+
   /* If we've moved into a lower loop nest, then that becomes
 our best block.  */
-  if (bb_loop_depth (temp_bb) < bb_loop_depth (best_bb))
+  else if (bb_loop_depth (temp_bb) < bb_loop_depth (best_bb))
best_bb = temp_bb;
-}
 
-  /* Placing a statement before a setjmp-like function would be invalid
- (it cannot be reevaluated when execution 

[gcc r15-812] Avoid splitting store dataref groups during SLP discovery

2024-05-23 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:c71886f2ca2e46ce1449c7064d6f1b447d02fcba

commit r15-812-gc71886f2ca2e46ce1449c7064d6f1b447d02fcba
Author: Richard Biener 
Date:   Fri Sep 29 13:13:16 2023 +0200

Avoid splitting store dataref groups during SLP discovery

The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating VEC_PERM
SLP nodes merging them so the store will always cover the whole group.

With this for example

int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
  for (int i = 0; i < 256; i++)
{
  x[4*i+0] = y[2*i+0];
  x[4*i+1] = y[2*i+1];
  x[4*i+2] = z[i];
  x[4*i+3] = w[i];
}
}

which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark).  We of course need a VF of four here.

.L2:
movdqa  z(%rax), %xmm0
movdqa  w(%rax), %xmm4
movdqa  y(%rax,%rax), %xmm2
movdqa  y+16(%rax,%rax), %xmm1
movdqa  %xmm0, %xmm3
punpckhdq   %xmm4, %xmm0
punpckldq   %xmm4, %xmm3
movdqa  %xmm2, %xmm4
shufps  $238, %xmm3, %xmm2
movaps  %xmm2, x+16(,%rax,4)
movdqa  %xmm1, %xmm2
shufps  $68, %xmm3, %xmm4
shufps  $68, %xmm0, %xmm2
movaps  %xmm4, x(,%rax,4)
shufps  $238, %xmm0, %xmm1
movaps  %xmm2, x+32(,%rax,4)
movaps  %xmm1, x+48(,%rax,4)
addq$16, %rax
cmpq$1024, %rax
jne .L2

The extra permute nodes merging distinct branches of the SLP
tree might be unexpected for some code, esp. since
SLP_TREE_REPRESENTATIVE cannot be meaningfully set and we
cannot populate SLP_TREE_SCALAR_STMTS or SLP_TREE_SCALAR_OPS
consistently as we can have a mix of both.

The patch keeps the sub-trees form consecutive lanes but that's
in principle not necessary if we for example have an even/odd
split which now would result in N single-lane sub-trees.  That's
left for future improvements.

The interesting part is how VLA vector ISAs handle merging of
two vectors that's not trivial even/odd merging.  The strathegy
of how to build the permute tree might need adjustments for that
(in the end splitting each branch to single lanes and then doing
even/odd merging would be the brute-force fallback).  Not sure
how much we can or should rely on the SLP optimize pass to handle
this.

The gcc.dg/vect/slp-12a.c case is interesting as we currently split
the 8 store group into lanes 0-5 which we SLP with an unroll factor
of two (on x86-64 with SSE) and the remaining two lanes are using
interleaving vectorization with a final unroll factor of four.  Thus
we're using hybrid SLP within a single store group.  After the change
we discover the same 0-5 lane SLP part as well as two single-lane
parts feeding the full store group.  But that results in a load
permutation that isn't supported (I have WIP patchs to rectify that).
So we end up cancelling SLP and vectorizing the whole loop with
interleaving which is IMO good and results in better code.

This is similar for gcc.target/i386/pr52252-atom.c where interleaving
generates much better code than hybrid SLP.  I'm unsure how to update
the testcase though.

gcc.dg/vect/slp-21.c runs into similar situations.  Note that when
when analyzing SLP operations we discard an instance we currently
force the full loop to have no SLP because hybrid detection is
broken.  It's probably not worth fixing this at this moment.

For gcc.dg/vect/pr97428.c we are not splitting the 16 store group
into two but merge the two 8 lane loads into one before doing the
store and thus have only a single SLP instance.  A similar situation
happens in gcc.dg/vect/slp-11c.c but the branches feeding the
single SLP store only have a single lane.  Likewise for
gcc.dg/vect/vect-complex-5.c and gcc.dg/vect/vect-gather-2.c.

gcc.dg/vect/slp-cond-1.c has an additional SLP vectorization
with a SLP store group of size two but two single-lane branches.

* tree-vect-slp.cc (vect_build_slp_instance): Do not split
store dataref groups on loop SLP discovery failure but create
a single SLP instance for the stores but branch to SLP sub-trees
and merge with a series of VEC_PERM nodes.

* gcc.dg/vect/pr97428.c: Expect a single store SLP group.
* gcc.dg/vect/slp-11c.c: Likewise, if !vect_load_lanes.
* gcc.dg/vect/vect-complex-5.c: Likewise.
* gcc.dg/vect/slp-12a.c: Do not expect SLP.
  

[gcc r15-793] tree-optimization/115197 - fix ICE w/ constant in LC PHI and loop distribution

2024-05-23 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:2b2476d4d18c92b8aba3567ebccd2100c2f7c258

commit r15-793-g2b2476d4d18c92b8aba3567ebccd2100c2f7c258
Author: Richard Biener 
Date:   Thu May 23 14:36:39 2024 +0200

tree-optimization/115197 - fix ICE w/ constant in LC PHI and loop 
distribution

Forgot a check for an SSA name before trying to replace a PHI arg with
its current definition.

PR tree-optimization/115197
* tree-loop-distribution.cc (copy_loop_before): Constant PHI
args remain the same.

* gcc.dg/pr115197.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/pr115197.c | 14 ++
 gcc/tree-loop-distribution.cc   |  7 +--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr115197.c b/gcc/testsuite/gcc.dg/pr115197.c
new file mode 100644
index 000..00d674b3bd9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr115197.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fno-tree-scev-cprop -ftree-pre 
-ftree-loop-distribute-patterns" } */
+
+int a, b[2], c, d, e, f[2];
+int main() {
+  while (a)
+if (d) {
+  if (e)
+return 0;
+  for (; c; c++)
+f[c] = 0 < (b[c] = ~(f[c + 1] < a));
+}
+  return 0;
+}
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index 668dc420449..4d1ed234fcb 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -977,8 +977,11 @@ copy_loop_before (class loop *loop, bool 
redirect_lc_phi_defs)
  if (virtual_operand_p (gimple_phi_result (phi)))
continue;
  use_operand_p use_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, exit);
- tree new_def = get_current_def (USE_FROM_PTR (use_p));
- SET_USE (use_p, new_def);
+ if (TREE_CODE (USE_FROM_PTR (use_p)) == SSA_NAME)
+   {
+ tree new_def = get_current_def (USE_FROM_PTR (use_p));
+ SET_USE (use_p, new_def);
+   }
}
 }


[gcc r15-792] tree-optimization/115199 - fix PTA constraint processing for LHS

2024-05-23 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:f0a02467bbc35a478eb82f5a8a7e8870827b51fc

commit r15-792-gf0a02467bbc35a478eb82f5a8a7e8870827b51fc
Author: Richard Biener 
Date:   Thu May 23 13:33:15 2024 +0200

tree-optimization/115199 - fix PTA constraint processing for  LHS

When processing a  = X constraint we treat it as *ANYTHING = X
during constraint processing but then end up recording it as
 = X anyway, breaking constraint graph building.  This is
because we only update the local copy of the LHS and not the constraint
itself.

PR tree-optimization/115199
* tree-ssa-structalias.cc (process_constraint): Also
record  = X as *ANYTING = X in the end.

* gcc.dg/torture/pr115199.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/torture/pr115199.c | 24 
 gcc/tree-ssa-structalias.cc |  2 +-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/torture/pr115199.c 
b/gcc/testsuite/gcc.dg/torture/pr115199.c
new file mode 100644
index 000..981a7330b32
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115199.c
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+
+struct b {
+  char *volatile c;
+};
+struct b * __attribute__((noipa))
+d()
+{
+  char *e;
+  struct b *b = __builtin_malloc(sizeof(b));
+  void *f = __builtin_malloc(1);
+
+  e = __builtin_memcpy(f, "z", 1);
+  b->c = e;
+  return b;
+}
+
+int main()
+{
+  struct b b = *d();
+  if (b.c[0] != 'z')
+__builtin_abort();
+  return 0;
+}
diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc
index 0e9423a78ec..a39b36c146e 100644
--- a/gcc/tree-ssa-structalias.cc
+++ b/gcc/tree-ssa-structalias.cc
@@ -3104,7 +3104,7 @@ process_constraint (constraint_t t)
  it here by turning it into *ANYTHING.  */
   if (lhs.type == ADDRESSOF
   && lhs.var == anything_id)
-lhs.type = DEREF;
+t->lhs.type = lhs.type = DEREF;
 
   /* ADDRESSOF on the lhs is invalid.  */
   gcc_assert (lhs.type != ADDRESSOF);


[gcc r15-791] tree-optimization/115138 - ptr-vs-ptr and FUNCTION_DECLs

2024-05-23 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:61f5b3c59ed20438d7d9918d7a83d29a21097d4e

commit r15-791-g61f5b3c59ed20438d7d9918d7a83d29a21097d4e
Author: Richard Biener 
Date:   Thu May 23 11:26:14 2024 +0200

tree-optimization/115138 - ptr-vs-ptr and FUNCTION_DECLs

I failed to realize we do not represent FUNCTION_DECLs or LABEL_DECLs
in vars explicitly and thus have to compare pt.vars_contains_nonlocal.

PR tree-optimization/115138
* tree-ssa-alias.cc (ptrs_compare_unequal): Make sure
pt.vars_contains_nonlocal differs since we do not represent
FUNCTION_DECLs or LABEL_DECLs in vars explicitly.

* gcc.dg/torture/pr115138.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/torture/pr115138.c | 28 
 gcc/tree-ssa-alias.cc   |  6 ++
 2 files changed, 34 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/torture/pr115138.c 
b/gcc/testsuite/gcc.dg/torture/pr115138.c
new file mode 100644
index 000..6becaecbaff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115138.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+
+int foo (int) {}
+int bar (int) {}
+
+typedef int (*pred)(int);
+
+int x, y;
+pred A () { if (x) return foo; else return bar; }
+pred B () { if (y) return foo; else return bar; }
+int __attribute__((noipa)) baz()
+{
+  pred a = A();
+  pred b = B();
+  if (a != b)
+return 42;
+  return 0;
+}
+
+int main()
+{
+  if (baz () != 0)
+__builtin_abort ();
+  y = 1;
+  if (baz () != 42)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
index d64d6d02f4a..1a91d63a31e 100644
--- a/gcc/tree-ssa-alias.cc
+++ b/gcc/tree-ssa-alias.cc
@@ -501,6 +501,12 @@ ptrs_compare_unequal (tree ptr1, tree ptr2)
  || pi2->pt.vars_contains_interposable)
return false;
  if ((!pi1->pt.null || !pi2->pt.null)
+ /* ???  We do not represent FUNCTION_DECL and LABEL_DECL
+in pt.vars but only set pt.vars_contains_nonlocal.  This
+makes compares involving those and other nonlocals
+imprecise.  */
+ && (!pi1->pt.vars_contains_nonlocal
+ || !pi2->pt.vars_contains_nonlocal)
  && (!pt_solution_includes_const_pool (>pt)
  || !pt_solution_includes_const_pool (>pt)))
return !pt_solutions_intersect (>pt, >pt);


[gcc r15-773] Fix mixed input kind permute optimization

2024-05-22 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:3507ab1b018a68500e49fa9f1de7caa0f1b53dda

commit r15-773-g3507ab1b018a68500e49fa9f1de7caa0f1b53dda
Author: Richard Biener 
Date:   Tue May 21 19:15:33 2024 +0200

Fix mixed input kind permute optimization

When change_vec_perm_layout runs into a permute combining two
nodes where one is invariant and one internal the partition of
one input can be -1 but the other might not be.  The following
supports this case by simply ignoring inputs with input partiton -1.

I'm not sure this is correct but it avoids ICEing when accessing
that partitions layout for gcc.target/i386/pr98928.c with the
change to avoid splitting store dataref groups during SLP discovery.

* tree-vect-slp.cc (change_vec_perm_layout): Ignore an
input partition of -1.

Diff:
---
 gcc/tree-vect-slp.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 43f2c153bf0..3f8209b43a7 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4640,6 +4640,8 @@ change_vec_perm_layout (slp_tree node, lane_permutation_t 
,
{
  slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
  unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
+ if (in_partition_i == -1u)
+   continue;
  this_in_layout_i = m_partitions[in_partition_i].layout;
}
   if (this_in_layout_i > 0)


[gcc r15-772] Avoid SLP_REPRESENTATIVE access for VEC_PERM in SLP scheduling

2024-05-22 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:31e9bae0ea5e5413abfa3ca9050e66cc6760553e

commit r15-772-g31e9bae0ea5e5413abfa3ca9050e66cc6760553e
Author: Richard Biener 
Date:   Fri May 17 15:23:38 2024 +0200

Avoid SLP_REPRESENTATIVE access for VEC_PERM in SLP scheduling

SLP permute nodes can end up without a SLP_REPRESENTATIVE now,
the following avoids touching it in this case in vect_schedule_slp_node.

* tree-vect-slp.cc (vect_schedule_slp_node): Avoid looking
at SLP_REPRESENTATIVE for VEC_PERM nodes.

Diff:
---
 gcc/tree-vect-slp.cc | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index f34ed54a70b..43f2c153bf0 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -9301,13 +9301,8 @@ vect_schedule_slp_node (vec_info *vinfo,
   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
 
-  if (dump_enabled_p ())
-dump_printf_loc (MSG_NOTE, vect_location,
-"-->vectorizing SLP node starting from: %G",
-stmt_info->stmt);
-
-  if (STMT_VINFO_DATA_REF (stmt_info)
-  && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
+  if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
+  && STMT_VINFO_DATA_REF (stmt_info))
 {
   /* Vectorized loads go before the first scalar load to make it
 ready early, vectorized stores go before the last scalar
@@ -9319,10 +9314,10 @@ vect_schedule_slp_node (vec_info *vinfo,
last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
   si = gsi_for_stmt (last_stmt_info->stmt);
 }
-  else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
-   || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
-   || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
-  && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
+  else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
+  && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
+  || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
+  || STMT_VINFO_TYPE (stmt_info) == phi_info_type))
 {
   /* For PHI node vectorization we do not use the insertion iterator.  */
   si = gsi_none ();
@@ -9456,6 +9451,9 @@ vect_schedule_slp_node (vec_info *vinfo,
   /* Handle purely internal nodes.  */
   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
 {
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"-->vectorizing SLP permutation node\n");
   /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
 be shared with different SLP nodes (but usually it's the same
 operation apart from the case the stmt is only there for denoting
@@ -9474,7 +9472,13 @@ vect_schedule_slp_node (vec_info *vinfo,
  }
 }
   else
-vect_transform_stmt (vinfo, stmt_info, , node, instance);
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"-->vectorizing SLP node starting from: %G",
+stmt_info->stmt);
+  vect_transform_stmt (vinfo, stmt_info, , node, instance);
+}
 }
 
 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.


[gcc r15-771] Avoid requiring VEC_PERM represenatives

2024-05-22 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:0c7792f707368d0225a9a457895b847ef660c270

commit r15-771-g0c7792f707368d0225a9a457895b847ef660c270
Author: Richard Biener 
Date:   Fri May 17 14:26:38 2024 +0200

Avoid requiring VEC_PERM represenatives

The following plugs one hole where we require a VEC_PERM node
representative unnecessarily.  This is for vect_check_store_rhs
which looks at the RHS and checks whether a constant can be
native encoded.  The fix is to guard that with vect_constant_def
additionally and making vect_is_simple_use forgiving for a missing
SLP_TREE_REPRESENTATIVE when the child is a VEC_PERM node,
initializing the scalar def to error_mark_node.

* tree-vect-stmts.cc (vect_check_store_rhs): Look at *rhs
only when it's a vec_constant_def.
(vect_is_simple_use): When we have no representative for
an internal node, fill in *op with error_mark_node.

Diff:
---
 gcc/tree-vect-stmts.cc | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 672959501bb..4219ad832db 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2553,7 +2553,8 @@ vect_check_store_rhs (vec_info *vinfo, stmt_vec_info 
stmt_info,
 
   /* In the case this is a store from a constant make sure
  native_encode_expr can handle it.  */
-  if (CONSTANT_CLASS_P (*rhs) && native_encode_expr (*rhs, NULL, 64) == 0)
+  if (rhs_dt == vect_constant_def
+  && CONSTANT_CLASS_P (*rhs) && native_encode_expr (*rhs, NULL, 64) == 0)
 {
   if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -14002,8 +14003,26 @@ vect_is_simple_use (vec_info *vinfo, stmt_vec_info 
stmt, slp_tree slp_node,
   *vectype = SLP_TREE_VECTYPE (child);
   if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
{
- *op = gimple_get_lhs (SLP_TREE_REPRESENTATIVE (child)->stmt);
- return vect_is_simple_use (*op, vinfo, dt, def_stmt_info_out);
+ /* ???  VEC_PERM nodes might be intermediate and their lane value
+have no representative (nor do we build a VEC_PERM stmt for
+the actual operation).  Note for two-operator nodes we set
+a representative but leave scalar stmts empty as we'd only
+have one for a subset of lanes.  Ideally no caller would
+require *op for internal defs.  */
+ if (SLP_TREE_REPRESENTATIVE (child))
+   {
+ *op = gimple_get_lhs (SLP_TREE_REPRESENTATIVE (child)->stmt);
+ return vect_is_simple_use (*op, vinfo, dt, def_stmt_info_out);
+   }
+ else
+   {
+ gcc_assert (SLP_TREE_CODE (child) == VEC_PERM_EXPR);
+ *op = error_mark_node;
+ *dt = vect_internal_def;
+ if (def_stmt_info_out)
+   *def_stmt_info_out = NULL;
+ return true;
+   }
}
   else
{


[gcc r15-768] web/115183 - fix typo in C++ docs

2024-05-22 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:424f8a01df9b311250e416759ad61c00bba4af48

commit r15-768-g424f8a01df9b311250e416759ad61c00bba4af48
Author: Richard Biener 
Date:   Wed May 22 10:19:08 2024 +0200

web/115183 - fix typo in C++ docs

The following fixes a reported typo.

* doc/invoke.texi (C++ Modules): Fix typo.

Diff:
---
 gcc/doc/invoke.texi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 218901c0b20..0625a5ede6f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -37646,7 +37646,7 @@ not get debugging information for routines in the 
precompiled header.
 @cindex speed of compilation
 
 Modules are a C++20 language feature.  As the name suggests, they
-provides a modular compilation system, intending to provide both
+provide a modular compilation system, intending to provide both
 faster builds and better library isolation.  The ``Merging Modules''
 paper @uref{https://wg21.link/p1103}, provides the easiest to read set
 of changes to the standard, although it does not capture later


gcc-wwwdocs branch master updated. 9d10f6fccee3a68102173f28cf312ed266b7d95d

2024-05-22 Thread Richard Biener via Gcc-cvs-wwwdocs
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gcc-wwwdocs".

The branch, master has been updated
   via  9d10f6fccee3a68102173f28cf312ed266b7d95d (commit)
  from  9e79c5e411eb3236b481c6093fad4dc5ae5141c5 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -
commit 9d10f6fccee3a68102173f28cf312ed266b7d95d
Author: Richard Biener 
Date:   Wed May 22 10:04:32 2024 +0200

web/115183 - Remove duplicate links to GCCGO online docs.

diff --git a/htdocs/onlinedocs/13.1.0/index.html 
b/htdocs/onlinedocs/13.1.0/index.html
index 2abc06ac..08d312ba 100644
--- a/htdocs/onlinedocs/13.1.0/index.html
+++ b/htdocs/onlinedocs/13.1.0/index.html
@@ -62,12 +62,6 @@
  
href="https://gcc.gnu.org/onlinedocs/gcc-13.1.0/libstdc++-api-gfdl.xml.gz;>XML 
GFDL or https://gcc.gnu.org/onlinedocs/gcc-13.1.0/libstdc++-api-html.tar.gz;>an
  HTML tarball)
-   https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo/;>GCCGO 13.1 
Manual (https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo.pdf;>also in
-   PDF or https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo.ps.gz;>PostScript or 
https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo-html.tar.gz;>an
-   HTML tarball)
 https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo/;>GCCGO 13.1 
Manual (https://gcc.gnu.org/onlinedocs/gcc-13.1.0/gccgo.pdf;>also in
PDF or https://gcc.gnu.org/onlinedocs/gcc-13.2.0/libstdc++-api-gfdl.xml.gz;>XML 
GFDL or https://gcc.gnu.org/onlinedocs/gcc-13.2.0/libstdc++-api-html.tar.gz;>an
  HTML tarball)
-   https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo/;>GCCGO 13.2 
Manual (https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo.pdf;>also in
-   PDF or https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo.ps.gz;>PostScript or 
https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo-html.tar.gz;>an
-   HTML tarball)
 https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo/;>GCCGO 13.2 
Manual (https://gcc.gnu.org/onlinedocs/gcc-13.2.0/gccgo.pdf;>also in
PDF or https://gcc.gnu.org/onlinedocs/gcc-13.3.0/libstdc++-api-gfdl.xml.gz;>XML 
GFDL or https://gcc.gnu.org/onlinedocs/gcc-13.3.0/libstdc++-api-html.tar.gz;>an
  HTML tarball)
-   https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo/;>GCCGO 13.3 
Manual (https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo.pdf;>also in
-   PDF or https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo.ps.gz;>PostScript or 
https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo-html.tar.gz;>an
-   HTML tarball)
 https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo/;>GCCGO 13.3 
Manual (https://gcc.gnu.org/onlinedocs/gcc-13.3.0/gccgo.pdf;>also in
PDF or https://gcc.gnu.org/onlinedocs/gcc-14.1.0/libstdc++-api-gfdl.xml.gz;>XML 
GFDL or https://gcc.gnu.org/onlinedocs/gcc-14.1.0/libstdc++-api-html.tar.gz;>an
  HTML tarball)
-   https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo/;>GCCGO 14.1 
Manual (https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo.pdf;>also in
-   PDF or https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo.ps.gz;>PostScript or 
https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo-html.tar.gz;>an
-   HTML tarball)
 https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo/;>GCCGO 14.1 
Manual (https://gcc.gnu.org/onlinedocs/gcc-14.1.0/gccgo.pdf;>also in
PDF or 

[gcc r15-753] tree-optimization/115137 - more ptr-vs-ptr compare fixes

2024-05-21 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:85f7828679edc3ae7488594145756cd53787650e

commit r15-753-g85f7828679edc3ae7488594145756cd53787650e
Author: Richard Biener 
Date:   Tue May 21 10:12:40 2024 +0200

tree-optimization/115137 - more ptr-vs-ptr compare fixes

The following fixes the omission of const-pool included in NONLOCAL.

PR tree-optimization/115137
* tree-ssa-structalias.cc (pt_solution_includes_const_pool): 
NONLOCAL
also includes constant pool entries.

* gcc.dg/torture/pr115137.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/torture/pr115137.c | 34 +
 gcc/tree-ssa-structalias.cc |  1 +
 2 files changed, 35 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/torture/pr115137.c 
b/gcc/testsuite/gcc.dg/torture/pr115137.c
new file mode 100644
index 000..9cd8ff93633
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr115137.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+
+struct a {
+  int b;
+} c;
+
+int d;
+long e;
+
+static void f(char *g, char *h, struct a *l) {
+  char a[1024];
+  int j = 0;
+
+  if (d)
+h = a;
+
+  for (; g < h; g++)
+if (__builtin_iscntrl(*g))
+  ++j;
+
+  while (l->b < j)
+;
+}
+
+int main() {
+  static const struct {
+char *input;
+  } k[] = {{"somepage.html"}, {""}};
+
+  for (unsigned int i = 0; i < 1; ++i) {
+e = __builtin_strlen(k[i].input);
+f(k[i].input, k[i].input + e, );
+  }
+}
diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc
index 61fb3610a17..0e9423a78ec 100644
--- a/gcc/tree-ssa-structalias.cc
+++ b/gcc/tree-ssa-structalias.cc
@@ -7087,6 +7087,7 @@ bool
 pt_solution_includes_const_pool (struct pt_solution *pt)
 {
   return (pt->const_pool
+ || pt->nonlocal
  || (pt->escaped && (!cfun || cfun->gimple_df->escaped.const_pool))
  || (pt->ipa_escaped && ipa_escaped_pt.const_pool));
 }


[gcc r15-750] tree-optimization/115149 - VOP live and missing PHIs

2024-05-21 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:ec9b8bafe20755d13ab9a1b834b5da79ae972c0e

commit r15-750-gec9b8bafe20755d13ab9a1b834b5da79ae972c0e
Author: Richard Biener 
Date:   Tue May 21 09:48:04 2024 +0200

tree-optimization/115149 - VOP live and missing PHIs

The following fixes a bug in vop-live get_live_in which was using
NULL to indicate the first processed edge but at the same time
using it for the case the live-in virtual operand cannot be computed.
The following fixes this, avoiding sinking a load to a place where
we'd have to insert virtual PHIs to make the virtual operand SSA
web OK.

PR tree-optimization/115149
* tree-ssa-live.cc (virtual_operand_live::get_live_in):
Explicitly track the first processed edge.

* gcc.dg/pr115149.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/pr115149.c | 16 
 gcc/tree-ssa-live.cc|  8 ++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr115149.c b/gcc/testsuite/gcc.dg/pr115149.c
new file mode 100644
index 000..9f6bc97dbe6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr115149.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline -fno-tree-vrp -fno-ipa-sra -fno-tree-dce 
-fno-tree-ch" } */
+
+int a, c, e, f, g, h[1], i;
+static int j(int b) { return 0; }
+static void k(int d) {}
+int main()
+{
+  if (h[0])
+while (1) {
+   k(f && j(i && (h[g] = e)));
+   while (a)
+ c ^= 1;
+}
+  return 0;
+}
diff --git a/gcc/tree-ssa-live.cc b/gcc/tree-ssa-live.cc
index e6ae551a457..60dfc05dcd9 100644
--- a/gcc/tree-ssa-live.cc
+++ b/gcc/tree-ssa-live.cc
@@ -1675,14 +1675,18 @@ virtual_operand_live::get_live_in (basic_block bb)
   edge_iterator ei;
   edge e;
   tree livein = NULL_TREE;
+  bool first = true;
   FOR_EACH_EDGE (e, ei, bb->preds)
 if (e->flags & EDGE_DFS_BACK)
   /* We can ignore backedges since if there's a def there it would
 have forced a PHI in the source because it also acts as use
 downstream.  */
   continue;
-else if (!livein)
-  livein = get_live_out (e->src);
+else if (first)
+  {
+   livein = get_live_out (e->src);
+   first = false;
+  }
 else if (get_live_out (e->src) != livein)
   /* When there's no virtual use downstream this indicates a point
 where we'd insert a PHI merging the different live virtual


gcc-wwwdocs branch master updated. 48be8366fcbf9246b2b5b1625febb5e9202842b8

2024-05-17 Thread Richard Biener via Gcc-cvs-wwwdocs
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gcc-wwwdocs".

The branch, master has been updated
   via  48be8366fcbf9246b2b5b1625febb5e9202842b8 (commit)
  from  ed9ceba9b8b038f0e0f333798da7abe046679d0c (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -
commit 48be8366fcbf9246b2b5b1625febb5e9202842b8
Author: Richard Biener 
Date:   Fri May 17 13:48:56 2024 +0200

Fix trunk status

diff --git a/htdocs/index.html b/htdocs/index.html
index 63fbcdcd..c52bb30b 100644
--- a/htdocs/index.html
+++ b/htdocs/index.html
@@ -222,7 +222,7 @@ More news? Let ger...@pfeifer.com know!
   
   https://gcc.gnu.org/pipermail/gcc/2024-April/243824.html;>2024-04-26
   
-  (regression fixes  docs only).
+  (general development).
   
   https://gcc.gnu.org/bugzilla/buglist.cgi?query_format=advancedshort_desc_type=regexpshort_desc=%5C%5B(%5B%200-9.%2F%5D*%5B%20%2F%5D)*15%5B%20%2F%5D%5B%200-9.%2F%5D*%5BRr%5Degression%20*%5C%5Dtarget_milestone=11.5target_milestone=12.4target_milestone=13.3target_milestone=14.2target_milestone=15.0known_to_fail_type=allwordssubstrknown_to_work_type=allwordssubstrlong_desc_type=allwordssubstrlong_desc=bug_file_loc_type=allwordssubstrbug_file_loc=gcchost_type=allwordssubstrgcchost=gcctarget_type=allwordssubstrgcctarget=gccbuild_type=allwordssubstrgccbuild=keywords_type=allwordskeywords=bug_status=UNCONFIRMEDbug_status=NEWbug_status=ASSIGNEDbug_status=SUSPENDEDbug_status=WAITINGbug_status=REOPENEDpriority=P1priority=P2priority=P3emailtype1=substringemail1=emailtype2=substringemail2=bugidtype=includebug_i
 
d=votes=chfieldfrom=chfieldto=Nowchfieldvalue=cmdtype=doitorder=Reuse+same+sort+as+last+timefield0-0-0=nooptype0-0-0=noopvalue0-0-0=">Serious

---

Summary of changes:
 htdocs/index.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


hooks/post-receive
-- 
gcc-wwwdocs


[gcc r12-10456] tree-optimization/112281 - loop distribution and zero dependence distances

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:5db4b5449df8f59a61438f8db1836dbc9b53f02e

commit r12-10456-g5db4b5449df8f59a61438f8db1836dbc9b53f02e
Author: Richard Biener 
Date:   Mon Nov 20 13:39:52 2023 +0100

tree-optimization/112281 - loop distribution and zero dependence distances

The following fixes an omission in dependence testing for loop
distribution.  When the overall dependence distance is not zero but
the dependence direction in the innermost common loop is = there is
a conflict between the partitions and we have to merge them.

PR tree-optimization/112281
* tree-loop-distribution.cc
(loop_distribution::pg_add_dependence_edges): For = in the
innermost common loop record a partition conflict.

* gcc.dg/torture/pr112281-1.c: New testcase.
* gcc.dg/torture/pr112281-2.c: Likewise.

(cherry picked from commit 3b34902417259031823bff7f853f615a60464bbd)

Diff:
---
 gcc/testsuite/gcc.dg/torture/pr112281-1.c | 18 ++
 gcc/testsuite/gcc.dg/torture/pr112281-2.c | 18 ++
 gcc/tree-loop-distribution.cc | 18 ++
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/torture/pr112281-1.c 
b/gcc/testsuite/gcc.dg/torture/pr112281-1.c
new file mode 100644
index ..711f5663195c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr112281-1.c
@@ -0,0 +1,18 @@
+/* { dg-do run } */
+/* { dg-additional-options "-ftree-loop-distribution" } */
+
+struct {
+  int : 8;
+  int a;
+} b, d[4] = {{0}, {0}, {0}, {5}};
+int c, e;
+int main() {
+  for (c = 2; c; c--)
+for (e = 0; e < 2; e++) {
+  d[c] = b = d[c + 1];
+  d[c + 1].a = 0;
+}
+  if (b.a != 0)
+__builtin_abort();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/torture/pr112281-2.c 
b/gcc/testsuite/gcc.dg/torture/pr112281-2.c
new file mode 100644
index ..d7671e3322b4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr112281-2.c
@@ -0,0 +1,18 @@
+/* { dg-do run } */
+/* { dg-additional-options "-ftree-loop-distribution" } */
+
+struct {
+  int : 8;
+  int a;
+} b, d[4] = {{5}, {0}, {0}, {0}};
+int c, e;
+int main() {
+  for (c = 0; c < 2; c++)
+for (e = 0; e < 2; e++) {
+  d[c + 1] = b = d[c];
+  d[c].a = 0;
+}
+  if (b.a != 0)
+__builtin_abort();
+  return 0;
+}
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index 606eb05e64a5..1b7d2a1ea7d2 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -2117,9 +2117,6 @@ loop_distribution::pg_add_dependence_edges (struct graph 
*rdg, int dir,
}
  else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
{
- if (DDR_REVERSED_P (ddr))
-   this_dir = -this_dir;
-
  /* Known dependences can still be unordered througout the
 iteration space, see gcc.dg/tree-ssa/ldist-16.c and
 gcc.dg/tree-ssa/pr94969.c.  */
@@ -2132,7 +2129,20 @@ loop_distribution::pg_add_dependence_edges (struct graph 
*rdg, int dir,
  /* Else as the distance vector is lexicographic positive swap
 the dependence direction.  */
  else
-   this_dir = -this_dir;
+   {
+ if (DDR_REVERSED_P (ddr))
+   this_dir = -this_dir;
+ this_dir = -this_dir;
+
+ /* When then dependence distance of the innermost common
+loop of the DRs is zero we have a conflict.  */
+ auto l1 = gimple_bb (DR_STMT (dr1))->loop_father;
+ auto l2 = gimple_bb (DR_STMT (dr2))->loop_father;
+ int idx = index_in_loop_nest (find_common_loop (l1, l2)->num,
+   DDR_LOOP_NEST (ddr));
+ if (DDR_DIST_VECT (ddr, 0)[idx] == 0)
+   this_dir = 2;
+   }
}
  else
this_dir = 0;


[gcc r12-10458] middle-end/110176 - wrong zext (bool) <= (int) 4294967295u folding

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:65e5547e5468ce404d0f9ebd646a1d63abf3a772

commit r12-10458-g65e5547e5468ce404d0f9ebd646a1d63abf3a772
Author: Richard Biener 
Date:   Wed Jan 31 14:40:24 2024 +0100

middle-end/110176 - wrong zext (bool) <= (int) 4294967295u folding

The following fixes a wrong pattern that didn't match the behavior
of the original fold_widened_comparison in that get_unwidened
returned a constant always in the wider type.  But here we're
using (int) 4294967295u without the conversion applied.  Fixed
by doing as earlier in the pattern - matching constants only
if the conversion was actually applied.

PR middle-end/110176
* match.pd (zext (bool) <= (int) 4294967295u): Make sure
to match INTEGER_CST only without outstanding conversion.

* gcc.dg/torture/pr110176.c: New testcase.

(cherry picked from commit 22dbfbe8767ff4c1d93e39f68ec7c2d5b1358beb)

Diff:
---
 gcc/match.pd| 12 -
 gcc/testsuite/gcc.dg/torture/pr110176.c | 46 +
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 0938d56fa45f..45ed34205106 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5379,19 +5379,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   >= TYPE_PRECISION (TREE_TYPE (@10)))
  && (TYPE_UNSIGNED (TREE_TYPE (@00))
  == TYPE_UNSIGNED (TREE_TYPE (@10
- || (TREE_CODE (@10) == INTEGER_CST
+ || (TREE_CODE (@1) == INTEGER_CST
  && INTEGRAL_TYPE_P (TREE_TYPE (@00))
- && int_fits_type_p (@10, TREE_TYPE (@00)
+ && int_fits_type_p (@1, TREE_TYPE (@00)
   (cmp @00 (convert @10))
-  (if (TREE_CODE (@10) == INTEGER_CST
+  (if (TREE_CODE (@1) == INTEGER_CST
   && INTEGRAL_TYPE_P (TREE_TYPE (@00))
-  && !int_fits_type_p (@10, TREE_TYPE (@00)))
+  && !int_fits_type_p (@1, TREE_TYPE (@00)))
(with
{
  tree min = lower_bound_in_type (TREE_TYPE (@10), TREE_TYPE (@00));
  tree max = upper_bound_in_type (TREE_TYPE (@10), TREE_TYPE (@00));
- bool above = integer_nonzerop (const_binop (LT_EXPR, type, max, @10));
- bool below = integer_nonzerop (const_binop (LT_EXPR, type, @10, min));
+ bool above = integer_nonzerop (const_binop (LT_EXPR, type, max, @1));
+ bool below = integer_nonzerop (const_binop (LT_EXPR, type, @1, min));
}
(if (above || below)
 (if (cmp == EQ_EXPR || cmp == NE_EXPR)
diff --git a/gcc/testsuite/gcc.dg/torture/pr110176.c 
b/gcc/testsuite/gcc.dg/torture/pr110176.c
new file mode 100644
index ..e41e3a0c3a7e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr110176.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+
+int f(_Bool t)
+{
+int tt = t;
+unsigned x = -1;
+int xx = x;
+return xx <= tt;
+}
+
+int a, b;
+void c() {}
+__attribute__((noipa))
+void h() {__builtin_abort();}
+int d() {
+  unsigned f[1];
+  int i;
+  if (a)
+goto h;
+  f[0] = -1;
+  while (1) {
+c();
+for (; a < 1; a++) {
+  if (0) {
+  j:
+continue;
+  }
+  i = f[0];
+  if (a)
+break;
+  b = i >= (b == 0);
+}
+if (!b) {
+  if (0) {
+  h:
+goto j;
+  }
+  return 0;
+}
+h();
+  }
+}
+int main() {
+  d();
+  return 0;
+}


[gcc r12-10454] tree-optimization/112505 - bit-precision induction vectorization

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:4a71557fbebe3fb4031d1c2adc4f89c89a8c6c62

commit r12-10454-g4a71557fbebe3fb4031d1c2adc4f89c89a8c6c62
Author: Richard Biener 
Date:   Thu Jan 11 14:00:33 2024 +0100

tree-optimization/112505 - bit-precision induction vectorization

Vectorization of bit-precision inductions isn't implemented but we
don't check this, instead we ICE during transform.

PR tree-optimization/112505
* tree-vect-loop.cc (vectorizable_induction): Reject
bit-precision induction.

* gcc.dg/vect/pr112505.c: New testcase.

(cherry picked from commit ec345df53556ec581590347f71c3d9ff3cdbca76)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr112505.c | 14 ++
 gcc/tree-vect-loop.cc|  9 +
 2 files changed, 23 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/vect/pr112505.c 
b/gcc/testsuite/gcc.dg/vect/pr112505.c
new file mode 100644
index ..56546c1095aa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr112505.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+
+short int w9;
+struct T {
+  short a : 14;
+  int b;
+};
+struct T v;
+void zc()
+{
+  for(int i = 0; i < 4; i ++)
+w9 *= v.b ? v.a-- < 0 : 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 86ee9e449e19..fd0e5a70a962 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8211,6 +8211,15 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 
   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
   gcc_assert (step_expr != NULL_TREE);
+  if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
+  && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"bit-precision induction vectorization not "
+"supported.\n");
+  return false;
+}
   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
 
   /* Check for backend support of PLUS/MINUS_EXPR. */


[gcc r12-10457] tree-optimization/111039 - abnormals and bit test merging

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:47e6bff94d980e2fcb6bcb42df04d3b73bd67da7

commit r12-10457-g47e6bff94d980e2fcb6bcb42df04d3b73bd67da7
Author: Richard Biener 
Date:   Thu Aug 17 13:10:14 2023 +0200

tree-optimization/111039 - abnormals and bit test merging

The following guards the bit test merging code in if-combine against
the appearance of SSA names used in abnormal PHIs.

PR tree-optimization/111039
* tree-ssa-ifcombine.cc (ifcombine_ifandif): Check for
SSA_NAME_OCCURS_IN_ABNORMAL_PHI.

* gcc.dg/pr111039.c: New testcase.

(cherry picked from commit 482551a79a3d3f107f6239679ee74655cfe8707e)

Diff:
---
 gcc/testsuite/gcc.dg/pr111039.c | 15 +++
 gcc/tree-ssa-ifcombine.cc   |  7 +++
 2 files changed, 22 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/pr111039.c b/gcc/testsuite/gcc.dg/pr111039.c
new file mode 100644
index ..bec9983b35f8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr111039.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+int _setjmp ();
+void abcd ();
+void abcde ();
+void compiler_corruption_function(int flags)
+{
+  int nowait = flags & 1048576, isexpand = flags & 8388608;
+  abcd();
+  _setjmp(flags);
+  if (nowait && isexpand)
+flags &= 0;
+  abcde();
+}
diff --git a/gcc/tree-ssa-ifcombine.cc b/gcc/tree-ssa-ifcombine.cc
index ce9bbebf9480..b139328af224 100644
--- a/gcc/tree-ssa-ifcombine.cc
+++ b/gcc/tree-ssa-ifcombine.cc
@@ -415,6 +415,9 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool 
inner_inv,
 {
   tree t, t2;
 
+  if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1))
+   return false;
+
   /* Do it.  */
   gsi = gsi_for_stmt (inner_cond);
   t = fold_build2 (LSHIFT_EXPR, TREE_TYPE (name1),
@@ -465,6 +468,10 @@ ifcombine_ifandif (basic_block inner_cond_bb, bool 
inner_inv,
   gimple_stmt_iterator gsi;
   tree t;
 
+  if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name1)
+ || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (name2))
+   return false;
+
   /* Find the common name which is bit-tested.  */
   if (name1 == name2)
;


[gcc r12-10455] tree-optimization/112495 - alias versioning and address spaces

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:dbb5273996259b04350a1e3d35e633c51fc9310f

commit r12-10455-gdbb5273996259b04350a1e3d35e633c51fc9310f
Author: Richard Biener 
Date:   Mon Nov 13 10:20:37 2023 +0100

tree-optimization/112495 - alias versioning and address spaces

We are not correctly handling differing address spaces in dependence
analysis runtime alias check generation so refuse to do that.

PR tree-optimization/112495
* tree-data-ref.cc (runtime_alias_check_p): Reject checks
between different address spaces.

* gcc.target/i386/pr112495.c: New testcase.

(cherry picked from commit 0f593c0521caab8cfac53514b1a5e7d0d0dd1932)

Diff:
---
 gcc/testsuite/gcc.target/i386/pr112495.c | 12 
 gcc/tree-data-ref.cc |  7 +++
 2 files changed, 19 insertions(+)

diff --git a/gcc/testsuite/gcc.target/i386/pr112495.c 
b/gcc/testsuite/gcc.target/i386/pr112495.c
new file mode 100644
index ..21afbaa6945d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr112495.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+typedef struct { int v; } T1;
+typedef struct { T1 v[32]; } T2;
+
+T1 s;
+T1 f1() { return s; }
+
+void f2(__seg_gs T2 *p, int n) {
+  for (int i = 0; i < n; ++i) p->v[i] = f1();
+}
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 397792c3584c..0df4a3525f4c 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -1632,6 +1632,13 @@ runtime_alias_check_p (ddr_p ddr, class loop *loop, bool 
speed_p)
   "runtime alias check not supported for"
   " outer loop.\n");
 
+  /* FORNOW: We don't support handling different address spaces.  */
+  if (TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (DR_BASE_ADDRESS (DDR_A (ddr)
+  != TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (DR_BASE_ADDRESS (DDR_B 
(ddr))
+return opt_result::failure_at (DR_STMT (DDR_A (ddr)),
+  "runtime alias check between different "
+  "address spaces not supported.\n");
+
   return opt_result::success ();
 }


[gcc r12-10453] debug/112718 - reset all type units with -ffat-lto-objects

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:1f41e8eef3da1d76c18fe1a93846054c53dc5a47

commit r12-10453-g1f41e8eef3da1d76c18fe1a93846054c53dc5a47
Author: Richard Biener 
Date:   Mon Jan 22 15:42:59 2024 +0100

debug/112718 - reset all type units with -ffat-lto-objects

When mixing -flto, -ffat-lto-objects and -fdebug-type-section we
fail to reset all type units after early output resulting in an
ICE when attempting to add then duplicate sibling attributes.

PR debug/112718
* dwarf2out.cc (dwarf2out_finish): Reset all type units
for the fat part of an LTO compile.

* gcc.dg/debug/pr112718.c: New testcase.

(cherry picked from commit 7218f5050cb7163edae331f54ca163248ab48bfa)

Diff:
---
 gcc/dwarf2out.cc  | 12 
 gcc/testsuite/gcc.dg/debug/pr112718.c | 12 
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/gcc/dwarf2out.cc b/gcc/dwarf2out.cc
index d14ec0261b6b..cfe87cba4c4c 100644
--- a/gcc/dwarf2out.cc
+++ b/gcc/dwarf2out.cc
@@ -32162,24 +32162,12 @@ dwarf2out_finish (const char *filename)
   reset_dies (comp_unit_die ());
   for (limbo_die_node *node = cu_die_list; node; node = node->next)
reset_dies (node->die);
-
-  hash_table comdat_type_table (100);
   for (ctnode = comdat_type_list; ctnode != NULL; ctnode = ctnode->next)
{
- comdat_type_node **slot
- = comdat_type_table.find_slot (ctnode, INSERT);
-
- /* Don't reset types twice.  */
- if (*slot != HTAB_EMPTY_ENTRY)
-   continue;
-
  /* Remove the pointer to the line table.  */
  remove_AT (ctnode->root_die, DW_AT_stmt_list);
-
  if (debug_info_level >= DINFO_LEVEL_TERSE)
reset_dies (ctnode->root_die);
-
- *slot = ctnode;
}
 
   /* Reset die CU symbol so we don't output it twice.  */
diff --git a/gcc/testsuite/gcc.dg/debug/pr112718.c 
b/gcc/testsuite/gcc.dg/debug/pr112718.c
new file mode 100644
index ..ff80ca5a2981
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/debug/pr112718.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lto } */
+/* { dg-options "-g -fdebug-types-section -flto -ffat-lto-objects" } */
+
+struct {
+  int h;
+  unsigned char data[20 + 24 * 6];
+} _EC_X9_62_PRIME_192V2;
+struct {
+  int h;
+  unsigned char data[20 + 24 * 6];
+} _EC_X9_62_PRIME_192V3;


[gcc r12-10452] tree-optimization/112793 - SLP of constant/external code-generated twice

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:9bad5cf9ae446b367f666176537eb76e94cc4448

commit r12-10452-g9bad5cf9ae446b367f666176537eb76e94cc4448
Author: Richard Biener 
Date:   Wed Dec 13 14:23:31 2023 +0100

tree-optimization/112793 - SLP of constant/external code-generated twice

The following makes the attempt at code-generating a constant/external
SLP node twice well-formed as that can happen when partitioning BB
vectorization attempts where we keep constants/externals unpartitioned.

PR tree-optimization/112793
* tree-vect-slp.cc (vect_schedule_slp_node): Already
code-generated constant/external nodes are OK.

* g++.dg/vect/pr112793.cc: New testcase.

(cherry picked from commit d782ec8362eadc3169286eb1e39c631effd02323)

Diff:
---
 gcc/testsuite/g++.dg/vect/pr112793.cc | 32 
 gcc/tree-vect-slp.cc  | 16 +---
 2 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/g++.dg/vect/pr112793.cc 
b/gcc/testsuite/g++.dg/vect/pr112793.cc
new file mode 100644
index ..258d7c1b1119
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/pr112793.cc
@@ -0,0 +1,32 @@
+// { dg-do compile }
+// { dg-require-effective-target c++11 }
+// { dg-additional-options "-march=znver2" { target x86_64-*-* i?86-*-* } }
+
+typedef double T;
+T c, s;
+T a[16];
+struct Matrix4 {
+  Matrix4(){}
+  Matrix4(T e, T f, T i, T j) {
+r[1] = r[4] = e;
+r[5] = f;
+r[8] = i;
+r[9] = j;
+  }
+  Matrix4 operator*(Matrix4 a) {
+return Matrix4(
+   r[0] * a.r[4] + r[4] + r[15] + r[6],
+   r[1] * a.r[4] + 1 + 2 + 3,  r[0] * r[8] + 1 + 2 + 3,
+   r[1] * r[8] + r[1] + r[14] + r[2] * r[3]);
+  }
+  T r[16] = {};
+};
+Matrix4 t1, t2;
+Matrix4 tt;
+Matrix4 getRotAltAzToEquatorial()
+{
+  t2.r[4] =  0;
+  t1.r[1] =  -s;
+  t1.r[8] = 0;
+  return t1 * t2;
+}
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 26c989cbff9a..54e6a9e4224f 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -7240,12 +7240,6 @@ vect_schedule_slp_node (vec_info *vinfo,
   int i;
   slp_tree child;
 
-  /* For existing vectors there's nothing to do.  */
-  if (SLP_TREE_VEC_DEFS (node).exists ())
-return;
-
-  gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
-
   /* Vectorize externals and constants.  */
   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
   || SLP_TREE_DEF_TYPE (node) == vect_external_def)
@@ -7256,10 +7250,18 @@ vect_schedule_slp_node (vec_info *vinfo,
   if (!SLP_TREE_VECTYPE (node))
return;
 
-  vect_create_constant_vectors (vinfo, node);
+  /* There are two reasons vector defs might already exist.  The first
+is that we are vectorizing an existing vector def.  The second is
+when performing BB vectorization shared constant/external nodes
+are not split apart during partitioning so during the code-gen
+DFS walk we can end up visiting them twice.  */
+  if (! SLP_TREE_VEC_DEFS (node).exists ())
+   vect_create_constant_vectors (vinfo, node);
   return;
 }
 
+  gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
+
   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
 
   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);


[gcc r14-10214] tree-optimization/114998 - use-after-free with loop distribution

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:1e9ae50d4d160f6d557fc4cbbe95c4a36897c09f

commit r14-10214-g1e9ae50d4d160f6d557fc4cbbe95c4a36897c09f
Author: Richard Biener 
Date:   Fri May 10 14:19:49 2024 +0200

tree-optimization/114998 - use-after-free with loop distribution

When loop distribution releases a PHI node of the original IL it
can end up clobbering memory that's re-used when it upon releasing
its RDG resets all stmt UIDs back to -1, even those that got released.

The fix is to avoid resetting UIDs based on stmts in the RDG but
instead reset only those still present in the loop.

PR tree-optimization/114998
* tree-loop-distribution.cc (free_rdg): Take loop argument.
Reset UIDs of stmts still in the IL rather than all stmts
referenced from the RDG.
(loop_distribution::build_rdg): Pass loop to free_rdg.
(loop_distribution::distribute_loop): Likewise.
(loop_distribution::transform_reduction_loop): Likewise.

* gcc.dg/torture/pr114998.c: New testcase.

(cherry picked from commit 34d15a4d630a0d54eddb99bdab086c506e10dac5)

Diff:
---
 gcc/testsuite/gcc.dg/torture/pr114998.c | 35 +
 gcc/tree-loop-distribution.cc   | 24 --
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/torture/pr114998.c 
b/gcc/testsuite/gcc.dg/torture/pr114998.c
new file mode 100644
index ..81fc1e077cb9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr114998.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-dce -ftree-loop-distribution" } */
+
+short a, d;
+int b, c, f, g, h, i, j[2], o;
+__attribute__((const)) int s(char r);
+int main() {
+  int l, m, k, n;
+  if (b) {
+char p;
+for (; p >= 0; p--) {
+  int e[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
+ 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
+ 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
+  if (j[p]) {
+int q[1];
+i = o;
+o = q[h];
+if (g)
+  n = d;
+m = 4;
+for (; m; m--) {
+  if (l)
+k |= c;
+  if (a)
+break;
+}
+  }
+  s(n);
+  f |= b;
+}
+  }
+  return 0;
+}
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index 95203fefa188..45932bae5e7f 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -778,7 +778,7 @@ loop_distribution::stmts_from_loop (class loop *loop, 
vec *stmts)
 /* Free the reduced dependence graph RDG.  */
 
 static void
-free_rdg (struct graph *rdg)
+free_rdg (struct graph *rdg, loop_p loop)
 {
   int i;
 
@@ -792,13 +792,25 @@ free_rdg (struct graph *rdg)
 
   if (v->data)
{
- gimple_set_uid (RDGV_STMT (v), -1);
  (RDGV_DATAREFS (v)).release ();
  free (v->data);
}
 }
 
   free_graph (rdg);
+
+  /* Reset UIDs of stmts still in the loop.  */
+  basic_block *bbs = get_loop_body (loop);
+  for (unsigned i = 0; i < loop->num_nodes; ++i)
+{
+  basic_block bb = bbs[i];
+  gimple_stmt_iterator gsi;
+  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next ())
+   gimple_set_uid (gsi_stmt (gsi), -1);
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next ())
+   gimple_set_uid (gsi_stmt (gsi), -1);
+}
+  free (bbs);
 }
 
 struct graph *
@@ -812,7 +824,7 @@ loop_distribution::build_rdg (class loop *loop, 
control_dependences *cd)
   rdg = new_graph (stmts.length ());
   if (!create_rdg_vertices (rdg, stmts, loop))
 {
-  free_rdg (rdg);
+  free_rdg (rdg, loop);
   return NULL;
 }
   stmts.release ();
@@ -3062,7 +3074,7 @@ loop_distribution::distribute_loop (class loop *loop,
 "Loop %d not distributed: too many memory references.\n",
 loop->num);
 
-  free_rdg (rdg);
+  free_rdg (rdg, loop);
   loop_nest.release ();
   free_data_refs (datarefs_vec);
   delete ddrs_table;
@@ -3259,7 +3271,7 @@ loop_distribution::distribute_loop (class loop *loop,
   FOR_EACH_VEC_ELT (partitions, i, partition)
 partition_free (partition);
 
-  free_rdg (rdg);
+  free_rdg (rdg, loop);
   return nbp - *nb_calls;
 }
 
@@ -3665,7 +3677,7 @@ loop_distribution::transform_reduction_loop (loop_p loop)
   auto_bitmap partition_stmts;
   bitmap_set_range (partition_stmts, 0, rdg->n_vertices);
   find_single_drs (loop, rdg, partition_stmts, _dr, _dr);
-  free_rdg (rdg);
+  free_rdg (rdg, loop);
 
   /* Bail out if there is no single load.  */
   if (load_dr == NULL)


[gcc r15-626] middle-end/115110 - Fix view_converted_memref_p

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:a5b3721c06646bf5b9b50a22964e8e2bd4d03f5f

commit r15-626-ga5b3721c06646bf5b9b50a22964e8e2bd4d03f5f
Author: Richard Biener 
Date:   Fri May 17 11:02:29 2024 +0200

middle-end/115110 - Fix view_converted_memref_p

view_converted_memref_p was checking the reference type against the
pointer type of the offset operand rather than its pointed-to type
which leads to all refs being subject to view-convert treatment
in get_alias_set causing numerous testsuite fails but with its
new uses from r15-512-g9b7cad5884f21c is also a wrong-code issue.

PR middle-end/115110
* tree-ssa-alias.cc (view_converted_memref_p): Fix.

Diff:
---
 gcc/tree-ssa-alias.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
index 9f5f69bcfad2..d64d6d02f4a8 100644
--- a/gcc/tree-ssa-alias.cc
+++ b/gcc/tree-ssa-alias.cc
@@ -2077,8 +2077,9 @@ view_converted_memref_p (tree base)
 {
   if (TREE_CODE (base) != MEM_REF && TREE_CODE (base) != TARGET_MEM_REF)
 return false;
-  return same_type_for_tbaa (TREE_TYPE (base),
-TREE_TYPE (TREE_OPERAND (base, 1))) != 1;
+  return (same_type_for_tbaa (TREE_TYPE (base),
+ TREE_TYPE (TREE_TYPE (TREE_OPERAND (base, 1
+ != 1);
 }
 
 /* Return true if an indirect reference based on *PTR1 constrained


[gcc r15-622] Add missing check for const_pool in the escaped solutions

2024-05-17 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:b420e0b920613c42f63252aa2478a8315dc37a13

commit r15-622-gb420e0b920613c42f63252aa2478a8315dc37a13
Author: Richard Biener 
Date:   Fri May 17 09:31:52 2024 +0200

Add missing check for const_pool in the escaped solutions

The ptr-vs-ptr compare folding using points-to info was missing a
check for const_pool being included in the escaped solution.  The
following fixes that, fixing the observed execute FAIL of
experimental/functional/searchers.cc

* tree-ssa-alias.h (pt_solution_includes_const_pool): Declare.
* tree-ssa-alias.cc (ptrs_compare_unequal): Use
pt_solution_includes_const_pool.
* tree-ssa-structalias.cc (pt_solution_includes_const_pool): New.

* gcc.dg/torture/20240517-1.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/torture/20240517-1.c | 26 ++
 gcc/tree-ssa-alias.cc |  3 ++-
 gcc/tree-ssa-alias.h  |  1 +
 gcc/tree-ssa-structalias.cc   | 11 +++
 4 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/torture/20240517-1.c 
b/gcc/testsuite/gcc.dg/torture/20240517-1.c
new file mode 100644
index ..ab83d3ca6fba
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/20240517-1.c
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-additional-options "-fmerge-all-constants" } */
+
+char *p;
+
+char * __attribute__((noipa))
+foo () { return p+1; }
+
+volatile int z;
+
+int main()
+{
+  /* ESCAPED = CONST_POOL */
+  p = "Hello";
+  /* PT = ESCAPED */
+  char *x = foo ();
+  char *y;
+  /* y PT = CONST_POOL */
+  if (z)
+y = "Baz";
+  else
+y = "Hello" + 1;
+  if (y != x)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
index 6d31fc836917..9f5f69bcfad2 100644
--- a/gcc/tree-ssa-alias.cc
+++ b/gcc/tree-ssa-alias.cc
@@ -501,7 +501,8 @@ ptrs_compare_unequal (tree ptr1, tree ptr2)
  || pi2->pt.vars_contains_interposable)
return false;
  if ((!pi1->pt.null || !pi2->pt.null)
- && (!pi1->pt.const_pool || !pi2->pt.const_pool))
+ && (!pt_solution_includes_const_pool (>pt)
+ || !pt_solution_includes_const_pool (>pt)))
return !pt_solutions_intersect (>pt, >pt);
}
 }
diff --git a/gcc/tree-ssa-alias.h b/gcc/tree-ssa-alias.h
index e29dff583750..5cd64e722955 100644
--- a/gcc/tree-ssa-alias.h
+++ b/gcc/tree-ssa-alias.h
@@ -178,6 +178,7 @@ extern bool pt_solution_empty_p (const pt_solution *);
 extern bool pt_solution_singleton_or_null_p (struct pt_solution *, unsigned *);
 extern bool pt_solution_includes_global (struct pt_solution *, bool);
 extern bool pt_solution_includes (struct pt_solution *, const_tree);
+extern bool pt_solution_includes_const_pool (struct pt_solution *);
 extern bool pt_solutions_intersect (struct pt_solution *, struct pt_solution 
*);
 extern void pt_solution_reset (struct pt_solution *);
 extern void pt_solution_set (struct pt_solution *, bitmap, bool);
diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc
index 0c6085b17662..61fb3610a172 100644
--- a/gcc/tree-ssa-structalias.cc
+++ b/gcc/tree-ssa-structalias.cc
@@ -7080,6 +7080,17 @@ pt_solution_includes (struct pt_solution *pt, const_tree 
decl)
   return res;
 }
 
+/* Return true if the points-to solution *PT contains a reference to a
+   constant pool entry.  */
+
+bool
+pt_solution_includes_const_pool (struct pt_solution *pt)
+{
+  return (pt->const_pool
+ || (pt->escaped && (!cfun || cfun->gimple_df->escaped.const_pool))
+ || (pt->ipa_escaped && ipa_escaped_pt.const_pool));
+}
+
 /* Return true if both points-to solutions PT1 and PT2 have a non-empty
intersection.  */


[gcc r15-580] tree-optimization/13962 - handle ptr-ptr compares in ptrs_compare_unequal

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:f3e5f4c58591f5dacdd14a65ec47bbe310df02a0

commit r15-580-gf3e5f4c58591f5dacdd14a65ec47bbe310df02a0
Author: Richard Biener 
Date:   Mon Mar 11 11:17:32 2024 +0100

tree-optimization/13962 - handle ptr-ptr compares in ptrs_compare_unequal

Now that we handle pt.null conservatively we can implement the missing
tracking of constant pool entries (aka STRING_CST) and handle
ptr-ptr compares using points-to info in ptrs_compare_unequal.

PR tree-optimization/13962
PR tree-optimization/96564
* tree-ssa-alias.h (pt_solution::const_pool): New flag.
* tree-ssa-alias.cc (ptrs_compare_unequal): Handle pointer-pointer
compares.
(dump_points_to_solution): Dump the const_pool flag, fix guard
of flag dumping.
* gimple-pretty-print.cc (pp_points_to_solution): Likewise.
* tree-ssa-structalias.cc (find_what_var_points_to): Set
the const_pool flag for STRING.
(pt_solution_ior_into): Handle the const_pool flag.
(ipa_escaped_pt): Initialize it.

* gcc.dg/tree-ssa/alias-39.c: New testcase.
* g++.dg/vect/pr68145.cc: Use -fno-tree-pta to avoid UB
to manifest in transforms no longer vectorizing this testcase
for an ICE.

Diff:
---
 gcc/gimple-pretty-print.cc   |  5 -
 gcc/testsuite/g++.dg/vect/pr68145.cc |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/alias-39.c | 12 
 gcc/tree-ssa-alias.cc| 30 ++
 gcc/tree-ssa-alias.h |  5 +
 gcc/tree-ssa-structalias.cc  |  6 +++---
 6 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/gcc/gimple-pretty-print.cc b/gcc/gimple-pretty-print.cc
index abda8871f97f..a71e1e0efc77 100644
--- a/gcc/gimple-pretty-print.cc
+++ b/gcc/gimple-pretty-print.cc
@@ -822,6 +822,8 @@ pp_points_to_solution (pretty_printer *buffer, const 
pt_solution *pt)
 pp_string (buffer, "unit-escaped ");
   if (pt->null)
 pp_string (buffer, "null ");
+  if (pt->const_pool)
+pp_string (buffer, "const-pool ");
   if (pt->vars
   && !bitmap_empty_p (pt->vars))
 {
@@ -838,7 +840,8 @@ pp_points_to_solution (pretty_printer *buffer, const 
pt_solution *pt)
   if (pt->vars_contains_nonlocal
  || pt->vars_contains_escaped
  || pt->vars_contains_escaped_heap
- || pt->vars_contains_restrict)
+ || pt->vars_contains_restrict
+ || pt->vars_contains_interposable)
{
  const char *comma = "";
  pp_string (buffer, " (");
diff --git a/gcc/testsuite/g++.dg/vect/pr68145.cc 
b/gcc/testsuite/g++.dg/vect/pr68145.cc
index 8a1e10ee7833..8d3502b0bf4e 100644
--- a/gcc/testsuite/g++.dg/vect/pr68145.cc
+++ b/gcc/testsuite/g++.dg/vect/pr68145.cc
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-Wno-return-type" } */
+/* { dg-additional-options "-fno-tree-pta -Wno-return-type" } */
 
 struct A {
   bool operator()(int p1, int p2) { return p1 && p2; }
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/alias-39.c 
b/gcc/testsuite/gcc.dg/tree-ssa/alias-39.c
new file mode 100644
index ..3b452893f6b1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/alias-39.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-forwprop3" } */
+
+static int a, b;
+int foo (int n, int which)
+{
+  void *p = __builtin_malloc (n);
+  void *q = which ?  : 
+  return p == q;
+}
+
+/* { dg-final { scan-tree-dump "return 0;" "forwprop3" } } */
diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
index 96301bbde7fa..6d31fc836917 100644
--- a/gcc/tree-ssa-alias.cc
+++ b/gcc/tree-ssa-alias.cc
@@ -484,9 +484,27 @@ ptrs_compare_unequal (tree ptr1, tree ptr2)
}
   return !pt_solution_includes (>pt, obj1);
 }
-
-  /* ???  We'd like to handle ptr1 != NULL and ptr1 != ptr2
- but those require pt.null to be conservatively correct.  */
+  else if (TREE_CODE (ptr1) == SSA_NAME)
+{
+  struct ptr_info_def *pi1 = SSA_NAME_PTR_INFO (ptr1);
+  if (!pi1
+ || pi1->pt.vars_contains_restrict
+ || pi1->pt.vars_contains_interposable)
+   return false;
+  if (integer_zerop (ptr2) && !pi1->pt.null)
+   return true;
+  if (TREE_CODE (ptr2) == SSA_NAME)
+   {
+ struct ptr_info_def *pi2 = SSA_NAME_PTR_INFO (ptr2);
+ if (!pi2
+ || pi2->pt.vars_contains_restrict
+ || pi2->pt.vars_contains_interposable)
+   return false;
+ if ((!pi1->pt.null || !pi2->pt.null)
+ && (!pi1->pt.const_pool || !pi2->pt.const_pool))
+   return !pt_solutions_intersect (>pt, >pt);
+   }
+}
 
   return false;
 }
@@ -636,6 +654,9 @@ dump_points_to_solution (FILE *file, struct pt_solution *pt)
   if (pt->null)
 fprintf (file, ", points-to NULL");
 
+  if (pt->const_pool)
+fprintf (file, ", points-to 

[gcc r15-579] wrong code with points-to and volatile

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:a9251ab3c91c8c559d0306838575a666ae62dff4

commit r15-579-ga9251ab3c91c8c559d0306838575a666ae62dff4
Author: Richard Biener 
Date:   Thu May 16 12:35:28 2024 +0200

wrong code with points-to and volatile

The following fixes points-to analysis which ignores the fact that
volatile qualified refs can result in any pointer.

* tree-ssa-structalias.cc (get_constraint_for_1): For
volatile referenced or decls use ANYTHING.

* gcc.dg/tree-ssa/alias-38.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/tree-ssa/alias-38.c | 14 ++
 gcc/tree-ssa-structalias.cc  |  7 +++
 2 files changed, 21 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/alias-38.c 
b/gcc/testsuite/gcc.dg/tree-ssa/alias-38.c
new file mode 100644
index ..a5c414934735
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/alias-38.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int x;
+int y;
+
+int main ()
+{
+  int *volatile p = 
+  return (p != );
+}
+
+/* { dg-final { scan-tree-dump " != " "optimized" } } */
+/* { dg-final { scan-tree-dump-not "return 1;" "optimized" } } */
diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc
index bb59c6a7c023..0bac1a1f045a 100644
--- a/gcc/tree-ssa-structalias.cc
+++ b/gcc/tree-ssa-structalias.cc
@@ -3575,6 +3575,10 @@ get_constraint_for_1 (tree t, vec *results, bool 
address_p,
   }
 case tcc_reference:
   {
+   if (TREE_THIS_VOLATILE (t))
+ /* Fall back to anything.  */
+ break;
+
switch (TREE_CODE (t))
  {
  case MEM_REF:
@@ -3676,6 +3680,9 @@ get_constraint_for_1 (tree t, vec *results, bool 
address_p,
   }
 case tcc_declaration:
   {
+   if (VAR_P (t) && TREE_THIS_VOLATILE (t))
+ /* Fall back to anything.  */
+ break;
get_constraint_for_ssa_var (t, results, address_p);
return;
   }


[gcc r12-10450] tree-optimization/114027 - conditional reduction chain

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:6661a7c098a46eff4afc98b55c89e3a71d63d674

commit r12-10450-g6661a7c098a46eff4afc98b55c89e3a71d63d674
Author: Richard Biener 
Date:   Thu Feb 22 10:50:12 2024 +0100

tree-optimization/114027 - conditional reduction chain

When we classify a conditional reduction chain as CONST_COND_REDUCTION
we fail to verify all involved conditionals have the same constant.
That's a quite unlikely situation so the following simply disables
such classification when there's more than one reduction statement.

PR tree-optimization/114027
* tree-vect-loop.cc (vecctorizable_reduction): Use optimized
condition reduction classification only for single-element
chains.

* gcc.dg/vect/pr114027.c: New testcase.

(cherry picked from commit 549f251f055e3a0b0084189a3012c4f15d635e75)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr114027.c | 26 ++
 gcc/tree-vect-loop.cc| 11 ++-
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr114027.c 
b/gcc/testsuite/gcc.dg/vect/pr114027.c
new file mode 100644
index ..ead9cdd982d7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr114027.c
@@ -0,0 +1,26 @@
+#include "tree-vect.h"
+
+int __attribute__((noipa))
+foo (int *f, int n)
+{
+  int res = 0;
+  for (int i = 0; i < n; ++i)
+{
+  if (f[2*i])
+res = 2;
+  if (f[2*i+1])
+res = -2;
+}
+  return res;
+}
+
+int f[] = { 1, 1, 1, 1, 1, 1, 1, 1,
+1, 1, 1, 1, 1, 1, 1, 0 };
+
+int
+main ()
+{
+  if (foo (f, 16) != 2)
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index e1681047d9da..86ee9e449e19 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6850,17 +6850,18 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE 
(vectype_op[i]))
vectype_in = vectype_op[i];
 
-  if (op.code == COND_EXPR)
+  /* Record how the non-reduction-def value of COND_EXPR is defined.
+???  For a chain of multiple CONDs we'd have to match them up all.  */
+  if (op.code == COND_EXPR && reduc_chain_length == 1)
{
- /* Record how the non-reduction-def value of COND_EXPR is defined.  */
  if (dt == vect_constant_def)
{
  cond_reduc_dt = dt;
  cond_reduc_val = op.ops[i];
}
- if (dt == vect_induction_def
- && def_stmt_info
- && is_nonwrapping_integer_induction (def_stmt_info, loop))
+ else if (dt == vect_induction_def
+  && def_stmt_info
+  && is_nonwrapping_integer_induction (def_stmt_info, loop))
{
  cond_reduc_dt = dt;
  cond_stmt_vinfo = def_stmt_info;


[gcc r12-10451] tree-optimization/114027 - fix testcase

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:2d650c041d37a3df2bec46a67e42f9976d7fd2bf

commit r12-10451-g2d650c041d37a3df2bec46a67e42f9976d7fd2bf
Author: Richard Biener 
Date:   Tue Mar 26 09:46:06 2024 +0100

tree-optimization/114027 - fix testcase

The following fixes out-of-bounds read in the testcase.

PR tree-optimization/114027
* gcc.dg/vect/pr114027.c: Fix iteration count.

(cherry picked from commit 4470611e20f3217ee81647b01fda65b6a62229aa)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr114027.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr114027.c 
b/gcc/testsuite/gcc.dg/vect/pr114027.c
index ead9cdd982d7..b3f3e30e15fc 100644
--- a/gcc/testsuite/gcc.dg/vect/pr114027.c
+++ b/gcc/testsuite/gcc.dg/vect/pr114027.c
@@ -20,7 +20,7 @@ int f[] = { 1, 1, 1, 1, 1, 1, 1, 1,
 int
 main ()
 {
-  if (foo (f, 16) != 2)
+  if (foo (f, 8) != 2)
 __builtin_abort ();
   return 0;
 }


[gcc r12-10448] tree-optimization/114231 - use patterns for BB SLP discovery root stmts

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:a7b1d814da0aa2e7331c56180264a8b786012971

commit r12-10448-ga7b1d814da0aa2e7331c56180264a8b786012971
Author: Richard Biener 
Date:   Tue Mar 5 10:55:56 2024 +0100

tree-optimization/114231 - use patterns for BB SLP discovery root stmts

The following makes sure to use recognized patterns when vectorizing
roots during BB SLP discovery.  We need to apply those late since
during root discovery we've not yet done pattern recognition.
All parts of the vectorizer assume patterns get used, for the testcase
we mix this up when doing live lane computation.

PR tree-optimization/114231
* tree-vect-slp.cc (vect_analyze_slp): Lookup patterns when
processing a BB SLP root.

* gcc.dg/vect/pr114231.c: New testcase.

(cherry picked from commit 04fffbaa87997ac893a9aa68b674c938ba3ecddb)

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr114231.c | 12 
 gcc/tree-vect-slp.cc |  4 
 2 files changed, 16 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/vect/pr114231.c 
b/gcc/testsuite/gcc.dg/vect/pr114231.c
new file mode 100644
index ..5e3a81039188
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr114231.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+
+void f(long*);
+int ff[2];
+void f2(long, long, unsigned long);
+void k(unsigned long x, unsigned long y)
+{
+  long t = x >> ff[0];
+  long t1 = ff[1];
+  unsigned long t2 = y >> ff[0];
+  f2(t1, t+t2, t2);
+}
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 7f9fbb9f3d01..f33e85337abd 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3410,6 +3410,10 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
   for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
{
  vect_location = bb_vinfo->roots[i].roots[0]->stmt;
+ /* Apply patterns.  */
+ for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
+   bb_vinfo->roots[i].stmts[j]
+ = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
  if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
   bb_vinfo->roots[i].stmts,
   bb_vinfo->roots[i].roots,


[gcc r12-10449] tree-optimization/114375 - disallow SLP discovery of permuted mask loads

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:c1b21855e131bb818aedc953f403812b494917fc

commit r12-10449-gc1b21855e131bb818aedc953f403812b494917fc
Author: Richard Biener 
Date:   Mon Mar 18 12:39:03 2024 +0100

tree-optimization/114375 - disallow SLP discovery of permuted mask loads

We cannot currently handle permutations of mask loads in code generation
or permute optimization.  But we simply drop any permutation on the
floor, so the following instead rejects the SLP build rather than
producing wrong-code.  I've also made sure to reject them in
vectorizable_load for completeness.

PR tree-optimization/114375
* tree-vect-slp.cc (vect_build_slp_tree_2): Compute the
load permutation for masked loads but reject it when any
such is necessary.
* tree-vect-stmts.cc (vectorizable_load): Reject masked
VMAT_ELEMENTWISE and VMAT_STRIDED_SLP as those are not
supported.

* gcc.dg/vect/vect-pr114375.c: New testcase.

(cherry picked from commit 4f2a35a76cca503749c696e7772d2e8eadc77ba5)

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-pr114375.c | 44 +++
 gcc/tree-vect-slp.cc  | 34 +++-
 gcc/tree-vect-stmts.cc|  8 ++
 3 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-pr114375.c 
b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c
new file mode 100644
index ..1e1cb0123d07
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c
@@ -0,0 +1,44 @@
+/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
+
+#include "tree-vect.h"
+
+int a[512];
+int b[512];
+int c[512];
+
+void __attribute__((noipa))
+foo(int * __restrict p)
+{
+  for (int i = 0; i < 64; ++i)
+{
+  int tem = 2, tem2 = 2;
+  if (a[4*i + 1])
+tem = p[4*i];
+  if (a[4*i])
+tem2 = p[4*i + 2];
+  b[2*i] = tem2;
+  b[2*i+1] = tem;
+  if (a[4*i + 2])
+tem = p[4*i + 1];
+  if (a[4*i + 3])
+tem2 = p[4*i + 3];
+  c[2*i] = tem2;
+  c[2*i+1] = tem;
+}
+}
+int main()
+{
+  check_vect ();
+
+  for (int i = 0; i < 512; ++i)
+a[i] = (i >> 1) & 1;
+
+  foo (a);
+
+  if (c[0] != 1 || c[1] != 0 || c[2] != 1 || c[3] != 0
+  || b[0] != 2 || b[1] != 2 || b[2] != 2 || b[3] != 2)
+abort ();
+
+  return 0;
+}
+
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index f33e85337abd..26c989cbff9a 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1722,10 +1722,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
   && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
 {
-  if (gcall *stmt = dyn_cast  (stmt_info->stmt))
-   gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
-   || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
-   || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
   else
{
  *max_nunits = this_max_nunits;
@@ -1741,15 +1739,37 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
  load_permutation.create (group_size);
  stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
+ bool any_permute = false;
  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
{
  int load_place = vect_get_place_in_interleaving_chain
  (load_info, first_stmt_info);
  gcc_assert (load_place != -1);
- load_permutation.safe_push (load_place);
+ any_permute |= load_place != j;
+ load_permutation.quick_push (load_place);
+   }
+
+ if (gcall *stmt = dyn_cast  (stmt_info->stmt))
+   {
+ gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
+ || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
+ || gimple_call_internal_p (stmt, 
IFN_MASK_GATHER_LOAD));
+ load_permutation.release ();
+ /* We cannot handle permuted masked loads, see PR114375.  */
+ if (any_permute
+ || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && DR_GROUP_SIZE (first_stmt_info) != group_size)
+ || STMT_VINFO_STRIDED_P (stmt_info))
+   {
+ matches[0] = false;
+ return NULL;
+   }
+   }
+ else
+   {
+ SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
+ return node;
}
- SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
- return node;
}
 }
   else if (gimple_assign_single_p (stmt_info->stmt)
diff --git a/gcc/tree-vect-stmts.cc 

[gcc r12-10447] middle-end/114734 - wrong code with expand_call_mem_ref

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:46b2e98983ebc62440c43217f2b3bbe72f8b0191

commit r12-10447-g46b2e98983ebc62440c43217f2b3bbe72f8b0191
Author: Richard Biener 
Date:   Fri Apr 26 15:47:13 2024 +0200

middle-end/114734 - wrong code with expand_call_mem_ref

When expand_call_mem_ref looks at the definition of the address
argument to eventually expand a _MEM_REF argument together
with a masked load it fails to honor constraints imposed by SSA
coalescing decisions.  The following fixes this.

PR middle-end/114734
* internal-fn.cc (expand_call_mem_ref): Use
get_gimple_for_ssa_name to get at the def stmt of the address
argument to honor SSA coalescing constraints.

(cherry picked from commit 20ebcaf826c91ddaf2aac35417ec1e5e6d31ad50)

Diff:
---
 gcc/internal-fn.cc | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 8b1733e20c44..db23f66b0219 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -52,6 +52,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "explow.h"
 #include "rtl-iter.h"
 #include "gimple-range.h"
+#include "tree-ssa-live.h"
+#include "tree-outof-ssa.h"
 
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
@@ -2663,8 +2665,8 @@ expand_call_mem_ref (tree type, gcall *stmt, int index)
   tree tmp = addr;
   if (TREE_CODE (tmp) == SSA_NAME)
 {
-  gimple *def = SSA_NAME_DEF_STMT (tmp);
-  if (gimple_assign_single_p (def))
+  gimple *def = get_gimple_for_ssa_name (tmp);
+  if (def && gimple_assign_single_p (def))
tmp = gimple_assign_rhs1 (def);
 }


[gcc r12-10446] lto/114655 - -flto=4 at link time doesn't override -flto=auto at compile time

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:42a0393655d8e4662ba001c3759cf00b639eddb9

commit r12-10446-g42a0393655d8e4662ba001c3759cf00b639eddb9
Author: Richard Biener 
Date:   Tue Apr 9 14:25:57 2024 +0200

lto/114655 - -flto=4 at link time doesn't override -flto=auto at compile 
time

The following adjusts -flto option processing in lto-wrapper to have
link-time -flto override any compile time setting.

PR lto/114655
* lto-wrapper.cc (merge_flto_options): Add force argument.
(merge_and_complain): Do not force here.
(run_gcc): But here to make the link-time -flto option override
any compile-time one.

(cherry picked from commit 32fb04adae90a0ea68e64e8fc3cb04b613b2e9f3)

Diff:
---
 gcc/lto-wrapper.cc | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/gcc/lto-wrapper.cc b/gcc/lto-wrapper.cc
index 155ccce57ae3..456636bd5dfe 100644
--- a/gcc/lto-wrapper.cc
+++ b/gcc/lto-wrapper.cc
@@ -193,15 +193,18 @@ find_option (vec , 
cl_decoded_option *option)
   return find_option (options, option->opt_index);
 }
 
-/* Merge -flto FOPTION into vector of DECODED_OPTIONS.  */
+/* Merge -flto FOPTION into vector of DECODED_OPTIONS.  If FORCE is true
+   then FOPTION overrides previous settings.  */
 
 static void
 merge_flto_options (vec _options,
-   cl_decoded_option *foption)
+   cl_decoded_option *foption, bool force)
 {
   int existing_opt = find_option (decoded_options, foption);
   if (existing_opt == -1)
 decoded_options.safe_push (*foption);
+  else if (force)
+decoded_options[existing_opt].arg = foption->arg;
   else
 {
   if (strcmp (foption->arg, decoded_options[existing_opt].arg) != 0)
@@ -466,7 +469,7 @@ merge_and_complain (vec _options,
  break;
 
case OPT_flto_:
- merge_flto_options (decoded_options, foption);
+ merge_flto_options (decoded_options, foption, false);
  break;
}
 }
@@ -1540,8 +1543,8 @@ run_gcc (unsigned argc, char *argv[])
  break;
 
case OPT_flto_:
- /* Merge linker -flto= option with what we have in IL files.  */
- merge_flto_options (fdecoded_options, option);
+ /* Override IL file settings with a linker -flto= option.  */
+ merge_flto_options (fdecoded_options, option, true);
  if (strcmp (option->arg, "jobserver") == 0)
jobserver_requested = true;
  break;


[gcc r12-10445] gcov-profile/114715 - missing coverage for switch

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:56415e39029012aa3675d3b4b71fb248cf43049e

commit r12-10445-g56415e39029012aa3675d3b4b71fb248cf43049e
Author: Richard Biener 
Date:   Mon Apr 15 11:09:17 2024 +0200

gcov-profile/114715 - missing coverage for switch

The following avoids missing coverage for the line of a switch statement
which happens when gimplification emits a BIND_EXPR wrapping the switch
as that prevents us from setting locations on the containing statements
via annotate_all_with_location.  Instead set the location of the GIMPLE
switch directly.

PR gcov-profile/114715
* gimplify.cc (gimplify_switch_expr): Set the location of the
GIMPLE switch.

* gcc.misc-tests/gcov-24.c: New testcase.

(cherry picked from commit 5a3cc62dbb45185dd1ca32caec80d57a320ec5a0)

Diff:
---
 gcc/gimplify.cc|  1 +
 gcc/testsuite/gcc.misc-tests/gcov-24.c | 30 ++
 2 files changed, 31 insertions(+)

diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 947fe570e1e7..9e7869770183 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -2764,6 +2764,7 @@ gimplify_switch_expr (tree *expr_p, gimple_seq *pre_p)
 
   switch_stmt = gimple_build_switch (SWITCH_COND (switch_expr),
 default_case, labels);
+  gimple_set_location (switch_stmt, EXPR_LOCATION (switch_expr));
   /* For the benefit of -Wimplicit-fallthrough, if switch_body_seq
 ends with a GIMPLE_LABEL holding SWITCH_BREAK_LABEL_P LABEL_DECL,
 wrap the GIMPLE_SWITCH up to that GIMPLE_LABEL into a GIMPLE_BIND,
diff --git a/gcc/testsuite/gcc.misc-tests/gcov-24.c 
b/gcc/testsuite/gcc.misc-tests/gcov-24.c
new file mode 100644
index ..395099bd7ae3
--- /dev/null
+++ b/gcc/testsuite/gcc.misc-tests/gcov-24.c
@@ -0,0 +1,30 @@
+/* { dg-options "-fprofile-arcs -ftest-coverage" } */
+/* { dg-do run { target native } } */
+
+int main()
+{
+  int a = 1;
+  int b = 2;
+  int c = -3;
+  switch(a) /* count(1) */
+{
+case 1: /* count(1) */
+c = 3;
+switch(b) { /* count(1) */
+  case 1: /* count(#) */
+  c = 4;
+  break;
+  case 2: /* count(1) */
+  c = 5;
+  break;
+}
+break;
+case 2: /* count(#) */
+c = 6;
+break;
+default: /* count(#) */
+break;
+}
+}
+
+/* { dg-final { run-gcov gcov-24.c } } */


[gcc r15-571] tree-optimization/79958 - make DSE track multiple paths

2024-05-16 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:1e0ae1f52741f7e0133661659ed2d210f939a398

commit r15-571-g1e0ae1f52741f7e0133661659ed2d210f939a398
Author: Richard Biener 
Date:   Wed May 15 18:32:37 2024 +0200

tree-optimization/79958 - make DSE track multiple paths

DSE currently gives up when the path we analyze forks.  This leads
to multiple missed dead store elimination PRs.  The following fixes
this by recursing for each path and maintaining the visited bitmap
to avoid visiting CFG re-merges multiple times.  The overall cost
is still limited by the same bound, it's just more likely we'll hit
the limit now.  The patch doesn't try to deal with byte tracking
once a path forks but drops info on the floor and only handling
fully dead stores in that case.

PR tree-optimization/79958
PR tree-optimization/109087
PR tree-optimization/100314
PR tree-optimization/114774
* tree-ssa-dse.cc (dse_classify_store): New forwarder.
(dse_classify_store): Add arguments cnt and visited, recurse
to track multiple paths when we end up with multiple defs.

* gcc.dg/tree-ssa/ssa-dse-48.c: New testcase.
* gcc.dg/tree-ssa/ssa-dse-49.c: Likewise.
* gcc.dg/tree-ssa/ssa-dse-50.c: Likewise.
* gcc.dg/tree-ssa/ssa-dse-51.c: Likewise.
* gcc.dg/graphite/pr80906.c: Avoid DSE of last data reference
in loop.
* g++.dg/ipa/devirt-24.C: Adjust for extra DSE.
* g++.dg/warn/Wuninitialized-pr107919-1.C: Use more important
-O2 optimization level, -O1 regresses.

Diff:
---
 gcc/testsuite/g++.dg/ipa/devirt-24.C   |  4 ++-
 .../g++.dg/warn/Wuninitialized-pr107919-1.C|  2 +-
 gcc/testsuite/gcc.dg/graphite/pr80906.c|  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-48.c | 17 
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-49.c | 18 +
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-50.c | 25 +
 gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-51.c | 24 +
 gcc/tree-ssa-dse.cc| 31 +++---
 8 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/g++.dg/ipa/devirt-24.C 
b/gcc/testsuite/g++.dg/ipa/devirt-24.C
index 7b5b806dd05f..333c03cd8dd7 100644
--- a/gcc/testsuite/g++.dg/ipa/devirt-24.C
+++ b/gcc/testsuite/g++.dg/ipa/devirt-24.C
@@ -37,4 +37,6 @@ C *b = new (C);
   }
 }
 /* { dg-final { scan-ipa-dump-times "Discovered a virtual call to a known 
target" 1 "inline" { xfail *-*-* } } } */
-/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 2 "cp"  } 
} */
+/* We used to have IPA CP see two aggregates passed to sort() but as the
+   first argument is unused DSE now elides the vptr initialization.  */
+/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 1 "cp"  } 
} */
diff --git a/gcc/testsuite/g++.dg/warn/Wuninitialized-pr107919-1.C 
b/gcc/testsuite/g++.dg/warn/Wuninitialized-pr107919-1.C
index dd631dc8bfe7..067a44a462e1 100644
--- a/gcc/testsuite/g++.dg/warn/Wuninitialized-pr107919-1.C
+++ b/gcc/testsuite/g++.dg/warn/Wuninitialized-pr107919-1.C
@@ -1,6 +1,6 @@
 // { dg-do compile }
 // { dg-require-effective-target c++17 }
-// { dg-options "-O -Wuninitialized" }
+// { dg-options "-O2 -Wuninitialized" }
 
 #include 
 #include 
diff --git a/gcc/testsuite/gcc.dg/graphite/pr80906.c 
b/gcc/testsuite/gcc.dg/graphite/pr80906.c
index 59c7f59cadff..ec3840834fc4 100644
--- a/gcc/testsuite/gcc.dg/graphite/pr80906.c
+++ b/gcc/testsuite/gcc.dg/graphite/pr80906.c
@@ -18,7 +18,7 @@ ec (int lh[][2])
  --bm;
if (bm != 0)
  --c5;
-   lh[0][0] = 0;
+   lh[hp][0] = 0;
m3 *= jv;
   }
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-48.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-48.c
new file mode 100644
index ..edfc62c7e4ab
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-48.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-dse1-details" } */
+
+int a;
+int foo (void);
+int bar (void);
+
+void
+baz (void)
+{
+  int *b[6];
+  b[0] = 
+  if (foo ())
+a |= bar ();
+}
+
+/* { dg-final { scan-tree-dump "Deleted dead store: b\\\[0\\\] = " "dse1" } 
} */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-49.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-49.c
new file mode 100644
index ..1eec284a4159
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-49.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fno-tree-dce -fdump-tree-dse1-details" } */
+
+struct X { int i; };
+void bar ();
+void foo (int b)
+{
+  struct X x;
+  x.i = 1;
+  if (b)
+{
+  bar ();
+  __builtin_abort ();
+}
+  bar ();
+}
+
+/* { dg-final { scan-tree-dump "Deleted dead store: x.i = 1;" "dse1" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-50.c 

[gcc r15-518] tree-optimization/114589 - remove profile based sink heuristics

2024-05-15 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:99b1daae18c095d6c94d32efb77442838e11cbfb

commit r15-518-g99b1daae18c095d6c94d32efb77442838e11cbfb
Author: Richard Biener 
Date:   Fri May 3 14:04:41 2024 +0200

tree-optimization/114589 - remove profile based sink heuristics

The following removes the profile based heuristic limiting sinking
and instead uses post-dominators to avoid sinking to places that
are executed under the same conditions as the earlier location which
the profile based heuristic should have guaranteed as well.

To avoid regressing this moves the empty-latch check to cover all
sink cases.

It also stream-lines the resulting select_best_block a bit but avoids
adjusting heuristics more with this change.  gfortran.dg/streamio_9.f90
starts execute failing with this on x86_64 with -m32 because the
(float)i * 9....e-7 compute is sunk across a STOP causing it
to be no longer spilled and thus the compare failing due to excess
precision.  The patch adds -ffloat-store to avoid this, following
other similar testcases.

This change fixes the testcase in the PR only when using -fno-ivopts
as otherwise VRP is confused.

PR tree-optimization/114589
* tree-ssa-sink.cc (select_best_block): Remove profile-based
heuristics.  Instead reject sink locations that sink
to post-dominators.  Move empty latch check here from
statement_sink_location.  Also consider early_bb for the
loop depth check.
(statement_sink_location): Remove superfluous check.  Remove
empty latch check.
(pass_sink_code::execute): Compute/release post-dominators.

* gfortran.dg/streamio_9.f90: Use -ffloat-store to avoid
excess precision when not spilling.
* g++.dg/tree-ssa/pr114589.C: New testcase.

Diff:
---
 gcc/testsuite/g++.dg/tree-ssa/pr114589.C | 22 
 gcc/testsuite/gfortran.dg/streamio_9.f90 |  1 +
 gcc/tree-ssa-sink.cc | 62 ++--
 3 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr114589.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr114589.C
new file mode 100644
index ..85bb6d03015b
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr114589.C
@@ -0,0 +1,22 @@
+// { dg-do compile { target c++11 } }
+// { dg-options "-O2 -fno-ivopts -fdump-tree-optimized" }
+
+template 
+struct simple_optional {
+bool has_val;
+T val;
+
+auto begin() const -> T const* { return  }
+auto end() const -> T const* { return  + (has_val ? 1 : 0); }
+};
+
+void f(int);
+
+void call_f(simple_optional const& o) {
+for (int i : o) {
+f(i);
+}
+}
+
+// Only a conditional execution of 'f' should prevail, no loop
+// { dg-final { scan-tree-dump-times ".
 ! Test case derived from that given in PR by Steve Kargl.
diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc
index 2f90acb7ef48..2188b7523c7b 100644
--- a/gcc/tree-ssa-sink.cc
+++ b/gcc/tree-ssa-sink.cc
@@ -178,15 +178,7 @@ nearest_common_dominator_of_uses (def_operand_p def_p, 
bool *debug_stmts)
 
We want the most control dependent block in the shallowest loop nest.
 
-   If the resulting block is in a shallower loop nest, then use it.  Else
-   only use the resulting block if it has significantly lower execution
-   frequency than EARLY_BB to avoid gratuitous statement movement.  We
-   consider statements with VOPS more desirable to move.
-
-   This pass would obviously benefit from PDO as it utilizes block
-   frequencies.  It would also benefit from recomputing frequencies
-   if profile data is not available since frequencies often get out
-   of sync with reality.  */
+   If the resulting block is in a shallower loop nest, then use it.  */
 
 static basic_block
 select_best_block (basic_block early_bb,
@@ -195,18 +187,17 @@ select_best_block (basic_block early_bb,
 {
   basic_block best_bb = late_bb;
   basic_block temp_bb = late_bb;
-  int threshold;
 
   while (temp_bb != early_bb)
 {
+  /* Walk up the dominator tree, hopefully we'll find a shallower
+loop nest.  */
+  temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb);
+
   /* If we've moved into a lower loop nest, then that becomes
 our best block.  */
   if (bb_loop_depth (temp_bb) < bb_loop_depth (best_bb))
best_bb = temp_bb;
-
-  /* Walk up the dominator tree, hopefully we'll find a shallower
-loop nest.  */
-  temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb);
 }
 
   /* Placing a statement before a setjmp-like function would be invalid
@@ -221,6 +212,16 @@ select_best_block (basic_block early_bb,
   if (bb_loop_depth (best_bb) < bb_loop_depth (early_bb))
 return best_bb;
 
+  /* Do not move stmts to post-dominating places on the same loop depth.  */
+  if (dominated_by_p (CDI_POST_DOMINATORS, early_bb, 

[gcc r15-517] middle-end/111422 - wrong stack var coalescing, handle PHIs

2024-05-15 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:ab25eef36400e8c1d28e3ed059c5f95a38b45f17

commit r15-517-gab25eef36400e8c1d28e3ed059c5f95a38b45f17
Author: Richard Biener 
Date:   Wed May 15 13:06:30 2024 +0200

middle-end/111422 - wrong stack var coalescing, handle PHIs

The gcc.c-torture/execute/pr111422.c testcase after installing the
sink pass improvement reveals that we also need to handle

 _65 =  + _58;  _44 =  + _43;
 # _59 = PHI <_65, _44>
 *_59 = 8;
 g = {v} {CLOBBER(eos)};
 ...
 n[0] = 
 *_59 = 8;
 g = {v} {CLOBBER(eos)};

where we fail to see the conflict between n and g after the first
clobber of g.  Before the sinking improvement there was a conflict
recorded on a path where _65/_44 are unused, so the real conflict
was missed but the fake one avoided the miscompile.

The following handles PHI defs in add_scope_conflicts_2 which
fixes the issue.

PR middle-end/111422
* cfgexpand.cc (add_scope_conflicts_2): Handle PHIs
by recursing to their arguments.

Diff:
---
 gcc/cfgexpand.cc | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 557cb28733bd..8de5f2ba58b7 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -584,10 +584,21 @@ add_scope_conflicts_2 (tree use, bitmap work,
  || INTEGRAL_TYPE_P (TREE_TYPE (use
 {
   gimple *g = SSA_NAME_DEF_STMT (use);
-  if (is_gimple_assign (g))
-   if (tree op = gimple_assign_rhs1 (g))
- if (TREE_CODE (op) == ADDR_EXPR)
-   visit (g, TREE_OPERAND (op, 0), op, work);
+  if (gassign *a = dyn_cast  (g))
+   {
+ if (tree op = gimple_assign_rhs1 (a))
+   if (TREE_CODE (op) == ADDR_EXPR)
+ visit (a, TREE_OPERAND (op, 0), op, work);
+   }
+  else if (gphi *p = dyn_cast  (g))
+   for (unsigned i = 0; i < gimple_phi_num_args (p); ++i)
+ if (TREE_CODE (use = gimple_phi_arg_def (p, i)) == SSA_NAME)
+   if (gassign *a = dyn_cast  (SSA_NAME_DEF_STMT (use)))
+ {
+   if (tree op = gimple_assign_rhs1 (a))
+ if (TREE_CODE (op) == ADDR_EXPR)
+   visit (a, TREE_OPERAND (op, 0), op, work);
+ }
 }
 }


[gcc r14-10211] middle-end/114931 - type_hash_canon and structual equality types

2024-05-15 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:1d89cb43943e77d0bbb48fd5a58a352bdd3d82c7

commit r14-10211-g1d89cb43943e77d0bbb48fd5a58a352bdd3d82c7
Author: Richard Biener 
Date:   Fri May 3 10:44:50 2024 +0200

middle-end/114931 - type_hash_canon and structual equality types

TYPE_STRUCTURAL_EQUALITY_P is part of our type system so we have
to make sure to include that into the type unification done via
type_hash_canon.  This requires the flag to be set before querying
the hash which is the biggest part of the patch.

PR middle-end/114931
gcc/
* tree.cc (type_hash_canon_hash): Hash TYPE_STRUCTURAL_EQUALITY_P.
(type_cache_hasher::equal): Compare TYPE_STRUCTURAL_EQUALITY_P.
(build_array_type_1): Set TYPE_STRUCTURAL_EQUALITY_P before
probing with type_hash_canon.
(build_function_type): Likewise.
(build_method_type_directly): Likewise.
(build_offset_type): Likewise.
(build_complex_type): Likewise.
* attribs.cc (build_type_attribute_qual_variant): Likewise.

gcc/c-family/
* c-common.cc (complete_array_type): Set TYPE_STRUCTURAL_EQUALITY_P
before probing with type_hash_canon.

gcc/testsuite/
* gcc.dg/pr114931.c: New testcase.

(cherry picked from commit b09c2e9560648b0cf993c2ca9ad972c34e6bddfa)

Diff:
---
 gcc/attribs.cc  | 20 ++---
 gcc/c-family/c-common.cc| 11 +--
 gcc/testsuite/gcc.dg/pr114931.c | 10 +++
 gcc/tree.cc | 65 +
 4 files changed, 74 insertions(+), 32 deletions(-)

diff --git a/gcc/attribs.cc b/gcc/attribs.cc
index 12ffc5f170a1..3ab0b0fd87a4 100644
--- a/gcc/attribs.cc
+++ b/gcc/attribs.cc
@@ -1336,6 +1336,16 @@ build_type_attribute_qual_variant (tree otype, tree 
attribute, int quals)
   tree dtype = ntype = build_distinct_type_copy (ttype);
 
   TYPE_ATTRIBUTES (ntype) = attribute;
+  /* If the target-dependent attributes make NTYPE different from
+its canonical type, we will need to use structural equality
+checks for this type.
+
+We shouldn't get here for stripping attributes from a type;
+the no-attribute type might not need structural comparison.  But
+we can if was discarded from type_hash_table.  */
+  if (TYPE_STRUCTURAL_EQUALITY_P (ttype)
+ || !comp_type_attributes (ntype, ttype))
+   SET_TYPE_STRUCTURAL_EQUALITY (ntype);
 
   hashval_t hash = type_hash_canon_hash (ntype);
   ntype = type_hash_canon (hash, ntype);
@@ -1343,16 +1353,6 @@ build_type_attribute_qual_variant (tree otype, tree 
attribute, int quals)
   if (ntype != dtype)
/* This variant was already in the hash table, don't mess with
   TYPE_CANONICAL.  */;
-  else if (TYPE_STRUCTURAL_EQUALITY_P (ttype)
-  || !comp_type_attributes (ntype, ttype))
-   /* If the target-dependent attributes make NTYPE different from
-  its canonical type, we will need to use structural equality
-  checks for this type.
-
-  We shouldn't get here for stripping attributes from a type;
-  the no-attribute type might not need structural comparison.  But
-  we can if was discarded from type_hash_table.  */
-   SET_TYPE_STRUCTURAL_EQUALITY (ntype);
   else if (TYPE_CANONICAL (ntype) == ntype)
TYPE_CANONICAL (ntype) = TYPE_CANONICAL (ttype);
 
diff --git a/gcc/c-family/c-common.cc b/gcc/c-family/c-common.cc
index d14591c7bd3b..aae998d0f738 100644
--- a/gcc/c-family/c-common.cc
+++ b/gcc/c-family/c-common.cc
@@ -7115,6 +7115,13 @@ complete_array_type (tree *ptype, tree initial_value, 
bool do_default)
   TYPE_TYPELESS_STORAGE (main_type) = TYPE_TYPELESS_STORAGE (type);
   layout_type (main_type);
 
+  /* Set TYPE_STRUCTURAL_EQUALITY_P early.  */
+  if (TYPE_STRUCTURAL_EQUALITY_P (TREE_TYPE (main_type))
+  || TYPE_STRUCTURAL_EQUALITY_P (TYPE_DOMAIN (main_type)))
+SET_TYPE_STRUCTURAL_EQUALITY (main_type);
+  else
+TYPE_CANONICAL (main_type) = main_type;
+
   /* Make sure we have the canonical MAIN_TYPE. */
   hashval_t hashcode = type_hash_canon_hash (main_type);
   main_type = type_hash_canon (hashcode, main_type);
@@ -7122,7 +7129,7 @@ complete_array_type (tree *ptype, tree initial_value, 
bool do_default)
   /* Fix the canonical type.  */
   if (TYPE_STRUCTURAL_EQUALITY_P (TREE_TYPE (main_type))
   || TYPE_STRUCTURAL_EQUALITY_P (TYPE_DOMAIN (main_type)))
-SET_TYPE_STRUCTURAL_EQUALITY (main_type);
+gcc_assert (TYPE_STRUCTURAL_EQUALITY_P (main_type));
   else if (TYPE_CANONICAL (TREE_TYPE (main_type)) != TREE_TYPE (main_type)
   || (TYPE_CANONICAL (TYPE_DOMAIN (main_type))
   != TYPE_DOMAIN (main_type)))
@@ -7130,8 +7137,6 @@ complete_array_type (tree *ptype, tree initial_value, 
bool do_default)
   = build_array_type (TYPE_CANONICAL (TREE_TYPE (main_type)),

[gcc r14-10210] Avoid changing type in the type_hash_canon hash

2024-05-15 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:573e1df0ec8428e564c97af7c237a5e0c98c59bd

commit r14-10210-g573e1df0ec8428e564c97af7c237a5e0c98c59bd
Author: Richard Biener 
Date:   Fri May 3 11:48:07 2024 +0200

Avoid changing type in the type_hash_canon hash

When building a type and type_hash_canon returns an existing type
avoid changing it, in particular its TYPE_CANONICAL.

PR middle-end/114931
* tree.cc (build_array_type_1): Return early when type_hash_canon
returned an older existing type.
(build_function_type): Likewise.
(build_method_type_directly): Likewise.
(build_offset_type): Likewise.

(cherry picked from commit 7a212ac678e13e0df5da2d090144b246a1262b64)

Diff:
---
 gcc/tree.cc | 12 
 1 file changed, 12 insertions(+)

diff --git a/gcc/tree.cc b/gcc/tree.cc
index 83f3bf306afa..780662549fea 100644
--- a/gcc/tree.cc
+++ b/gcc/tree.cc
@@ -7352,7 +7352,10 @@ build_array_type_1 (tree elt_type, tree index_type, bool 
typeless_storage,
   if (shared)
 {
   hashval_t hash = type_hash_canon_hash (t);
+  tree probe_type = t;
   t = type_hash_canon (hash, t);
+  if (t != probe_type)
+   return t;
 }
 
   if (TYPE_CANONICAL (t) == t && set_canonical)
@@ -7509,7 +7512,10 @@ build_function_type (tree value_type, tree arg_types,
 
   /* If we already have such a type, use the old one.  */
   hashval_t hash = type_hash_canon_hash (t);
+  tree probe_type = t;
   t = type_hash_canon (hash, t);
+  if (t != probe_type)
+return t;
 
   /* Set up the canonical type. */
   any_structural_p   = TYPE_STRUCTURAL_EQUALITY_P (value_type);
@@ -7663,7 +7669,10 @@ build_method_type_directly (tree basetype,
 
   /* If we already have such a type, use the old one.  */
   hashval_t hash = type_hash_canon_hash (t);
+  tree probe_type = t;
   t = type_hash_canon (hash, t);
+  if (t != probe_type)
+return t;
 
   /* Set up the canonical type. */
   any_structural_p
@@ -7720,7 +7729,10 @@ build_offset_type (tree basetype, tree type)
 
   /* If we already have such a type, use the old one.  */
   hashval_t hash = type_hash_canon_hash (t);
+  tree probe_type = t;
   t = type_hash_canon (hash, t);
+  if (t != probe_type)
+return t;
 
   if (!COMPLETE_TYPE_P (t))
 layout_type (t);


[gcc r15-491] tree-optimization/99954 - redo loop distribution memcpy recognition fix

2024-05-14 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:c290e6a0b7a9de5692963affc6627a4af7dc2411

commit r15-491-gc290e6a0b7a9de5692963affc6627a4af7dc2411
Author: Richard Biener 
Date:   Tue May 14 11:13:51 2024 +0200

tree-optimization/99954 - redo loop distribution memcpy recognition fix

The following revisits the fix for PR99954 which was observed as
causing missed memcpy recognition and instead using memmove for
non-aliasing copies.  While the original fix mitigated bogus
recognition of memcpy the root cause was not properly identified.
The root cause is dr_analyze_indices "failing" to handle union
references and leaving the DRs indices in a state that's not correctly
handled by dr_may_alias.  The following mitigates this there
appropriately, restoring memcpy recognition for non-aliasing copies.

This makes us run into a latent issue in ptr_deref_may_alias_decl_p
when the pointer is something like [0].a in which case we fail
to handle non-SSA name pointers.  Add code similar to what we have
in ptr_derefs_may_alias_p.

PR tree-optimization/99954
* tree-data-ref.cc (dr_may_alias_p): For bases that are
not completely analyzed fall back to TBAA and points-to.
* tree-loop-distribution.cc
(loop_distribution::classify_builtin_ldst): When there
is no dependence again classify as memcpy.
* tree-ssa-alias.cc (ptr_deref_may_alias_decl_p): Verify
the pointer is an SSA name.

* gcc.dg/tree-ssa/ldist-40.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/tree-ssa/ldist-40.c | 10 ++
 gcc/tree-data-ref.cc | 22 ++
 gcc/tree-loop-distribution.cc|  4 ++--
 gcc/tree-ssa-alias.cc|  5 +
 4 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-40.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ldist-40.c
new file mode 100644
index ..238a0098352a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-40.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ldist-details" } */
+
+void copy_a_to_b (char * __restrict b, char * a, int n)
+{
+  for (int i = 0; i < n; ++i)
+b[i] = a[i];
+}
+
+/* { dg-final { scan-tree-dump "generated memcpy" "ldist" } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index f37734b53409..db15ddb43ded 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -3066,6 +3066,28 @@ dr_may_alias_p (const struct data_reference *a, const 
struct data_reference *b,
return ptr_derefs_may_alias_p (build_fold_addr_expr (addr_a),
   TREE_OPERAND (addr_b, 0));
 }
+  /* If dr_analyze_innermost failed to handle a component we are
+ possibly left with a non-base in which case we didn't analyze
+ a possible evolution of the base when analyzing a loop.  */
+  else if (loop_nest
+  && (handled_component_p (addr_a) || handled_component_p (addr_b)))
+{
+  /* For true dependences we can apply TBAA.  */
+  if (flag_strict_aliasing
+ && DR_IS_WRITE (a) && DR_IS_READ (b)
+ && !alias_sets_conflict_p (get_alias_set (DR_REF (a)),
+get_alias_set (DR_REF (b
+   return false;
+  if (TREE_CODE (addr_a) == MEM_REF)
+   return ptr_derefs_may_alias_p (TREE_OPERAND (addr_a, 0),
+  build_fold_addr_expr (addr_b));
+  else if (TREE_CODE (addr_b) == MEM_REF)
+   return ptr_derefs_may_alias_p (build_fold_addr_expr (addr_a),
+  TREE_OPERAND (addr_b, 0));
+  else
+   return ptr_derefs_may_alias_p (build_fold_addr_expr (addr_a),
+  build_fold_addr_expr (addr_b));
+}
 
   /* Otherwise DR_BASE_OBJECT is an access that covers the whole object
  that is being subsetted in the loop nest.  */
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index 45932bae5e7f..668dc4204490 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -1840,11 +1840,11 @@ loop_distribution::classify_builtin_ldst (loop_p loop, 
struct graph *rdg,
   /* Now check that if there is a dependence.  */
   ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
 
-  /* Classify as memmove if no dependence between load and store.  */
+  /* Classify as memcpy if no dependence between load and store.  */
   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 {
   partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, 
size);
-  partition->kind = PKIND_MEMMOVE;
+  partition->kind = PKIND_MEMCPY;
   return;
 }
 
diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
index e7c1c1aa6243..374ba04e6fd0 100644
--- a/gcc/tree-ssa-alias.cc
+++ b/gcc/tree-ssa-alias.cc
@@ -294,6 +294,11 @@ ptr_deref_may_alias_decl_p (tree ptr, 

[gcc(refs/users/rguenth/heads/vect-force-slp)] Improve combined store node splitting

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:4336060fe2db8ec41c0f108034a4ae8de89e5fa1

commit 4336060fe2db8ec41c0f108034a4ae8de89e5fa1
Author: Richard Biener 
Date:   Wed Mar 20 14:55:08 2024 +0100

Improve combined store node splitting

The following improves on the initial "Avoid splitting store dataref
groups during SLP discovery" change, in particular on how we deal
with the multi-input VEC_PERM node combining back the SLP instances
into the single node for the whole group store.  Instead of
combining the last two inputs recursively this more carefully
selects nodes to combine (but still recursively), combining the
first two nodes with the least number of inputs.  That should avoid
the need for three-input permutes consistently.

* tree-vect-slp.cc (vect_build_slp_instance): Split merge
permute node in a better manner.

Diff:
---
 gcc/tree-vect-slp.cc | 66 +---
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index f3743997e9cd..7e6ff07db0ff 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3654,18 +3654,45 @@ vect_build_slp_instance (vec_info *vinfo,
}
 
  /* ???  Now we have a single permute node but when that's
-fed more than two inputs it's prone to hit the limitation
+fed more than two inputs it's prone to hit the limitation
 on at most two sources for a VEC_PERM_EXPR.  Ideally
 we'd defer the following to the optimize-slp pass but
 for now split it here.
-???  Optimally we'd produce permute nodes feeding in
-the same number of lanes from each input and also have
-the same vector type (only the width will eventually
-differ here), for now just do "something".  */
+For now perform pairwise reduction, reducing the two inputs
+with the least number of lanes to one and then repeat until
+we end up with two inputs.  */
  while (SLP_TREE_CHILDREN (perm).length () > 2)
{
- slp_tree b = SLP_TREE_CHILDREN (perm).pop ();
- slp_tree a = SLP_TREE_CHILDREN (perm).pop ();
+ /* Pick the two nodes with the least number of lanes,
+prefer the earliest candidate and maintain ai < bi.  */
+ int ai = -1;
+ int bi = -1;
+ for (unsigned ci = 0;
+  ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
+   {
+ if (ai == -1)
+   ai = ci;
+ else if (bi == -1)
+   bi = ci;
+ else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
+   < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
+  || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
+  < SLP_TREE_LANES (SLP_TREE_CHILDREN 
(perm)[bi])))
+   {
+ if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
+ <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
+   bi = ci;
+ else
+   {
+ ai = bi;
+ bi = ci;
+   }
+   }
+   }
+
+ /* Produce a merge of nodes ai and bi.  */
+ slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
+ slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
  unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
  slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
  SLP_TREE_LANES (permab) = n;
@@ -3682,12 +3709,25 @@ vect_build_slp_instance (vec_info *vinfo,
  for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
SLP_TREE_LANE_PERMUTATION (permab)
  .quick_push (std::make_pair (1, k));
- /* ???  Popluate SLP_TREE_SCALAR_STMTS/OPS of permab.  */
- SLP_TREE_CHILDREN (perm).quick_push (permab);
- for (unsigned k = group_size - n; k < group_size; ++k)
-   SLP_TREE_LANE_PERMUTATION (perm)[k]
- = std::make_pair (SLP_TREE_CHILDREN (perm).length () - 1,
-   k - (group_size - n));
+
+ /* Put the merged node into 'perm', in place of a  */
+ SLP_TREE_CHILDREN (perm)[ai] = permab;
+ /* Adjust the references to b in the permutation
+of perm and to the later children which we'll
+remove.  */
+ for (unsigned k = 0; k < SLP_TREE_LANES 

[gcc(refs/users/rguenth/heads/vect-force-slp)] Add single-lane SLP support to .GOMP_SIMD_LANE vectorization

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:ccbe801c08914ba136bba9cdbbcf0146dd261204

commit ccbe801c08914ba136bba9cdbbcf0146dd261204
Author: Richard Biener 
Date:   Wed Mar 13 14:13:00 2024 +0100

Add single-lane SLP support to .GOMP_SIMD_LANE vectorization

The following adds support for single-lane SLP .GOMP_SIMD_LANE
vectorization.

* tree-vect-slp.cc (no_arg_map): New.
(vect_get_operand_map): Handle IFN_GOMP_SIMD_LANE.
(vect_build_slp_tree_1): Likewise.
* tree-vect-stmts.cc (vectorizable_call): Handle single-lane SLP
for .GOMP_SIMD_LANE calls.

Diff:
---
 gcc/tree-vect-slp.cc   | 11 +++
 gcc/tree-vect-stmts.cc | 27 +++
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 3138a815da7a..f3743997e9cd 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -505,6 +505,7 @@ static const int cond_expr_maps[3][5] = {
   { 4, -2, -1, 1, 2 },
   { 4, -1, -2, 2, 1 }
 };
+static const int no_arg_map[] = { 0 };
 static const int arg0_map[] = { 1, 0 };
 static const int arg1_map[] = { 1, 1 };
 static const int arg2_map[] = { 1, 2 };
@@ -585,6 +586,9 @@ vect_get_operand_map (const gimple *stmt, bool 
gather_scatter_p = false,
  case IFN_CTZ:
return arg0_map;
 
+ case IFN_GOMP_SIMD_LANE:
+   return no_arg_map;
+
  default:
break;
  }
@@ -1168,6 +1172,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
*swap,
  ldst_p = true;
  rhs_code = CFN_MASK_STORE;
}
+ else if (cfn == CFN_GOMP_SIMD_LANE)
+   ;
  else if ((cfn != CFN_LAST
&& cfn != CFN_MASK_CALL
&& internal_fn_p (cfn)
@@ -1271,6 +1277,11 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
*swap,
  need_same_oprnds = true;
  first_op1 = gimple_call_arg (call_stmt, 1);
}
+ else if (rhs_code == CFN_GOMP_SIMD_LANE)
+   {
+ need_same_oprnds = true;
+ first_op1 = gimple_call_arg (call_stmt, 1);
+   }
}
   else
{
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 840ff8a3406a..270c5a5dd347 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3341,7 +3341,7 @@ vectorizable_call (vec_info *vinfo,
   if (ifn == IFN_LAST && !fndecl)
 {
   if (cfn == CFN_GOMP_SIMD_LANE
- && !slp_node
+ && (!slp_node || SLP_TREE_LANES (slp_node) == 1)
  && loop_vinfo
  && LOOP_VINFO_LOOP (loop_vinfo)->simduid
  && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
@@ -3487,18 +3487,15 @@ vectorizable_call (vec_info *vinfo,
  /* Build argument list for the vectorized call.  */
  if (slp_node)
{
- vec vec_oprnds0;
-
+ unsigned int vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
  vect_get_slp_defs (vinfo, slp_node, _defs);
- vec_oprnds0 = vec_defs[0];
 
  /* Arguments are ready.  Create the new vector stmt.  */
- FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_oprnd0)
+ for (i = 0; i < vec_num; ++i)
{
  int varg = 0;
  if (masked_loop_p && reduc_idx >= 0)
{
- unsigned int vec_num = vec_oprnds0.length ();
  /* Always true for SLP.  */
  gcc_assert (ncopies == 1);
  vargs[varg++] = vect_get_loop_mask (loop_vinfo,
@@ -3539,11 +3536,26 @@ vectorizable_call (vec_info *vinfo,
  vect_finish_stmt_generation (vinfo, stmt_info,
   new_stmt, gsi);
}
+ else if (cfn == CFN_GOMP_SIMD_LANE)
+   {
+ /* ???  For multi-lane SLP we'd need to build
+{ 0, 0, .., 1, 1, ... }.  */
+ tree cst = build_index_vector (vectype_out,
+i * nunits_out, 1);
+ tree new_var
+   = vect_get_new_ssa_name (vectype_out, vect_simple_var,
+"cst_");
+ gimple *init_stmt = gimple_build_assign (new_var, cst);
+ vect_init_vector_1 (vinfo, stmt_info, init_stmt, NULL);
+ new_temp = make_ssa_name (vec_dest);
+ new_stmt = gimple_build_assign (new_temp, new_var);
+ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
+  gsi);
+   }
  else
{
  if (len_opno >= 0 && len_loop_p)
{
- unsigned int vec_num = 

[gcc(refs/users/rguenth/heads/vect-force-slp)] Handle unused-only-live stmts in SLP discovery

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:d0b1eaaf0ba4b9e6fd2b18ef597bde3eb7bd018b

commit d0b1eaaf0ba4b9e6fd2b18ef597bde3eb7bd018b
Author: Richard Biener 
Date:   Thu Mar 7 15:13:33 2024 +0100

Handle unused-only-live stmts in SLP discovery

The following adds SLP discovery for roots that are only live but
otherwise unused.

* tree-vect-slp.cc (vect_analyze_slp): Analyze SLP for live
but otherwise unused defs.

Diff:
---
 gcc/tree-vect-slp.cc | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 6cc544057115..3138a815da7a 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3985,6 +3985,40 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
  saved_stmts.release ();
}
}
+
+  if (param_vect_single_lane_slp != 0)
+   {
+ /* Make sure to vectorize only-live stmts, usually inductions.  */
+ for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
+   for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
+gsi_next ())
+ {
+   gphi *lc_phi = *gsi;
+   tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
+   stmt_vec_info stmt_info;
+   if (TREE_CODE (def) == SSA_NAME
+   && !virtual_operand_p (def)
+   && (stmt_info = loop_vinfo->lookup_def (def))
+   && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
+   && STMT_VINFO_LIVE_P (stmt_info)
+   && (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
+   || (STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
+   && STMT_VINFO_REDUC_IDX (stmt_info) == -1)))
+ {
+   vec stmts;
+   vec roots = vNULL;
+   vec remain = vNULL;
+   stmts.create (1);
+   stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
+   bool res = vect_build_slp_instance (vinfo,
+   
slp_inst_kind_reduc_group,
+   stmts, roots, remain,
+   max_tree_size, ,
+   bst_map, NULL);
+   gcc_assert (res);
+ }
+ }
+   }
 }
 
   hash_set visited_patterns;


[gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid bogus SLP outer loop vectorization

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:4652b8bdf79f6ba3a86e085b7ce13d23057c57f6

commit 4652b8bdf79f6ba3a86e085b7ce13d23057c57f6
Author: Richard Biener 
Date:   Wed Mar 6 15:13:05 2024 +0100

Avoid bogus SLP outer loop vectorization

This fixes the check for multiple types which go wrong I think
because of bogus pointer IV increments when there are multiple
copies of vector stmts in the inner loop.

* tree-vect-stmts.cc (vectorizable_load): Avoid outer loop
SLP vectorization with multi-copy vector stmts in the inner
loop.
(vectorizable_store): Likewise.

Diff:
---
 gcc/tree-vect-stmts.cc | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 414c1fce38db..840ff8a3406a 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8195,7 +8195,9 @@ vectorizable_store (vec_info *vinfo,
   gcc_assert (ncopies >= 1);
 
   /* FORNOW.  This restriction should be relaxed.  */
-  if (loop && nested_in_vect_loop_p (loop, stmt_info) && ncopies > 1)
+  if (loop
+  && nested_in_vect_loop_p (loop, stmt_info)
+  && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
 {
   if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9941,7 +9943,8 @@ vectorizable_load (vec_info *vinfo,
   gcc_assert (ncopies >= 1);
 
   /* FORNOW. This restriction should be relaxed.  */
-  if (nested_in_vect_loop && ncopies > 1)
+  if (nested_in_vect_loop
+  && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
 {
   if (dump_enabled_p ())
 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,


[gcc(refs/users/rguenth/heads/vect-force-slp)] Allow single-lane SLP in-order reductions

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:0faad9e4dfa5015c9535e4f2a40400914c5b4674

commit 0faad9e4dfa5015c9535e4f2a40400914c5b4674
Author: Richard Biener 
Date:   Tue Mar 5 15:46:24 2024 +0100

Allow single-lane SLP in-order reductions

The single-lane case isn't different from non-SLP, no re-association
implied.

* tree-vect-loop.cc (vectorizable_reduction): Allow
single-lane SLP in-order reductions.

Diff:
---
 gcc/tree-vect-loop.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 8fb8800e6a7e..a5597ec1287b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8134,7 +8134,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 }
 
   if (reduction_type == FOLD_LEFT_REDUCTION
-  && slp_node
+  && (slp_node && SLP_TREE_LANES (slp_node) > 1)
   && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 {
   /* We cannot use in-order reductions in this case because there is


[gcc(refs/users/rguenth/heads/vect-force-slp)] Add double reduction support for SLP vectorization

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:5bdcb5f182a6472a66cc5d7842a64ec7ad0fd7e5

commit 5bdcb5f182a6472a66cc5d7842a64ec7ad0fd7e5
Author: Richard Biener 
Date:   Tue Mar 5 15:28:58 2024 +0100

Add double reduction support for SLP vectorization

The following makes double reduction vectorization work when
using (single-lane) SLP vectorization.

* tree-vect-loop.cc (vect_analyze_scalar_cycles_1): Queue
double reductions in LOOP_VINFO_REDUCTIONS.
(vect_create_epilog_for_reduction): Remove asserts disabling
SLP for double reductions.
(vectorizable_reduction): Analyze SLP double reductions
only once and start off the correct places.
* tree-vect-slp.cc (vect_get_and_check_slp_defs): Allow
vect_double_reduction_def.
(vect_build_slp_tree_2): Fix condition for the ignored
reduction initial values.
* tree-vect-stmts.cc (vect_analyze_stmt): Allow
vect_double_reduction_def.

Diff:
---
 gcc/tree-vect-loop.cc  | 35 +--
 gcc/tree-vect-slp.cc   |  3 ++-
 gcc/tree-vect-stmts.cc |  4 
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 195db5b1089e..8fb8800e6a7e 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -685,6 +685,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, 
class loop *loop,
 
   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
+ /* Make it accessible for SLP vectorization.  */
+ LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
 }
   else
 {
@@ -5973,7 +5975,6 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   stmt_vec_info rdef_info = stmt_info;
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
 {
-  gcc_assert (!slp_node);
   double_reduc = true;
   stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
(stmt_info->stmt, 0));
@@ -6018,7 +6019,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 {
   outer_loop = loop;
   loop = loop->inner;
-  gcc_assert (!slp_node && double_reduc);
+  gcc_assert (double_reduc);
 }
 
   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
@@ -6033,7 +6034,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 for induc_val, use initial_def.  */
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
-  /* ???  Coverage for double_reduc and 'else' isn't clear.  */
+  /* ???  Coverage for 'else' isn't clear.  */
 }
   else
 {
@@ -7532,15 +7533,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
   return true;
 }
-  if (slp_node)
-{
-  slp_node_instance->reduc_phis = slp_node;
-  /* ???  We're leaving slp_node to point to the PHIs, we only
-need it to get at the number of vector stmts which wasn't
-yet initialized for the instance root.  */
-}
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
 {
+  if (gimple_bb (stmt_info->stmt) != loop->header)
+   {
+ /* For SLP we arrive here for both the inner loop LC PHI and
+the outer loop PHI.  The latter is what we want to analyze
+the reduction with.  */
+ gcc_assert (slp_node);
+ return true;
+   }
   use_operand_p use_p;
   gimple *use_stmt;
   bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
@@ -7549,6 +7551,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   phi_info = loop_vinfo->lookup_stmt (use_stmt);
 }
 
+  if (slp_node)
+{
+  slp_node_instance->reduc_phis = slp_node;
+  /* ???  We're leaving slp_node to point to the PHIs, we only
+need it to get at the number of vector stmts which wasn't
+yet initialized for the instance root.  */
+}
+
   /* PHIs should not participate in patterns.  */
   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
   gphi *reduc_def_phi = as_a  (phi_info->stmt);
@@ -7564,6 +7574,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   bool only_slp_reduc_chain = true;
   stmt_info = NULL;
   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
+  /* For double-reductions we start SLP analysis at the inner loop LC PHI
+ which is the def of the outer loop live stmt.  */
+  if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def
+  && slp_node)
+slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
   while (reduc_def != PHI_RESULT (reduc_def_phi))
 {
   stmt_vec_info def = 

[gcc(refs/users/rguenth/heads/vect-force-slp)] Allow single-lane COND_REDUCTION vectorization

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:27affc29b9de2cc35ec53c7263b23027d9932191

commit 27affc29b9de2cc35ec53c7263b23027d9932191
Author: Richard Biener 
Date:   Fri Mar 1 14:39:08 2024 +0100

Allow single-lane COND_REDUCTION vectorization

The following enables single-lane COND_REDUCTION vectorization.

* tree-vect-loop.cc (vect_create_epilog_for_reduction):
Adjust for single-lane COND_REDUCTION SLP vectorization.
(vectorizable_reduction): Likewise.
(vect_transform_cycle_phi): Likewise.

Diff:
---
 gcc/tree-vect-loop.cc | 100 +-
 1 file changed, 83 insertions(+), 17 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 7eeae908d367..195db5b1089e 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6028,7 +6028,13 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   tree induc_val = NULL_TREE;
   tree adjustment_def = NULL;
   if (slp_node)
-;
+{
+  /* Optimize: for induction condition reduction, if we can't use zero
+for induc_val, use initial_def.  */
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
+   induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
+  /* ???  Coverage for double_reduc and 'else' isn't clear.  */
+}
   else
 {
   /* Optimize: for induction condition reduction, if we can't use zero
@@ -6073,23 +6079,46 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
 {
   auto_vec, 2> ccompares;
-  stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
-  cond_info = vect_stmt_to_vectorize (cond_info);
-  while (cond_info != reduc_info)
+  if (slp_node)
{
- if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
+ slp_tree cond_node = slp_node_instance->root;
+ while (cond_node != slp_node_instance->reduc_phis)
{
- gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
- gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
- ccompares.safe_push
-   (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
-STMT_VINFO_REDUC_IDX (cond_info) == 2));
+ stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
+ if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
+   {
+ gimple *vec_stmt
+   = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
+ gcc_assert (gimple_assign_rhs_code (vec_stmt) == 
VEC_COND_EXPR);
+ ccompares.safe_push
+   (std::make_pair (gimple_assign_rhs1 (vec_stmt),
+STMT_VINFO_REDUC_IDX (cond_info) == 2));
+   }
+ /* ???  We probably want to have REDUC_IDX on the SLP node?  */
+ cond_node = SLP_TREE_CHILDREN
+   (cond_node)[STMT_VINFO_REDUC_IDX (cond_info)];
}
- cond_info
-   = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
-1 + STMT_VINFO_REDUC_IDX
-   (cond_info)));
+   }
+  else
+   {
+ stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
  cond_info = vect_stmt_to_vectorize (cond_info);
+ while (cond_info != reduc_info)
+   {
+ if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
+   {
+ gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
+ gcc_assert (gimple_assign_rhs_code (vec_stmt) == 
VEC_COND_EXPR);
+ ccompares.safe_push
+   (std::make_pair (gimple_assign_rhs1 (vec_stmt),
+STMT_VINFO_REDUC_IDX (cond_info) == 2));
+   }
+ cond_info
+   = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
+1 + STMT_VINFO_REDUC_IDX
+(cond_info)));
+ cond_info = vect_stmt_to_vectorize (cond_info);
+   }
}
   gcc_assert (ccompares.length () != 0);
 
@@ -6502,7 +6531,8 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   /* 2.3 Create the reduction code, using one of the three schemes described
  above. In SLP we simply need to extract all the elements from the 
  vector (without reducing them), so we use scalar shifts.  */
-  else if (reduc_fn != IFN_LAST && !slp_reduc)
+  else if (reduc_fn != IFN_LAST
+  && (!slp_reduc || SLP_TREE_LANES (slp_node) == 1))
 {
   tree tmp;
   tree vec_elem_type;
@@ -7767,7 +7797,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   /* If 

[gcc(refs/users/rguenth/heads/vect-force-slp)] Place easily identifyable assert insead of SIGSEV

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:a1126dd1b7b0ba051d7d62de2c12b7affa2ecc34

commit a1126dd1b7b0ba051d7d62de2c12b7affa2ecc34
Author: Richard Biener 
Date:   Fri Mar 1 14:56:01 2024 +0100

Place easily identifyable assert insead of SIGSEV

Better identification of known ICEs.

* tree-vect-stmts.cc (vect_is_simple_use): Assert instead of
SIGSEV.

Diff:
---
 gcc/tree-vect-stmts.cc | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 8fef72cb9072..ca81957def06 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -14009,6 +14009,10 @@ vect_is_simple_use (vec_info *vinfo, stmt_vec_info 
stmt, slp_tree slp_node,
   *vectype = SLP_TREE_VECTYPE (child);
   if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
{
+ /* ???  Instead of crashing, easier to identify.  But we
+need to think what to do with internal defs of VEC_PERM
+kind here.  */
+ gcc_assert (SLP_TREE_REPRESENTATIVE (child));
  *op = gimple_get_lhs (SLP_TREE_REPRESENTATIVE (child)->stmt);
  return vect_is_simple_use (*op, vinfo, dt, def_stmt_info_out);
}


[gcc(refs/users/rguenth/heads/vect-force-slp)] Relax COND_EXPR reduction vectorization SLP restriction

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:1ba0215280e51f0cbc3c3867d6e8c07fc76694f8

commit 1ba0215280e51f0cbc3c3867d6e8c07fc76694f8
Author: Richard Biener 
Date:   Fri Feb 23 16:16:38 2024 +0100

Relax COND_EXPR reduction vectorization SLP restriction

Allow one-lane SLP but for the case where we need to swap the arms.

* tree-vect-stmts.cc (vectorizable_condition): Allow
single-lane SLP, but not when we need to swap then and
else clause.

Diff:
---
 gcc/tree-vect-stmts.cc | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 72a9c144823c..8fef72cb9072 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12105,7 +12105,7 @@ vectorizable_condition (vec_info *vinfo,
 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL;
   if (for_reduction)
 {
-  if (slp_node)
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
return false;
   reduc_info = info_for_reduction (vinfo, stmt_info);
   reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
@@ -12194,6 +12194,10 @@ vectorizable_condition (vec_info *vinfo,
  cond_expr = NULL_TREE;
}
}
+  /* ???  The vectorized operand query below doesn't allow swapping
+this way for SLP.  */
+  if (slp_node)
+   return false;
   std::swap (then_clause, else_clause);
 }


[gcc(refs/users/rguenth/heads/vect-force-slp)] Amend --param vect-force-slp checking

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:362a1f1bc0c72c618737a634a88898e3f6142995

commit 362a1f1bc0c72c618737a634a88898e3f6142995
Author: Richard Biener 
Date:   Fri Feb 23 12:39:57 2024 +0100

Amend --param vect-force-slp checking

This makes sure no non-SLP code-gen happens.

* tree-vect-stmts.cc (vect_transform_stmt): Assert no
non-SLP code-gen happens with --param vect-force-slp=1.

Diff:
---
 gcc/tree-vect-stmts.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b694cc4a8373..72a9c144823c 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13373,6 +13373,8 @@ vect_transform_stmt (vec_info *vinfo,
 
   gcc_assert (slp_node || !PURE_SLP_STMT (stmt_info));
 
+  gcc_assert (!param_vect_force_slp || slp_node);
+
   tree saved_vectype = STMT_VINFO_VECTYPE (stmt_info);
   if (slp_node)
 STMT_VINFO_VECTYPE (stmt_info) = SLP_TREE_VECTYPE (slp_node);


[gcc(refs/users/rguenth/heads/vect-force-slp)] Do single-lane SLP discovery for reductions

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:b423891ad43d003a565e7b5c6ed648e446bd3c7c

commit b423891ad43d003a565e7b5c6ed648e446bd3c7c
Author: Richard Biener 
Date:   Fri Feb 23 11:45:50 2024 +0100

Do single-lane SLP discovery for reductions

The following performs single-lane SLP discovery for reductions.
This exposes a latent issue with reduction SLP in outer loop
vectorization and makes gcc.dg/vect/vect-outer-4[fgkl].c FAIL
execution.

* tree-vect-slp.cc (vect_build_slp_tree_2): Only multi-lane
discoveries are reduction chains and need special backedge
treatment.
(vect_analyze_slp): Fall back to single-lane SLP discovery
for reductions. Make sure to try single-lane SLP reduction
for all reductions as fallback.

Diff:
---
 gcc/tree-vect-slp.cc | 58 +++-
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index ecc185aae885..f39cde3a8d50 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1918,7 +1918,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
/* Reduction chain backedge defs are filled manually.
   ???  Need a better way to identify a SLP reduction chain PHI.
   Or a better overall way to SLP match those.  */
-   if (all_same && def_type == vect_reduction_def)
+   if (stmts.length () > 1
+   && all_same && def_type == vect_reduction_def)
  skip_args[loop_latch_edge (loop)->dest_idx] = true;
  }
else if (def_type != vect_internal_def)
@@ -3911,7 +3912,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
  }
 
   /* Find SLP sequences starting from groups of reductions.  */
-  if (loop_vinfo->reductions.length () > 1)
+  if (loop_vinfo->reductions.length () > 0)
{
  /* Collect reduction statements.  */
  vec scalar_stmts;
@@ -3934,17 +3935,54 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
  && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR
  && gimple_assign_rhs_code (g) != SAD_EXPR)))
scalar_stmts.quick_push (next_info);
+ else if (param_vect_single_lane_slp != 0)
+   {
+ vec stmts;
+ vec roots = vNULL;
+ vec remain = vNULL;
+ stmts.create (1);
+ stmts.quick_push (next_info);
+ bool res = vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, ,
+ bst_map, NULL);
+ gcc_assert (res);
+   }
}
- if (scalar_stmts.length () > 1)
+ vec roots = vNULL;
+ vec remain = vNULL;
+ vec saved_stmts = vNULL;
+ if (param_vect_single_lane_slp != 0)
+   /* ???  scalar_stmts ownership and arg passing sucks.  */
+   saved_stmts = scalar_stmts.copy ();
+ if ((scalar_stmts.length () <= 1
+  || !vect_build_slp_instance (loop_vinfo,
+   slp_inst_kind_reduc_group,
+   scalar_stmts, roots, remain,
+   max_tree_size, , bst_map,
+   NULL))
+ && param_vect_single_lane_slp != 0)
{
- vec roots = vNULL;
- vec remain = vNULL;
- vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group,
-  scalar_stmts, roots, remain,
-  max_tree_size, , bst_map, NULL);
+ if (scalar_stmts.length () <= 1)
+   scalar_stmts.release ();
+ /* Do SLP discovery for single-lane reductions.  */
+ for (auto stmt_info : saved_stmts)
+   {
+ vec stmts;
+ vec roots = vNULL;
+ vec remain = vNULL;
+ stmts.create (1);
+ stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
+ bool res = vect_build_slp_instance (vinfo,
+ slp_inst_kind_reduc_group,
+ stmts, roots, remain,
+ max_tree_size, ,
+ bst_map, NULL);
+ gcc_assert (res);
+   }
+ saved_stmts.release ();
}
- else
-   scalar_stmts.release ();
}
 }


[gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid SLP build failure for unsupported shifts

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:95f3686d4788d7420422d514996c5b6e7a8facfd

commit 95f3686d4788d7420422d514996c5b6e7a8facfd
Author: Richard Biener 
Date:   Thu Oct 5 14:07:02 2023 +0200

Avoid SLP build failure for unsupported shifts

When asserting that SLP _build_ can succeed we run into the SLP
shift discovery code trying to be clever doing vectorizable_shifts
work and failing discovery already.  That gives a false impression
for now, so disable that when we do single-lane builds.

* tree-vect-slp.cc (vect_build_slp_tree_1): Do not fail
fatally for shifts not supported by the target when discovering
a single lane.

Diff:
---
 gcc/tree-vect-slp.cc | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 6bfc59dc2131..ecc185aae885 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1215,7 +1215,13 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
*swap,
  if (!directly_supported_p (rhs_code, vectype, optab_vector))
{
  /* No vector/vector shift, try for a vector/scalar shift.  */
- if (!directly_supported_p (rhs_code, vectype, optab_scalar))
+ if (!directly_supported_p (rhs_code, vectype, optab_scalar)
+ /* ???  We are using this to guide operand swapping to
+eventually make all shift operands the same but we
+shouldn't fail in the end - that's be business of
+vectorizable_shift.
+Avoid spurious ICEs for single-lane discovery.  */
+ && group_size != 1)
{
  if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,


[gcc(refs/users/rguenth/heads/vect-force-slp)] Reduce single-lane SLP testresult noise

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:e8ba59ef5c6284604f3c0920e246ed4cf889e541

commit e8ba59ef5c6284604f3c0920e246ed4cf889e541
Author: Richard Biener 
Date:   Thu Oct 5 13:31:16 2023 +0200

Reduce single-lane SLP testresult noise

The following avoids dumping 'vectorizing stmts using SLP' for
single-lane instances since that causes extra testsuite fallout.

* tree-vect-slp.cc (vect_schedule_slp): Gate dumping
'vectorizing stmts using SLP' on > 1 lanes.

Diff:
---
 gcc/tree-vect-slp.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index ef0199cf3fb2..6bfc59dc2131 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10055,7 +10055,8 @@ vect_schedule_slp (vec_info *vinfo, const 
vec _instances)
   if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
vectorize_slp_instance_root_stmt (node, instance);
 
-  if (dump_enabled_p ())
+  /* ???  Reduce some testsuite noise because of "more SLP".  */
+  if (SLP_TREE_LANES (node) > 1 && dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
  "vectorizing stmts using SLP.\n");
 }


[gcc(refs/users/rguenth/heads/vect-force-slp)] Add FIXME note regarding gcc.dg/vect/pr60276.c runfail with single-lane SLP

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:307f09a496e4476c006e8b1fe56b396a465c9413

commit 307f09a496e4476c006e8b1fe56b396a465c9413
Author: Richard Biener 
Date:   Wed Oct 4 14:34:18 2023 +0200

Add FIXME note regarding gcc.dg/vect/pr60276.c runfail with single-lane SLP

* tree-vect-stmts.cc (vectorizable_load): Add FIXME to
PR60276 fix.

Diff:
---
 gcc/tree-vect-stmts.cc | 5 +
 1 file changed, 5 insertions(+)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 46e4edb5e36f..b694cc4a8373 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -9951,6 +9951,11 @@ vectorizable_load (vec_info *vinfo,
 
   /* Invalidate assumptions made by dependence analysis when vectorization
  on the unrolled body effectively re-orders stmts.  */
+  /* ???  This fails to trigger with single-lane SLP, gcc.dg/vect/pr60276.c,
+ but simply removing the ncopies > 1 conditional here (and below) will
+ cause FAILs of gcc.dg/vect/no-vfa-vect-depend-3.c and
+ gcc.dg/vect/tsvc/vect-tsvc-s3251.c.  The original fix (for PR60276)
+ needs to be re-thought.  */
   if (ncopies > 1
   && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0
   && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),


[gcc(refs/users/rguenth/heads/vect-force-slp)] Avoid splitting store dataref groups during SLP discovery

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:9315bfc661432c3ad82a7ade21359d5c078dc41b

commit 9315bfc661432c3ad82a7ade21359d5c078dc41b
Author: Richard Biener 
Date:   Fri Sep 29 13:13:16 2023 +0200

Avoid splitting store dataref groups during SLP discovery

The following avoids splitting store dataref groups during SLP
discovery but instead forces (eventually single-lane) consecutive
lane SLP discovery for all lanes of the group, creating a VEC_PERM
SLP node merging them so the store will always cover the whole group.

I figured the patched function needs some refactoring so this is
in draft state indenting-wise.  With this for example

int x[1024], y[1024], z[1024], w[1024];
void foo (void)
{
  for (int i = 0; i < 256; i++)
{
  x[4*i+0] = y[2*i+0];
  x[4*i+1] = y[2*i+1];
  x[4*i+2] = z[i];
  x[4*i+3] = w[i];
}
}

which was previously using hybrid SLP can now be fully SLPed and
SSE code generated looks better (but of course you never know,
I didn't actually benchmark).  We of course need a VF of four here.

.L2:
movdqa  z(%rax), %xmm0
movdqa  w(%rax), %xmm4
movdqa  y(%rax,%rax), %xmm2
movdqa  y+16(%rax,%rax), %xmm1
movdqa  %xmm0, %xmm3
punpckhdq   %xmm4, %xmm0
punpckldq   %xmm4, %xmm3
movdqa  %xmm2, %xmm4
shufps  $238, %xmm3, %xmm2
movaps  %xmm2, x+16(,%rax,4)
movdqa  %xmm1, %xmm2
shufps  $68, %xmm3, %xmm4
shufps  $68, %xmm0, %xmm2
movaps  %xmm4, x(,%rax,4)
shufps  $238, %xmm0, %xmm1
movaps  %xmm2, x+32(,%rax,4)
movaps  %xmm1, x+48(,%rax,4)
addq$16, %rax
cmpq$1024, %rax
jne .L2

The extra permute nodes unfortunately sometimes do not behave
nicely wrt vect_is_simple_use since when the source is an
invariant or external there's no def stmt we can fake as
representative but vect_is_simple_use eventually gets the
caller the scalar operand and its definition.  One might
argue using SLP_TREE_OPS and getting an external def would
maybe be more to the point, also since permute optimization
could change whether or not that appears.

* tree-vect-slp.cc (vect_build_slp_instance): Do not split
dataref groups on discovery failure.

Diff:
---
 gcc/tree-vect-slp.cc | 171 ++-
 1 file changed, 168 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 3a078b253df5..ef0199cf3fb2 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3476,8 +3476,6 @@ vect_build_slp_instance (vec_info *vinfo,
   else
 {
   /* Failed to SLP.  */
-  /* Free the allocated memory.  */
-  scalar_stmts.release ();
 }
 
   stmt_vec_info stmt_info = stmt_info_;
@@ -3496,6 +3494,8 @@ vect_build_slp_instance (vec_info *vinfo,
   if (is_a  (vinfo)
  && (i > 1 && i < group_size))
{
+/* Free the allocated memory.  */
+scalar_stmts.release ();
  tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
@@ -3542,7 +3542,10 @@ vect_build_slp_instance (vec_info *vinfo,
 
   /* For loop vectorization split into arbitrary pieces of size > 1.  */
   if (is_a  (vinfo)
- && (i > 1 && i < group_size)
+ && ((i > 1 && i < group_size)
+ /* For single-lane SLP when only the first lane didn't fail
+also split to single-lanes.  */
+ || (i > 0 && i < group_size && param_vect_single_lane_slp != 0))
  && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
{
  unsigned group1_size = i;
@@ -3551,6 +3554,164 @@ vect_build_slp_instance (vec_info *vinfo,
dump_printf_loc (MSG_NOTE, vect_location,
 "Splitting SLP group at stmt %u\n", i);
 
+ if (param_vect_single_lane_slp != 0)
+   {
+ /* Analyze the stored values and pinch them together with
+a permute node so we can preserve the whole store group.  */
+ auto_vec rhs_nodes;
+
+  /* Calculate the unrolling factor based on the smallest type.  */
+  poly_uint64 unrolling_factor = 1;
+
+ unsigned int start = 0, end = i;
+ while (start < group_size)
+   {
+ gcc_assert (end - start >= 1);
+ vec substmts;
+ substmts.create (end - start);
+ for (unsigned j = start; j < end; ++j)
+   substmts.quick_push (scalar_stmts[j]);
+ max_nunits = 1;
+ node = vect_build_slp_tree (vinfo, substmts, end - start,
+ 

[gcc(refs/users/rguenth/heads/vect-force-slp)] Do not account single-lane SLP graphs against discovery limit

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:fa0f551f3931529d5be72140f5a37ed02d0e0366

commit fa0f551f3931529d5be72140f5a37ed02d0e0366
Author: Richard Biener 
Date:   Fri Sep 29 15:12:54 2023 +0200

Do not account single-lane SLP graphs against discovery limit

The following avoids accounting single-lane SLP to the discovery
limit.  Even when raising it the attempt of forming multi-lane SLP
can exhaust the limit before we fall back to single-lane.

* tree-vect-slp.cc (vect_build_slp_tree): Only account
multi-lane SLP to limit.

Diff:
---
 gcc/tree-vect-slp.cc | 31 ++-
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 66c8fa38979f..3a078b253df5 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1725,21 +1725,26 @@ vect_build_slp_tree (vec_info *vinfo,
   SLP_TREE_SCALAR_STMTS (res) = stmts;
   bst_map->put (stmts.copy (), res);
 
-  if (*limit == 0)
+  /* Single-lane SLP doesn't have the chance of run-away, do not account
+ it to the limit.  */
+  if (stmts.length () > 1)
 {
-  if (dump_enabled_p ())
-   dump_printf_loc (MSG_NOTE, vect_location,
-"SLP discovery limit exceeded\n");
-  /* Mark the node invalid so we can detect those when still in use
-as backedge destinations.  */
-  SLP_TREE_SCALAR_STMTS (res) = vNULL;
-  SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
-  res->failed = XNEWVEC (bool, group_size);
-  memset (res->failed, 0, sizeof (bool) * group_size);
-  memset (matches, 0, sizeof (bool) * group_size);
-  return NULL;
+  if (*limit == 0)
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"SLP discovery limit exceeded\n");
+ /* Mark the node invalid so we can detect those when still in use
+as backedge destinations.  */
+ SLP_TREE_SCALAR_STMTS (res) = vNULL;
+ SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
+ res->failed = XNEWVEC (bool, group_size);
+ memset (res->failed, 0, sizeof (bool) * group_size);
+ memset (matches, 0, sizeof (bool) * group_size);
+ return NULL;
+   }
+  --*limit;
 }
-  --*limit;
 
   if (dump_enabled_p ())
 dump_printf_loc (MSG_NOTE, vect_location,


[gcc(refs/users/rguenth/heads/vect-force-slp)] Allow bigger SLP graphs

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:6b597bdb72177699178e238e3da713dc222a0eeb

commit 6b597bdb72177699178e238e3da713dc222a0eeb
Author: Richard Biener 
Date:   Fri Sep 29 13:05:01 2023 +0200

Allow bigger SLP graphs

When doing single-lane SLP discovery only we're easily running into
the SLP graph size limit when patterns are involved.  The following
ups the limit from the number of scalar stmts to the number of
scalar or pattern stmts by using the number of stmt_vec_infos created.

* tree-vect-loop.cc (vect_analyze_loop_2): Use the number
of stmt_vec_infos created to limit the SLP graph size.

Diff:
---
 gcc/tree-vect-loop.cc | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 361aec064884..7eeae908d367 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2855,8 +2855,10 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool 
,
   if (slp)
 {
   /* Check the SLP opportunities in the loop, analyze and build
-SLP trees.  */
-  ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
+SLP trees.  Use the number of stmt_vec_infos as graph limit
+since that also includes pattern stmts which LOOP_VINFO_N_STMTS
+does not.  */
+  ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length ());
   if (!ok)
return ok;


[gcc(refs/users/rguenth/heads/vect-force-slp)] Handle non-grouped SLP stores

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:540ffaa0d720ed04bb083857230ecd583662f8cc

commit 540ffaa0d720ed04bb083857230ecd583662f8cc
Author: Richard Biener 
Date:   Wed Oct 4 14:32:39 2023 +0200

Handle non-grouped SLP stores

The following adjusts vectorizable_store to properly handle
non-grouped SLP stores to update vec_num.

* tree-vect-stmts.cc (vectorizable_store): Always set
vec_num for SLP.

Diff:
---
 gcc/tree-vect-stmts.cc | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index f99dce38bf7b..46e4edb5e36f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8361,10 +8361,12 @@ vectorizable_store (vec_info *vinfo,
   return vectorizable_scan_store (vinfo, stmt_info, gsi, vec_stmt, 
ncopies);
 }
 
-  if (grouped_store)
+  if (grouped_store || slp)
 {
   /* FORNOW */
-  gcc_assert (!loop || !nested_in_vect_loop_p (loop, stmt_info));
+  gcc_assert (!grouped_store
+ || !loop
+ || !nested_in_vect_loop_p (loop, stmt_info));
 
   if (slp)
 {
@@ -8373,8 +8375,9 @@ vectorizable_store (vec_info *vinfo,
  group.  */
   vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
  first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
- gcc_assert (DR_GROUP_FIRST_ELEMENT (first_stmt_info)
- == first_stmt_info);
+ gcc_assert (!STMT_VINFO_GROUPED_ACCESS (first_stmt_info)
+ || (DR_GROUP_FIRST_ELEMENT (first_stmt_info)
+ == first_stmt_info));
  first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
  op = vect_get_store_rhs (first_stmt_info);
 }


[gcc(refs/users/rguenth/heads/vect-force-slp)] Add --param vect-single-lane-slp

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:e2d9709cdc50d68ed0e662634d8608c6f8491888

commit e2d9709cdc50d68ed0e662634d8608c6f8491888
Author: Richard Biener 
Date:   Fri Sep 29 12:54:17 2023 +0200

Add --param vect-single-lane-slp

The following adds --param vect-single-lane-slp to guard single-lane
loop SLP discovery.  As first client we look at non-grouped stores
with an assert that SLP discovery works to discover gaps in it.

* params.opt (-param=vect-single-lane-slp=): New.
* tree-vect-slp.cc (vect_analyze_slp): Perform single-lane
loop SLP discovery for non-grouped stores if requested.

Diff:
---
 gcc/params.opt   |  4 
 gcc/tree-vect-slp.cc | 26 ++
 2 files changed, 30 insertions(+)

diff --git a/gcc/params.opt b/gcc/params.opt
index 74ea9c6f8d93..4cde5c3015ae 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1198,6 +1198,10 @@ The maximum factor which the loop vectorizer applies to 
the cost of statements i
 Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 
1) Param Optimization
 Enable loop vectorization of floating point inductions.
 
+-param=vect-single-lane-slp=
+Common Joined UInteger Var(param_vect_single_lane_slp) Init(0) IntegerRange(0, 
1) Param Optimization
+Enable single lane SLP discovery.
+
 -param=vect-force-slp=
 Common Joined UInteger Var(param_vect_force_slp) Init(0) IntegerRange(0, 1) 
Param Optimization
 Fail vectorization when falling back to non-SLP.
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index f34ed54a70b0..66c8fa38979f 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3643,6 +3643,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
 opt_result
 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 {
+  loop_vec_info loop_vinfo = dyn_cast  (vinfo);
   unsigned int i;
   stmt_vec_info first_element;
   slp_instance instance;
@@ -3658,6 +3659,31 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
 vect_analyze_slp_instance (vinfo, bst_map, first_element,
   slp_inst_kind_store, max_tree_size, );
+  if (loop_vinfo && param_vect_single_lane_slp != 0)
+{
+  data_reference_p dr;
+  FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
+   if (DR_IS_WRITE (dr))
+ {
+   stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
+   /* It works a bit to dissolve the group but that's
+  not really what we want to do.  Instead group analysis
+  above starts discovery for each lane and pieces them together
+  to a single store to the whole group.  */
+   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+ continue;
+   vec stmts;
+   vec roots = vNULL;
+   vec remain = vNULL;
+   stmts.create (1);
+   stmts.quick_push (stmt_info);
+   bool res = vect_build_slp_instance (vinfo, slp_inst_kind_store,
+   stmts, roots, remain,
+   max_tree_size, ,
+   bst_map, NULL);
+   gcc_assert (res);
+ }
+}
 
   if (bb_vec_info bb_vinfo = dyn_cast  (vinfo))
 {


[gcc(refs/users/rguenth/heads/vect-force-slp)] Fail vectorization when not SLP with --param vect-force-slp=1

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:51d831bd7cd122511d03efcc3da2de343a16553a

commit 51d831bd7cd122511d03efcc3da2de343a16553a
Author: Richard Biener 
Date:   Wed Aug 23 10:48:32 2023 +0200

Fail vectorization when not SLP with --param vect-force-slp=1

The following adds --param vect-force-slp allowing to indicate failure
when not all stmts participating in loop vectorization are using
SLP vectorization.

This is intended for transitioning and debugging.

Enabling this without further changes results in the following
within vect.exp on x86_64

=== g++ Summary ===

-# of expected passes   619
+# of expected passes   546
+# of unexpected failures   73

=== gcc Summary ===

-# of expected passes   8835
-# of expected failures 256
+# of expected passes   7271
+# of unexpected failures   1564
+# of unexpected successes  12
+# of expected failures 244

=== gfortran Summary ===

-# of expected passes   171
+# of expected passes   144
+# of unexpected failures   27

* params.opt (-param=vect-force-slp=): New, default to 0.
* doc/invoke.texi (--param vect-force-slp): Document.
* tree-vect-stmts.cc (vect_analyze_stmt): With
--param vect-force-slp=1 fail vectorization when not using SLP.

Diff:
---
 gcc/doc/invoke.texi| 4 
 gcc/params.opt | 4 
 gcc/tree-vect-stmts.cc | 6 ++
 3 files changed, 14 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ddcd5213f06a..3bd02fb13e5e 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16747,6 +16747,10 @@ this parameter.  The default value of this parameter 
is 50.
 @item vect-induction-float
 Enable loop vectorization of floating point inductions.
 
+@item vect-force-slp
+Fail vectorization when falling back to non-SLP.  This is intended for
+debugging only.
+
 @item vrp-sparse-threshold
 Maximum number of basic blocks before VRP uses a sparse bitmap cache.
 
diff --git a/gcc/params.opt b/gcc/params.opt
index d34ef545bf03..74ea9c6f8d93 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1198,6 +1198,10 @@ The maximum factor which the loop vectorizer applies to 
the cost of statements i
 Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 
1) Param Optimization
 Enable loop vectorization of floating point inductions.
 
+-param=vect-force-slp=
+Common Joined UInteger Var(param_vect_force_slp) Init(0) IntegerRange(0, 1) 
Param Optimization
+Fail vectorization when falling back to non-SLP.
+
 -param=vrp-sparse-threshold=
 Common Joined UInteger Var(param_vrp_sparse_threshold) Init(3000) Optimization 
Param
 Maximum number of basic blocks before VRP uses a sparse bitmap cache.
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b8a71605f1bc..f99dce38bf7b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13257,6 +13257,12 @@ vect_analyze_stmt (vec_info *vinfo,
   return opt_result::success ();
 }
 
+  if (param_vect_force_slp && !node)
+return opt_result::failure_at (stmt_info->stmt,
+  "not vectorized:"
+  " not part of SLP but SLP forced: %G",
+  stmt_info->stmt);
+
   ok = true;
   if (!bb_vinfo
   && (STMT_VINFO_RELEVANT_P (stmt_info)


[gcc/rguenth/heads/vect-force-slp] (1426 commits) PR60276 fix for single-lane SLP

2024-05-13 Thread Richard Biener via Gcc-cvs
The branch 'rguenth/heads/vect-force-slp' was updated to point to:

 8a9b159a8608... PR60276 fix for single-lane SLP

It previously pointed to:

 3a1fe1d6d941... Improve combined store node splitting

Diff:

!!! WARNING: THE FOLLOWING COMMITS ARE NO LONGER ACCESSIBLE (LOST):
---

  3a1fe1d... Improve combined store node splitting
  86287e0... Add single-lane SLP support to .GOMP_SIMD_LANE vectorizatio
  e2bef5c... Fix last commit WRT patterns
  fa45f8f... Handle unused-only-live stmts in SLP discovery
  d1b8915... Avoid bogus SLP outer loop vectorization
  89b9eee... Fix non-grouped SLP load/store accounting in alignment peel
  27d303e... Allow single-lane SLP in-order reductions
  bc49f0d... Add double reduction support for SLP vectorization
  c316aa7... Allow single-lane COND_REDUCTION vectorization
  8cdfb70... Place easily identifyable assert insead of SIGSEV
  e672d56... Refactor SLP reduction group discovery
  d17ef2e... Allow patterns in SLP reductions
  74e7541... Relax COND_EXPR reduction vectorization SLP restriction
  c917156... Amend --param vect-force-slp checking
  b992691... Do single-lane SLP discovery for reductions
  598e22d... Fix SLP reduction initial value for pointer reductions
  98f3724... PR60276 fix for single-lane SLP
  ef49bc6... Avoid SLP build failure for unsupported shifts
  a373df9... Reduce single-lane SLP testresult noise
  69d5454... Add FIXME note regarding gcc.dg/vect/pr60276.c runfail with
  04db3df... Avoid splitting store dataref groups during SLP discovery
  9db0573... Do not account single-lane SLP graphs against discovery lim
  305009b... Allow bigger SLP graphs
  248bd7d... Guard SLP optimize latch edge discovery
  0ba91db... Handle non-grouped SLP stores
  f9c2a5d... Add --param vect-single-lane-slp
  e5d482c... Fail vectorization when not SLP with --param vect-force-slp


Summary of changes (added commits):
---

  8a9b159... PR60276 fix for single-lane SLP (*)
  c4af8eb... testsuite: c++: Allow for std::printf in g++.dg/modules/std (*)
  fb1649f... libstdc++: Use __builtin_shufflevector for simd split and c (*)
  898d714... Refactor SLP reduction group discovery (*)
  b621482... tree-ssa-math-opts: Pattern recognize yet another .ADD_OVER (*)
  f3f02a7... Manually add ChangeLog entry for r15-353-gd7bb8eaade3cd3aa7 (*)
  f2d1189... Daily bump. (*)
  5de0753... ada: Move Init_Proc_Level_Formal from Exp_Ch3 to Exp_Util (*)
  51b84f2... ada: Remove code that expected pre/post being split into co (*)
  a004159... ada: Revert recent change for Put_Image and Object_Size att (*)
  6d13384... ada: Rename finalization scope masters into finalization ma (*)
  a9c07b8... ada: Remove dynamic frame in System.Image_D and document it (*)
  7e348a4... ada: Attributes Put_Image and Object_Size are defined by Ad (*)
  c1ece0c... ada: Remove guards against traversal of empty list of aspec (*)
  b3eef3b... ada: Fix crash on Compile_Time_Warning in dead code (*)
  f7e1dde... ada: Deconstruct flag Split_PPC since splitting now is done (*)
  3aa99be... ada: Move splitting of pre/post aspect expressions to expan (*)
  1de93ed... ada: Fix style in comments (*)
  da88475... ada: Refine type of a local variable (*)
  32fe73e... ada: Recognize pragma Lock_Free as specific to GNAT (*)
  4768f3d... ada: Deconstruct unused flag Is_Expanded_Contract (*)
  7f12896... ada: Refactor repeated code for querying Boolean-valued asp (*)
  65c0029... ada: Complete implementation of Ada 2022 aspect Exclusive_F (*)
  0533acf... ada: Rewrite Append_Entity_Name; skip irrelevant names (*)
  c52bfe6... ada: Couple of comment tweaks to latest change (*)
  c8e01e7... ada: Replace finalization masters with finalization collect (*)
  eff0e26... ada: Remove deprecated VxWorks interrupt connection API (*)
  56e781f... ada: Decouple finalization masters from storage pools (*)
  c1b33f8... ada: Small cleanup in the BIP machinery (*)
  8d6c7fc... ada: Restore fix for controlled dynamic allocation with BIP (*)
  8e76c18... ada: Avoid crash on illegal constrained type declarations (*)
  2fc8ea4... ada: Fix pragma Compile_Time_Error for alignment of array t (*)
  c573c56... ada: Enable casing on composite via -X0 instead of -X (*)
  5270bfc... ada: Fix internal error with Put_Image aspect on access-to- (*)
  a14dc3e... ada: Simplify uses of readdir_gnat with object overlay (*)
  105bba8... ada: Refactor GNAT.Directory_Operations.Read to minimise ru (*)
  0a82463... ada: Compiler crash on nonstatic container aggregates for D (*)
  2d0eeb5... Fortran: Fix wrong code in unlimited polymorphic assignment (*)
  0c6dd4b... Revert "MIPS: Support constraint 'w' for MSA instruction" (*)
  4aeff84... MAINTAINERS: Add myself to write after approval (*)
  13b6ac4... Fortran: fix frontend memleak (*)
  4607799... arm: Use utxb rN, rM, ror #8 to implement zero_extract on a (*)
  83fb5e6... [to-be-committed,RISC-V] Improve usage of slli.uw in 

[gcc r15-431] PR60276 fix for single-lane SLP

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:8a9b159a86081053289be0c44339623ff59717a2

commit r15-431-g8a9b159a86081053289be0c44339623ff59717a2
Author: Richard Biener 
Date:   Thu Nov 9 11:30:22 2023 +0100

PR60276 fix for single-lane SLP

When enabling single-lane SLP and not splitting groups the fix for
PR60276 is no longer effective since it for unknown reason exempted
pure SLP.  The following removes this exemption, making
gcc.dg/vect/pr60276.c PASS even with --param vect-single-lane-slp=1

PR tree-optimization/60276
* tree-vect-stmts.cc (vectorizable_load): Do not exempt
pure_slp grouped loads from the STMT_VINFO_MIN_NEG_DIST
restriction.

Diff:
---
 gcc/tree-vect-stmts.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 21e8fe98e44a..b8a71605f1bc 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -9995,8 +9995,7 @@ vectorizable_load (vec_info *vinfo,
 
   /* Invalidate assumptions made by dependence analysis when vectorization
 on the unrolled body effectively re-orders stmts.  */
-  if (!PURE_SLP_STMT (stmt_info)
- && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0
+  if (STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0
  && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
   STMT_VINFO_MIN_NEG_DIST (stmt_info)))
{


[gcc r13-8763] rtl-optimization/54052 - RTL SSA PHI insertion compile-time hog

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:d629308c699bb8fe90c2afeb7fa1acb12cb5526b

commit r13-8763-gd629308c699bb8fe90c2afeb7fa1acb12cb5526b
Author: Richard Biener 
Date:   Mon Feb 19 11:10:50 2024 +0100

rtl-optimization/54052 - RTL SSA PHI insertion compile-time hog

The following tries to address the PHI insertion compile-time hog in
RTL fwprop observed with the PR54052 testcase where the loop computing
the "unfiltered" set of variables possibly needing PHI nodes for each
block exhibits quadratic compile-time and memory-use.

It does so by pruning the local DEFs with LR_OUT of the block, removing
regs that can never be LR_IN (defined by this block) in the dominance
frontier.

PR rtl-optimization/54052
* rtl-ssa/blocks.cc (function_info::place_phis): Filter
local defs by LR_OUT.

(cherry picked from commit c7151283dc747769d4ac4f216d8f519bda2569b5)

Diff:
---
 gcc/rtl-ssa/blocks.cc | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/gcc/rtl-ssa/blocks.cc b/gcc/rtl-ssa/blocks.cc
index 1f9969d78d88..0ee9fa0d6a2a 100644
--- a/gcc/rtl-ssa/blocks.cc
+++ b/gcc/rtl-ssa/blocks.cc
@@ -639,7 +639,12 @@ function_info::place_phis (build_info )
   if (bitmap_empty_p ([b1]))
continue;
 
-  bitmap b1_def = _LR_BB_INFO (BASIC_BLOCK_FOR_FN (m_fn, b1))->def;
+  // Defs in B1 that are possibly in LR_IN in the dominance frontier
+  // blocks.
+  auto_bitmap b1_def;
+  bitmap_and (b1_def, _LR_BB_INFO (BASIC_BLOCK_FOR_FN (m_fn, b1))->def,
+ DF_LR_OUT (BASIC_BLOCK_FOR_FN (m_fn, b1)));
+
   bitmap_iterator bmi;
   unsigned int b2;
   EXECUTE_IF_SET_IN_BITMAP ([b1], 0, b2, bmi)


[gcc r15-428] Refactor SLP reduction group discovery

2024-05-13 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:898d7145fb90734c9981555ec099710d87fc05af

commit r15-428-g898d7145fb90734c9981555ec099710d87fc05af
Author: Richard Biener 
Date:   Fri Mar 1 12:08:36 2024 +0100

Refactor SLP reduction group discovery

The following refactors a bit how we perform SLP reduction group
discovery possibly making it easier to have multiple reduction
groups later, esp. with single-lane SLP.

* tree-vect-slp.cc (vect_analyze_slp_instance): Remove
slp_inst_kind_reduc_group handling.
(vect_analyze_slp): Add the meat here.

Diff:
---
 gcc/tree-vect-slp.cc | 67 ++--
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 8c18f5308e2e..f34ed54a70b0 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3586,7 +3586,6 @@ vect_analyze_slp_instance (vec_info *vinfo,
   slp_instance_kind kind,
   unsigned max_tree_size, unsigned *limit)
 {
-  unsigned int i;
   vec scalar_stmts;
 
   if (is_a  (vinfo))
@@ -3620,35 +3619,6 @@ vect_analyze_slp_instance (vec_info *vinfo,
   STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
 }
-  else if (kind == slp_inst_kind_reduc_group)
-{
-  /* Collect reduction statements.  */
-  const vec 
-   = as_a  (vinfo)->reductions;
-  scalar_stmts.create (reductions.length ());
-  for (i = 0; reductions.iterate (i, _info); i++)
-   {
- gassign *g;
- next_info = vect_stmt_to_vectorize (next_info);
- if ((STMT_VINFO_RELEVANT_P (next_info)
-  || STMT_VINFO_LIVE_P (next_info))
- /* ???  Make sure we didn't skip a conversion around a reduction
-path.  In that case we'd have to reverse engineer that
-conversion stmt following the chain using reduc_idx and from
-the PHI using reduc_def.  */
- && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
- /* Do not discover SLP reductions for lane-reducing ops, that
-will fail later.  */
- && (!(g = dyn_cast  (STMT_VINFO_STMT (next_info)))
- || (gimple_assign_rhs_code (g) != DOT_PROD_EXPR
- && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR
- && gimple_assign_rhs_code (g) != SAD_EXPR)))
-   scalar_stmts.quick_push (next_info);
-   }
-  /* If less than two were relevant/live there's nothing to SLP.  */
-  if (scalar_stmts.length () < 2)
-   return false;
-}
   else
 gcc_unreachable ();
 
@@ -3740,9 +3710,40 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
 
   /* Find SLP sequences starting from groups of reductions.  */
   if (loop_vinfo->reductions.length () > 1)
-   vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
-  slp_inst_kind_reduc_group, max_tree_size,
-  );
+   {
+ /* Collect reduction statements.  */
+ vec scalar_stmts;
+ scalar_stmts.create (loop_vinfo->reductions.length ());
+ for (auto next_info : loop_vinfo->reductions)
+   {
+ gassign *g;
+ next_info = vect_stmt_to_vectorize (next_info);
+ if ((STMT_VINFO_RELEVANT_P (next_info)
+  || STMT_VINFO_LIVE_P (next_info))
+ /* ???  Make sure we didn't skip a conversion around a
+reduction path.  In that case we'd have to reverse
+engineer that conversion stmt following the chain using
+reduc_idx and from the PHI using reduc_def.  */
+ && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
+ /* Do not discover SLP reductions for lane-reducing ops, that
+will fail later.  */
+ && (!(g = dyn_cast  (STMT_VINFO_STMT (next_info)))
+ || (gimple_assign_rhs_code (g) != DOT_PROD_EXPR
+ && gimple_assign_rhs_code (g) != WIDEN_SUM_EXPR
+ && gimple_assign_rhs_code (g) != SAD_EXPR)))
+   scalar_stmts.quick_push (next_info);
+   }
+ if (scalar_stmts.length () > 1)
+   {
+ vec roots = vNULL;
+ vec remain = vNULL;
+ vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group,
+  scalar_stmts, roots, remain,
+  max_tree_size, , bst_map, NULL);
+   }
+ else
+   scalar_stmts.release ();
+   }
 }
 
   hash_set visited_patterns;


[gcc r15-362] tree-optimization/114998 - use-after-free with loop distribution

2024-05-10 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:34d15a4d630a0d54eddb99bdab086c506e10dac5

commit r15-362-g34d15a4d630a0d54eddb99bdab086c506e10dac5
Author: Richard Biener 
Date:   Fri May 10 14:19:49 2024 +0200

tree-optimization/114998 - use-after-free with loop distribution

When loop distribution releases a PHI node of the original IL it
can end up clobbering memory that's re-used when it upon releasing
its RDG resets all stmt UIDs back to -1, even those that got released.

The fix is to avoid resetting UIDs based on stmts in the RDG but
instead reset only those still present in the loop.

PR tree-optimization/114998
* tree-loop-distribution.cc (free_rdg): Take loop argument.
Reset UIDs of stmts still in the IL rather than all stmts
referenced from the RDG.
(loop_distribution::build_rdg): Pass loop to free_rdg.
(loop_distribution::distribute_loop): Likewise.
(loop_distribution::transform_reduction_loop): Likewise.

* gcc.dg/torture/pr114998.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/torture/pr114998.c | 35 +
 gcc/tree-loop-distribution.cc   | 24 --
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/torture/pr114998.c 
b/gcc/testsuite/gcc.dg/torture/pr114998.c
new file mode 100644
index ..81fc1e077cb9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr114998.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fno-tree-dce -ftree-loop-distribution" } */
+
+short a, d;
+int b, c, f, g, h, i, j[2], o;
+__attribute__((const)) int s(char r);
+int main() {
+  int l, m, k, n;
+  if (b) {
+char p;
+for (; p >= 0; p--) {
+  int e[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
+ 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
+ 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
+  if (j[p]) {
+int q[1];
+i = o;
+o = q[h];
+if (g)
+  n = d;
+m = 4;
+for (; m; m--) {
+  if (l)
+k |= c;
+  if (a)
+break;
+}
+  }
+  s(n);
+  f |= b;
+}
+  }
+  return 0;
+}
diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
index 95203fefa188..45932bae5e7f 100644
--- a/gcc/tree-loop-distribution.cc
+++ b/gcc/tree-loop-distribution.cc
@@ -778,7 +778,7 @@ loop_distribution::stmts_from_loop (class loop *loop, 
vec *stmts)
 /* Free the reduced dependence graph RDG.  */
 
 static void
-free_rdg (struct graph *rdg)
+free_rdg (struct graph *rdg, loop_p loop)
 {
   int i;
 
@@ -792,13 +792,25 @@ free_rdg (struct graph *rdg)
 
   if (v->data)
{
- gimple_set_uid (RDGV_STMT (v), -1);
  (RDGV_DATAREFS (v)).release ();
  free (v->data);
}
 }
 
   free_graph (rdg);
+
+  /* Reset UIDs of stmts still in the loop.  */
+  basic_block *bbs = get_loop_body (loop);
+  for (unsigned i = 0; i < loop->num_nodes; ++i)
+{
+  basic_block bb = bbs[i];
+  gimple_stmt_iterator gsi;
+  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next ())
+   gimple_set_uid (gsi_stmt (gsi), -1);
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next ())
+   gimple_set_uid (gsi_stmt (gsi), -1);
+}
+  free (bbs);
 }
 
 struct graph *
@@ -812,7 +824,7 @@ loop_distribution::build_rdg (class loop *loop, 
control_dependences *cd)
   rdg = new_graph (stmts.length ());
   if (!create_rdg_vertices (rdg, stmts, loop))
 {
-  free_rdg (rdg);
+  free_rdg (rdg, loop);
   return NULL;
 }
   stmts.release ();
@@ -3062,7 +3074,7 @@ loop_distribution::distribute_loop (class loop *loop,
 "Loop %d not distributed: too many memory references.\n",
 loop->num);
 
-  free_rdg (rdg);
+  free_rdg (rdg, loop);
   loop_nest.release ();
   free_data_refs (datarefs_vec);
   delete ddrs_table;
@@ -3259,7 +3271,7 @@ loop_distribution::distribute_loop (class loop *loop,
   FOR_EACH_VEC_ELT (partitions, i, partition)
 partition_free (partition);
 
-  free_rdg (rdg);
+  free_rdg (rdg, loop);
   return nbp - *nb_calls;
 }
 
@@ -3665,7 +3677,7 @@ loop_distribution::transform_reduction_loop (loop_p loop)
   auto_bitmap partition_stmts;
   bitmap_set_range (partition_stmts, 0, rdg->n_vertices);
   find_single_drs (loop, rdg, partition_stmts, _dr, _dr);
-  free_rdg (rdg);
+  free_rdg (rdg, loop);
 
   /* Bail out if there is no single load.  */
   if (load_dr == NULL)


[gcc r15-361] Allow patterns in SLP reductions

2024-05-10 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:52d4691294c84793b301ad3cc24e277b8c7efe0b

commit r15-361-g52d4691294c84793b301ad3cc24e277b8c7efe0b
Author: Richard Biener 
Date:   Fri Mar 1 09:29:32 2024 +0100

Allow patterns in SLP reductions

The following removes the over-broad rejection of patterns for SLP
reductions which is done by removing them from LOOP_VINFO_REDUCTIONS
during pattern detection.  That's also insufficient in case the
pattern only appears on the reduction path.  Instead this implements
the proper correctness check in vectorizable_reduction and guides
SLP discovery to heuristically avoid forming later invalid groups.

I also couldn't find any testcase that FAILs when allowing the SLP
reductions to form so I've added one.

I came across this for single-lane SLP reductions with the all-SLP
work where we rely on patterns to properly vectorize COND_EXPR
reductions.

* tree-vect-patterns.cc (vect_pattern_recog_1): Do not
remove reductions involving patterns.
* tree-vect-loop.cc (vectorizable_reduction): Reject SLP
reduction groups with multiple lane-reducing reductions.
* tree-vect-slp.cc (vect_analyze_slp_instance): When discovering
SLP reduction groups avoid including lane-reducing ones.

* gcc.dg/vect/vect-reduc-sad-9.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c | 68 
 gcc/tree-vect-loop.cc| 15 ++
 gcc/tree-vect-patterns.cc| 13 --
 gcc/tree-vect-slp.cc | 26 +++
 4 files changed, 101 insertions(+), 21 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c
new file mode 100644
index ..3c6af4510f45
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c
@@ -0,0 +1,68 @@
+/* Disabling epilogues until we find a better way to deal with scans.  */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-additional-options "-msse4.2" { target { x86_64-*-* i?86-*-* } } } */
+/* { dg-require-effective-target vect_usad_char } */
+
+#include 
+#include "tree-vect.h"
+
+#define N 64
+
+unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int abs (int);
+
+/* Sum of absolute differences between arrays of unsigned char types.
+   Detected as a sad pattern.
+   Vectorized on targets that support sad for unsigned chars.  */
+
+__attribute__ ((noinline)) int
+foo (int len, int *res2)
+{
+  int i;
+  int result = 0;
+  int result2 = 0;
+
+  for (i = 0; i < len; i++)
+{
+  /* Make sure we are not using an SLP reduction for this.  */
+  result += abs (X[2*i] - Y[2*i]);
+  result2 += abs (X[2*i + 1] - Y[2*i + 1]);
+}
+
+  *res2 = result2;
+  return result;
+}
+
+
+int
+main (void)
+{
+  int i;
+  int sad;
+
+  check_vect ();
+
+  for (i = 0; i < N/2; i++)
+{
+  X[2*i] = i;
+  Y[2*i] = N/2 - i;
+  X[2*i+1] = i;
+  Y[2*i+1] = 0;
+  __asm__ volatile ("");
+}
+
+
+  int sad2;
+  sad = foo (N/2, );
+  if (sad != (N/2)*(N/4))
+abort ();
+  if (sad2 != (N/2-1)*(N/2)/2)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 704df7bdcc73..361aec064884 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7667,6 +7667,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   return false;
 }
 
+  /* Lane-reducing ops also never can be used in a SLP reduction group
+ since we'll mix lanes belonging to different reductions.  But it's
+ OK to use them in a reduction chain or when the reduction group
+ has just one element.  */
+  if (lane_reduc_code_p
+  && slp_node
+  && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+  && SLP_TREE_LANES (slp_node) > 1)
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"lane-reducing reduction in reduction group.\n");
+  return false;
+}
+
   /* All uses but the last are expected to be defined in the loop.
  The last use is the reduction variable.  In case of nested cycle this
  assumption is not true: we use reduc_index to record the index of the
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 8e8de5ea3a55..dfb7d8005262 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -7160,7 +7160,6 @@ vect_pattern_recog_1 (vec_info *vinfo,
  vect_recog_func *recog_func, stmt_vec_info stmt_info)
 {
   gimple *pattern_stmt;
-  loop_vec_info loop_vinfo;
   tree pattern_vectype;
 
   /* If this 

[gcc r13-8727] tree-optimization/114375 - disallow SLP discovery of permuted mask loads

2024-05-08 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:4f2a35a76cca503749c696e7772d2e8eadc77ba5

commit r13-8727-g4f2a35a76cca503749c696e7772d2e8eadc77ba5
Author: Richard Biener 
Date:   Mon Mar 18 12:39:03 2024 +0100

tree-optimization/114375 - disallow SLP discovery of permuted mask loads

We cannot currently handle permutations of mask loads in code generation
or permute optimization.  But we simply drop any permutation on the
floor, so the following instead rejects the SLP build rather than
producing wrong-code.  I've also made sure to reject them in
vectorizable_load for completeness.

PR tree-optimization/114375
* tree-vect-slp.cc (vect_build_slp_tree_2): Compute the
load permutation for masked loads but reject it when any
such is necessary.
* tree-vect-stmts.cc (vectorizable_load): Reject masked
VMAT_ELEMENTWISE and VMAT_STRIDED_SLP as those are not
supported.

* gcc.dg/vect/vect-pr114375.c: New testcase.

(cherry picked from commit 94c3508c5a14d1948fe3bffa9e16c6f3d9c2836a)

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-pr114375.c | 44 +++
 gcc/tree-vect-slp.cc  | 34 +++-
 gcc/tree-vect-stmts.cc|  8 ++
 3 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-pr114375.c 
b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c
new file mode 100644
index ..1e1cb0123d07
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-pr114375.c
@@ -0,0 +1,44 @@
+/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
+
+#include "tree-vect.h"
+
+int a[512];
+int b[512];
+int c[512];
+
+void __attribute__((noipa))
+foo(int * __restrict p)
+{
+  for (int i = 0; i < 64; ++i)
+{
+  int tem = 2, tem2 = 2;
+  if (a[4*i + 1])
+tem = p[4*i];
+  if (a[4*i])
+tem2 = p[4*i + 2];
+  b[2*i] = tem2;
+  b[2*i+1] = tem;
+  if (a[4*i + 2])
+tem = p[4*i + 1];
+  if (a[4*i + 3])
+tem2 = p[4*i + 3];
+  c[2*i] = tem2;
+  c[2*i+1] = tem;
+}
+}
+int main()
+{
+  check_vect ();
+
+  for (int i = 0; i < 512; ++i)
+a[i] = (i >> 1) & 1;
+
+  foo (a);
+
+  if (c[0] != 1 || c[1] != 0 || c[2] != 1 || c[3] != 0
+  || b[0] != 2 || b[1] != 2 || b[2] != 2 || b[3] != 2)
+abort ();
+
+  return 0;
+}
+
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index bbc05fac65ec..c01dc02afff6 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1780,10 +1780,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
   && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
 {
-  if (gcall *stmt = dyn_cast  (stmt_info->stmt))
-   gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
-   || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
-   || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+   gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
   else
{
  *max_nunits = this_max_nunits;
@@ -1799,15 +1797,37 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
  load_permutation.create (group_size);
  stmt_vec_info first_stmt_info
= DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
+ bool any_permute = false;
  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
{
  int load_place = vect_get_place_in_interleaving_chain
  (load_info, first_stmt_info);
  gcc_assert (load_place != -1);
- load_permutation.safe_push (load_place);
+ any_permute |= load_place != j;
+ load_permutation.quick_push (load_place);
+   }
+
+ if (gcall *stmt = dyn_cast  (stmt_info->stmt))
+   {
+ gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
+ || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
+ || gimple_call_internal_p (stmt, 
IFN_MASK_GATHER_LOAD));
+ load_permutation.release ();
+ /* We cannot handle permuted masked loads, see PR114375.  */
+ if (any_permute
+ || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && DR_GROUP_SIZE (first_stmt_info) != group_size)
+ || STMT_VINFO_STRIDED_P (stmt_info))
+   {
+ matches[0] = false;
+ return NULL;
+   }
+   }
+ else
+   {
+ SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
+ return node;
}
- SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
- return node;
}
 }
   else if (gimple_assign_single_p (stmt_info->stmt)
diff --git a/gcc/tree-vect-stmts.cc 

[gcc r13-8726] cfgrtl: Fix MEM_EXPR update in duplicate_insn_chain [PR114924]

2024-05-08 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:c63704a2d840436797f54e175a2af0cb029889d2

commit r13-8726-gc63704a2d840436797f54e175a2af0cb029889d2
Author: Alex Coplan 
Date:   Fri May 3 09:23:59 2024 +0100

cfgrtl: Fix MEM_EXPR update in duplicate_insn_chain [PR114924]

The PR shows that when cfgrtl.cc:duplicate_insn_chain attempts to
update the MR_DEPENDENCE_CLIQUE information for a MEM_EXPR we can end up
accidentally dropping (e.g.) an ARRAY_REF from the MEM_EXPR and end up
replacing it with the underlying MEM_REF.  This leads to an
inconsistency in the MEM_EXPR information, and could lead to wrong code.

While the walk down to the MEM_REF is necessary to update
MR_DEPENDENCE_CLIQUE, we should use the outer tree expression for the
MEM_EXPR.  This patch does that.

gcc/ChangeLog:

PR rtl-optimization/114924
* cfgrtl.cc (duplicate_insn_chain): When updating MEM_EXPRs,
don't strip (e.g.) ARRAY_REFs from the final MEM_EXPR.

(cherry picked from commit fe40d525619eee9c2821126390df75068df4773a)

Diff:
---
 gcc/cfgrtl.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/cfgrtl.cc b/gcc/cfgrtl.cc
index 149131c2693f..4cb32e4d9bf3 100644
--- a/gcc/cfgrtl.cc
+++ b/gcc/cfgrtl.cc
@@ -4407,12 +4407,13 @@ duplicate_insn_chain (rtx_insn *from, rtx_insn *to,
   since MEM_EXPR is shared so make a copy and
   walk to the subtree again.  */
tree new_expr = unshare_expr (MEM_EXPR (*iter));
+   tree orig_new_expr = new_expr;
if (TREE_CODE (new_expr) == WITH_SIZE_EXPR)
  new_expr = TREE_OPERAND (new_expr, 0);
while (handled_component_p (new_expr))
  new_expr = TREE_OPERAND (new_expr, 0);
MR_DEPENDENCE_CLIQUE (new_expr) = newc;
-   set_mem_expr (const_cast  (*iter), new_expr);
+   set_mem_expr (const_cast  (*iter), orig_new_expr);
  }
  }
}


  1   2   3   >