The following fixes loop masking of .COND_ADD reductions when
we decide to reduce multiple lanes to one, thus go through
vect_transform_reduction.  The first issue is in
vect_reduction_update_partial_vector_usage which does not handle
incoming .COND_ADD well and fails to compute 'cond_fn' in this
case, disabling masking.  The second issue is that
vect_transform_reduction does not implement the masked but
not mask-by-cond case for any .COND_* operation.  The following
should fix both.

The testcases verify runtime in vect.exp and vectorization support
in the i386 target section for the combinations of -O3, -Ofast
plus masked vs. non-masked epilogues.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

        PR tree-optimization/122723
        * tree-vect-loop.cc (vect_reduction_update_partial_vector_usage):
        Handle incoming .COND_* operation.
        (vect_transform_reduction): Likewise.  Handle .COND_*
        operation when not using COND_EXPR masking in a masked loop.

        * gcc.dg/vect/vect-reduc-cond-add-1.c: New generic functional
        testcase.
        * gcc.target/i386/vect-epilogues-6.c: New testcase.
        * gcc.target/i386/vect-epilogues-7.c: Likewise.
        * gcc.target/i386/vect-epilogues-8.c: Likewise.
        * gcc.target/i386/vect-epilogues-9.c: Likewise.
---
 .../gcc.dg/vect/vect-reduc-cond-add-1.c       | 50 +++++++++++++++++++
 .../gcc.target/i386/vect-epilogues-6.c        | 21 ++++++++
 .../gcc.target/i386/vect-epilogues-7.c        | 21 ++++++++
 .../gcc.target/i386/vect-epilogues-8.c        | 21 ++++++++
 .../gcc.target/i386/vect-epilogues-9.c        | 21 ++++++++
 gcc/tree-vect-loop.cc                         | 35 +++++++++----
 6 files changed, 159 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-epilogues-9.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c
new file mode 100644
index 00000000000..1e64df7f6d2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-cond-add-1.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+char mask[128];
+
+double __attribute__((noipa))
+foo (double *a, int n)
+{
+  double sum = 0.0;
+  for (int i = 0; i < n; ++i)
+    {
+      double val;
+      if (mask[i])
+        val = a[i];
+      else
+        val = -0.0;
+      sum = sum + val;
+    }
+  return sum;
+}
+
+double a[128];
+
+int main()
+{
+  check_vect ();
+
+#pragma GCC novector
+  for (int i = 0; i < 128; ++i)
+    {
+      a[i] = (i * 7) % 15;
+      mask[i] = (i + 1) & 4;
+    }
+
+  double sum = foo (a, 87);
+  double sum2 = 0.0;
+#pragma GCC novector
+  for (int i = 0; i < 87; ++i)
+    {
+      double val;
+      if (mask[i])
+        val = a[i];
+      else
+        val = -0.0;
+      sum2 = sum2 + val;
+    }
+
+  if (sum != sum2)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-6.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-6.c
new file mode 100644
index 00000000000..8cd8740c6ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-6.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl -mavx512bw -mprefer-vector-width=512 --param 
vect-partial-vector-usage=0 -fdump-tree-vect-optimized" } */
+
+double
+foo (double *a, char *mask, int n)
+{
+  double sum = 0.0;
+  for (int i = 0; i < n; ++i)
+    {
+      double val;
+      if (mask[i])
+        val = a[i];
+      else
+        val = -0.0;
+      sum = sum + val;
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte 
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 32 
byte vectors" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-7.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-7.c
new file mode 100644
index 00000000000..63c29895f9b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-7.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl -mavx512bw -mprefer-vector-width=512 --param 
vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */
+
+double
+foo (double *a, char *mask, int n)
+{
+  double sum = 0.0;
+  for (int i = 0; i < n; ++i)
+    {
+      double val;
+      if (mask[i])
+        val = a[i];
+      else
+        val = -0.0;
+      sum = sum + val;
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte 
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 
masked 64 byte vectors" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-8.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-8.c
new file mode 100644
index 00000000000..ab5d4556ecb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-8.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512vl -mavx512bw -mprefer-vector-width=512 
--param vect-partial-vector-usage=0 -fdump-tree-vect-optimized" } */
+
+double
+foo (double *a, char *mask, int n)
+{
+  double sum = 0.0;
+  for (int i = 0; i < n; ++i)
+    {
+      double val;
+      if (mask[i])
+        val = a[i];
+      else
+        val = -0.0;
+      sum = sum + val;
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte 
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 32 
byte vectors" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-epilogues-9.c 
b/gcc/testsuite/gcc.target/i386/vect-epilogues-9.c
new file mode 100644
index 00000000000..72564a8a882
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-epilogues-9.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512vl -mavx512bw -mprefer-vector-width=512 
--param vect-partial-vector-usage=1 -fdump-tree-vect-optimized" } */
+
+double
+foo (double *a, char *mask, int n)
+{
+  double sum = 0.0;
+  for (int i = 0; i < n; ++i)
+    {
+      double val;
+      if (mask[i])
+        val = a[i];
+      else
+        val = -0.0;
+      sum = sum + val;
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte 
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 
masked 64 byte vectors" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index b11b4c168ab..e013d4f9809 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6647,7 +6647,10 @@ vect_reduction_update_partial_vector_usage 
(loop_vec_info loop_vinfo,
 {
   enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
   internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
-  internal_fn cond_fn = get_conditional_internal_fn (code, type);
+  internal_fn cond_fn
+    = ((code.is_internal_fn ()
+       && internal_fn_mask_index ((internal_fn)code) != -1)
+       ? (internal_fn)code : get_conditional_internal_fn (code, type));
 
   if (reduc_type != FOLD_LEFT_REDUCTION
       && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
@@ -7871,7 +7874,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
 
   code_helper code = canonicalize_code (op.code, op.type);
-  internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
+  internal_fn cond_fn
+    = ((code.is_internal_fn ()
+       && internal_fn_mask_index ((internal_fn)code) != -1)
+       ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
 
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
@@ -8119,17 +8125,26 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
             yet.  */
          gcc_assert (!lane_reducing);
 
-         /* Make sure that the reduction accumulator is vop[0].  */
-         if (reduc_index == 1)
-           {
-             gcc_assert (commutative_binary_op_p (code, op.type));
-             std::swap (vop[0], vop[1]);
-           }
          tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
                                          vec_num, vectype_in,
                                          mask_index++);
-         gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
-                                                   vop[0], vop[1], vop[0]);
+         gcall *call;
+         if (code.is_internal_fn () && cond_fn_p)
+           {
+             gcc_assert (op.num_ops >= 3
+                         && internal_fn_mask_index (internal_fn (code)) == 0);
+             vop[2] = vec_oprnds[2][i];
+             mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
+                                      mask, vop[0], gsi);
+             call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
+                                                vop[2], vop[reduc_index]);
+           }
+         else
+           {
+             gcc_assert (code.is_tree_code ());
+             call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
+                                                vop[1], vop[reduc_index]);
+           }
          new_temp = make_ssa_name (vec_dest, call);
          gimple_call_set_lhs (call, new_temp);
          gimple_call_set_nothrow (call, true);
-- 
2.51.0

Reply via email to