https://gcc.gnu.org/g:3602ebc86473a9ff9c68996470600e372a8b9379

commit r16-4499-g3602ebc86473a9ff9c68996470600e372a8b9379
Author: Richard Biener <[email protected]>
Date:   Thu Oct 9 14:03:29 2025 +0200

    Implement bool reduction vectorization
    
    Currently we mess up here in two places.  One is pattern recognition
    which computes a mask-precision for a bool reduction PHI that's
    inconsistent with that of the latch definition.  This is solved by
    iterating the mask-precision computation.  The second is that the
    reduction epilogue generation and the code querying support for it
    isn't ready for mask inputs.  The following fixes this by falling
    back to doing all the epilogue processing on a data type again, if
    the target does not support a direct mask reduction.  For that we
    utilize the newly added reduc_sbool_{and,ior,xor}_scal optabs
    so we can go the direct IFN path on masks if the target supports
    that.  In the future we can also implement an additional fallback
    for IOR and AND reductions using a scalar cond-expr like
    mask != 0 ? true : false, but the new optabs provide more information
    to the target.
    
            PR tree-optimization/101639
            PR tree-optimization/103495
            * tree-vectorizer.h (vect_reduc_info_s): Add reduc_type_for_mask.
            (VECT_REDUC_INFO_VECTYPE_FOR_MASK): New.
            * tree-vect-patterns.cc (vect_determine_mask_precision):
            Return whether the mask precision changed.
            (vect_determine_precisions): Iterate mask precision computation
            for loop vectorization.
            * tree-vect-loop.cc (get_initial_defs_for_reduction): Properly
            convert non-mask initial values to a mask initial def for
            the reduction.
            (sbool_reduction_fn_for_fn): New function.
            (vect_create_epilog_for_reduction): For a mask input convert
            it to the vector type analysis decided to use.  Use a regular
            conversion for the final convert to the scalar code type.
            (vectorizable_reduction): Support mask reductions.  Verify
            we can compute a data vector from the mask result or a direct
            maks reduction is provided by the target.
    
            * gcc.dg/vect/vect-reduc-bool-1.c: New testcase.
            * gcc.dg/vect/vect-reduc-bool-2.c: Likewise.
            * gcc.dg/vect/vect-reduc-bool-3.c: Likewise.
            * gcc.dg/vect/vect-reduc-bool-4.c: Likewise.
            * gcc.dg/vect/vect-reduc-bool-5.c: Likewise.
            * gcc.dg/vect/vect-reduc-bool-6.c: Likewise.
            * gcc.dg/vect/vect-reduc-bool-7.c: Likewise.
            * gcc.dg/vect/vect-reduc-bool-8.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c |  52 +++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c |  52 +++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c |  52 +++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-4.c |  52 +++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-5.c |  50 ++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-6.c |  50 ++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-7.c |  50 ++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-8.c |  50 ++++++++++
 gcc/tree-vect-loop.cc                         | 129 +++++++++++++++++++++-----
 gcc/tree-vect-patterns.cc                     |  82 ++++++++++------
 gcc/tree-vectorizer.h                         |   5 +
 11 files changed, 574 insertions(+), 50 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c
new file mode 100644
index 000000000000..38aead8a1c7c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c
@@ -0,0 +1,52 @@
+#include "tree-vect.h"
+
+char p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fand (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (fand (n))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fior (n))
+      abort ();
+
+  p[0] = 1;
+  for (int n = 1; n < 77; ++n)
+    if (!fior (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c
new file mode 100644
index 000000000000..2949b8308a20
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c
@@ -0,0 +1,52 @@
+#include "tree-vect.h"
+
+short p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fand (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (fand (n))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fior (n))
+      abort ();
+
+  p[0] = 1;
+  for (int n = 1; n < 77; ++n)
+    if (!fior (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c
new file mode 100644
index 000000000000..893aa4bb290a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c
@@ -0,0 +1,52 @@
+#include "tree-vect.h"
+
+int p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fand (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (fand (n))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fior (n))
+      abort ();
+
+  p[0] = 1;
+  for (int n = 1; n < 77; ++n)
+    if (!fior (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-4.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-4.c
new file mode 100644
index 000000000000..dc37e06133bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-4.c
@@ -0,0 +1,52 @@
+#include "tree-vect.h"
+
+long long p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fand (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (fand (n))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fior (n))
+      abort ();
+
+  p[0] = 1;
+  for (int n = 1; n < 77; ++n)
+    if (!fior (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-5.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-5.c
new file mode 100644
index 000000000000..9bafc09927c2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-5.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+char p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fxort (n) != !(n & 1))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n) != (n & 1))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fxort (n))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-6.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-6.c
new file mode 100644
index 000000000000..ee1b9649e555
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-6.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+short p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fxort (n) != !(n & 1))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n) != (n & 1))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fxort (n))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-7.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-7.c
new file mode 100644
index 000000000000..ab5f3ae89b5e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-7.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+int p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fxort (n) != !(n & 1))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n) != (n & 1))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fxort (n))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-8.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-8.c
new file mode 100644
index 000000000000..6b0a65659518
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-8.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+long long p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fxort (n) != !(n & 1))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n) != (n & 1))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fxort (n))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 568353ae5bf5..de335b1ac6fb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3297,6 +3297,28 @@ reduction_fn_for_scalar_code (code_helper code, 
internal_fn *reduc_fn)
       }
 }
 
+/* Set *SBOOL_FN to the corresponding function working on vector masks
+   for REDUC_FN.  Return true if that exists, false otherwise.  */
+
+static bool
+sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
+{
+  switch (reduc_fn)
+    {
+    case IFN_REDUC_AND:
+      *sbool_fn = IFN_REDUC_SBOOL_AND;
+      return true;
+    case IFN_REDUC_IOR:
+      *sbool_fn = IFN_REDUC_SBOOL_IOR;
+      return true;
+    case IFN_REDUC_XOR:
+      *sbool_fn = IFN_REDUC_SBOOL_XOR;
+      return true;
+    default:
+      return false;
+    }
+}
+
 /* If there is a neutral value X such that a reduction would not be affected
    by the introduction of additional X elements, return that X, otherwise
    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
@@ -4902,17 +4924,16 @@ get_initial_defs_for_reduction (loop_vec_info 
loop_vinfo,
   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
     nunits = group_size;
 
+  tree vector_elt_type = TREE_TYPE (vector_type);
   number_of_places_left_in_vector = nunits;
   bool constant_p = true;
   tree_vector_builder elts (vector_type, nunits, 1);
   elts.quick_grow (nunits);
   gimple_seq ctor_seq = NULL;
   if (neutral_op
-      && !useless_type_conversion_p (TREE_TYPE (vector_type),
+      && !useless_type_conversion_p (vector_elt_type,
                                     TREE_TYPE (neutral_op)))
-    neutral_op = gimple_convert (&ctor_seq,
-                                TREE_TYPE (vector_type),
-                                neutral_op);
+    neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
   for (j = 0; j < nunits * number_of_vectors; ++j)
     {
       tree op;
@@ -4924,11 +4945,22 @@ get_initial_defs_for_reduction (loop_vec_info 
loop_vinfo,
        op = neutral_op;
       else
        {
-         if (!useless_type_conversion_p (TREE_TYPE (vector_type),
+         if (!useless_type_conversion_p (vector_elt_type,
                                          TREE_TYPE (initial_values[i])))
-           initial_values[i] = gimple_convert (&ctor_seq,
-                                               TREE_TYPE (vector_type),
-                                               initial_values[i]);
+           {
+             if (VECTOR_BOOLEAN_TYPE_P (vector_type))
+               initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
+                                                 vector_elt_type,
+                                                 initial_values[i],
+                                                 build_all_ones_cst
+                                                   (vector_elt_type),
+                                                 build_zero_cst
+                                                   (vector_elt_type));
+             else
+               initial_values[i] = gimple_convert (&ctor_seq,
+                                                   vector_elt_type,
+                                                   initial_values[i]);
+           }
          op = initial_values[i];
        }
 
@@ -5549,6 +5581,22 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   /* Shouldn't be used beyond this point.  */
   exit_bb = nullptr;
 
+  /* If we are operating on a mask vector and do not support direct mask
+     reduction, work on a bool data vector instead of a mask vector.  */
+  if (VECTOR_BOOLEAN_TYPE_P (vectype)
+      && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
+      && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
+    {
+      gcc_assert (reduc_inputs.length () == 1);
+      vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
+      gimple_seq stmts = NULL;
+      reduc_inputs[0] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
+                                     reduc_inputs[0],
+                                     build_one_cst (vectype),
+                                     build_zero_cst (vectype));
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
+    }
+
   if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
@@ -5943,8 +5991,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 
          new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
                                   new_temp, bitsize, bitsize_zero_node);
-         new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-                                  scalar_type, new_temp);
+         new_temp = gimple_convert (&stmts, scalar_type, new_temp);
          gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
          scalar_results.safe_push (new_temp);
         }
@@ -7017,15 +7064,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
   VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
 
-  /* We do not handle mask reductions correctly in the epilogue.  */
-  if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "mask reduction not supported.\n");
-      return false;
-    }
-
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7343,6 +7381,23 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       return false;
     }
 
+  /* See if we can convert a mask vector to a corresponding bool data vector
+     to perform the epilogue reduction.  */
+  tree alt_vectype_out = NULL_TREE;
+  if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
+    {
+      alt_vectype_out
+       = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
+                                              TREE_TYPE (vectype_out),
+                                              TYPE_VECTOR_SUBPARTS
+                                                (vectype_out));
+      if (!alt_vectype_out
+         || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
+                      TYPE_VECTOR_SUBPARTS (vectype_out))
+         || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
+       alt_vectype_out = NULL_TREE;
+    }
+
   internal_fn reduc_fn = IFN_LAST;
   if (reduction_type == TREE_CODE_REDUCTION
       || reduction_type == FOLD_LEFT_REDUCTION
@@ -7353,9 +7408,26 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
          ? fold_left_reduction_fn (orig_code, &reduc_fn)
          : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
        {
-         if (reduc_fn != IFN_LAST
-             && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
-                                                 OPTIMIZE_FOR_SPEED))
+         internal_fn sbool_fn = IFN_LAST;
+         if (reduc_fn == IFN_LAST)
+           ;
+         else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
+                   || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
+                       == MODE_VECTOR_BOOL))
+                  && direct_internal_fn_supported_p (reduc_fn, vectype_out,
+                                                     OPTIMIZE_FOR_SPEED))
+           ;
+         else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
+                  && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
+                  && direct_internal_fn_supported_p (sbool_fn, vectype_out,
+                                                     OPTIMIZE_FOR_SPEED))
+           reduc_fn = sbool_fn;
+         else if (reduction_type != FOLD_LEFT_REDUCTION
+                  && alt_vectype_out
+                  && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
+                                                     OPTIMIZE_FOR_SPEED))
+           VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
+         else
            {
              if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7372,6 +7444,19 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
          return false;
        }
+      if (reduc_fn == IFN_LAST
+         && VECTOR_BOOLEAN_TYPE_P (vectype_out))
+       {
+         if (!alt_vectype_out)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "cannot turn mask into bool data vector for "
+                                "reduction epilogue.\n");
+             return false;
+           }
+         VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
+       }
     }
   else if (reduction_type == COND_REDUCTION)
     {
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index c92fbcd143dc..878a045c4364 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -7157,13 +7157,14 @@ possible_vector_mask_operation_p (stmt_vec_info 
stmt_info)
 
 /* If STMT_INFO sets a boolean SSA_NAME, see whether we should use
    a vector mask type instead of a normal vector type.  Record the
-   result in STMT_INFO->mask_precision.  */
+   result in STMT_INFO->mask_precision.  Returns true when the
+   precision changed.  */
 
-static void
+static bool
 vect_determine_mask_precision (vec_info *vinfo, stmt_vec_info stmt_info)
 {
   if (!possible_vector_mask_operation_p (stmt_info))
-    return;
+    return false;
 
   /* If at least one boolean input uses a vector mask type,
      pick the mask type with the narrowest elements.
@@ -7245,8 +7246,11 @@ vect_determine_mask_precision (vec_info *vinfo, 
stmt_vec_info stmt_info)
          scalar_mode mode;
          tree vectype, mask_type;
          if (is_a <scalar_mode> (TYPE_MODE (op0_type), &mode)
-             && (vectype = get_vectype_for_scalar_type (vinfo, op0_type))
-             && (mask_type = get_mask_type_for_scalar_type (vinfo, op0_type))
+             /* Do not allow this to set vinfo->vector_mode, this might
+                disrupt the result for the next iteration.  */
+             && (vectype = get_related_vectype_for_scalar_type
+                                               (vinfo->vector_mode, op0_type))
+             && (mask_type = truth_type_for (vectype))
              && expand_vec_cmp_expr_p (vectype, mask_type, code))
            precision = GET_MODE_BITSIZE (mode);
        }
@@ -7272,19 +7276,30 @@ vect_determine_mask_precision (vec_info *vinfo, 
stmt_vec_info stmt_info)
        }
     }
 
-  if (dump_enabled_p ())
+  if (stmt_info->mask_precision != precision)
     {
-      if (precision == ~0U)
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "using normal nonmask vectors for %G",
-                        stmt_info->stmt);
-      else
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "using boolean precision %d for %G",
-                        precision, stmt_info->stmt);
-    }
+      if (dump_enabled_p ())
+       {
+         if (precision == ~0U)
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "using normal nonmask vectors for %G",
+                            stmt_info->stmt);
+         else
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "using boolean precision %d for %G",
+                            precision, stmt_info->stmt);
+       }
 
-  stmt_info->mask_precision = precision;
+      /* ???  We'd like to assert stmt_info->mask_precision == 0
+        || stmt_info->mask_precision > precision, thus that we only
+        decrease mask precisions throughout iteration, but the
+        tcc_comparison handling above means for comparisons of bools
+        we start with 8 but might increase in case the bools get mask
+        precision on their own.  */
+      stmt_info->mask_precision = precision;
+      return true;
+    }
+  return false;
 }
 
 /* Handle vect_determine_precisions for STMT_INFO, given that we
@@ -7317,22 +7332,33 @@ vect_determine_precisions (vec_info *vinfo)
 
   DUMP_VECT_SCOPE ("vect_determine_precisions");
 
-  for (unsigned int i = 0; i < nbbs; i++)
+  /* For mask precisions we have to iterate since otherwise we do not
+     get reduction PHI precision correct.  For now do this only for
+     loop vectorization.  */
+  bool changed;
+  do
     {
-      basic_block bb = bbs[i];
-      for (auto gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-       {
-         stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi.phi ());
-         if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
-           vect_determine_mask_precision (vinfo, stmt_info);
-       }
-      for (auto gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      changed = false;
+      for (unsigned int i = 0; i < nbbs; i++)
        {
-         stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (gsi));
-         if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
-           vect_determine_mask_precision (vinfo, stmt_info);
+         basic_block bb = bbs[i];
+         for (auto gsi = gsi_start_phis (bb);
+              !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi.phi ());
+             if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
+               changed |= vect_determine_mask_precision (vinfo, stmt_info);
+           }
+         for (auto gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (gsi));
+             if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
+               changed |= vect_determine_mask_precision (vinfo, stmt_info);
+           }
        }
     }
+  while (changed && is_a <loop_vec_info> (vinfo));
+
   for (unsigned int i = 0; i < nbbs; i++)
     {
       basic_block bb = bbs[nbbs - i - 1];
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4785cbdd61df..905a29142d3e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -855,6 +855,10 @@ public:
   /* The vector type for performing the actual reduction operation.  */
   tree reduc_vectype;
 
+  /* The vector type we should use for the final reduction in the epilogue
+     when we reduce a mask.  */
+  tree reduc_vectype_for_mask;
+
   /* For INTEGER_INDUC_COND_REDUCTION, the initial value to be used.  */
   tree induc_cond_initial_val;
 
@@ -888,6 +892,7 @@ typedef class vect_reduc_info_s *vect_reduc_info;
 #define VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL(I) ((I)->induc_cond_initial_val)
 #define VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT(I) ((I)->reduc_epilogue_adjustment)
 #define VECT_REDUC_INFO_VECTYPE(I) ((I)->reduc_vectype)
+#define VECT_REDUC_INFO_VECTYPE_FOR_MASK(I) ((I)->reduc_vectype_for_mask)
 #define VECT_REDUC_INFO_FORCE_SINGLE_CYCLE(I) ((I)->force_single_cycle)
 #define VECT_REDUC_INFO_RESULT_POS(I) ((I)->reduc_result_pos)

Reply via email to