[PATCH 1/3][RFC] Implement bool reduction vectorization

Richard Biener Fri, 17 Oct 2025 12:36:22 -0700

Currently we mess up here in two places.  One is pattern recognition
which computes a mask-precision for a bool reduction PHI that's
inconsistent with that of the latch definition.  This is solved by
iterating the mask-precision computation.  The second is that the
reduction epilogue generation and the code querying support for it
isn't ready for mask inputs.  This should be fixed as well, mainly
by doing all the epilogue processing on a data type again, for now
at least.  We probably want reduc_mask_{and,ior,xor}_scal optabs
so we can go the direct IFN path on masks if the target supports
that.


[2/3] adds these optabs and [3/3] is a way to use them but with no
actual target implementation (I stubbed a x86 one for one case to
get the code miscompiled^Wexercised).

I wonder if there's any feedback on the epilogue handling, esp. how
SVE or RVV would be able to handle vector mask reduction to a
scalar bool in the epilogue.  Esp. whether you think there is already
sufficient functionality that I just didn't see.

The first patch passed bootstrap & regtest on x86_64-unknown-linux-gnu.
It plugs quite some holes where we didn't think of masks in reduction
handling and at least on x86 it works with epilogue vectorization as
well.

Thanks,
Richard.

        PR tree-optimization/101639
        PR tree-optimization/103495
        * tree-vect-patterns.cc (vect_determine_mask_precision):
        Return whether the mask precision changed.
        (vect_determine_precisions): Iterate mask precision computation.
        * tree-vect-loop.cc (get_initial_defs_for_reduction): Properly
        convert non-mask initial values to a mask initial def for
        the reduction.
        (vect_create_epilog_for_reduction): For a mask input convert
        it to a corresponding data type.  Use a regular conversion
        for the final convert to the scalar code type.
        (vectorizable_reduction): Support mask reductions.  Verify
        we can compute a data vector from the mask result.

        * gcc.dg/vect/vect-reduc-bool-1.c: New testcase.
        * gcc.dg/vect/vect-reduc-bool-2.c: New testcase.
        * gcc.dg/vect/vect-reduc-bool-3.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c | 32 ++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c | 32 ++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c | 32 ++++++++
 gcc/tree-vect-loop.cc                         | 80 +++++++++++++-----
 gcc/tree-vect-patterns.cc                     | 81 ++++++++++++-------
 5 files changed, 210 insertions(+), 47 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c
new file mode 100644
index 00000000000..2460a14b664
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c
@@ -0,0 +1,32 @@
+#include "tree-vect.h"
+
+char p[128];
+
+bool __attribute__((noipa))
+f (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!f (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (f (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" { target { 
vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c
new file mode 100644
index 00000000000..d383d3bd2f0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c
@@ -0,0 +1,32 @@
+#include "tree-vect.h"
+
+short p[128];
+
+bool __attribute__((noipa))
+f (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!f (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (f (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" { target { 
vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c
new file mode 100644
index 00000000000..f59ffd01a87
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c
@@ -0,0 +1,32 @@
+#include "tree-vect.h"
+
+int p[128];
+
+bool __attribute__((noipa))
+f (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!f (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (f (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" { target { 
vect_int && vect_condition } } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 8ff8dc2ddda..0f890fae22c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -4908,17 +4908,16 @@ get_initial_defs_for_reduction (loop_vec_info 
loop_vinfo,
   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
     nunits = group_size;
 
+  tree vector_elt_type = TREE_TYPE (vector_type);
   number_of_places_left_in_vector = nunits;
   bool constant_p = true;
   tree_vector_builder elts (vector_type, nunits, 1);
   elts.quick_grow (nunits);
   gimple_seq ctor_seq = NULL;
   if (neutral_op
-      && !useless_type_conversion_p (TREE_TYPE (vector_type),
+      && !useless_type_conversion_p (vector_elt_type,
                                     TREE_TYPE (neutral_op)))
-    neutral_op = gimple_convert (&ctor_seq,
-                                TREE_TYPE (vector_type),
-                                neutral_op);
+    neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
   for (j = 0; j < nunits * number_of_vectors; ++j)
     {
       tree op;
@@ -4930,11 +4929,22 @@ get_initial_defs_for_reduction (loop_vec_info 
loop_vinfo,
        op = neutral_op;
       else
        {
-         if (!useless_type_conversion_p (TREE_TYPE (vector_type),
+         if (!useless_type_conversion_p (vector_elt_type,
                                          TREE_TYPE (initial_values[i])))
-           initial_values[i] = gimple_convert (&ctor_seq,
-                                               TREE_TYPE (vector_type),
-                                               initial_values[i]);
+           {
+             if (VECTOR_BOOLEAN_TYPE_P (vector_type))
+               initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
+                                                 vector_elt_type,
+                                                 initial_values[i],
+                                                 build_all_ones_cst
+                                                   (vector_elt_type),
+                                                 build_zero_cst
+                                                   (vector_elt_type));
+             else
+               initial_values[i] = gimple_convert (&ctor_seq,
+                                                   vector_elt_type,
+                                                   initial_values[i]);
+           }
          op = initial_values[i];
        }
 
@@ -5555,6 +5565,23 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   /* Shouldn't be used beyond this point.  */
   exit_bb = nullptr;
 
+  /* For the actual reduction work on a bool data vector instead of a
+     mask vector.  */
+  if (VECTOR_BOOLEAN_TYPE_P (vectype))
+    {
+      gcc_assert (reduc_inputs.length () == 1);
+      vectype = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
+                                                    TREE_TYPE (vectype),
+                                                    TYPE_VECTOR_SUBPARTS
+                                                      (vectype));
+      gimple_seq stmts = NULL;
+      reduc_inputs[0] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
+                                     reduc_inputs[0],
+                                     build_one_cst (vectype),
+                                     build_zero_cst (vectype));
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
+    }
+
   if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
@@ -5949,8 +5976,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 
          new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
                                   new_temp, bitsize, bitsize_zero_node);
-         new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-                                  scalar_type, new_temp);
+         new_temp = gimple_convert (&stmts, scalar_type, new_temp);
          gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
          scalar_results.safe_push (new_temp);
         }
@@ -7023,15 +7049,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
   VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
 
-  /* We do not handle mask reductions correctly in the epilogue.  */
-  if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "mask reduction not supported.\n");
-      return false;
-    }
-
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7284,6 +7301,29 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
   VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
 
+  /* For now see to implement the epilogue reduction on a bool data,
+     not the mask type.  */
+  tree orig_vectype_out = vectype_out;
+  if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
+    {
+      vectype_out
+       = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
+                                              TREE_TYPE (vectype_out),
+                                              TYPE_VECTOR_SUBPARTS
+                                                (orig_vectype_out));
+      if (!vectype_out
+         || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_out),
+                      TYPE_VECTOR_SUBPARTS (orig_vectype_out))
+         || !expand_vec_cond_expr_p (vectype_out, orig_vectype_out))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "cannot turn mask into bool data vector for "
+                            "reduction epilogue.\n");
+         return false;
+       }
+    }
+
   reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
@@ -7404,6 +7444,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       return false;
     }
 
+  vectype_out = orig_vectype_out;
+
   /* For SLP reductions, see if there is a neutral value we can use.  */
   tree neutral_op = NULL_TREE;
   tree initial_value = NULL_TREE;
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index becee62a9f5..5b536bb13b3 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -6828,13 +6828,14 @@ possible_vector_mask_operation_p (stmt_vec_info 
stmt_info)
 
 /* If STMT_INFO sets a boolean SSA_NAME, see whether we should use
    a vector mask type instead of a normal vector type.  Record the
-   result in STMT_INFO->mask_precision.  */
+   result in STMT_INFO->mask_precision.  Returns true when the
+   precision changed.  */
 
-static void
+static bool
 vect_determine_mask_precision (vec_info *vinfo, stmt_vec_info stmt_info)
 {
   if (!possible_vector_mask_operation_p (stmt_info))
-    return;
+    return false;
 
   /* If at least one boolean input uses a vector mask type,
      pick the mask type with the narrowest elements.
@@ -6916,8 +6917,11 @@ vect_determine_mask_precision (vec_info *vinfo, 
stmt_vec_info stmt_info)
          scalar_mode mode;
          tree vectype, mask_type;
          if (is_a <scalar_mode> (TYPE_MODE (op0_type), &mode)
-             && (vectype = get_vectype_for_scalar_type (vinfo, op0_type))
-             && (mask_type = get_mask_type_for_scalar_type (vinfo, op0_type))
+             /* Do not allow this to set vinfo->vector_mode, this might
+                disrupt the result for the next iteration.  */
+             && (vectype = get_related_vectype_for_scalar_type
+                                               (vinfo->vector_mode, op0_type))
+             && (mask_type = truth_type_for (vectype))
              && expand_vec_cmp_expr_p (vectype, mask_type, code))
            precision = GET_MODE_BITSIZE (mode);
        }
@@ -6943,19 +6947,30 @@ vect_determine_mask_precision (vec_info *vinfo, 
stmt_vec_info stmt_info)
        }
     }
 
-  if (dump_enabled_p ())
+  if (stmt_info->mask_precision != precision)
     {
-      if (precision == ~0U)
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "using normal nonmask vectors for %G",
-                        stmt_info->stmt);
-      else
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "using boolean precision %d for %G",
-                        precision, stmt_info->stmt);
-    }
+      if (dump_enabled_p ())
+       {
+         if (precision == ~0U)
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "using normal nonmask vectors for %G",
+                            stmt_info->stmt);
+         else
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "using boolean precision %d for %G",
+                            precision, stmt_info->stmt);
+       }
 
-  stmt_info->mask_precision = precision;
+      /* ???  We'd like to assert stmt_info->mask_precision == 0
+        || stmt_info->mask_precision > precision, thus that we only
+        decrease mask precisions throughout iteration, but the
+        tcc_comparison handling above means for comparisons of bools
+        we start with 8 but might increase in case the bools get mask
+        precision on their own.  */
+      stmt_info->mask_precision = precision;
+      return true;
+    }
+  return false;
 }
 
 /* Handle vect_determine_precisions for STMT_INFO, given that we
@@ -6988,22 +7003,32 @@ vect_determine_precisions (vec_info *vinfo)
 
   DUMP_VECT_SCOPE ("vect_determine_precisions");
 
-  for (unsigned int i = 0; i < nbbs; i++)
+  /* For mask precisions we have to iterate since otherwise we do not
+     get reduction PHI precision correct.  */
+  bool changed;
+  do
     {
-      basic_block bb = bbs[i];
-      for (auto gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-       {
-         stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi.phi ());
-         if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
-           vect_determine_mask_precision (vinfo, stmt_info);
-       }
-      for (auto gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      changed = false;
+      for (unsigned int i = 0; i < nbbs; i++)
        {
-         stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (gsi));
-         if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
-           vect_determine_mask_precision (vinfo, stmt_info);
+         basic_block bb = bbs[i];
+         for (auto gsi = gsi_start_phis (bb);
+              !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi.phi ());
+             if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
+               changed |= vect_determine_mask_precision (vinfo, stmt_info);
+           }
+         for (auto gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (gsi));
+             if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
+               changed |= vect_determine_mask_precision (vinfo, stmt_info);
+           }
        }
     }
+  while (changed);
+
   for (unsigned int i = 0; i < nbbs; i++)
     {
       basic_block bb = bbs[nbbs - i - 1];
-- 
2.51.0

[PATCH 1/3][RFC] Implement bool reduction vectorization

Reply via email to