On 10/11/2025 16:13, Christopher Bazley wrote:

On 10/11/2025 14:59, Christopher Bazley wrote:

On 07/11/2025 13:57, Richard Biener wrote:
On Wed, 5 Nov 2025, Christopher Bazley wrote:

On 28/10/2025 13:29, Richard Biener wrote:
On Tue, 28 Oct 2025, Christopher Bazley wrote:

+/* Materialize length number INDEX for a group of scalar stmts in SLP_NODE
that
+   operate on NVECTORS vectors of type VECTYPE, where 0 <= INDEX <
NVECTORS.  A
+   length limit is only required for the tail, therefore NULL_TREE is
returned
+   for every value of INDEX except that last; otherwise, return a value
that
+   contains FACTOR multiplied by the number of elements that should be
+   processed.  */
+
+tree
+vect_slp_get_bb_len (slp_tree slp_node, unsigned int nvectors, tree
vectype,
+                  unsigned int index, unsigned int factor)
+{
+  gcc_checking_assert (SLP_TREE_CAN_USE_LEN_P (slp_node));
+
+  /* Only the last vector can be a partial vector.  */
+  if (index < nvectors - 1)
+    return NULL_TREE;
+
+  /* vect_get_num_copies only allows a partial vector if it is the only
+     vector.  */
+  if (nvectors > 1)
+    return NULL_TREE;
+
+  gcc_checking_assert (nvectors == 1);
+
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  unsigned int group_size = SLP_TREE_LANES (slp_node);
+
+  /* A single vector can be a full vector, in which case no length limit is
+   * needed.  */
+  if (known_eq (nunits, group_size))
+    return NULL_TREE;
+
+  /* Return the scaled length of a single partial vector. */
+  gcc_checking_assert (known_lt (group_size, nunits));
+  return size_int (group_size * factor);
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 3115c610736..5ec65b2b2de 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1408,7 +1408,9 @@ vectorizable_internal_function (combined_fn cfn, tree
fndecl,
   /* Record that a complete set of masks associated with VINFO would need to       contain a sequence of NVECTORS masks that each control a vector of type       VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
-   these vector masks with the vector version of SCALAR_MASK.  */
+   these vector masks with the vector version of SCALAR_MASK.
Alternatively,
+   if doing basic block vectorization, record that an equivalent mask would
be
+   required to vectorize SLP_NODE.  */
   static void
   vect_record_mask (vec_info *vinfo, slp_tree slp_node, unsigned int
   nvectors,
                 tree vectype, tree scalar_mask)
@@ -1418,7 +1420,10 @@ vect_record_mask (vec_info *vinfo, slp_tree slp_node,
unsigned int nvectors,
       vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
       nvectors,
                          vectype, scalar_mask);
     else
-    (void) slp_node; // FORNOW
+    {
+      gcc_checking_assert (!SLP_TREE_CAN_USE_LEN_P (slp_node));
+      SLP_TREE_CAN_USE_MASK_P (slp_node) = true;
+    }
   }

   /* Given a complete set of masks associated with VINFO, extract mask
number
@@ -1436,16 +1441,15 @@ vect_get_mask (vec_info *vinfo, slp_tree slp_node,
gimple_stmt_iterator *gsi,
       return vect_get_loop_mask (loop_vinfo, gsi, &LOOP_VINFO_MASKS
       (loop_vinfo),
                              nvectors, vectype, index);
     else
-    {
-      (void) slp_node; // FORNOW
-      return NULL_TREE;
-    }
+    return vect_slp_get_bb_mask (slp_node, gsi, nvectors, vectype, index);
   }

   /* Record that a complete set of lengths associated with VINFO would need
   to
      contain a sequence of NVECTORS lengths for controlling an operation on       VECTYPE.  The operation splits each element of VECTYPE into FACTOR
      separate
-   subelements, measuring the length as a number of these subelements.  */ +   subelements, measuring the length as a number of these subelements.
+   Alternatively, if doing basic block vectorization, record that an
equivalent
+   length would be required to vectorize SLP_NODE.  */
   static void
   vect_record_len (vec_info *vinfo, slp_tree slp_node, unsigned int
   nvectors,
                tree vectype, unsigned int factor)
@@ -1455,7 +1459,10 @@ vect_record_len (vec_info *vinfo, slp_tree slp_node,
unsigned int nvectors,
       vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo),
       nvectors,
                         vectype, factor);
     else
-    (void) slp_node; // FORNOW
+    {
+      gcc_checking_assert (!SLP_TREE_CAN_USE_MASK_P (slp_node));
+      SLP_TREE_CAN_USE_LEN_P (slp_node) = true;
+    }
   }

   /* Given a complete set of lengths associated with VINFO, extract length
number
@@ -1476,10 +1483,7 @@ vect_get_len (vec_info *vinfo, slp_tree slp_node,
gimple_stmt_iterator *gsi,
       return vect_get_loop_len (loop_vinfo, gsi, &LOOP_VINFO_LENS
       (loop_vinfo),
                             nvectors, vectype, index, factor);
     else
-    {
-      (void) slp_node; // FORNOW
-      return NULL_TREE;
-    }
+    return vect_slp_get_bb_len (slp_node, nvectors, vectype, index,
factor);
   }

   static tree permute_vec_elements (vec_info *, tree, tree, tree,
stmt_vec_info,
@@ -14252,24 +14256,35 @@ supportable_indirect_convert_operation
(code_helper code,
      mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
      Add the statements to SEQ.  */

+void
+vect_gen_while_ssa_name (gimple_seq *seq, tree mask_type, tree start_index,
+                      tree end_index, tree ssa_name)
+{
+  tree cmp_type = TREE_TYPE (start_index);
+  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
cmp_type,
+ mask_type,
+ OPTIMIZE_FOR_SPEED));
+  gcall *call
+    = gimple_build_call_internal (IFN_WHILE_ULT, 3, start_index, end_index,
+                               build_zero_cst (mask_type));
That's quite restrictive, for constant_p nunits you should be able to
create a VECTOR_CST.  How do you ensure that the actual vector length
is big enough btw?
CCing the list on my reply.

I think that an existing function named fold_while_ult already optimises
all IFN_WHILE_ULT usage that can be optimised safely, so I'm reluctant
to duplicate a version of that logic here.
x86 does not implement IFN_WHILE_ULT, that's what I wanted to say.
I do not remember any check that ensures this is only called for
targets that do?

The function vect_gen_while_ssa_name is called by

{vectorizable_call|vectorizable_operation|vectorizable_load|vectorizable_store|vectorizable_simd_clone_call} -> vect_get_mask (if and only if vect_can_use_mask_p) -> vect_slp_get_bb_mask

or

{vect_do_peeling|vect_transform_loop} -> vect_set_loop_condition -> vect_set_loop_condition_partial_vectors -> vect_set_loop_controls_directly (if and only if LOOP_VINFO_FULLY_MASKED_P) -> vect_gen_while.

For loop vectorisation, vect_can_use_mask_p is equivalent to LOOP_VINFO_FULLY_MASKED_P; for BB SLP, it is instead equivalent to SLP_TREE_CAN_USE_MASK_P.

SLP_TREE_CAN_USE_MASK_P is initialised to false and set to true in vect_record_mask. I intended it to be a BB-SLP-specific alternative to the !LOOP_VINFO_MASKS (L).is_empty () precondition in LOOP_VINFO_FULLY_MASKED_P; however, LOOP_VINFO_FULLY_MASKED_P also has another precondition that has no equivalent in the definition of SLP_TREE_CAN_USE_MASK_P: LOOP_VINFO_USING_PARTIAL_VECTORS_P.

LOOP_VINFO_USING_PARTIAL_VECTORS_P is true if both LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P and LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P are true. LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P is true if param_vect_partial_vector_usage was true (which does not depend on the target AFAIK) and it has not subsequently been set to false; LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P is true for VLA modes or if peeling is insufficient.

I created a BB SLP equivalent of LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P, named SLP_TREE_CAN_USE_PARTIAL_VECTORS_P, which is set to true during construction of an SLP node (similar to how LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P is initialised to true if param_vect_partial_vector_usage).

Target support for the appropriate load/store-lanes instructions is checked in check_load_store_for_partial_vectors (iff vect_can_use_partial_vectors_p). This function can call vect_cannot_use_partial_vectors (often as an alternative to vect_record_len or vect_record_mask). For loop vectorisation, calling vect_cannot_use_partial_vectors is equivalent to LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P = false; for BB SLP, it is equivalent to SLP_TREE_CAN_USE_PARTIAL_VECTORS_P = false. Consequently, the circumstances in which vect_can_use_mask_p returns false in vect_get_mask are similar to those in which LOOP_VINFO_FULLY_MASKED_P returns false in vect_set_loop_controls_directly, although LOOP_VINFO_FULLY_MASKED_P has additional preconditions.

A robust definition of vect_can_use_mask_p for BB SLP should probably have SLP_TREE_CAN_USE_PARTIAL_VECTORS_P as a precondition for returning true, in conjunction with
Actually, I think vect_get_mask and vect_get_len are only called during the transform phase. Analysis of the SLP tree should fail earlier than that, during the analysis phase, if the target does not support the required instructions (e.g., reflected by SLP_TREE_CAN_USE_PARTIAL_VECTORS_P == false). I think that condition is already checked in vect_analyze_stmt, although the vect_can_use_mask_p and vect_can_use_len_p are not part of the check.
SLP_TREE_CAN_USE_MASK_P (and similar for vect_can_use_len_p). At the moment, there is a maintenance hazard where vect_record_mask or vect_record_len could be called despite SLP_TREE_CAN_USE_PARTIAL_VECTORS_P == false (although that never happens in my patches) or either of the vect_record_* functions could be called before vect_cannot_use_partial_vectors (e.g., because vect_load_lanes_supported succeeded for one value of group_size but not another) .

It seems as though GCC is currently at least somewhat robust against different calls to vect_load_lanes_supported returning different answers, as evidenced by the following code in vect_analyze_loop_2:
Oh, but it looks as though that cannot happen for a single SLP node: check_load_store_for_partial_vectors, vectorizable_operation and vectorizable_call only call either vect_record_len or vect_record_mask, never both. That means my existing assertions are correct...

  /* For now, we don't expect to mix both masking and length approaches for one
     loop, disable it if both are recorded.  */
  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
      && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
    {
      if (dump_enabled_p ())
    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
             "can't vectorize a loop with partial vectors"
             " because we don't expect to mix different"
             " approaches with partial vectors for the"
             " same loop.\n");
      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
    }

and in its callee, vect_determine_partial_vectors_and_peeling:

  if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
    return opt_result::failure_at (vect_location,
                   "not vectorized: loop needs but cannot "
                   "use partial vectors\n");

I should probably try to retain that robustness for BB SLP by allowing both SLP_TREE_CAN_USE_LEN_P and SLP_TREE_CAN_USE_MASK_P to be set concurrently, until the end of the analysis phase. That would preclude use of an enumeration, which you suggested.
... and this could be feasible.

--
Christopher Bazley
Staff Software Engineer, GNU Tools Team.
Arm Ltd, 110 Fulbourn Road, Cambridge, CB1 9NJ, UK.
http://www.arm.com/

Reply via email to