This enables use of a predicate mask or length limit for
vectorization of basic blocks in cases where previously only the
equivalent rolled (i.e. loop) form of some source code would have
been vectorized. Predication is only used for groups whose size
is not neatly divisible into vectors of lengths that can be
supported directly by the target.

The whole change enabled by wiring up vect_can_use_partial_vectors_p
to SLP_TREE_CAN_USE_PARTIAL_VECTORS_P, so that when this function is
used for BB SLP, it reads from the SLP node instead of from the loop
vectorizer's state whether we still have the option of vectorizing
using lengths or masks to prevent use of inactive lanes.

vect_record_max_nunits is updated to prevent it returning failure
for BB SLP if the group size is not an integral multiple of the
number of lanes in the vector type; it now allows such cases if
the vector type might be more than long enough. At the same time,
vect_get_num_vectors is updated to return early with 1 if a
vector type is long enough for the specified SLP tree node. This
avoids an ICE in vect_get_num_vectors, which cannot cope with SVE
vector types.

Instead of giving up if vect_get_vector_types_for_stmt
fails for the specified group size, vect_build_slp_tree_1
now calls vect_get_vector_types_for_stmt again without
a group size (which defaults to 0) as a fallback.
If this succeeds then the initial failure is treated as a
'soft' failure that results in the group being split.
Consequently, assertions that "For BB vectorization, we
should always have a group size once we've constructed the
SLP tree" were deleted in get_vectype_for_scalar_type and
vect_get_vector_types_for_stmt.

vect_create_vectorized_promotion_stmts no longer pushes
more stmts than implied by vect_get_num_copies because it could
previously overrun the number of slots allocated for an SLP node
(based on its number of lanes and type). e.g., four defs were
pushed for a promotion of V8HI to V2DI (8/2=4) even if only two
lanes of the V8HI were active. Allowing it later caused ICE in
vectorizable_operation for a parent node, because binary ops
require both operands to be the same length.

Since promotion no longer produces redundant definitions,
vectorizable_conversion also had to be modified so that demotion no
longer relies on an even number of defs being produced. If
necessary, it now pushes a single constant zero def.
---
 gcc/tree-vect-loop.cc  |  10 ++
 gcc/tree-vect-slp.cc   | 105 ++++++++++++++++-----
 gcc/tree-vect-stmts.cc | 206 ++++++++++++++++++++++++++++++-----------
 gcc/tree-vectorizer.h  |  17 +++-
 4 files changed, 256 insertions(+), 82 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 50cdc2a90fa..1a14ff2c5c8 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -907,6 +907,11 @@ vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
   rgroup_controls *rgm;
   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
     res = MAX (res, rgm->max_nscalars_per_iter);
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "max_nscalars_per_iter=%u\n",
+                    res);
+
   return res;
 }
 
@@ -1015,6 +1020,11 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
       unsigned int nscalars_per_iter
          = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
                       LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+      if (dump_enabled_p ())
+       dump_printf_loc (
+         MSG_NOTE, vect_location,
+         "verify_full_masking: nvectors=%u, nscalars_per_iter=%u\n", nvectors,
+         nscalars_per_iter);
 
       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
        {
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index b0c4c05a447..64783a027eb 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1085,8 +1085,12 @@ vect_record_nunits (vec_info *vinfo, stmt_vec_info 
stmt_info,
     }
 
   /* If populating the vector type requires unrolling then fail
-     before adjusting *nunits for basic-block vectorization.  */
+     before adjusting *nunits for basic-block vectorization.
+     Allow group sizes that are indivisible by the vector length only if they
+     are known not to exceed the vector length.  We may be able to support such
+     cases by generating constant masks.  */
   if (is_a <bb_vec_info> (vinfo)
+      && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype))
       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
     {
       if (dump_enabled_p ())
@@ -1138,12 +1142,29 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
*swap,
   tree soft_fail_nunits_vectype = NULL_TREE;
 
   tree vectype, nunits_vectype;
+  bool unsupported_datatype = false;
   if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
-                                      &nunits_vectype, group_size))
+                                      &nunits_vectype, &unsupported_datatype,
+                                      group_size))
     {
-      /* Fatal mismatch.  */
-      matches[0] = false;
-      return false;
+      /* Try to get fallback vector types and continue analysis, producing
+        matches[] as if vectype was not an issue.  This allows splitting of
+        groups to happen.  */
+      if (unsupported_datatype
+         && vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
+                                            &nunits_vectype,
+                                            &unsupported_datatype))
+       {
+         gcc_assert (is_a<bb_vec_info> (vinfo));
+         maybe_soft_fail = true;
+         soft_fail_nunits_vectype = nunits_vectype;
+       }
+      else
+       {
+         /* Fatal mismatch.  */
+         matches[0] = false;
+         return false;
+       }
     }
   if (is_a <bb_vec_info> (vinfo)
       && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
@@ -1653,16 +1674,22 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
*swap,
 
   if (maybe_soft_fail)
     {
-      unsigned HOST_WIDE_INT const_nunits;
-      if (!TYPE_VECTOR_SUBPARTS
-           (soft_fail_nunits_vectype).is_constant (&const_nunits)
-         || const_nunits > group_size)
+      /* Use the known minimum number of subparts for VLA because we still need
+        to choose a splitting point although the choice is more arbitrary.  */
+      unsigned HOST_WIDE_INT const_nunits = constant_lower_bound (
+         TYPE_VECTOR_SUBPARTS (soft_fail_nunits_vectype));
+
+      if (const_nunits > group_size)
        matches[0] = false;
       else
        {
          /* With constant vector elements simulate a mismatch at the
             point we need to split.  */
+         gcc_assert ((const_nunits % 2) == 0);
          unsigned tail = group_size & (const_nunits - 1);
+         if (tail == 0)
+           tail = const_nunits;
+         gcc_assert (group_size >= tail);
          memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
        }
       return false;
@@ -2393,13 +2420,21 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                  /* Check whether we can build the invariant.  If we can't
                     we never will be able to.  */
                  tree type = TREE_TYPE (chains[0][n].op);
-                 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
-                     && (TREE_CODE (type) == BOOLEAN_TYPE
-                         || !can_duplicate_and_interleave_p (vinfo, group_size,
-                                                             type)))
+                 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
                    {
-                     matches[0] = false;
-                     goto out;
+                     if (TREE_CODE (type) == BOOLEAN_TYPE)
+                       {
+                         matches[0] = false;
+                         goto out;
+                       }
+
+                     if (!is_a<bb_vec_info> (vinfo)
+                         && !can_duplicate_and_interleave_p (vinfo, group_size,
+                                                             type))
+                       {
+                         matches[0] = false;
+                         goto out;
+                       }
                    }
                }
              else if (dt != vect_internal_def)
@@ -2828,7 +2863,7 @@ out:
                    uniform_val = NULL_TREE;
                    break;
                  }
-             if (!uniform_val
+             if (!uniform_val && !is_a<bb_vec_info> (vinfo)
                  && !can_duplicate_and_interleave_p (vinfo,
                                                      oprnd_info->ops.length (),
                                                      TREE_TYPE (op0)))
@@ -4877,9 +4912,10 @@ vect_analyze_slp_instance (vec_info *vinfo,
            = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
          tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
                                                      1 << floor_log2 (i));
-         unsigned HOST_WIDE_INT const_nunits;
-         if (vectype
-             && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
+         unsigned HOST_WIDE_INT const_nunits
+           = vectype ? constant_lower_bound (TYPE_VECTOR_SUBPARTS (vectype))
+                     : 0;
+         if (const_nunits > 1 && (i % const_nunits) == 0)
            {
              /* Split into two groups at the first vector boundary.  */
              gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
@@ -8534,6 +8570,12 @@ vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree 
node,
      When all elements are the same we can use a splat.  */
   tree vectype = SLP_TREE_VECTYPE (node);
   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "vect_prologue_cost_for_slp: node %p, vector type %T, "
+                    "group_size %u\n",
+                    (void *) node, vectype, group_size);
+
   unsigned HOST_WIDE_INT const_nunits;
   unsigned nelt_limit;
   unsigned nvectors = vect_get_num_copies (vinfo, node);
@@ -11497,7 +11539,21 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
       unpack_factor = 1;
     }
   unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
-  gcc_assert (repeating_p || multiple_p (olanes, nunits));
+
+  /* With fully-predicated BB-SLP, an external node's number of lanes can be
+     incompatible with the chosen vector width (e.g., lane packs of 3 with a
+     natural 2-lane vector type).  */
+  if (!repeating_p && !multiple_p (olanes, nunits))
+    {
+      if (dump_p)
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "unsupported permutation %p: vector type %T,"
+                        " nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
+                        " ncopies=%" PRIu64 ", lanes=%u and unpack=%u\n",
+                        (void *) node, vectype, estimated_poly_value (nunits),
+                        ncopies, SLP_TREE_LANES (node), unpack_factor);
+      return -1;
+    }
 
   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
      from the { SLP operand, scalar lane } permutation as recorded in the
@@ -11758,7 +11814,14 @@ vect_schedule_slp_node (vec_info *vinfo,
 
   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
   if (SLP_TREE_VECTYPE (node))
-    SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
+    {
+      unsigned number_of_vectors = vect_get_num_copies (vinfo, node);
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Allocating %u defs to schedule node %p\n",
+                        number_of_vectors, (void *) node);
+      SLP_TREE_VEC_DEFS (node).create (number_of_vectors);
+    }
 
   if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
     {
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 5ec65b2b2de..3da91ef93fe 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1637,18 +1637,22 @@ check_load_store_for_partial_vectors (vec_info *vinfo, 
tree vectype,
     unsigned int nvectors;
     if (can_div_away_from_zero_p (size, nunits, &nvectors))
       return nvectors;
-    gcc_unreachable ();
+
+    gcc_assert (known_le (size, nunits));
+    return 1u;
   };
 
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
-  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  poly_uint64 size = loop_vinfo
+                      ? group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+                      : SLP_TREE_LANES (slp_node);
   machine_mode mask_mode;
   machine_mode vmode;
   bool using_partial_vectors_p = false;
   if (get_len_load_store_mode
       (vecmode, is_load, nullptr, elsvals).exists (&vmode))
     {
-      nvectors = group_memory_nvectors (group_size * vf, nunits);
+      nvectors = group_memory_nvectors (size, nunits);
       unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vecmode);
       vect_record_len (vinfo, slp_node, nvectors, vectype, factor);
       using_partial_vectors_p = true;
@@ -1657,7 +1661,7 @@ check_load_store_for_partial_vectors (vec_info *vinfo, 
tree vectype,
           && can_vec_mask_load_store_p (vecmode, mask_mode, is_load, NULL,
                                         elsvals))
     {
-      nvectors = group_memory_nvectors (group_size * vf, nunits);
+      nvectors = group_memory_nvectors (size, nunits);
       vect_record_mask (vinfo, slp_node, nvectors, vectype, scalar_mask);
       using_partial_vectors_p = true;
     }
@@ -3260,12 +3264,11 @@ vect_get_strided_load_store_ops (stmt_vec_info 
stmt_info, slp_tree node,
 
 static tree
 vect_get_loop_variant_data_ptr_increment (
-  vec_info *vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
+  loop_vec_info loop_vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
   vec_loop_lens *loop_lens, dr_vec_info *dr_info,
   vect_memory_access_type memory_access_type)
 {
-  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
-  tree step = vect_dr_behavior (vinfo, dr_info)->step;
+  tree step = vect_dr_behavior (loop_vinfo, dr_info)->step;
 
   /* gather/scatter never reach here.  */
   gcc_assert (!mat_gather_scatter_p (memory_access_type));
@@ -3309,7 +3312,7 @@ vect_get_data_ptr_increment (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
 
   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
   if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
-    return vect_get_loop_variant_data_ptr_increment (vinfo, aggr_type, gsi,
+    return vect_get_loop_variant_data_ptr_increment (loop_vinfo, aggr_type, 
gsi,
                                                     loop_lens, dr_info,
                                                     memory_access_type);
 
@@ -5008,7 +5011,14 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
        }
 
       if (gimple_get_lhs (new_stmt))
-       SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt));
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (
+             MSG_NOTE, vect_location,
+             "vectorizable_simd_clone_call: push def to node %p\n",
+             (void *) slp_node);
+         SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt));
+       }
     }
 
   for (i = 0; i < nargs; ++i)
@@ -5122,7 +5132,7 @@ vect_create_vectorized_demotion_stmts (vec_info *vinfo, 
vec<tree> *vec_oprnds,
    call the function recursively.  */
 
 static void
-vect_create_vectorized_promotion_stmts (vec_info *vinfo,
+vect_create_vectorized_promotion_stmts (vec_info *vinfo, slp_tree slp_node,
                                        vec<tree> *vec_oprnds0,
                                        vec<tree> *vec_oprnds1,
                                        stmt_vec_info stmt_info, tree vec_dest,
@@ -5135,37 +5145,39 @@ vect_create_vectorized_promotion_stmts (vec_info *vinfo,
   gimple *new_stmt1, *new_stmt2;
   vec<tree> vec_tmp = vNULL;
 
-  vec_tmp.create (vec_oprnds0->length () * 2);
+  const unsigned ncopies = vect_get_num_copies (vinfo, slp_node);
+  vec_tmp.create (ncopies);
+  gcc_assert (vec_oprnds0->length () <= ncopies);
   FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
     {
+      if (vec_tmp.length () >= ncopies)
+       break;
+
       if (op_type == binary_op)
        vop1 = (*vec_oprnds1)[i];
       else
        vop1 = NULL_TREE;
 
       /* Generate the two halves of promotion operation.  */
-      new_stmt1 = vect_gen_widened_results_half (vinfo, ch1, vop0, vop1,
-                                                op_type, vec_dest, gsi,
-                                                stmt_info);
-      new_stmt2 = vect_gen_widened_results_half (vinfo, ch2, vop0, vop1,
-                                                op_type, vec_dest, gsi,
-                                                stmt_info);
-      if (is_gimple_call (new_stmt1))
-       {
-         new_tmp1 = gimple_call_lhs (new_stmt1);
-         new_tmp2 = gimple_call_lhs (new_stmt2);
-       }
-      else
+      new_stmt1
+       = vect_gen_widened_results_half (vinfo, ch1, vop0, vop1, op_type,
+                                        vec_dest, gsi, stmt_info);
+      new_tmp1 = is_gimple_call (new_stmt1) ? gimple_call_lhs (new_stmt1)
+                                           : gimple_assign_lhs (new_stmt1);
+      vec_tmp.quick_push (new_tmp1);
+
+      if (vec_tmp.length () < ncopies)
        {
-         new_tmp1 = gimple_assign_lhs (new_stmt1);
-         new_tmp2 = gimple_assign_lhs (new_stmt2);
+         new_stmt2
+           = vect_gen_widened_results_half (vinfo, ch2, vop0, vop1, op_type,
+                                            vec_dest, gsi, stmt_info);
+         new_tmp2 = is_gimple_call (new_stmt2) ? gimple_call_lhs (new_stmt2)
+                                               : gimple_assign_lhs (new_stmt2);
+         vec_tmp.quick_push (new_tmp2);
        }
-
-      /* Store the results for the next step.  */
-      vec_tmp.quick_push (new_tmp1);
-      vec_tmp.quick_push (new_tmp2);
     }
 
+  gcc_assert (vec_tmp.length () <= ncopies);
   vec_oprnds0->release ();
   *vec_oprnds0 = vec_tmp;
 }
@@ -5379,7 +5391,13 @@ vectorizable_conversion (vec_info *vinfo,
   /* If op0 is an external or constant def, infer the vector type
      from the scalar type.  */
   if (!vectype_in)
-    vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
+    {
+      vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location, "inferred vector type %T\n",
+                        vectype_in);
+    }
+
   if (!cost_vec)
     gcc_assert (vectype_in);
   if (!vectype_in)
@@ -5767,12 +5785,15 @@ vectorizable_conversion (vec_info *vinfo,
                                             stmt_info, this_dest, gsi, c1,
                                             op_type);
          else
-           vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
-                                                   &vec_oprnds1, stmt_info,
-                                                   this_dest, gsi,
+           vect_create_vectorized_promotion_stmts (vinfo, slp_node,
+                                                   &vec_oprnds0, &vec_oprnds1,
+                                                   stmt_info, this_dest, gsi,
                                                    c1, c2, op_type);
        }
 
+      gcc_assert (vec_oprnds0.length ()
+                 == vect_get_num_copies (vinfo, slp_node));
+
       FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
        {
          gimple *new_stmt;
@@ -5796,6 +5817,16 @@ vectorizable_conversion (vec_info *vinfo,
         generate more than one vector stmt - i.e - we need to "unroll"
         the vector stmt by a factor VF/nunits.  */
       vect_get_vec_defs (vinfo, slp_node, op0, &vec_oprnds0);
+
+      /* Promotion no longer produces redundant defs (since support was
+       added for length/mask-predicated BB SLP of awkward-sized groups),
+       therefore demotion now has to handle that case too.  */
+      if (vec_oprnds0.length () % 2 != 0)
+       {
+         tree vectype = TREE_TYPE (vec_oprnds0[0]);
+         vec_oprnds0.safe_push (build_zero_cst (vectype));
+       }
+
       /* Arguments are ready.  Create the new vector stmts.  */
       if (cvt_type && modifier == NARROW_DST)
        FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
@@ -5814,6 +5845,8 @@ vectorizable_conversion (vec_info *vinfo,
       /* After demoting op0 to cvt_type, convert it to dest.  */
       if (cvt_type && code == FLOAT_EXPR)
        {
+         SLP_TREE_VEC_DEFS (slp_node).reserve (vec_oprnds0.length ());
+
          for (unsigned int i = 0; i != vec_oprnds0.length() / 2;  i++)
            {
              /* Arguments are ready, create the new vector stmt.  */
@@ -10448,7 +10481,7 @@ vectorizable_load (vec_info *vinfo,
 
       aggr_type = build_array_type_nelts (elem_type, group_size * nunits);
       if (!costing_p)
-       bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
+       bump = vect_get_data_ptr_increment (loop_vinfo, gsi, dr_info, aggr_type,
                                            memory_access_type, loop_lens);
 
       unsigned int inside_cost = 0, prologue_cost = 0;
@@ -12349,6 +12382,9 @@ vectorizable_comparison_1 (vec_info *vinfo, tree 
vectype,
   if (!vectype)
     {
       vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1), 
slp_node);
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "invariant comparison, guessed type %T.\n", vectype);
       if (!vectype || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype), nunits))
        return false;
     }
@@ -12951,6 +12987,21 @@ vect_analyze_stmt (vec_info *vinfo,
                                   " live stmt not supported: %G",
                                   stmt_info->stmt);
 
+  if (bb_vinfo)
+    {
+      unsigned int group_size = SLP_TREE_LANES (node);
+      tree vectype = SLP_TREE_VECTYPE (node);
+      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+      bool needs_partial = known_lt (group_size, nunits);
+      if (needs_partial && !SLP_TREE_CAN_USE_PARTIAL_VECTORS_P (node))
+       return opt_result::failure_at (stmt_info->stmt,
+                                      "not vectorized: SLP node needs but "
+                                      "cannot use partial vectors: %G",
+                                      stmt_info->stmt);
+      if (maybe_gt (group_size, nunits))
+       gcc_assert (multiple_p (group_size, nunits));
+    }
+
   return opt_result::success ();
 }
 
@@ -13253,13 +13304,7 @@ tree
 get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type,
                             unsigned int group_size)
 {
-  /* For BB vectorization, we should always have a group size once we've
-     constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
-     are tentative requests during things like early data reference
-     analysis and pattern recognition.  */
-  if (is_a <bb_vec_info> (vinfo))
-    gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
-  else
+  if (!is_a <bb_vec_info> (vinfo))
     group_size = 0;
 
   tree vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
@@ -13270,13 +13315,31 @@ get_vectype_for_scalar_type (vec_info *vinfo, tree 
scalar_type,
   /* Register the natural choice of vector type, before the group size
      has been applied.  */
   if (vectype)
+  {
+    if (dump_enabled_p ())
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "get_vectype_for_scalar_type: natural type for %T "
+                      "(ignoring group size %u): %T\n",
+                      scalar_type, group_size, vectype);
     vinfo->used_vector_modes.add (TYPE_MODE (vectype));
+  }
+  else if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "get_vectype_for_scalar_type: no natural type for %T "
+                    "(ignoring group size %u)\n",
+                    scalar_type, group_size);
 
   /* If the natural choice of vector type doesn't satisfy GROUP_SIZE,
-     try again with an explicit number of elements.  */
+     try again with an explicit number of elements.  A vector type satisfies
+     GROUP_SIZE if it could be long enough to store the whole group but we 
don't
+     know for sure.  (If we know that the vector type is long enough, then we
+     can generate masks to handle the excess lanes; if we aren't sure then we
+     must substitute a vector type that can be used to carve up the group.)
+   */
   if (vectype
       && group_size
-      && maybe_ge (TYPE_VECTOR_SUBPARTS (vectype), group_size))
+      && maybe_ge (TYPE_VECTOR_SUBPARTS (vectype), group_size)
+      && !known_ge (TYPE_VECTOR_SUBPARTS (vectype), group_size))
     {
       /* Start with the biggest number of units that fits within
         GROUP_SIZE and halve it until we find a valid vector type.
@@ -13295,6 +13358,11 @@ get_vectype_for_scalar_type (vec_info *vinfo, tree 
scalar_type,
        {
          vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
                                                         scalar_type, nunits);
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "get_vectype_for_scalar_type: trying %u elements "
+                            "of type %T: %T\n",
+                            nunits, scalar_type, vectype);
          nunits /= 2;
        }
       while (nunits > 1 && !vectype);
@@ -13567,6 +13635,13 @@ vect_is_simple_use (vec_info *vinfo, slp_tree slp_node,
     {
       if (def_stmt_info_out)
        *def_stmt_info_out = NULL;
+      if (SLP_TREE_SCALAR_OPS (child).is_empty ())
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Child has no scalar operands.\n");
+         return false;
+       }
       *op = SLP_TREE_SCALAR_OPS (child)[0];
       *dt = SLP_TREE_DEF_TYPE (child);
       return true;
@@ -13592,7 +13667,22 @@ vect_maybe_update_slp_op_vectype (slp_tree op, tree 
vectype)
       && SLP_TREE_DEF_TYPE (op) == vect_external_def
       && SLP_TREE_LANES (op) > 1)
     return false;
+
+  /* When the vectorizer falls back to building vector operands from scalars,
+     it can create SLP trees with external defs that have a number of lanes not
+     divisible by the number of subparts in a vector type naively inferred from
+     the scalar type.  Reject such types to avoid ICE when later computing the
+     prologue cost for invariant operands.  */
+  if (SLP_TREE_DEF_TYPE (op) == vect_external_def
+      && maybe_lt (TYPE_VECTOR_SUBPARTS (vectype), SLP_TREE_LANES (op))
+      && !multiple_p (SLP_TREE_LANES (op), TYPE_VECTOR_SUBPARTS (vectype)))
+    return false;
+
   SLP_TREE_VECTYPE (op) = vectype;
+  if (dump_enabled_p ())
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "updated vectype of operand %p with %u lanes to %T\n",
+                      (void *) op, SLP_TREE_LANES (op), vectype);
   return true;
 }
 
@@ -14313,24 +14403,25 @@ vect_gen_while_not (gimple_seq *seq, tree mask_type, 
tree start_index,
 
    - Set *NUNITS_VECTYPE_OUT to the vector type that contains the maximum
      number of units needed to vectorize STMT_INFO, or NULL_TREE if the
-     statement does not help to determine the overall number of units.  */
+     statement does not help to determine the overall number of units.
+
+   On failure:
+
+   - Set *UNSUPPORTED_DATATYPE to true if the statement can't be vectorized
+     because it uses a data type that the target doesn't support in vector form
+     for a group of the given GROUP_SIZE.
+ */
 
 opt_result
 vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
                                tree *stmt_vectype_out,
                                tree *nunits_vectype_out,
+                               bool *unsupported_datatype,
                                unsigned int group_size)
 {
   gimple *stmt = stmt_info->stmt;
 
-  /* For BB vectorization, we should always have a group size once we've
-     constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
-     are tentative requests during things like early data reference
-     analysis and pattern recognition.  */
-  if (is_a <bb_vec_info> (vinfo))
-    gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
-  else
-    group_size = 0;
+  group_size = 0;
 
   *stmt_vectype_out = NULL_TREE;
   *nunits_vectype_out = NULL_TREE;
@@ -14406,10 +14497,13 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, 
stmt_vec_info stmt_info,
        }
       vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
       if (!vectype)
-       return opt_result::failure_at (stmt,
-                                      "not vectorized:"
-                                      " unsupported data-type %T\n",
-                                      scalar_type);
+       {
+         *unsupported_datatype = true;
+         return opt_result::failure_at (stmt,
+                                        "not vectorized:"
+                                        " unsupported data-type %T\n",
+                                        scalar_type);
+       }
 
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 92c7ee6e46d..de778f1333c 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2331,6 +2331,14 @@ vect_get_num_copies (vec_info *vinfo, slp_tree node)
 
   vf *= SLP_TREE_LANES (node);
   tree vectype = SLP_TREE_VECTYPE (node);
+  if (known_ge (TYPE_VECTOR_SUBPARTS (vectype), vf))
+    {
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE,
+                    "%d lanes for node %p fits in one vector of type %T\n",
+                    SLP_TREE_LANES (node), (void *) node, vectype);
+      return 1;
+    }
 
   return vect_get_num_vectors (vf, vectype);
 }
@@ -2629,9 +2637,9 @@ extern tree vect_gen_while (gimple_seq *, tree, tree, 
tree,
                            const char * = nullptr);
 extern void vect_gen_while_ssa_name (gimple_seq *, tree, tree, tree, tree);
 extern tree vect_gen_while_not (gimple_seq *, tree, tree, tree);
-extern opt_result vect_get_vector_types_for_stmt (vec_info *,
-                                                 stmt_vec_info, tree *,
-                                                 tree *, unsigned int = 0);
+extern opt_result vect_get_vector_types_for_stmt (vec_info *, stmt_vec_info,
+                                                 tree *, tree *,
+                                                 bool *, unsigned int = 0);
 extern opt_tree vect_get_mask_type_for_stmt (stmt_vec_info, unsigned int = 0);
 
 /* In tree-if-conv.cc.  */
@@ -2964,8 +2972,7 @@ vect_can_use_partial_vectors_p (vec_info *vinfo, slp_tree 
slp_node)
   if (loop_vinfo)
     return LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
 
-  (void) slp_node; // FORNOW
-  return false;
+  return SLP_TREE_CAN_USE_PARTIAL_VECTORS_P (slp_node);
 }
 
 /* If VINFO is vectorizer state for loop vectorization then record that we no
-- 
2.43.0


Reply via email to