Hi Richard,

> I think it would be better to fold this into the existing documentation
> a bit more:
[..]

done.

Fixed the remaining nits in the attached v5.

Bootstrap and regtest are good on s390x, Power9 and i386.

Regards
 Robin

--

gcc/ChangeLog:

        * config/rs6000/vsx.md: Use const0 bias predicate.
        * doc/md.texi: Document bias value.
        * internal-fn.c (expand_partial_load_optab_fn): Add bias.
        (expand_partial_store_optab_fn): Likewise.
        (internal_len_load_store_bias): New function.
        * internal-fn.h (VECT_PARTIAL_BIAS_UNSUPPORTED): New define.
        (internal_len_load_store_bias): New function.
        * tree-vect-loop-manip.c (vect_set_loop_controls_directly): Set
bias.
        (vect_set_loop_condition_partial_vectors): Add header_seq parameter.
        * tree-vect-loop.c (vect_verify_loop_lens): Verify bias.
        (vect_estimate_min_profitable_iters): Account for bias.
        (vect_get_loop_len): Add bias-adjusted length.
        * tree-vect-stmts.c (vectorizable_store): Use.
        (vectorizable_load): Use.
        * tree-vectorizer.h (struct rgroup_controls): Add bias-adjusted
length.
        (LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS): New macro.
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 83d6c7b76f3..9da166f0502 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -5618,7 +5618,8 @@
 (define_expand "len_load_v16qi"
   [(match_operand:V16QI 0 "vlogical_operand")
    (match_operand:V16QI 1 "memory_operand")
-   (match_operand:QI 2 "gpc_reg_operand")]
+   (match_operand:QI 2 "gpc_reg_operand")
+   (match_operand:QI 3 "zero_constant")]
   "TARGET_P9_VECTOR && TARGET_64BIT"
 {
   rtx mem = XEXP (operands[1], 0);
@@ -5632,6 +5633,7 @@
   [(match_operand:V16QI 0 "memory_operand")
    (match_operand:V16QI 1 "vlogical_operand")
    (match_operand:QI 2 "gpc_reg_operand")
+   (match_operand:QI 3 "zero_constant")
   ]
   "TARGET_P9_VECTOR && TARGET_64BIT"
 {
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 8fd0f8d2fe1..2af3e68ca7e 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5206,25 +5206,43 @@ This pattern is not allowed to @code{FAIL}.
 
 @cindex @code{len_load_@var{m}} instruction pattern
 @item @samp{len_load_@var{m}}
-Load the number of vector elements specified by operand 2 from memory
-operand 1 into vector register operand 0, setting the other elements of
+Load (operand 2 - operand 3) elements from vector memory operand 1
+into vector register operand 0, setting the other elements of
 operand 0 to undefined values.  Operands 0 and 1 have mode @var{m},
 which must be a vector mode.  Operand 2 has whichever integer mode the
-target prefers.  If operand 2 exceeds the number of elements in mode
-@var{m}, the behavior is undefined.  If the target prefers the length
-to be measured in bytes rather than elements, it should only implement
-this pattern for vectors of @code{QI} elements.
+target prefers.  Operand 3 conceptually has mode @code{QI}.
+
+Operand 2 can be a variable or a constant amount.  Operand 3 specifies a
+constant bias: it is either a constant 0 or a constant -1.  The predicate on
+operand 3 must only accept the bias values that the target actually supports.
+GCC handles a bias of 0 more efficiently than a bias of -1.
+
+If (operand 2 - operand 3) exceeds the number of elements in mode
+@var{m}, the behavior is undefined.
+
+If the target prefers the length to be measured in bytes rather than
+elements, it should only implement this pattern for vectors of @code{QI}
+elements.
 
 This pattern is not allowed to @code{FAIL}.
 
 @cindex @code{len_store_@var{m}} instruction pattern
 @item @samp{len_store_@var{m}}
-Store the number of vector elements specified by operand 2 from vector
-register operand 1 into memory operand 0, leaving the other elements of
+Store (operand 2 - operand 3) vector elements from vector register operand 1
+into memory operand 0, leaving the other elements of
 operand 0 unchanged.  Operands 0 and 1 have mode @var{m}, which must be
 a vector mode.  Operand 2 has whichever integer mode the target prefers.
-If operand 2 exceeds the number of elements in mode @var{m}, the behavior
-is undefined.  If the target prefers the length to be measured in bytes
+Operand 3 conceptually has mode @code{QI}.
+
+Operand 2 can be a variable or a constant amount.  Operand 3 specifies a
+constant bias: it is either a constant 0 or a constant -1.  The predicate on
+operand 3 must only accept the bias values that the target actually supports.
+GCC handles a bias of 0 more efficiently than a bias of -1.
+
+If (operand 2 - operand 3) exceeds the number of elements in mode
+@var{m}, the behavior is undefined.
+
+If the target prefers the length to be measured in bytes
 rather than elements, it should only implement this pattern for vectors
 of @code{QI} elements.
 
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 08f94b7a17a..655c04bfa5a 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index)
 static void
 expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 {
-  class expand_operand ops[3];
-  tree type, lhs, rhs, maskt;
-  rtx mem, target, mask;
+  class expand_operand ops[4];
+  tree type, lhs, rhs, maskt, biast;
+  rtx mem, target, mask, bias;
   insn_code icode;
 
   maskt = gimple_call_arg (stmt, 2);
@@ -2723,11 +2723,20 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
   create_output_operand (&ops[0], target, TYPE_MODE (type));
   create_fixed_operand (&ops[1], mem);
   if (optab == len_load_optab)
-    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
-				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
+    {
+      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
+				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
+      biast = gimple_call_arg (stmt, 3);
+      bias = expand_normal (biast);
+      create_input_operand (&ops[3], bias, QImode);
+      expand_insn (icode, 4, ops);
+    }
   else
-    create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
-  expand_insn (icode, 3, ops);
+    {
+      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
+      expand_insn (icode, 3, ops);
+    }
+
   if (!rtx_equal_p (target, ops[0].value))
     emit_move_insn (target, ops[0].value);
 }
@@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 static void
 expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 {
-  class expand_operand ops[3];
-  tree type, lhs, rhs, maskt;
-  rtx mem, reg, mask;
+  class expand_operand ops[4];
+  tree type, lhs, rhs, maskt, biast;
+  rtx mem, reg, mask, bias;
   insn_code icode;
 
   maskt = gimple_call_arg (stmt, 2);
@@ -2766,11 +2775,19 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
   create_fixed_operand (&ops[0], mem);
   create_input_operand (&ops[1], reg, TYPE_MODE (type));
   if (optab == len_store_optab)
-    create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
-				 TYPE_UNSIGNED (TREE_TYPE (maskt)));
+    {
+      create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
+				   TYPE_UNSIGNED (TREE_TYPE (maskt)));
+      biast = gimple_call_arg (stmt, 4);
+      bias = expand_normal (biast);
+      create_input_operand (&ops[3], bias, QImode);
+      expand_insn (icode, 4, ops);
+    }
   else
-    create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
-  expand_insn (icode, 3, ops);
+    {
+      create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
+      expand_insn (icode, 3, ops);
+    }
 }
 
 #define expand_mask_store_optab_fn expand_partial_store_optab_fn
@@ -4287,6 +4304,30 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
 	  && insn_operand_matches (icode, 4, GEN_INT (align)));
 }
 
+/* Return the supported bias for IFN which is either IFN_LEN_LOAD
+   or IFN_LEN_STORE.  For now we only support the biases of 0 and -1
+   (in case 0 is not an allowable length for len_load or len_store).
+   If none of the biases match what the backend provides, return
+   VECT_PARTIAL_BIAS_UNSUPPORTED.  */
+
+signed char
+internal_len_load_store_bias (internal_fn ifn, machine_mode mode)
+{
+  optab optab = direct_internal_fn_optab (ifn);
+  insn_code icode = direct_optab_handler (optab, mode);
+
+  if (icode != CODE_FOR_nothing)
+    {
+      /* For now we only support biases of 0 or -1.  Try both of them.  */
+      if (insn_operand_matches (icode, 3, GEN_INT (0)))
+	return 0;
+      if (insn_operand_matches (icode, 3, GEN_INT (-1)))
+	return -1;
+    }
+
+  return VECT_PARTIAL_BIAS_UNSUPPORTED;
+}
+
 /* Expand STMT as though it were a call to internal function FN.  */
 
 void
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index c96b9a79005..4d41ec3861b 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -230,6 +230,10 @@ extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
 						    tree, tree, int);
 extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
 						poly_uint64, unsigned int);
+#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
+
+extern signed char internal_len_load_store_bias (internal_fn ifn,
+						 machine_mode);
 
 extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
 				    bool, bool, bool, bool, tree *);
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index f788deb3d72..69eb22ec819 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -421,6 +421,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
 static tree
 vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 				 gimple_seq *preheader_seq,
+				 gimple_seq *header_seq,
 				 gimple_stmt_iterator loop_cond_gsi,
 				 rgroup_controls *rgc, tree niters,
 				 tree niters_skip, bool might_wrap_p)
@@ -664,6 +665,19 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
 
       vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
     }
+
+  int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+  if (partial_load_bias != 0)
+    {
+      tree adjusted_len = rgc->bias_adjusted_ctrl;
+      gassign *minus = gimple_build_assign (adjusted_len, PLUS_EXPR,
+					    rgc->controls[0],
+					    build_int_cst
+					    (TREE_TYPE (rgc->controls[0]),
+					     partial_load_bias));
+      gimple_seq_add_stmt (header_seq, minus);
+    }
+
   return next_ctrl;
 }
 
@@ -744,6 +758,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
 	/* Set up all controls for this group.  */
 	test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
 						     &preheader_seq,
+						     &header_seq,
 						     loop_cond_gsi, rgc,
 						     niters, niters_skip,
 						     might_wrap_p);
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index f700d5e7ac2..6f908676c2e 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1165,6 +1165,31 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
     return false;
 
+  machine_mode len_load_mode = get_len_load_store_mode
+    (loop_vinfo->vector_mode, true).require ();
+  machine_mode len_store_mode = get_len_load_store_mode
+    (loop_vinfo->vector_mode, false).require ();
+
+  signed char partial_load_bias = internal_len_load_store_bias
+    (IFN_LEN_LOAD, len_load_mode);
+
+  signed char partial_store_bias = internal_len_load_store_bias
+    (IFN_LEN_STORE, len_store_mode);
+
+  gcc_assert (partial_load_bias == partial_store_bias);
+
+  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
+    return false;
+
+  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
+     len_loads with a length of zero.  In order to avoid that we prohibit
+     more than one loop length here.  */
+  if (partial_load_bias == -1
+      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
+    return false;
+
+  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
+
   unsigned int max_nitems_per_iter = 1;
   unsigned int i;
   rgroup_controls *rgl;
@@ -4081,6 +4106,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	 here.  */
 
       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
+      signed char partial_load_store_bias
+	= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
       bool need_iterate_p
 	= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
 	   && !vect_known_niters_smaller_than_vf (loop_vinfo));
@@ -4113,6 +4140,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	       for each since start index is zero.  */
 	    prologue_stmts += num_vectors;
 
+	    /* If we have a non-zero partial load bias, we need one PLUS
+	       to adjust the load length.  */
+	    if (partial_load_store_bias != 0)
+	      body_stmts += 1;
+
 	    /* Each may need two MINs and one MINUS to update lengths in body
 	       for next iteration.  */
 	    if (need_iterate_p)
@@ -9165,6 +9197,8 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
 		   unsigned int nvectors, unsigned int index)
 {
   rgroup_controls *rgl = &(*lens)[nvectors - 1];
+  bool use_bias_adjusted_len =
+    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
 
   /* Populate the rgroup's len array, if this is the first time we've
      used it.  */
@@ -9175,15 +9209,28 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
 	{
 	  tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
 	  gcc_assert (len_type != NULL_TREE);
+
 	  tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
 
 	  /* Provide a dummy definition until the real one is available.  */
 	  SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
 	  rgl->controls[i] = len;
+
+	  if (use_bias_adjusted_len)
+	    {
+	      gcc_assert (i == 0);
+	      tree adjusted_len =
+		make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
+	      SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
+	      rgl->bias_adjusted_ctrl = adjusted_len;
+	    }
 	}
     }
 
-  return rgl->controls[index];
+  if (use_bias_adjusted_len)
+    return rgl->bias_adjusted_ctrl;
+  else
+    return rgl->controls[index];
 }
 
 /* Scale profiling counters by estimation for LOOP which is vectorized
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 8c427174b37..2b3dac6d5c1 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -8414,9 +8414,15 @@ vectorizable_store (vec_info *vinfo,
 						   gsi);
 		      vec_oprnd = var;
 		    }
+
+		  signed char biasval =
+		    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+
+		  tree bias = build_int_cst (intQI_type_node, biasval);
 		  gcall *call
-		    = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
-						  ptr, final_len, vec_oprnd);
+		    = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
+						  ptr, final_len, vec_oprnd,
+						  bias);
 		  gimple_call_set_nothrow (call, true);
 		  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
 		  new_stmt = call;
@@ -9720,22 +9726,29 @@ vectorizable_load (vec_info *vinfo,
 					       vec_num * j + i);
 			tree ptr = build_int_cst (ref_type,
 						  align * BITS_PER_UNIT);
+
+			machine_mode vmode = TYPE_MODE (vectype);
+			opt_machine_mode new_ovmode
+			  = get_len_load_store_mode (vmode, true);
+			machine_mode new_vmode = new_ovmode.require ();
+			tree qi_type = unsigned_intQI_type_node;
+
+			signed char biasval =
+			  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+
+			tree bias = build_int_cst (intQI_type_node, biasval);
+
 			gcall *call
-			  = gimple_build_call_internal (IFN_LEN_LOAD, 3,
+			  = gimple_build_call_internal (IFN_LEN_LOAD, 4,
 							dataref_ptr, ptr,
-							final_len);
+							final_len, bias);
 			gimple_call_set_nothrow (call, true);
 			new_stmt = call;
 			data_ref = NULL_TREE;
 
 			/* Need conversion if it's wrapped with VnQI.  */
-			machine_mode vmode = TYPE_MODE (vectype);
-			opt_machine_mode new_ovmode
-			  = get_len_load_store_mode (vmode, true);
-			machine_mode new_vmode = new_ovmode.require ();
 			if (vmode != new_vmode)
 			  {
-			    tree qi_type = unsigned_intQI_type_node;
 			    tree new_vtype
 			      = build_vector_type_for_mode (qi_type, new_vmode);
 			    tree var = vect_get_new_ssa_name (new_vtype,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 2f6e1e268fb..8d4932eefe4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -555,6 +555,10 @@ struct rgroup_controls {
 
   /* A vector of nV controls, in iteration order.  */
   vec<tree> controls;
+
+  /* In case of len_load and len_store with a bias there is only one
+     rgroup.  This holds the adjusted loop length for the this rgroup.  */
+  tree bias_adjusted_ctrl;
 };
 
 typedef auto_vec<rgroup_controls> vec_loop_masks;
@@ -759,6 +763,11 @@ public:
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
 
+  /* The bias for len_load and len_store.  For now, only 0 and -1 are
+     supported.  -1 must be used when a backend does not support
+     len_load/len_store with a length of zero.  */
+  signed char partial_load_store_bias;
+
   /* When we have grouped data accesses with gaps, we may introduce invalid
      memory accesses.  We peel the last iteration of the loop to prevent
      this.  */
@@ -824,6 +833,7 @@ public:
 #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
 #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             \
   (L)->epil_using_partial_vectors_p
+#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
 #define LOOP_VINFO_VECT_FACTOR(L)          (L)->vectorization_factor
 #define LOOP_VINFO_MAX_VECT_FACTOR(L)      (L)->max_vectorization_factor
 #define LOOP_VINFO_MASKS(L)                (L)->masks

Reply via email to