Hi Robin,

on 2021/11/12 下午5:56, Robin Dapp wrote:
> Hi Kewen and Richard,
> 
> the attached v3 addresses the comments to v2, among others:
> 
>  - Rename to load_store where appropriate.
>  - Save the adjusted length as a separate control that is used instead
> of loop_len with a bias != 0 and added to the loop header.
>  - Update the costs to reflect a bias.
> 
> Bootstrap and regtest were fine on z15 and p9.
> 

Nice!  Some minor comments are inlined below.

> Regards
>  Robin
> 
> 
> vll-v3.patch
> 

...

>  extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
>                                   bool, bool, bool, bool, tree *);
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 4988c93fdb6..931378820ac 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -421,6 +421,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, 
> rgroup_controls *dest_rgm,
>  static tree
>  vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
>                                gimple_seq *preheader_seq,
> +                              gimple_seq *header_seq,
>                                gimple_stmt_iterator loop_cond_gsi,
>                                rgroup_controls *rgc, tree niters,
>                                tree niters_skip, bool might_wrap_p)
> @@ -436,7 +437,7 @@ vect_set_loop_controls_directly (class loop *loop, 
> loop_vec_info loop_vinfo,
>    tree length_limit = NULL_TREE;
>    /* For length, we need length_limit to ensure length in range.  */
>    if (!use_masks_p)
> -    length_limit = build_int_cst (compare_type, nitems_per_ctrl);
> +      length_limit = build_int_cst (compare_type, nitems_per_ctrl);
>  

Nit, seems like an unintentional change.

>    /* Calculate the maximum number of item values that the rgroup
>       handles in total, the number that it handles for each iteration
> @@ -560,8 +561,9 @@ vect_set_loop_controls_directly (class loop *loop, 
> loop_vec_info loop_vinfo,
>      {
>        /* Previous controls will cover BIAS items.  This control covers the
>        next batch.  */
> +      tree bias_tree;
>        poly_uint64 bias = nitems_per_ctrl * i;
> -      tree bias_tree = build_int_cst (compare_type, bias);
> +      bias_tree = build_int_cst (compare_type, bias);
>  

Same as above.

>        /* See whether the first iteration of the vector loop is known
>        to have a full control.  */
> @@ -664,6 +666,20 @@ vect_set_loop_controls_directly (class loop *loop, 
> loop_vec_info loop_vinfo,
>  
>        vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
>      }
> +
> +  int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +  if (partial_load_bias != 0
> +      && partial_load_bias != VECT_PARTIAL_BIAS_UNSUPPORTED)
> +    {

IIUC, we don't need to check VECT_PARTIAL_BIAS_UNSUPPORTED again?  Since it's
at the stage of transformation, we have checked it before for sure?

> +      tree adjusted_len = rgc->bias_adjusted_ctrl;
> +      gassign *minus = gimple_build_assign (adjusted_len, MINUS_EXPR,
> +                                         rgc->controls[0],
> +                                         build_int_cst
> +                                         (TREE_TYPE (rgc->controls[0]),
> +                                          -partial_load_bias));
> +      gimple_seq_add_stmt (header_seq, minus);
> +    }
> +
>    return next_ctrl;
>  }
>  
> @@ -744,6 +760,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>       /* Set up all controls for this group.  */
>       test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
>                                                    &preheader_seq,
> +                                                  &header_seq,
>                                                    loop_cond_gsi, rgc,
>                                                    niters, niters_skip,
>                                                    might_wrap_p);
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index e94356d76e9..ceeb6920871 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1163,6 +1163,31 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
>    if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
>      return false;
>  
> +  machine_mode len_load_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, true).require ();
> +  machine_mode len_store_mode = get_len_load_store_mode
> +    (loop_vinfo->vector_mode, false).require ();
> +
> +  signed char partial_load_bias = internal_len_load_store_bias
> +    (IFN_LEN_LOAD, len_load_mode);
> +
> +  signed char partial_store_bias = internal_len_load_store_bias
> +    (IFN_LEN_STORE, len_store_mode);
> +
> +  gcc_assert (partial_load_bias == partial_store_bias);
> +
> +  if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
> +    return false;
> +
> +  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
> +

Nit, it seems better to move this assignment after the below early return.

> +  /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
> +     len_loads with a length of zero.  In order to avoid that we prohibit
> +     more than one loop length here.  */
> +  if (partial_load_bias == -1
> +      && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
> +      return false;
> +
>    unsigned int max_nitems_per_iter = 1;
>    unsigned int i;
>    rgroup_controls *rgl;
> @@ -4125,6 +4150,8 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>        here.  */
>  
>        bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
> +      signed char partial_load_store_bias
> +     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>        bool need_iterate_p
>       = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>          && !vect_known_niters_smaller_than_vf (loop_vinfo));
> @@ -4157,6 +4184,11 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>              for each since start index is zero.  */
>           prologue_stmts += num_vectors;
>  
> +         /* If we have a non-zero partial load bias, we need one MINUS
> +            and a MAX to adjust the load length.  */
> +         if (partial_load_store_bias != 0)
> +           prologue_stmts += 2;
> +

IIUC, now we use the biasing length based on the PHI-ed loop_len, it has only 
one extra
unit cost for MINUS and it's part of cost for body instead of prologue?

>           /* Each may need two MINs and one MINUS to update lengths in body
>              for next iteration.  */
>           if (need_iterate_p)
> @@ -9226,6 +9258,13 @@ vect_get_loop_len (loop_vec_info loop_vinfo, 
> vec_loop_lens *lens,
>  {
>    rgroup_controls *rgl = &(*lens)[nvectors - 1];
>  
> +  signed char partial_load_store_bias =
> +    LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +
> +  bool use_bias_adjusted_len =
> +    partial_load_store_bias != VECT_PARTIAL_BIAS_UNSUPPORTED
> +    && partial_load_store_bias != 0;
> +

Nit, VECT_PARTIAL_BIAS_UNSUPPORTED check seems redundant here too.

BR,
Kewen

Reply via email to