On Fri, 10 Nov 2023, Juzhe-Zhong wrote:

> PR: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112438
> 
> 1. Since SELECT_VL result is not necessary always VF in non-final iteration.
> 
> Current GIMPLE IR is wrong:
> 
> # vect_vec_iv_.8_22 = PHI <_21(4), { 0, 1, 2, ... }(3)>
> ...
> _35 = .SELECT_VL (ivtmp_33, VF);
> _21 = vect_vec_iv_.8_22 + { VF, ... };
> 
> E.g. Consider the total iterations N = 6, the VF = 4.
> Since SELECT_VL output is defined as not always to be VF in non-final 
> iteration
> which needs to depend on hardware implementation.
> 
> Suppose we have a RVV CPU core with vsetvl doing even distribution workload 
> optimization.
> It may process 3 elements at the 1st iteration and 3 elements at the last 
> iteration.
> Then the induction variable here: _21 = vect_vec_iv_.8_22 + { POLY_INT_CST 
> [4, 4], ... }; 
> is wrong which is adding VF, which is 4, actually, we didn't process 4 
> elements.
> 
> It should be adding 3 elements which is the result of SELECT_VL.
> So, here the correct IR should be:
> 
>   _36 = .SELECT_VL (ivtmp_34, VF);
>   _22 = (int) _36;
>   vect_cst__21 = [vec_duplicate_expr] _22;
> 
> 2. This issue only happens on non-SLP vectorization single rgroup since:
>    
>      if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
>     {
>       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
>                                         OPTIMIZE_FOR_SPEED)
>         && LOOP_VINFO_LENS (loop_vinfo).length () == 1
>         && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
>         && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
>             || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
>       LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
>     }
> 
> 3. This issue doesn't appears on nested loop no matter 
> LOOP_VINFO_USING_SELECT_VL_P is true or false.
> 
> Since:
> 
>   # vect_vec_iv_.6_5 = PHI <_19(3), { 0, ... }(5)>
>   # vect_diff_15.7_20 = PHI <vect_diff_9.8_22(3), vect_diff_18.5_11(5)>
>   _19 = vect_vec_iv_.6_5 + { 1, ... };
>   vect_diff_9.8_22 = .COND_LEN_ADD ({ -1, ... }, vect_vec_iv_.6_5, 
> vect_diff_15.7_20, vect_diff_15.7_20, _28, 0);
>   ivtmp_1 = ivtmp_4 + 4294967295;
>   ....
>   <bb 5> [local count: 6549826]:
>   # vect_diff_18.5_11 = PHI <vect_diff_9.8_22(4), { 0, ... }(2)>
>   # ivtmp_26 = PHI <ivtmp_27(4), 40(2)>
>   _28 = .SELECT_VL (ivtmp_26, POLY_INT_CST [4, 4]);
>   goto <bb 3>; [100.00%]
> 
> Note the induction variable IR: _21 = vect_vec_iv_.8_22 + { POLY_INT_CST [4, 
> 4], ... }; update induction variable
> independent on VF (or don't care about how many elements are processed in the 
> iteration).
> 
> The update is loop invariant. So it won't be the problem even if 
> LOOP_VINFO_USING_SELECT_VL_P is true.
>    
> Testing passed, Ok for trunk ?

OK.

Richard.

>       PR tree-optimization/112438
> 
> gcc/ChangeLog:
> 
>       * tree-vect-loop.cc (vectorizable_induction):
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/riscv/rvv/autovec/pr112438.c: New test.
> 
> ---
>  .../gcc.target/riscv/rvv/autovec/pr112438.c   | 33 +++++++++++++++++++
>  gcc/tree-vect-loop.cc                         | 30 ++++++++++++++++-
>  2 files changed, 62 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112438.c
> 
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112438.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112438.c
> new file mode 100644
> index 00000000000..51f90df38a0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112438.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-vect-cost-model 
> -ffast-math -fdump-tree-optimized-details" } */
> +
> +void
> +foo (int n, int *__restrict in, int *__restrict out)
> +{
> +  for (int i = 0; i < n; i += 1)
> +    {
> +      out[i] = in[i] + i;
> +    }
> +}
> +
> +void
> +foo2 (int n, float * __restrict in, 
> +float * __restrict out)
> +{
> +  for (int i = 0; i < n; i += 1)
> +    {
> +      out[i] = in[i] + i;
> +    }
> +}
> +
> +void
> +foo3 (int n, float * __restrict in, 
> +float * __restrict out, float x)
> +{
> +  for (int i = 0; i < n; i += 1)
> +    {
> +      out[i] = in[i] + i* i;
> +    }
> +}
> +
> +/* We don't want to see vect_vec_iv_.21_25 + { POLY_INT_CST [4, 4], ... }.  
> */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 8abc1937d74..b152072c969 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10306,10 +10306,36 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>  
>  
>    /* Create the vector that holds the step of the induction.  */
> +  gimple_stmt_iterator *step_iv_si = NULL;
>    if (nested_in_vect_loop)
>      /* iv_loop is nested in the loop to be vectorized. Generate:
>         vec_step = [S, S, S, S]  */
>      new_name = step_expr;
> +  else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> +    {
> +      /* When we're using loop_len produced by SELEC_VL, the non-final
> +      iterations are not always processing VF elements.  So vectorize
> +      induction variable instead of
> +
> +        _21 = vect_vec_iv_.6_22 + { VF, ... };
> +
> +      We should generate:
> +
> +        _35 = .SELECT_VL (ivtmp_33, VF);
> +        vect_cst__22 = [vec_duplicate_expr] _35;
> +        _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
> +      gcc_assert (!slp_node);
> +      gimple_seq seq = NULL;
> +      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> +      tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 
> 0);
> +      expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
> +                                              unshare_expr (len)),
> +                                &seq, true, NULL_TREE);
> +      new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
> +                            step_expr);
> +      gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
> +      step_iv_si = &si;
> +    }
>    else
>      {
>        /* iv_loop is the loop to be vectorized. Generate:
> @@ -10336,7 +10362,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>             || TREE_CODE (new_name) == SSA_NAME);
>    new_vec = build_vector_from_val (step_vectype, t);
>    vec_step = vect_init_vector (loop_vinfo, stmt_info,
> -                            new_vec, step_vectype, NULL);
> +                            new_vec, step_vectype, step_iv_si);
>  
>  
>    /* Create the following def-use cycle:
> @@ -10382,6 +10408,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>        gimple_seq seq = NULL;
>        /* FORNOW. This restriction should be relaxed.  */
>        gcc_assert (!nested_in_vect_loop);
> +      /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  
> */
> +      gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>  
>        /* Create the vector that holds the step of the induction.  */
>        if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to