Re: [PATCH] RISC-V: VLA preempts VLS on unknown NITERS loop

Kito Cheng Wed, 10 Jan 2024 21:56:31 -0800

The idea makes sense to me, LGTM :)


On Thu, Jan 11, 2024 at 10:43 AM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote:
>
> This patch fixes the known issues on SLP cases:
>
>         ble     a2,zero,.L11
>         addiw   t1,a2,-1
>         li      a5,15
>         bleu    t1,a5,.L9
>         srliw   a7,t1,4
>         slli    a7,a7,7
>         lui     t3,%hi(.LANCHOR0)
>         lui     a6,%hi(.LANCHOR0+128)
>         addi    t3,t3,%lo(.LANCHOR0)
>         li      a4,128
>         addi    a6,a6,%lo(.LANCHOR0+128)
>         add     a7,a7,a0
>         addi    a3,a1,37
>         mv      a5,a0
>         vsetvli zero,a4,e8,m8,ta,ma
>         vle8.v  v24,0(t3)
>         vle8.v  v16,0(a6)
> .L4:
>         li      a6,128
>         vle8.v  v0,0(a3)
>         vrgather.vv     v8,v0,v24
>         vadd.vv v8,v8,v16
>         vse8.v  v8,0(a5)
>         add     a5,a5,a6
>         add     a3,a3,a6
>         bne     a5,a7,.L4
>         andi    a5,t1,-16
>         mv      t1,a5
> .L3:
>         subw    a2,a2,a5
>         li      a4,1
>         beq     a2,a4,.L5
>         slli    a5,a5,32
>         srli    a5,a5,32
>         addiw   a2,a2,-1
>         slli    a5,a5,3
>         csrr    a4,vlenb
>         slli    a6,a2,32
>         addi    t3,a5,37
>         srli    a3,a6,29
>         slli    a4,a4,2
>         add     t3,a1,t3
>         add     a5,a0,a5
>         mv      t5,a3
>         bgtu    a3,a4,.L14
> .L6:
>         li      a4,50790400
>         addi    a4,a4,1541
>         li      a6,67633152
>         addi    a6,a6,513
>         slli    a4,a4,32
>         add     a4,a4,a6
>         vsetvli t4,zero,e64,m4,ta,ma
>         vmv.v.x v16,a4
>         vsetvli a6,zero,e16,m8,ta,ma
>         vid.v   v8
>         vsetvli zero,t5,e8,m4,ta,ma
>         vle8.v  v20,0(t3)
>         vsetvli a6,zero,e16,m8,ta,ma
>         csrr    a7,vlenb
>         vand.vi v8,v8,-8
>         vsetvli zero,zero,e8,m4,ta,ma
>         slli    a4,a7,2
>         vrgatherei16.vv v4,v20,v8
>         vadd.vv v4,v4,v16
>         vsetvli zero,t5,e8,m4,ta,ma
>         vse8.v  v4,0(a5)
>         bgtu    a3,a4,.L15
> .L7:
>         addw    t1,a2,t1
> .L5:
>         slliw   a5,t1,3
>         add     a1,a1,a5
>         lui     a4,%hi(.LC2)
>         add     a0,a0,a5
>         lbu     a3,37(a1)
>         addi    a5,a4,%lo(.LC2)
>         vsetivli        zero,8,e8,mf2,ta,ma
>         vmv.v.x v1,a3
>         vle8.v  v2,0(a5)
>         vadd.vv v1,v1,v2
>         vse8.v  v1,0(a0)
> .L11:
>         ret
> .L15:
>         sub     a3,a3,a4
>         bleu    a3,a4,.L8
>         mv      a3,a4
> .L8:
>         li      a7,50790400
>         csrr    a4,vlenb
>         slli    a4,a4,2
>         addi    a7,a7,1541
>         li      t4,67633152
>         add     t3,t3,a4
>         vsetvli zero,a3,e8,m4,ta,ma
>         slli    a7,a7,32
>         addi    t4,t4,513
>         vle8.v  v20,0(t3)
>         add     a4,a5,a4
>         add     a7,a7,t4
>         vsetvli a5,zero,e64,m4,ta,ma
>         vmv.v.x v16,a7
>         vsetvli a6,zero,e16,m8,ta,ma
>         vid.v   v8
>         vand.vi v8,v8,-8
>         vsetvli zero,zero,e8,m4,ta,ma
>         vrgatherei16.vv v4,v20,v8
>         vadd.vv v4,v4,v16
>         vsetvli zero,a3,e8,m4,ta,ma
>         vse8.v  v4,0(a4)
>         j       .L7
> .L14:
>         mv      t5,a4
>         j       .L6
> .L9:
>         li      a5,0
>         li      t1,0
>         j       .L3
>
> The vectorization codegen is quite inefficient since we choose a VLS modes to 
> vectorize the loop body
> with epilogue choosing a VLA modes.
>
> cost.c:6:21: note:  ***** Choosing vector mode V128QI
> cost.c:6:21: note:  ***** Choosing epilogue vector mode RVVM4QI
>
> As we known, in RVV side, we have VLA modes and VLS modes. VLAmodes support 
> partial vectors wheras
> VLSmodes support full vectors.  The goal we add VLSmodes is to improve the 
> codegen of known NITERS
> or SLP codes.
>
> If NITERS is unknown, that is i < n, n is unknown. We will always have 
> partial vectors vectorization.
> It can be loop body or epilogue. In this case, It's always more efficient to 
> apply VLA partial vectorization
> on loop body which doesn't have epilogue.
>
> After this patch:
>
> f:
>         ble     a2,zero,.L7
>         li      a5,1
>         beq     a2,a5,.L5
>         li      a6,50790400
>         addi    a6,a6,1541
>         li      a4,67633152
>         addi    a4,a4,513
>         csrr    a5,vlenb
>         addiw   a2,a2,-1
>         slli    a6,a6,32
>         add     a6,a6,a4
>         slli    a5,a5,2
>         slli    a4,a2,32
>         vsetvli t1,zero,e64,m4,ta,ma
>         srli    a3,a4,29
>         neg     t4,a5
>         addi    a7,a1,37
>         mv      a4,a0
>         vmv.v.x v12,a6
>         vsetvli t3,zero,e16,m8,ta,ma
>         vid.v   v16
>         vand.vi v16,v16,-8
> .L4:
>         minu    a6,a3,a5
>         vsetvli zero,a6,e8,m4,ta,ma
>         vle8.v  v8,0(a7)
>         vsetvli t3,zero,e8,m4,ta,ma
>         mv      t1,a3
>         vrgatherei16.vv v4,v8,v16
>         vsetvli zero,a6,e8,m4,ta,ma
>         vadd.vv v4,v4,v12
>         vse8.v  v4,0(a4)
>         add     a7,a7,a5
>         add     a4,a4,a5
>         add     a3,a3,t4
>         bgtu    t1,a5,.L4
> .L3:
>         slliw   a2,a2,3
>         add     a1,a1,a2
>         lui     a5,%hi(.LC0)
>         lbu     a4,37(a1)
>         add     a0,a0,a2
>         addi    a5,a5,%lo(.LC0)
>         vsetivli        zero,8,e8,mf2,ta,ma
>         vmv.v.x v1,a4
>         vle8.v  v2,0(a5)
>         vadd.vv v1,v1,v2
>         vse8.v  v1,0(a0)
> .L7:
>         ret
>
> Tested on both RV32 and RV64 no regression. Ok for trunk ?
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-vector-costs.cc 
> (costs::better_main_loop_than_p): VLA preempt VLS on unknown NITERS loop.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/partial/slp-1.c: Remove xfail.
>         * gcc.target/riscv/rvv/autovec/partial/slp-16.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/partial/slp-3.c: Ditto.
>         * gcc.target/riscv/rvv/autovec/partial/slp-5.c: Ditto.
>
> ---
>  gcc/config/riscv/riscv-vector-costs.cc                   | 9 +++++++++
>  .../gcc.target/riscv/rvv/autovec/partial/slp-1.c         | 2 +-
>  .../gcc.target/riscv/rvv/autovec/partial/slp-16.c        | 2 +-
>  .../gcc.target/riscv/rvv/autovec/partial/slp-3.c         | 2 +-
>  .../gcc.target/riscv/rvv/autovec/partial/slp-5.c         | 2 +-
>  5 files changed, 13 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
> b/gcc/config/riscv/riscv-vector-costs.cc
> index e53f4a186f3..58ec0b9b503 100644
> --- a/gcc/config/riscv/riscv-vector-costs.cc
> +++ b/gcc/config/riscv/riscv-vector-costs.cc
> @@ -1042,6 +1042,15 @@ costs::better_main_loop_than_p (const vector_costs 
> *uncast_other) const
>             }
>         }
>      }
> +  /* If NITERS is unknown, we should not use VLS modes to vectorize
> +     the loop since we don't support partial vectors for VLS modes,
> +     that is, we will have full vectors (VLSmodes) on loop body
> +     and partial vectors (VLAmodes) on loop epilogue which is very
> +     inefficient.  Instead, we should apply partial vectors (VLAmodes)
> +     on loop body without an epilogue on unknown NITERS loop.  */
> +  else if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo)
> +          && m_cost_type == VLS_VECTOR_COST)
> +    return false;
>
>    return vector_costs::better_main_loop_than_p (other);
>  }
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
> index 948b20b68d3..0a1d1f72e6b 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c
> @@ -21,6 +21,6 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
>
>  /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES 
> are chosen
>     instead of SLP when riscv-autovec-lmul=m1.  */
> -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { 
> any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } 
> } } */
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { 
> any-opts "--param riscv-autovec-lmul=m1" } } } } */
>  /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param 
> riscv-autovec-lmul=m1" } } } } */
>  /* { dg-final { scan-assembler {\tvand} { xfail { any-opts "--param 
> riscv-autovec-lmul=m1" } } } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
> index 7b23cafab3f..05220c32c5d 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
> @@ -21,6 +21,6 @@ f (uint8_t *restrict a, uint8_t *restrict b, int n)
>
>  /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES 
> are chosen
>     instead of SLP when riscv-autovec-lmul=m1.  */
> -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { 
> any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } 
> } } */
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { 
> any-opts "--param riscv-autovec-lmul=m1" } } } } */
>  /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param 
> riscv-autovec-lmul=m1"} } } } */
>  /* { dg-final { scan-assembler-not {\tvmul} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
> index 3622c59c439..5e64231b37d 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c
> @@ -21,4 +21,4 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
>
>  /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES 
> are chosen
>     instead of SLP when riscv-autovec-lmul=m1.  */
> -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { 
> any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } 
> } } */
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { 
> any-opts "--param riscv-autovec-lmul=m1" } } } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
> index 5c0a6775474..c78b3709078 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
> @@ -21,4 +21,4 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
>
>  /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES 
> are chosen
>     instead of SLP when riscv-autovec-lmul=m1.  */
> -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { 
> any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } 
> } } */
> +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { 
> any-opts "--param riscv-autovec-lmul=m1" } } } } */
> --
> 2.36.3
>

Re: [PATCH] RISC-V: VLA preempts VLS on unknown NITERS loop

Reply via email to