The idea makes sense to me, LGTM :)
On Thu, Jan 11, 2024 at 10:43 AM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote: > > This patch fixes the known issues on SLP cases: > > ble a2,zero,.L11 > addiw t1,a2,-1 > li a5,15 > bleu t1,a5,.L9 > srliw a7,t1,4 > slli a7,a7,7 > lui t3,%hi(.LANCHOR0) > lui a6,%hi(.LANCHOR0+128) > addi t3,t3,%lo(.LANCHOR0) > li a4,128 > addi a6,a6,%lo(.LANCHOR0+128) > add a7,a7,a0 > addi a3,a1,37 > mv a5,a0 > vsetvli zero,a4,e8,m8,ta,ma > vle8.v v24,0(t3) > vle8.v v16,0(a6) > .L4: > li a6,128 > vle8.v v0,0(a3) > vrgather.vv v8,v0,v24 > vadd.vv v8,v8,v16 > vse8.v v8,0(a5) > add a5,a5,a6 > add a3,a3,a6 > bne a5,a7,.L4 > andi a5,t1,-16 > mv t1,a5 > .L3: > subw a2,a2,a5 > li a4,1 > beq a2,a4,.L5 > slli a5,a5,32 > srli a5,a5,32 > addiw a2,a2,-1 > slli a5,a5,3 > csrr a4,vlenb > slli a6,a2,32 > addi t3,a5,37 > srli a3,a6,29 > slli a4,a4,2 > add t3,a1,t3 > add a5,a0,a5 > mv t5,a3 > bgtu a3,a4,.L14 > .L6: > li a4,50790400 > addi a4,a4,1541 > li a6,67633152 > addi a6,a6,513 > slli a4,a4,32 > add a4,a4,a6 > vsetvli t4,zero,e64,m4,ta,ma > vmv.v.x v16,a4 > vsetvli a6,zero,e16,m8,ta,ma > vid.v v8 > vsetvli zero,t5,e8,m4,ta,ma > vle8.v v20,0(t3) > vsetvli a6,zero,e16,m8,ta,ma > csrr a7,vlenb > vand.vi v8,v8,-8 > vsetvli zero,zero,e8,m4,ta,ma > slli a4,a7,2 > vrgatherei16.vv v4,v20,v8 > vadd.vv v4,v4,v16 > vsetvli zero,t5,e8,m4,ta,ma > vse8.v v4,0(a5) > bgtu a3,a4,.L15 > .L7: > addw t1,a2,t1 > .L5: > slliw a5,t1,3 > add a1,a1,a5 > lui a4,%hi(.LC2) > add a0,a0,a5 > lbu a3,37(a1) > addi a5,a4,%lo(.LC2) > vsetivli zero,8,e8,mf2,ta,ma > vmv.v.x v1,a3 > vle8.v v2,0(a5) > vadd.vv v1,v1,v2 > vse8.v v1,0(a0) > .L11: > ret > .L15: > sub a3,a3,a4 > bleu a3,a4,.L8 > mv a3,a4 > .L8: > li a7,50790400 > csrr a4,vlenb > slli a4,a4,2 > addi a7,a7,1541 > li t4,67633152 > add t3,t3,a4 > vsetvli zero,a3,e8,m4,ta,ma > slli a7,a7,32 > addi t4,t4,513 > vle8.v v20,0(t3) > add a4,a5,a4 > add a7,a7,t4 > vsetvli a5,zero,e64,m4,ta,ma > vmv.v.x v16,a7 > vsetvli a6,zero,e16,m8,ta,ma > vid.v v8 > vand.vi v8,v8,-8 > vsetvli zero,zero,e8,m4,ta,ma > vrgatherei16.vv v4,v20,v8 > vadd.vv v4,v4,v16 > vsetvli zero,a3,e8,m4,ta,ma > vse8.v v4,0(a4) > j .L7 > .L14: > mv t5,a4 > j .L6 > .L9: > li a5,0 > li t1,0 > j .L3 > > The vectorization codegen is quite inefficient since we choose a VLS modes to > vectorize the loop body > with epilogue choosing a VLA modes. > > cost.c:6:21: note: ***** Choosing vector mode V128QI > cost.c:6:21: note: ***** Choosing epilogue vector mode RVVM4QI > > As we known, in RVV side, we have VLA modes and VLS modes. VLAmodes support > partial vectors wheras > VLSmodes support full vectors. The goal we add VLSmodes is to improve the > codegen of known NITERS > or SLP codes. > > If NITERS is unknown, that is i < n, n is unknown. We will always have > partial vectors vectorization. > It can be loop body or epilogue. In this case, It's always more efficient to > apply VLA partial vectorization > on loop body which doesn't have epilogue. > > After this patch: > > f: > ble a2,zero,.L7 > li a5,1 > beq a2,a5,.L5 > li a6,50790400 > addi a6,a6,1541 > li a4,67633152 > addi a4,a4,513 > csrr a5,vlenb > addiw a2,a2,-1 > slli a6,a6,32 > add a6,a6,a4 > slli a5,a5,2 > slli a4,a2,32 > vsetvli t1,zero,e64,m4,ta,ma > srli a3,a4,29 > neg t4,a5 > addi a7,a1,37 > mv a4,a0 > vmv.v.x v12,a6 > vsetvli t3,zero,e16,m8,ta,ma > vid.v v16 > vand.vi v16,v16,-8 > .L4: > minu a6,a3,a5 > vsetvli zero,a6,e8,m4,ta,ma > vle8.v v8,0(a7) > vsetvli t3,zero,e8,m4,ta,ma > mv t1,a3 > vrgatherei16.vv v4,v8,v16 > vsetvli zero,a6,e8,m4,ta,ma > vadd.vv v4,v4,v12 > vse8.v v4,0(a4) > add a7,a7,a5 > add a4,a4,a5 > add a3,a3,t4 > bgtu t1,a5,.L4 > .L3: > slliw a2,a2,3 > add a1,a1,a2 > lui a5,%hi(.LC0) > lbu a4,37(a1) > add a0,a0,a2 > addi a5,a5,%lo(.LC0) > vsetivli zero,8,e8,mf2,ta,ma > vmv.v.x v1,a4 > vle8.v v2,0(a5) > vadd.vv v1,v1,v2 > vse8.v v1,0(a0) > .L7: > ret > > Tested on both RV32 and RV64 no regression. Ok for trunk ? > > gcc/ChangeLog: > > * config/riscv/riscv-vector-costs.cc > (costs::better_main_loop_than_p): VLA preempt VLS on unknown NITERS loop. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/partial/slp-1.c: Remove xfail. > * gcc.target/riscv/rvv/autovec/partial/slp-16.c: Ditto. > * gcc.target/riscv/rvv/autovec/partial/slp-3.c: Ditto. > * gcc.target/riscv/rvv/autovec/partial/slp-5.c: Ditto. > > --- > gcc/config/riscv/riscv-vector-costs.cc | 9 +++++++++ > .../gcc.target/riscv/rvv/autovec/partial/slp-1.c | 2 +- > .../gcc.target/riscv/rvv/autovec/partial/slp-16.c | 2 +- > .../gcc.target/riscv/rvv/autovec/partial/slp-3.c | 2 +- > .../gcc.target/riscv/rvv/autovec/partial/slp-5.c | 2 +- > 5 files changed, 13 insertions(+), 4 deletions(-) > > diff --git a/gcc/config/riscv/riscv-vector-costs.cc > b/gcc/config/riscv/riscv-vector-costs.cc > index e53f4a186f3..58ec0b9b503 100644 > --- a/gcc/config/riscv/riscv-vector-costs.cc > +++ b/gcc/config/riscv/riscv-vector-costs.cc > @@ -1042,6 +1042,15 @@ costs::better_main_loop_than_p (const vector_costs > *uncast_other) const > } > } > } > + /* If NITERS is unknown, we should not use VLS modes to vectorize > + the loop since we don't support partial vectors for VLS modes, > + that is, we will have full vectors (VLSmodes) on loop body > + and partial vectors (VLAmodes) on loop epilogue which is very > + inefficient. Instead, we should apply partial vectors (VLAmodes) > + on loop body without an epilogue on unknown NITERS loop. */ > + else if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo) > + && m_cost_type == VLS_VECTOR_COST) > + return false; > > return vector_costs::better_main_loop_than_p (other); > } > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c > index 948b20b68d3..0a1d1f72e6b 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-1.c > @@ -21,6 +21,6 @@ f (int8_t *restrict a, int8_t *restrict b, int n) > > /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES > are chosen > instead of SLP when riscv-autovec-lmul=m1. */ > -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { > any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } > } } */ > +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { > any-opts "--param riscv-autovec-lmul=m1" } } } } */ > /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param > riscv-autovec-lmul=m1" } } } } */ > /* { dg-final { scan-assembler {\tvand} { xfail { any-opts "--param > riscv-autovec-lmul=m1" } } } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c > index 7b23cafab3f..05220c32c5d 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c > @@ -21,6 +21,6 @@ f (uint8_t *restrict a, uint8_t *restrict b, int n) > > /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES > are chosen > instead of SLP when riscv-autovec-lmul=m1. */ > -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { > any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } > } } */ > +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { > any-opts "--param riscv-autovec-lmul=m1" } } } } */ > /* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts "--param > riscv-autovec-lmul=m1"} } } } */ > /* { dg-final { scan-assembler-not {\tvmul} } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c > index 3622c59c439..5e64231b37d 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-3.c > @@ -21,4 +21,4 @@ f (int8_t *restrict a, int8_t *restrict b, int n) > > /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES > are chosen > instead of SLP when riscv-autovec-lmul=m1. */ > -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { > any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } > } } */ > +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { > any-opts "--param riscv-autovec-lmul=m1" } } } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c > index 5c0a6775474..c78b3709078 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c > @@ -21,4 +21,4 @@ f (int8_t *restrict a, int8_t *restrict b, int n) > > /* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES > are chosen > instead of SLP when riscv-autovec-lmul=m1. */ > -/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { > any-opts "--param riscv-autovec-lmul=m1" "--param riscv-autovec-lmul=m8" } } > } } */ > +/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail { > any-opts "--param riscv-autovec-lmul=m1" } } } } */ > -- > 2.36.3 >