Re: [PATCH] RISC-V: Support widening register overlap for vf4/vf8

2023-11-29 Thread Kito Cheng
LGTM, thanks :)

On Thu, Nov 30, 2023 at 2:49 PM Juzhe-Zhong  wrote:
>
>
> size_t
> foo (char const *buf, size_t len)
> {
>   size_t sum = 0;
>   size_t vl = __riscv_vsetvlmax_e8m8 ();
>   size_t step = vl * 4;
>   const char *it = buf, *end = buf + len;
>   for (; it + step <= end;)
> {
>   vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
>   it += vl;
>   vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
>   it += vl;
>   vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
>   it += vl;
>   vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
>   it += vl;
>
>   asm volatile("nop" ::: "memory");
>   vint64m8_t vw0 = __riscv_vsext_vf8_i64m8 (v0, vl);
>   vint64m8_t vw1 = __riscv_vsext_vf8_i64m8 (v1, vl);
>   vint64m8_t vw2 = __riscv_vsext_vf8_i64m8 (v2, vl);
>   vint64m8_t vw3 = __riscv_vsext_vf8_i64m8 (v3, vl);
>
>   asm volatile("nop" ::: "memory");
>   size_t sum0 = __riscv_vmv_x_s_i64m8_i64 (vw0);
>   size_t sum1 = __riscv_vmv_x_s_i64m8_i64 (vw1);
>   size_t sum2 = __riscv_vmv_x_s_i64m8_i64 (vw2);
>   size_t sum3 = __riscv_vmv_x_s_i64m8_i64 (vw3);
>
>   sum += sumation (sum0, sum1, sum2, sum3);
> }
>   return sum;
> }
>
> Before this patch:
>
> add a3,s0,s1
> add a4,s6,s1
> add a5,s7,s1
> vsetvli zero,s0,e64,m8,ta,ma
> vle8.v  v4,0(s1)
> vle8.v  v3,0(a3)
> mv  s1,s2
> vle8.v  v2,0(a4)
> vle8.v  v1,0(a5)
> nop
> vsext.vf8   v8,v4
> vsext.vf8   v16,v2
> vs8r.v  v8,0(sp)
> vsext.vf8   v24,v1
> vsext.vf8   v8,v3
> nop
> vmv.x.s a1,v8
> vl8re64.v   v8,0(sp)
> vmv.x.s a3,v24
> vmv.x.s a2,v16
> vmv.x.s a0,v8
> add s2,s2,s5
> callsumation
> add s3,s3,a0
> bgeus4,s2,.L5
>
> After this patch:
>
> add a3,s0,s1
> add a4,s6,s1
> add a5,s7,s1
> vsetvli zero,s0,e64,m8,ta,ma
> vle8.v  v15,0(s1)
> vle8.v  v23,0(a3)
> mv  s1,s2
> vle8.v  v31,0(a4)
> vle8.v  v7,0(a5)
> vsext.vf8   v8,v15
> vsext.vf8   v16,v23
> vsext.vf8   v24,v31
> vsext.vf8   v0,v7
> vmv.x.s a3,v0
> vmv.x.s a2,v24
> vmv.x.s a1,v16
> vmv.x.s a0,v8
> add s2,s2,s5
> callsumation
> add s3,s3,a0
> bgeus4,s2,.L5
>
> PR target/112431
>
> gcc/ChangeLog:
>
> * config/riscv/vector.md: Add widening overlap of vf2/vf4.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/pr112431-16.c: New test.
> * gcc.target/riscv/rvv/base/pr112431-17.c: New test.
> * gcc.target/riscv/rvv/base/pr112431-18.c: New test.
>
> ---
>  gcc/config/riscv/vector.md| 38 ++-
>  .../gcc.target/riscv/rvv/base/pr112431-16.c   | 68 +++
>  .../gcc.target/riscv/rvv/base/pr112431-17.c   | 51 ++
>  .../gcc.target/riscv/rvv/base/pr112431-18.c   | 51 ++
>  4 files changed, 190 insertions(+), 18 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c
>
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index 6b891c11324..e5d62c6e58b 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -3704,43 +3704,45 @@
>
>  ;; Vector Quad-Widening Sign-extend and Zero-extend.
>  (define_insn "@pred__vf4"
> -  [(set (match_operand:VQEXTI 0 "register_operand"  "=,")
> +  [(set (match_operand:VQEXTI 0 "register_operand"   "=vr,   vr, 
>   vr,   vr, ?, ?")
> (if_then_else:VQEXTI
>   (unspec:
> -   [(match_operand: 1 "vector_mask_operand"   "vmWc1,vmWc1")
> -(match_operand 4 "vector_length_operand"  "   rK,   rK")
> -(match_operand 5 "const_int_operand"  "i,i")
> -(match_operand 6 "const_int_operand"  "i,i")
> -(match_operand 7 "const_int_operand"  "i,i")
> +   [(match_operand: 1 "vector_mask_operand"   
> "vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
> +(match_operand 4 "vector_length_operand"  "   rK,   rK,  
>  rK,   rK,   rK,   rK")
> +(match_operand 5 "const_int_operand"  "i,i,  
>   i,i,i,i")
> +(match_operand 6 "const_int_operand"  "i,i,  
>   i,i,i,i")
> +(match_operand 7 "const_int_operand"  "i,i,  
>   i,i,i,i")
>  (reg:SI VL_REGNUM)
>  (reg:SI VTYPE_REGNUM)] 

[PATCH] RISC-V: Support widening register overlap for vf4/vf8

2023-11-29 Thread Juzhe-Zhong


size_t
foo (char const *buf, size_t len)
{
  size_t sum = 0;
  size_t vl = __riscv_vsetvlmax_e8m8 ();
  size_t step = vl * 4;
  const char *it = buf, *end = buf + len;
  for (; it + step <= end;)
{
  vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
  it += vl;
  vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
  it += vl;
  vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
  it += vl;
  vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
  it += vl;
  
  asm volatile("nop" ::: "memory");
  vint64m8_t vw0 = __riscv_vsext_vf8_i64m8 (v0, vl);
  vint64m8_t vw1 = __riscv_vsext_vf8_i64m8 (v1, vl);
  vint64m8_t vw2 = __riscv_vsext_vf8_i64m8 (v2, vl);
  vint64m8_t vw3 = __riscv_vsext_vf8_i64m8 (v3, vl);

  asm volatile("nop" ::: "memory");
  size_t sum0 = __riscv_vmv_x_s_i64m8_i64 (vw0);
  size_t sum1 = __riscv_vmv_x_s_i64m8_i64 (vw1);
  size_t sum2 = __riscv_vmv_x_s_i64m8_i64 (vw2);
  size_t sum3 = __riscv_vmv_x_s_i64m8_i64 (vw3);

  sum += sumation (sum0, sum1, sum2, sum3);
}
  return sum;
}

Before this patch:

add a3,s0,s1
add a4,s6,s1
add a5,s7,s1
vsetvli zero,s0,e64,m8,ta,ma
vle8.v  v4,0(s1)
vle8.v  v3,0(a3)
mv  s1,s2
vle8.v  v2,0(a4)
vle8.v  v1,0(a5)
nop
vsext.vf8   v8,v4
vsext.vf8   v16,v2
vs8r.v  v8,0(sp)
vsext.vf8   v24,v1
vsext.vf8   v8,v3
nop
vmv.x.s a1,v8
vl8re64.v   v8,0(sp)
vmv.x.s a3,v24
vmv.x.s a2,v16
vmv.x.s a0,v8
add s2,s2,s5
callsumation
add s3,s3,a0
bgeus4,s2,.L5

After this patch:

add a3,s0,s1
add a4,s6,s1
add a5,s7,s1
vsetvli zero,s0,e64,m8,ta,ma
vle8.v  v15,0(s1)
vle8.v  v23,0(a3)
mv  s1,s2
vle8.v  v31,0(a4)
vle8.v  v7,0(a5)
vsext.vf8   v8,v15
vsext.vf8   v16,v23
vsext.vf8   v24,v31
vsext.vf8   v0,v7
vmv.x.s a3,v0
vmv.x.s a2,v24
vmv.x.s a1,v16
vmv.x.s a0,v8
add s2,s2,s5
callsumation
add s3,s3,a0
bgeus4,s2,.L5

PR target/112431

gcc/ChangeLog:

* config/riscv/vector.md: Add widening overlap of vf2/vf4.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/pr112431-16.c: New test.
* gcc.target/riscv/rvv/base/pr112431-17.c: New test.
* gcc.target/riscv/rvv/base/pr112431-18.c: New test.

---
 gcc/config/riscv/vector.md| 38 ++-
 .../gcc.target/riscv/rvv/base/pr112431-16.c   | 68 +++
 .../gcc.target/riscv/rvv/base/pr112431-17.c   | 51 ++
 .../gcc.target/riscv/rvv/base/pr112431-18.c   | 51 ++
 4 files changed, 190 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-16.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-17.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-18.c

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 6b891c11324..e5d62c6e58b 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -3704,43 +3704,45 @@
 
 ;; Vector Quad-Widening Sign-extend and Zero-extend.
 (define_insn "@pred__vf4"
-  [(set (match_operand:VQEXTI 0 "register_operand"  "=,")
+  [(set (match_operand:VQEXTI 0 "register_operand"   "=vr,   vr,   
vr,   vr, ?, ?")
(if_then_else:VQEXTI
  (unspec:
-   [(match_operand: 1 "vector_mask_operand"   "vmWc1,vmWc1")
-(match_operand 4 "vector_length_operand"  "   rK,   rK")
-(match_operand 5 "const_int_operand"  "i,i")
-(match_operand 6 "const_int_operand"  "i,i")
-(match_operand 7 "const_int_operand"  "i,i")
+   [(match_operand: 1 "vector_mask_operand"   
"vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
+(match_operand 4 "vector_length_operand"  "   rK,   rK,   
rK,   rK,   rK,   rK")
+(match_operand 5 "const_int_operand"  "i,i,
i,i,i,i")
+(match_operand 6 "const_int_operand"  "i,i,
i,i,i,i")
+(match_operand 7 "const_int_operand"  "i,i,
i,i,i,i")
 (reg:SI VL_REGNUM)
 (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
  (any_extend:VQEXTI
-   (match_operand: 3 "register_operand" "   vr,   vr"))
- (match_operand:VQEXTI 2 "vector_merge_operand"   "   vu,0")))]
+   (match_operand: 3 "register_operand" "  W43,  W43,  
W86,  W86,   vr,   vr"))
+ (match_operand:VQEXTI 2