OK, thanks for moving that forward!

but just one nit: don't include godbolt.org url as possible since it's
not permanently valid.

On Wed, Nov 29, 2023 at 4:36 PM juzhe.zh...@rivai.ai
<juzhe.zh...@rivai.ai> wrote:
>
> Sorry for sending it twice.
>
> Add
>
> Co-authored-by: kito-cheng <kito.ch...@sifive.com>
> Co-authored-by: kito-cheng <kito.ch...@gmail.com>
>
> in changelog.
>
>
> No other difference.
>
> ________________________________
> juzhe.zh...@rivai.ai
>
>
> From: Juzhe-Zhong
> Date: 2023-11-29 16:34
> To: gcc-patches
> CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
> Subject: [PATCH] RISC-V: Support highpart register overlap for vwcvt
> Since Richard supports register filters recently, we are able to support 
> highpart register
> overlap for widening RVV instructions.
>
> This patch support it for vwcvt intrinsics.
>
> I leverage real application user codes for vwcvt:
> https://github.com/riscv/riscv-v-spec/issues/929
> https://godbolt.org/z/xoeGnzd8q
>
> This is the real application codes that using LMUL = 8 with unrolling to gain 
> optimal
> performance for specific libraury.
>
> You can see in the codegen, GCC has optimal codegen for such since we 
> supported register
> lowpart overlap for narrowing instructions (dest EEW < source EEW).
>
> Now, we start to support highpart register overlap from this patch for 
> widening instructions (dest EEW > source EEW).
>
> Leverage this intrinsic codes above but for vwcvt:
>
> https://godbolt.org/z/1TMPE5Wfr
>
> size_t
> foo (char const *buf, size_t len)
> {
>   size_t sum = 0;
>   size_t vl = __riscv_vsetvlmax_e8m8 ();
>   size_t step = vl * 4;
>   const char *it = buf, *end = buf + len;
>   for (; it + step <= end;)
>     {
>       vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
>       it += vl;
>       vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
>       it += vl;
>       vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
>       it += vl;
>       vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
>       it += vl;
>
>       asm volatile("nop" ::: "memory");
>       vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
>       vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
>       vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
>       vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);
>
>       asm volatile("nop" ::: "memory");
>       size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
>       size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
>       size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
>       size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);
>
>       sum += sumation (sum0, sum1, sum2, sum3);
>     }
>   return sum;
> }
>
> Before this patch:
>
> ...
> csrr    t0,vlenb
> ...
>         vwcvt.x.x.v     v16,v8
>         vwcvt.x.x.v     v8,v28
>         vs8r.v  v16,0(sp)               ---> spill
>         vwcvt.x.x.v     v16,v24
>         vwcvt.x.x.v     v24,v4
>         nop
>         vsetvli zero,zero,e16,m8,ta,ma
>         vmv.x.s a2,v16
>         vl8re16.v       v16,0(sp)      --->  reload
> ...
> csrr    t0,vlenb
> ...
>
> You can see heavy spill && reload inside the loop body.
>
> After this patch:
>
> ...
> vwcvt.x.x.v v8,v12
> vwcvt.x.x.v v16,v20
> vwcvt.x.x.v v24,v28
> vwcvt.x.x.v v0,v4
> ...
>
> Optimal codegen after this patch.
>
> Tested on zvl128b no regression.
>
> I am gonna to test zve64d/zvl256b/zvl512b/zvl1024b.
>
> Ok for trunk if no regression on the testing above ?
>
> Co-authored-by: kito-cheng <kito.ch...@sifive.com>
> Co-authored-by: kito-cheng <kito.ch...@gmail.com>
>
> PR target/112431
>
> gcc/ChangeLog:
>
> * config/riscv/constraints.md (TARGET_VECTOR ? V_REGS : NO_REGS): New 
> register filters.
> * config/riscv/riscv.md (no,W21,W42,W84,W41,W81,W82): Ditto.
> (no,yes): Ditto.
> * config/riscv/vector.md: Support highpart register overlap for vwcvt.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/pr112431-1.c: New test.
> * gcc.target/riscv/rvv/base/pr112431-2.c: New test.
> * gcc.target/riscv/rvv/base/pr112431-3.c: New test.
>
> ---
> gcc/config/riscv/constraints.md               |  23 ++++
> gcc/config/riscv/riscv.md                     |  24 ++++
> gcc/config/riscv/vector.md                    |  19 ++--
> .../gcc.target/riscv/rvv/base/pr112431-1.c    | 104 ++++++++++++++++++
> .../gcc.target/riscv/rvv/base/pr112431-2.c    |  68 ++++++++++++
> .../gcc.target/riscv/rvv/base/pr112431-3.c    |  51 +++++++++
> 6 files changed, 280 insertions(+), 9 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
>
> diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
> index 68be4515c04..19bb36616bf 100644
> --- a/gcc/config/riscv/constraints.md
> +++ b/gcc/config/riscv/constraints.md
> @@ -169,6 +169,29 @@
> (define_register_constraint "vm" "TARGET_VECTOR ? VM_REGS : NO_REGS"
>    "A vector mask register (if available).")
> +;; These following constraints are used by RVV instructions with dest EEW > 
> src EEW.
> +;; RISC-V 'V' Spec 5.2. Vector Operands:
> +;; The destination EEW is greater than the source EEW, the source EMUL is at 
> least 1,
> +;; and the overlap is in the highest-numbered part of the destination 
> register group.
> +;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, v2, or 
> v4 is not).
> +(define_register_constraint "W21" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 2 == 1." "regno % 2 == 1")
> +
> +(define_register_constraint "W42" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 4 == 2." "regno % 4 == 2")
> +
> +(define_register_constraint "W84" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 4." "regno % 8 == 4")
> +
> +(define_register_constraint "W41" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 4 == 1." "regno % 4 == 1")
> +
> +(define_register_constraint "W81" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 1." "regno % 8 == 1")
> +
> +(define_register_constraint "W82" "TARGET_VECTOR ? V_REGS : NO_REGS"
> +  "A vector register has register number % 8 == 2." "regno % 8 == 2")
> +
> ;; This constraint is used to match instruction "csrr %0, vlenb" which is 
> generated in "mov<mode>".
> ;; VLENB is a run-time constant which represent the vector register length in 
> bytes.
> ;; BYTES_PER_RISCV_VECTOR represent runtime invariant of vector register 
> length in bytes.
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 935eeb7fd8e..6bf2dfdf9b4 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -501,6 +501,27 @@
>    ]
>    (const_string "no")))
> +(define_attr "vconstraint" "no,W21,W42,W84,W41,W81,W82"
> +  (const_string "no"))
> +
> +(define_attr "vconstraint_enabled" "no,yes"
> +  (cond [(eq_attr "vconstraint" "no")
> +         (const_string "yes")
> +
> +         (and (eq_attr "vconstraint" "W21")
> +       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 
> 2"))
> + (const_string "no")
> +
> +         (and (eq_attr "vconstraint" "W42,W41")
> +       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 
> 4"))
> + (const_string "no")
> +
> +         (and (eq_attr "vconstraint" "W84,W81,W82")
> +       (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 
> 8"))
> + (const_string "no")
> +        ]
> +       (const_string "yes")))
> +
> ;; Attribute to control enable or disable instructions.
> (define_attr "enabled" "no,yes"
>    (cond [
> @@ -509,6 +530,9 @@
>      (eq_attr "fp_vector_disabled" "yes")
>      (const_string "no")
> +
> +    (eq_attr "vconstraint_enabled" "no")
> +    (const_string "no")
>    ]
>    (const_string "yes")))
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index ba9c9e5a9b6..bace900fee5 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -3898,22 +3898,22 @@
> ;; vwcvt<u>.x.x.v
> (define_insn "@pred_<optab><mode>"
> -  [(set (match_operand:VWEXTI 0 "register_operand"                  
> "=&vr,&vr")
> +  [(set (match_operand:VWEXTI 0 "register_operand"                   "=vr,   
> vr,   vr,   vr,  vr,    vr, ?&vr, ?&vr")
> (if_then_else:VWEXTI
>   (unspec:<VM>
> -     [(match_operand:<VM> 1 "vector_mask_operand"           "vmWc1,vmWc1")
> -      (match_operand 4 "vector_length_operand"              "   rK,   rK")
> -      (match_operand 5 "const_int_operand"                  "    i,    i")
> -      (match_operand 6 "const_int_operand"                  "    i,    i")
> -      (match_operand 7 "const_int_operand"                  "    i,    i")
> +     [(match_operand:<VM> 1 "vector_mask_operand"           
> "vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
> +      (match_operand 4 "vector_length_operand"              "   rK,   rK,   
> rK,   rK,   rK,   rK,   rK,   rK")
> +      (match_operand 5 "const_int_operand"                  "    i,    i,    
> i,    i,    i,    i,    i,    i")
> +      (match_operand 6 "const_int_operand"                  "    i,    i,    
> i,    i,    i,    i,    i,    i")
> +      (match_operand 7 "const_int_operand"                  "    i,    i,    
> i,    i,    i,    i,    i,    i")
>      (reg:SI VL_REGNUM)
>      (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
>   (plus:VWEXTI
>     (any_extend:VWEXTI
> -       (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "   vr,   vr"))
> +       (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "  W21,  W21,  
> W42,  W42,  W84,  W84,   vr,   vr"))
>     (vec_duplicate:VWEXTI
>       (reg:<VEL> X0_REGNUM)))
> -   (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0")))]
> +   (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0,   
> vu,    0,   vu,    0,   vu,    0")))]
>    "TARGET_VECTOR"
>    "vwcvt<u>.x.x.v\t%0,%3%p1"
>    [(set_attr "type" "viwalu")
> @@ -3921,7 +3921,8 @@
>     (set_attr "vl_op_idx" "4")
>     (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[5])"))
>     (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[6])"))
> -   (set (attr "avl_type_idx") (const_int 7))])
> +   (set (attr "avl_type_idx") (const_int 7))
> +   (set_attr "vconstraint" "W21,W21,W42,W42,W84,W84,no,no")])
> ;; 
> -------------------------------------------------------------------------------
> ;; ---- Predicated integer Narrowing operations
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
> new file mode 100644
> index 00000000000..6b9a7c448f0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
> @@ -0,0 +1,104 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +size_t __attribute__ ((noinline))
> +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
> +   size_t sum5, size_t sum6, size_t sum7, size_t sum8, size_t sum9,
> +   size_t sum10, size_t sum11, size_t sum12, size_t sum13, size_t sum14,
> +   size_t sum15)
> +{
> +  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7 + sum8 + sum9
> + + sum10 + sum11 + sum12 + sum13 + sum14 + sum15;
> +}
> +
> +size_t
> +foo (char const *buf, size_t len)
> +{
> +  size_t sum = 0;
> +  size_t vl = __riscv_vsetvlmax_e8m8 ();
> +  size_t step = vl * 4;
> +  const char *it = buf, *end = buf + len;
> +  for (; it + step <= end;)
> +    {
> +      vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v4 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v5 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v6 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v7 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v8 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v9 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v10 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v11 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v12 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v13 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v14 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +      vint8m1_t v15 = __riscv_vle8_v_i8m1 ((void *) it, vl);
> +      it += vl;
> +
> +      asm volatile("nop" ::: "memory");
> +      vint16m2_t vw0 = __riscv_vwcvt_x_x_v_i16m2 (v0, vl);
> +      vint16m2_t vw1 = __riscv_vwcvt_x_x_v_i16m2 (v1, vl);
> +      vint16m2_t vw2 = __riscv_vwcvt_x_x_v_i16m2 (v2, vl);
> +      vint16m2_t vw3 = __riscv_vwcvt_x_x_v_i16m2 (v3, vl);
> +      vint16m2_t vw4 = __riscv_vwcvt_x_x_v_i16m2 (v4, vl);
> +      vint16m2_t vw5 = __riscv_vwcvt_x_x_v_i16m2 (v5, vl);
> +      vint16m2_t vw6 = __riscv_vwcvt_x_x_v_i16m2 (v6, vl);
> +      vint16m2_t vw7 = __riscv_vwcvt_x_x_v_i16m2 (v7, vl);
> +      vint16m2_t vw8 = __riscv_vwcvt_x_x_v_i16m2 (v8, vl);
> +      vint16m2_t vw9 = __riscv_vwcvt_x_x_v_i16m2 (v9, vl);
> +      vint16m2_t vw10 = __riscv_vwcvt_x_x_v_i16m2 (v10, vl);
> +      vint16m2_t vw11 = __riscv_vwcvt_x_x_v_i16m2 (v11, vl);
> +      vint16m2_t vw12 = __riscv_vwcvt_x_x_v_i16m2 (v12, vl);
> +      vint16m2_t vw13 = __riscv_vwcvt_x_x_v_i16m2 (v13, vl);
> +      vint16m2_t vw14 = __riscv_vwcvt_x_x_v_i16m2 (v14, vl);
> +      vint16m2_t vw15 = __riscv_vwcvt_x_x_v_i16m2 (v15, vl);
> +
> +      asm volatile("nop" ::: "memory");
> +      size_t sum0 = __riscv_vmv_x_s_i16m2_i16 (vw0);
> +      size_t sum1 = __riscv_vmv_x_s_i16m2_i16 (vw1);
> +      size_t sum2 = __riscv_vmv_x_s_i16m2_i16 (vw2);
> +      size_t sum3 = __riscv_vmv_x_s_i16m2_i16 (vw3);
> +      size_t sum4 = __riscv_vmv_x_s_i16m2_i16 (vw4);
> +      size_t sum5 = __riscv_vmv_x_s_i16m2_i16 (vw5);
> +      size_t sum6 = __riscv_vmv_x_s_i16m2_i16 (vw6);
> +      size_t sum7 = __riscv_vmv_x_s_i16m2_i16 (vw7);
> +      size_t sum8 = __riscv_vmv_x_s_i16m2_i16 (vw8);
> +      size_t sum9 = __riscv_vmv_x_s_i16m2_i16 (vw9);
> +      size_t sum10 = __riscv_vmv_x_s_i16m2_i16 (vw10);
> +      size_t sum11 = __riscv_vmv_x_s_i16m2_i16 (vw11);
> +      size_t sum12 = __riscv_vmv_x_s_i16m2_i16 (vw12);
> +      size_t sum13 = __riscv_vmv_x_s_i16m2_i16 (vw13);
> +      size_t sum14 = __riscv_vmv_x_s_i16m2_i16 (vw14);
> +      size_t sum15 = __riscv_vmv_x_s_i16m2_i16 (vw15);
> +
> +      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8,
> +        sum9, sum10, sum11, sum12, sum13, sum14, sum15);
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmv1r} } } */
> +/* { dg-final { scan-assembler-not {vmv2r} } } */
> +/* { dg-final { scan-assembler-not {vmv4r} } } */
> +/* { dg-final { scan-assembler-not {vmv8r} } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
> new file mode 100644
> index 00000000000..da92d59406f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
> @@ -0,0 +1,68 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +size_t __attribute__ ((noinline))
> +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
> +   size_t sum5, size_t sum6, size_t sum7)
> +{
> +  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
> +}
> +
> +size_t
> +foo (char const *buf, size_t len)
> +{
> +  size_t sum = 0;
> +  size_t vl = __riscv_vsetvlmax_e8m8 ();
> +  size_t step = vl * 4;
> +  const char *it = buf, *end = buf + len;
> +  for (; it + step <= end;)
> +    {
> +      vint8m2_t v0 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v1 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v2 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v3 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v4 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v5 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v6 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +      vint8m2_t v7 = __riscv_vle8_v_i8m2 ((void *) it, vl);
> +      it += vl;
> +
> +      asm volatile("nop" ::: "memory");
> +      vint16m4_t vw0 = __riscv_vwcvt_x_x_v_i16m4 (v0, vl);
> +      vint16m4_t vw1 = __riscv_vwcvt_x_x_v_i16m4 (v1, vl);
> +      vint16m4_t vw2 = __riscv_vwcvt_x_x_v_i16m4 (v2, vl);
> +      vint16m4_t vw3 = __riscv_vwcvt_x_x_v_i16m4 (v3, vl);
> +      vint16m4_t vw4 = __riscv_vwcvt_x_x_v_i16m4 (v4, vl);
> +      vint16m4_t vw5 = __riscv_vwcvt_x_x_v_i16m4 (v5, vl);
> +      vint16m4_t vw6 = __riscv_vwcvt_x_x_v_i16m4 (v6, vl);
> +      vint16m4_t vw7 = __riscv_vwcvt_x_x_v_i16m4 (v7, vl);
> +
> +      asm volatile("nop" ::: "memory");
> +      size_t sum0 = __riscv_vmv_x_s_i16m4_i16 (vw0);
> +      size_t sum1 = __riscv_vmv_x_s_i16m4_i16 (vw1);
> +      size_t sum2 = __riscv_vmv_x_s_i16m4_i16 (vw2);
> +      size_t sum3 = __riscv_vmv_x_s_i16m4_i16 (vw3);
> +      size_t sum4 = __riscv_vmv_x_s_i16m4_i16 (vw4);
> +      size_t sum5 = __riscv_vmv_x_s_i16m4_i16 (vw5);
> +      size_t sum6 = __riscv_vmv_x_s_i16m4_i16 (vw6);
> +      size_t sum7 = __riscv_vmv_x_s_i16m4_i16 (vw7);
> +
> +      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7);
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmv1r} } } */
> +/* { dg-final { scan-assembler-not {vmv2r} } } */
> +/* { dg-final { scan-assembler-not {vmv4r} } } */
> +/* { dg-final { scan-assembler-not {vmv8r} } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
> new file mode 100644
> index 00000000000..46f93a9049b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
> +
> +#include "riscv_vector.h"
> +
> +size_t __attribute__ ((noinline))
> +sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3)
> +{
> +  return sum0 + sum1 + sum2 + sum3;
> +}
> +
> +size_t
> +foo (char const *buf, size_t len)
> +{
> +  size_t sum = 0;
> +  size_t vl = __riscv_vsetvlmax_e8m8 ();
> +  size_t step = vl * 4;
> +  const char *it = buf, *end = buf + len;
> +  for (; it + step <= end;)
> +    {
> +      vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
> +      it += vl;
> +      vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
> +      it += vl;
> +      vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
> +      it += vl;
> +      vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
> +      it += vl;
> +
> +      asm volatile("nop" ::: "memory");
> +      vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
> +      vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
> +      vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
> +      vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);
> +
> +      asm volatile("nop" ::: "memory");
> +      size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
> +      size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
> +      size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
> +      size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);
> +
> +      sum += sumation (sum0, sum1, sum2, sum3);
> +    }
> +  return sum;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmv1r} } } */
> +/* { dg-final { scan-assembler-not {vmv2r} } } */
> +/* { dg-final { scan-assembler-not {vmv4r} } } */
> +/* { dg-final { scan-assembler-not {vmv8r} } } */
> +/* { dg-final { scan-assembler-not {csrr} } } */
> --
> 2.36.3
>

Reply via email to