Hi Juzhe,

I'm seeing that this patch introduces failures with rv32gcv-ilp32d as seen here https://github.com/ewlu/gcc-precommit-ci/issues/1194. Digging a little deeper, it appears that there's an illegal instruction in a shared library which (at least for FAIL: gcc.c-torture/execute/920501-8.c -O2 execution test) is using vmv.v.i without a prior vsetvl. I believe the other failures may be similar.

Logs:
spawn -ignore SIGHUP /scratch/ewlu/ci/triage/compare/build-407-errors/build-gcc-linux-stage2/gcc/xgcc -B/scratch/ewlu/ci/triage/compare/build-407-errors/build-gcc-linux-stage2/gcc/ /scratch/ewlu/ci/triage/compare/gcc/gcc/testsuite/gcc.c-torture/execute/920501-8.c -march=rv32gcv -mabi=ilp32d -mtune=rocket -mcmodel=medlow -fdiagnostics-plain-output -O2 -w -lm -o ./920501-8.exe
PASS: gcc.c-torture/execute/920501-8.c   -O2  (test for excess errors)
spawn riscv64-unknown-linux-gnu-run ./920501-8.exe
/scratch/ewlu/ci/triage/compare/build-407-errors/../scripts/wrapper/qemu/riscv64-unknown-linux-gnu-run: line 15: 584664 Illegal instruction (core dumped) QEMU_CPU="$(march-to-cpu-opt --get-riscv-tag $1)" qemu-riscv$xlen -r 5.10 "${qemu_args[@]}" -L ${RISC_V_SYSROOT} "$@"
FAIL: gcc.c-torture/execute/920501-8.c   -O2  execution test

Execution:
QEMU_CPU="rv32,vlen=128,v=true,vext_spec=v1.0,Zve32f=true,Zve64f=true" ./bin/qemu-riscv32 ./920501-8.exe

GDB output:
Program received signal SIGILL, Illegal instruction.
0x2b3d0f3e in __printf_buffer () from /scratch/ewlu/ci/triage/compare/build-407-errors/sysroot/lib32/ilp32d/libc.so.6
1: x/i $pc
=> 0x2b3d0f3e <__printf_buffer+410>:    vmv.v.i v1,0

I've included the first 150ish lines of the function's objdump below.

Edwin

$ ./bin/riscv64-unknown-linux-gnu-objdump -d sysroot/lib32/ilp32d/libc.so.6 > dump

00046da4 <__printf_buffer>:
   46da4:       000f8797                auipc   a5,0xf8
46da8: 1ac7a783 lw a5,428(a5) # 13ef50 <_GLOBAL_OFFSET_TABLE_+0x64>
   46dac:       b3010113                addi    sp,sp,-1232
   46db0:       4c812423                sw      s0,1224(sp)
   46db4:       c6be                    sw      a5,76(sp)
   46db6:       9792                    add     a5,a5,tp
   46db8:       439c                    lw      a5,0(a5)
   46dba:       842a                    mv      s0,a0
   46dbc:       4d212023                sw      s2,1216(sp)
   46dc0:       c2ae                    sw      a1,68(sp)
   46dc2:       892e                    mv      s2,a1
   46dc4:       852e                    mv      a0,a1
   46dc6:       02500593                li      a1,37
   46dca:       de3e                    sw      a5,60(sp)
   46dcc:       4c112623                sw      ra,1228(sp)
   46dd0:       4c912223                sw      s1,1220(sp)
   46dd4:       4b312e23                sw      s3,1212(sp)
   46dd8:       4b812423                sw      s8,1192(sp)
   46ddc:       84b2                    mv      s1,a2
   46dde:       dcb2                    sw      a2,120(sp)
   46de0:       89b6                    mv      s3,a3
   46de2:       c0b6                    sw      a3,64(sp)
   46de4:       60a300ef                jal     773ee <strchrnul>
   46de8:       c4aa                    sw      a0,72(sp)
   46dea:       41250633                sub     a2,a0,s2
   46dee:       8c2a                    mv      s8,a0
   46df0:       85ca                    mv      a1,s2
   46df2:       8522                    mv      a0,s0
46df4: 933f90ef jal 40726 <__printf_buffer_write>
   46df8:       4c1c                    lw      a5,24(s0)
46dfa: c3dd beqz a5,46ea0 <__printf_buffer+0xfc>
   46dfc:       000c4783                lbu     a5,0(s8)
46e00: c3c5 beqz a5,46ea0 <__printf_buffer+0xfc>
   46e02:       000fa797                auipc   a5,0xfa
46e06: 85a7a783 lw a5,-1958(a5) # 14065c <__printf_function_table>
   46e0a:       4b512a23                sw      s5,1204(sp)
   46e0e:       da3e                    sw      a5,52(sp)
46e10: c399 beqz a5,46e16 <__printf_buffer+0x72> 46e12: 2680106f j 4807a <__printf_buffer+0x12d6>
   46e16:       000fa797                auipc   a5,0xfa
46e1a: 8367a783 lw a5,-1994(a5) # 14064c <__printf_modifier_table> 46e1e: 740797e3 bnez a5,47d6c <__printf_buffer+0xfc8>
   46e22:       000f9797                auipc   a5,0xf9
46e26: e927a783 lw a5,-366(a5) # 13fcb4 <__printf_va_arg_table> 46e2a: 740791e3 bnez a5,47d6c <__printf_buffer+0xfc8>
   46e2e:       57fd                    li      a5,-1
   46e30:       d0be                    sw      a5,96(sp)
   46e32:       0019f793                andi    a5,s3,1
   46e36:       d4be                    sw      a5,104(sp)
   46e38:       111c                    addi    a5,sp,160
   46e3a:       4b412c23                sw      s4,1208(sp)
   46e3e:       4b612823                sw      s6,1200(sp)
   46e42:       4b712623                sw      s7,1196(sp)
   46e46:       4b912223                sw      s9,1188(sp)
   46e4a:       4ba12023                sw      s10,1184(sp)
   46e4e:       49b12e23                sw      s11,1180(sp)
   46e52:       ce82                    sw      zero,92(sp)
   46e54:       4a81                    li      s5,0
   46e56:       000b9a17                auipc   s4,0xb9
46e5a: 2dea0a13 addi s4,s4,734 # 100134 <step4_jumps.0>
   46e5e:       d6be                    sw      a5,108(sp)
   46e60:       001c4d03                lbu     s10,1(s8)
   46e64:       fe0d0793                addi    a5,s10,-32
   46e68:       0ff7f793                zext.b  a5,a5
   46e6c:       05a00713                li      a4,90
46e70: 04f77763 bgeu a4,a5,46ebe <__printf_buffer+0x11a> 46e74: 520d17e3 bnez s10,47ba2 <__printf_buffer+0xdfe>
   46e78:       47b6                    lw      a5,76(sp)
   46e7a:       9792                    add     a5,a5,tp
   46e7c:       4759                    li      a4,22
   46e7e:       00042c23                sw      zero,24(s0)
   46e82:       c398                    sw      a4,0(a5)
   46e84:       4b812a03                lw      s4,1208(sp)
   46e88:       4b412a83                lw      s5,1204(sp)
   46e8c:       4b012b03                lw      s6,1200(sp)
   46e90:       4ac12b83                lw      s7,1196(sp)
   46e94:       4a412c83                lw      s9,1188(sp)
   46e98:       4a012d03                lw      s10,1184(sp)
   46e9c:       49c12d83                lw      s11,1180(sp)
   46ea0:       4cc12083                lw      ra,1228(sp)
   46ea4:       4c812403                lw      s0,1224(sp)
   46ea8:       4c412483                lw      s1,1220(sp)
   46eac:       4c012903                lw      s2,1216(sp)
   46eb0:       4bc12983                lw      s3,1212(sp)
   46eb4:       4a812c03                lw      s8,1192(sp)
   46eb8:       4d010113                addi    sp,sp,1232
   46ebc:       8082                    ret
   46ebe:       01aa07b3                add     a5,s4,s10
   46ec2:       05c7c783                lbu     a5,92(a5)
   46ec6:       078a                    slli    a5,a5,0x2
   46ec8:       97d2                    add     a5,a5,s4
   46eca:       0d87a703                lw      a4,216(a5)
   46ece:       00000797                auipc   a5,0x0
46ed2: fa678793 addi a5,a5,-90 # 46e74 <__printf_buffer+0xd0>
   46ed6:       973e                    add     a4,a4,a5
   46ed8:       02000793                li      a5,32
   46edc:       d63e                    sw      a5,44(sp)
   46ede:       0c05                    addi    s8,s8,1
   46ee0:       4b01                    li      s6,0
   46ee2:       59fd                    li      s3,-1
   46ee4:       4c81                    li      s9,0
   46ee6:       4381                    li      t2,0
   46ee8:       4781                    li      a5,0
   46eea:       4b81                    li      s7,0
   46eec:       4901                    li      s2,0
   46eee:       4e01                    li      t3,0
   46ef0:       ce02                    sw      zero,28(sp)
   46ef2:       4d81                    li      s11,0
   46ef4:       d202                    sw      zero,36(sp)
   46ef6:       d402                    sw      zero,40(sp)
   46ef8:       8702                    jr      a4
   46efa:       5726                    lw      a4,104(sp)
46efc: c319 beqz a4,46f02 <__printf_buffer+0x15e> 46efe: 23c0106f j 4813a <__printf_buffer+0x1396>
   46f02:       874a                    mv      a4,s2
   46f04:       001b9693                slli    a3,s7,0x1
   46f08:       8f55                    or      a4,a4,a3
   46f0a:       078a                    slli    a5,a5,0x2
   46f0c:       8fd9                    or      a5,a5,a4
   46f0e:       5722                    lw      a4,40(sp)
   46f10:       070e                    slli    a4,a4,0x3
   46f12:       8fd9                    or      a5,a5,a4
   46f14:       5712                    lw      a4,36(sp)
   46f16:       0712                    slli    a4,a4,0x4
   46f18:       8fd9                    or      a5,a5,a4
   46f1a:       005d9f13                slli    t5,s11,0x5
   46f1e:       4772                    lw      a4,28(sp)
   46f20:       071a                    slli    a4,a4,0x6
   46f22:       01e7e7b3                or      a5,a5,t5
   46f26:       8fd9                    or      a5,a5,a4
   46f28:       0e1e                    slli    t3,t3,0x7
   46f2a:       01c7e7b3                or      a5,a5,t3
   46f2e:       0b2e                    slli    s6,s6,0xb
   46f30:       6705                    lui     a4,0x1
46f32: 8ff70713 addi a4,a4,-1793 # 8ff <current+0x8b7>
   46f36:       0167e7b3                or      a5,a5,s6
   46f3a:       8ff9                    and     a5,a5,a4
   46f3c:       5736                    lw      a4,108(sp)
   46f3e:       5e0030d7                vmv.v.i v1,0
   46f42:       020700a7                vse8.v  v1,(a4)
   46f46:       5732                    lw      a4,44(sp)
   46f48:       cb4e                    sw      s3,148(sp)
   46f4a:       cd66                    sw      s9,152(sp)
   46f4c:       cf6a                    sw      s10,156(sp)

On 1/15/2024 4:00 AM, Juzhe-Zhong wrote:
This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with 
-march=rv64gcv in real hardware.

The root cause is incorrect cost model cause inefficient vectorization which 
makes us performance drop significantly.

So this patch does:

1. Adjust vector to scalar cost by introducing v to scalar reg move.
2. Adjust vec_construct cost since we does spend NUNITS instructions to 
construct the vector.

Tested on both RV32/RV64 no regression, Rebase to the trunk and commit it as it 
is approved by Robin.

        PR target/113247

gcc/ChangeLog:

        * config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector 
to scalar regmove.
        * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto.
        * config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust 
vec_construct cost.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test.
        * gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto.
        * gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto.
        * gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test.
        * gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test.
        * gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test.
        * gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test.

---
  gcc/config/riscv/riscv-protos.h               |   2 +
  gcc/config/riscv/riscv-vector-costs.cc        |   3 +
  gcc/config/riscv/riscv.cc                     |   4 +-
  .../vect/costmodel/riscv/rvv/pr113247-1.c     | 195 ++++++++++++++++++
  .../vect/costmodel/riscv/rvv/pr113247-2.c     |   6 +
  .../vect/costmodel/riscv/rvv/pr113247-3.c     |   6 +
  .../vect/costmodel/riscv/rvv/pr113247-4.c     |   6 +
  .../riscv/rvv/autovec/vls/reduc-19.c          |   2 +-
  .../riscv/rvv/autovec/vls/reduc-20.c          |   2 +-
  .../riscv/rvv/autovec/vls/reduc-21.c          |   2 +-
  10 files changed, 224 insertions(+), 4 deletions(-)
  create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
  create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
  create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
  create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 4f3b677f4f9..21f6dadf113 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -255,6 +255,8 @@ struct regmove_vector_cost
  {
    const int GR2VR;
    const int FR2VR;
+  const int VR2GR;
+  const int VR2FR;
  };
/* Cost for vector insn classes. */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 90ab93b7506..7c9840df4e9 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1056,6 +1056,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree 
vectype, int stmt_cost)
      case scalar_to_vec:
        return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
                                                  : costs->regmove->GR2VR);
+    case vec_to_scalar:
+      return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
+                                                 : costs->regmove->VR2GR);
      default:
        break;
      }
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index ee1a57b321d..568db90a27d 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
  static const regmove_vector_cost rvv_regmove_vector_cost = {
    2, /* GR2VR  */
    2, /* FR2VR  */
+  2, /* VR2GR  */
+  2, /* VR2FR  */
  };
/* Generic costs for vector insn classes. It is supposed to be the vector cost
@@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
        return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
case vec_construct:
-      return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1;
+      return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
default:
        gcc_unreachable ();
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
new file mode 100644
index 00000000000..0d09a624a00
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
@@ -0,0 +1,195 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
--param=riscv-autovec-lmul=dynamic" } */
+
+#include <stdint-gcc.h>
+
+#define Ch(x,y,z)   (z ^ (x & (y ^ z)))
+#define Maj(x,y,z)  ((x & y) | (z & (x | y)))
+
+#define SHR(x, n)    (x >> n)
+#define ROTR(x,n)    (SHR(x,n) | (x << (32 - n)))
+#define S1(x)        (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
+#define S0(x)        (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
+
+#define s1(x)        (ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))
+#define s0(x)        (ROTR(x, 7) ^ ROTR(x,18) ^  SHR(x, 3))
+
+#define SHA256_STEP(a,b,c,d,e,f,g,h,x,K)                 \
+{                                                        \
+    tmp1 = h + S1(e) + Ch(e,f,g) + K + x;                \
+    tmp2 = S0(a) + Maj(a,b,c);                           \
+    h  = tmp1 + tmp2;                                    \
+    d += tmp1;                                           \
+}
+
+#define BE_LOAD32(n,b,i) (n) = byteswap(*(uint32_t *)(b + i))
+
+static uint32_t byteswap(uint32_t x)
+{
+    x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
+    x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
+
+    return x;
+}
+
+void sha256 (const uint8_t *in, uint32_t out[8])
+{
+    uint32_t tmp1, tmp2, a, b, c, d, e, f, g, h;
+    uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, 
w15;
+
+    tmp1 = tmp2 = 0;
+    w0 = w1 = w2 = w3 = w4 = w5 = w6 = w7 = w8 = w9 = w10 = w11 = w12 = w13 = 
w14 = w15 = 0;
+
+    BE_LOAD32 (  w0, in,  0 );
+    BE_LOAD32 (  w1, in,  4 );
+    BE_LOAD32 (  w2, in,  8 );
+    BE_LOAD32 (  w3, in, 12 );
+    BE_LOAD32 (  w4, in, 16 );
+    BE_LOAD32 (  w5, in, 20 );
+    BE_LOAD32 (  w6, in, 24 );
+    BE_LOAD32 (  w7, in, 28 );
+    BE_LOAD32 (  w8, in, 32 );
+    BE_LOAD32 (  w9, in, 36 );
+    BE_LOAD32 ( w10, in, 40 );
+    BE_LOAD32 ( w11, in, 44 );
+    BE_LOAD32 ( w12, in, 48 );
+    BE_LOAD32 ( w13, in, 52 );
+    BE_LOAD32 ( w14, in, 56 );
+    BE_LOAD32 ( w15, in, 60 );
+
+    a = out[0];
+    b = out[1];
+    c = out[2];
+    d = out[3];
+    e = out[4];
+    f = out[5];
+    g = out[6];
+    h = out[7];
+
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0x428a2f98);
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0x71374491);
+    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0xb5c0fbcf);
+    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0xe9b5dba5);
+    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x3956c25b);
+    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x59f111f1);
+    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x923f82a4);
+    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0xab1c5ed5);
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0xd807aa98);
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0x12835b01);
+    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x243185be);
+    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x550c7dc3);
+    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x72be5d74);
+    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0x80deb1fe);
+    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x9bdc06a7);
+    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc19bf174);
+
+    w0 = s1(w14) + w9 + s0(w1) + w0;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0xe49b69c1);
+    w1 = s1(w15) + w10 + s0(w2) + w1;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0xefbe4786);
+    w2 = s1(w0) + w11 + s0(w3) + w2;
+    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0x0fc19dc6);
+    w3 = s1(w1) + w12 + s0(w4) + w3;
+    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0x240ca1cc);
+    w4 = s1(w2) + w13 + s0(w5) + w4;
+    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x2de92c6f);
+    w5 = s1(w3) + w14 + s0(w6) + w5;
+    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x4a7484aa);
+    w6 = s1(w4) + w15 + s0(w7) + w6;
+    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x5cb0a9dc);
+    w7 = s1(w5) + w0 + s0(w8) + w7;
+    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0x76f988da);
+    w8 = s1(w6) + w1 + s0(w9) + w8;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0x983e5152);
+    w9 = s1(w7) + w2 + s0(w10) + w9;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0xa831c66d);
+    w10 = s1(w8) + w3 + s0(w11) + w10;
+    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xb00327c8);
+    w11 = s1(w9) + w4 + s0(w12) + w11;
+    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xbf597fc7);
+    w12 = s1(w10) + w5 + s0(w13) + w12;
+    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xc6e00bf3);
+    w13 = s1(w11) + w6 + s0(w14) + w13;
+    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd5a79147);
+    w14 = s1(w12) + w7 + s0(w15) + w14;
+    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x06ca6351);
+    w15 = s1(w13) + w8 + s0(w0) + w15;
+    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x14292967);
+
+    w0 = s1(w14) + w9 + s0(w1) + w0;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0x27b70a85);
+    w1 = s1(w15) + w10 + s0(w2) + w1;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0x2e1b2138);
+    w2 = s1(w0) + w11 + s0(w3) + w2;
+    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0x4d2c6dfc);
+    w3 = s1(w1) + w12 + s0(w4) + w3;
+    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0x53380d13);
+    w4 = s1(w2) + w13 + s0(w5) + w4;
+    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x650a7354);
+    w5 = s1(w3) + w14 + s0(w6) + w5;
+    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x766a0abb);
+    w6 = s1(w4) + w15 + s0(w7) + w6;
+    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x81c2c92e);
+    w7 = s1(w5) + w0 + s0(w8) + w7;
+    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0x92722c85);
+    w8 = s1(w6) + w1 + s0(w9) + w8;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0xa2bfe8a1);
+    w9 = s1(w7) + w2 + s0(w10) + w9;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0xa81a664b);
+    w10 = s1(w8) + w3 + s0(w11) + w10;
+    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xc24b8b70);
+    w11 = s1(w9) + w4 + s0(w12) + w11;
+    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xc76c51a3);
+    w12 = s1(w10) + w5 + s0(w13) + w12;
+    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xd192e819);
+    w13 = s1(w11) + w6 + s0(w14) + w13;
+    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd6990624);
+    w14 = s1(w12) + w7 + s0(w15) + w14;
+    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xf40e3585);
+    w15 = s1(w13) + w8 + s0(w0) + w15;
+    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x106aa070);
+
+    w0 = s1(w14) + w9 + s0(w1) + w0;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0x19a4c116);
+    w1 = s1(w15) + w10 + s0(w2) + w1;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0x1e376c08);
+    w2 = s1(w0) + w11 + s0(w3) + w2;
+    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0x2748774c);
+    w3 = s1(w1) + w12 + s0(w4) + w3;
+    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0x34b0bcb5);
+    w4 = s1(w2) + w13 + s0(w5) + w4;
+    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x391c0cb3);
+    w5 = s1(w3) + w14 + s0(w6) + w5;
+    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x4ed8aa4a);
+    w6 = s1(w4) + w15 + s0(w7) + w6;
+    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x5b9cca4f);
+    w7 = s1(w5) + w0 + s0(w8) + w7;
+    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0x682e6ff3);
+    w8 = s1(w6) + w1 + s0(w9) + w8;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0x748f82ee);
+    w9 = s1(w7) + w2 + s0(w10) + w9;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0x78a5636f);
+    w10 = s1(w8) + w3 + s0(w11) + w10;
+    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x84c87814);
+    w11 = s1(w9) + w4 + s0(w12) + w11;
+    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x8cc70208);
+    w12 = s1(w10) + w5 + s0(w13) + w12;
+    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x90befffa);
+    w13 = s1(w11) + w6 + s0(w14) + w13;
+    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xa4506ceb);
+    w14 = s1(w12) + w7 + s0(w15) + w14;
+    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xbef9a3f7);
+    w15 = s1(w13) + w8 + s0(w0) + w15;
+    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc67178f2);
+
+    out[0] += a;
+    out[1] += b;
+    out[2] += c;
+    out[3] += d;
+    out[4] += e;
+    out[5] += f;
+    out[6] += g;
+    out[7] += h;
+}
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
new file mode 100644
index 00000000000..64a53cfca88
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
--param=riscv-autovec-lmul=dynamic --param=riscv-autovec-preference=fixed-vlmax" } */
+
+#include "pr113247-1.c"
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
new file mode 100644
index 00000000000..423c90e4154
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize" } */
+
+#include "pr113247-1.c"
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
new file mode 100644
index 00000000000..c2a46d848e5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
--param=riscv-autovec-preference=fixed-vlmax" } */
+
+#include "pr113247-1.c"
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
index 2048b636910..5130fe5f2e3 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
@@ -14,7 +14,7 @@ DEF_REDUC_PLUS (_Float16, 512)
  DEF_REDUC_PLUS (_Float16, 1024)
  DEF_REDUC_PLUS (_Float16, 2048)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 10 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
  /* { dg-final { scan-assembler-not {csrr} } } */
  /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
  /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
index bfc328da568..819104a8cdf 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
@@ -13,7 +13,7 @@ DEF_REDUC_PLUS (float, 256)
  DEF_REDUC_PLUS (float, 512)
  DEF_REDUC_PLUS (float, 1024)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
  /* { dg-final { scan-assembler-not {csrr} } } */
  /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
  /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
index 8228590fa3b..2b61e0ac71a 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
@@ -12,7 +12,7 @@ DEF_REDUC_PLUS (float, 128)
  DEF_REDUC_PLUS (float, 256)
  DEF_REDUC_PLUS (float, 512)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 7 } } */
  /* { dg-final { scan-assembler-not {csrr} } } */
  /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
  /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */

Reply via email to