This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with 
-march=rv64gcv in real hardware.

The root cause is incorrect cost model cause inefficient vectorization which 
makes us performance drop significantly.

So this patch does:

1. Adjust vector to scalar cost by introducing v to scalar reg move.
2. Adjust vec_construct cost since we does spend NUNITS instructions to 
construct the vector.

Tested on both RV32/RV64 no regression, Rebase to the trunk and commit it as it 
is approved by Robin.

        PR target/113247

gcc/ChangeLog:

        * config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector 
to scalar regmove.
        * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto.
        * config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust 
vec_construct cost.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test.
        * gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto.
        * gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto.
        * gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test.
        * gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test.
        * gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test.
        * gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test.

---
 gcc/config/riscv/riscv-protos.h               |   2 +
 gcc/config/riscv/riscv-vector-costs.cc        |   3 +
 gcc/config/riscv/riscv.cc                     |   4 +-
 .../vect/costmodel/riscv/rvv/pr113247-1.c     | 195 ++++++++++++++++++
 .../vect/costmodel/riscv/rvv/pr113247-2.c     |   6 +
 .../vect/costmodel/riscv/rvv/pr113247-3.c     |   6 +
 .../vect/costmodel/riscv/rvv/pr113247-4.c     |   6 +
 .../riscv/rvv/autovec/vls/reduc-19.c          |   2 +-
 .../riscv/rvv/autovec/vls/reduc-20.c          |   2 +-
 .../riscv/rvv/autovec/vls/reduc-21.c          |   2 +-
 10 files changed, 224 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 4f3b677f4f9..21f6dadf113 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -255,6 +255,8 @@ struct regmove_vector_cost
 {
   const int GR2VR;
   const int FR2VR;
+  const int VR2GR;
+  const int VR2FR;
 };
 
 /* Cost for vector insn classes.  */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 90ab93b7506..7c9840df4e9 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1056,6 +1056,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree 
vectype, int stmt_cost)
     case scalar_to_vec:
       return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
                                                  : costs->regmove->GR2VR);
+    case vec_to_scalar:
+      return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
+                                                 : costs->regmove->VR2GR);
     default:
       break;
     }
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index ee1a57b321d..568db90a27d 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
 static const regmove_vector_cost rvv_regmove_vector_cost = {
   2, /* GR2VR  */
   2, /* FR2VR  */
+  2, /* VR2GR  */
+  2, /* VR2FR  */
 };
 
 /* Generic costs for vector insn classes.  It is supposed to be the vector cost
@@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
       return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
 
     case vec_construct:
-      return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1;
+      return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
 
     default:
       gcc_unreachable ();
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
new file mode 100644
index 00000000000..0d09a624a00
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
@@ -0,0 +1,195 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
--param=riscv-autovec-lmul=dynamic" } */
+
+#include <stdint-gcc.h>
+
+#define Ch(x,y,z)   (z ^ (x & (y ^ z)))
+#define Maj(x,y,z)  ((x & y) | (z & (x | y)))
+
+#define SHR(x, n)    (x >> n)
+#define ROTR(x,n)    (SHR(x,n) | (x << (32 - n)))
+#define S1(x)        (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
+#define S0(x)        (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
+
+#define s1(x)        (ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))
+#define s0(x)        (ROTR(x, 7) ^ ROTR(x,18) ^  SHR(x, 3))
+
+#define SHA256_STEP(a,b,c,d,e,f,g,h,x,K)                 \
+{                                                        \
+    tmp1 = h + S1(e) + Ch(e,f,g) + K + x;                \
+    tmp2 = S0(a) + Maj(a,b,c);                           \
+    h  = tmp1 + tmp2;                                    \
+    d += tmp1;                                           \
+}
+
+#define BE_LOAD32(n,b,i) (n) = byteswap(*(uint32_t *)(b + i))
+
+static uint32_t byteswap(uint32_t x)
+{
+    x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
+    x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;  
+
+    return x;
+}
+
+void sha256 (const uint8_t *in, uint32_t out[8])
+{
+    uint32_t tmp1, tmp2, a, b, c, d, e, f, g, h;
+    uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, 
w15;
+
+    tmp1 = tmp2 = 0;
+    w0 = w1 = w2 = w3 = w4 = w5 = w6 = w7 = w8 = w9 = w10 = w11 = w12 = w13 = 
w14 = w15 = 0;
+
+    BE_LOAD32 (  w0, in,  0 );
+    BE_LOAD32 (  w1, in,  4 );
+    BE_LOAD32 (  w2, in,  8 );
+    BE_LOAD32 (  w3, in, 12 );
+    BE_LOAD32 (  w4, in, 16 );
+    BE_LOAD32 (  w5, in, 20 );
+    BE_LOAD32 (  w6, in, 24 );
+    BE_LOAD32 (  w7, in, 28 );
+    BE_LOAD32 (  w8, in, 32 );
+    BE_LOAD32 (  w9, in, 36 );
+    BE_LOAD32 ( w10, in, 40 );
+    BE_LOAD32 ( w11, in, 44 );
+    BE_LOAD32 ( w12, in, 48 );
+    BE_LOAD32 ( w13, in, 52 );
+    BE_LOAD32 ( w14, in, 56 );
+    BE_LOAD32 ( w15, in, 60 );
+
+    a = out[0];
+    b = out[1];
+    c = out[2];
+    d = out[3];
+    e = out[4];
+    f = out[5];
+    g = out[6];
+    h = out[7];
+
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0x428a2f98);
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0x71374491);
+    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0xb5c0fbcf);
+    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0xe9b5dba5);
+    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x3956c25b);
+    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x59f111f1);
+    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x923f82a4);
+    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0xab1c5ed5);
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0xd807aa98);
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0x12835b01);
+    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x243185be);
+    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x550c7dc3);
+    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x72be5d74);
+    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0x80deb1fe);
+    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x9bdc06a7);
+    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc19bf174);
+
+    w0 = s1(w14) + w9 + s0(w1) + w0;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0xe49b69c1);
+    w1 = s1(w15) + w10 + s0(w2) + w1;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0xefbe4786);
+    w2 = s1(w0) + w11 + s0(w3) + w2;
+    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0x0fc19dc6);
+    w3 = s1(w1) + w12 + s0(w4) + w3;
+    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0x240ca1cc);
+    w4 = s1(w2) + w13 + s0(w5) + w4;
+    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x2de92c6f);
+    w5 = s1(w3) + w14 + s0(w6) + w5;
+    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x4a7484aa);
+    w6 = s1(w4) + w15 + s0(w7) + w6;
+    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x5cb0a9dc);
+    w7 = s1(w5) + w0 + s0(w8) + w7;
+    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0x76f988da);
+    w8 = s1(w6) + w1 + s0(w9) + w8;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0x983e5152);
+    w9 = s1(w7) + w2 + s0(w10) + w9;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0xa831c66d);
+    w10 = s1(w8) + w3 + s0(w11) + w10;
+    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xb00327c8);
+    w11 = s1(w9) + w4 + s0(w12) + w11;
+    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xbf597fc7);
+    w12 = s1(w10) + w5 + s0(w13) + w12;
+    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xc6e00bf3);
+    w13 = s1(w11) + w6 + s0(w14) + w13;
+    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd5a79147);
+    w14 = s1(w12) + w7 + s0(w15) + w14;
+    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x06ca6351);
+    w15 = s1(w13) + w8 + s0(w0) + w15;
+    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x14292967);
+
+    w0 = s1(w14) + w9 + s0(w1) + w0;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0x27b70a85);
+    w1 = s1(w15) + w10 + s0(w2) + w1;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0x2e1b2138);
+    w2 = s1(w0) + w11 + s0(w3) + w2;
+    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0x4d2c6dfc);
+    w3 = s1(w1) + w12 + s0(w4) + w3;
+    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0x53380d13);
+    w4 = s1(w2) + w13 + s0(w5) + w4;
+    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x650a7354);
+    w5 = s1(w3) + w14 + s0(w6) + w5;
+    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x766a0abb);
+    w6 = s1(w4) + w15 + s0(w7) + w6;
+    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x81c2c92e);
+    w7 = s1(w5) + w0 + s0(w8) + w7;
+    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0x92722c85);
+    w8 = s1(w6) + w1 + s0(w9) + w8;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0xa2bfe8a1);
+    w9 = s1(w7) + w2 + s0(w10) + w9;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0xa81a664b);
+    w10 = s1(w8) + w3 + s0(w11) + w10;
+    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xc24b8b70);
+    w11 = s1(w9) + w4 + s0(w12) + w11;
+    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xc76c51a3);
+    w12 = s1(w10) + w5 + s0(w13) + w12;
+    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xd192e819);
+    w13 = s1(w11) + w6 + s0(w14) + w13;
+    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd6990624);
+    w14 = s1(w12) + w7 + s0(w15) + w14;
+    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xf40e3585);
+    w15 = s1(w13) + w8 + s0(w0) + w15;
+    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x106aa070);
+
+    w0 = s1(w14) + w9 + s0(w1) + w0;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w0, 0x19a4c116);
+    w1 = s1(w15) + w10 + s0(w2) + w1;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w1, 0x1e376c08);
+    w2 = s1(w0) + w11 + s0(w3) + w2;
+    SHA256_STEP(g, h, a, b, c, d, e, f,  w2, 0x2748774c);
+    w3 = s1(w1) + w12 + s0(w4) + w3;
+    SHA256_STEP(f, g, h, a, b, c, d, e,  w3, 0x34b0bcb5);
+    w4 = s1(w2) + w13 + s0(w5) + w4;
+    SHA256_STEP(e, f, g, h, a, b, c, d,  w4, 0x391c0cb3);
+    w5 = s1(w3) + w14 + s0(w6) + w5;
+    SHA256_STEP(d, e, f, g, h, a, b, c,  w5, 0x4ed8aa4a);
+    w6 = s1(w4) + w15 + s0(w7) + w6;
+    SHA256_STEP(c, d, e, f, g, h, a, b,  w6, 0x5b9cca4f);
+    w7 = s1(w5) + w0 + s0(w8) + w7;
+    SHA256_STEP(b, c, d, e, f, g, h, a,  w7, 0x682e6ff3);
+    w8 = s1(w6) + w1 + s0(w9) + w8;
+    SHA256_STEP(a, b, c, d, e, f, g, h,  w8, 0x748f82ee);
+    w9 = s1(w7) + w2 + s0(w10) + w9;
+    SHA256_STEP(h, a, b, c, d, e, f, g,  w9, 0x78a5636f);
+    w10 = s1(w8) + w3 + s0(w11) + w10;
+    SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x84c87814);
+    w11 = s1(w9) + w4 + s0(w12) + w11;
+    SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x8cc70208);
+    w12 = s1(w10) + w5 + s0(w13) + w12;
+    SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x90befffa);
+    w13 = s1(w11) + w6 + s0(w14) + w13;
+    SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xa4506ceb);
+    w14 = s1(w12) + w7 + s0(w15) + w14;
+    SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xbef9a3f7);
+    w15 = s1(w13) + w8 + s0(w0) + w15;
+    SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc67178f2);
+
+    out[0] += a;
+    out[1] += b;
+    out[2] += c;
+    out[3] += d;
+    out[4] += e;
+    out[5] += f;
+    out[6] += g;
+    out[7] += h;
+}
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
new file mode 100644
index 00000000000..64a53cfca88
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
--param=riscv-autovec-lmul=dynamic 
--param=riscv-autovec-preference=fixed-vlmax" } */
+
+#include "pr113247-1.c"
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
new file mode 100644
index 00000000000..423c90e4154
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize" } */
+
+#include "pr113247-1.c"
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
new file mode 100644
index 00000000000..c2a46d848e5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
--param=riscv-autovec-preference=fixed-vlmax" } */
+
+#include "pr113247-1.c"
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
index 2048b636910..5130fe5f2e3 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
@@ -14,7 +14,7 @@ DEF_REDUC_PLUS (_Float16, 512)
 DEF_REDUC_PLUS (_Float16, 1024)
 DEF_REDUC_PLUS (_Float16, 2048)
 
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 10 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
 /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
 /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
index bfc328da568..819104a8cdf 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
@@ -13,7 +13,7 @@ DEF_REDUC_PLUS (float, 256)
 DEF_REDUC_PLUS (float, 512)
 DEF_REDUC_PLUS (float, 1024)
 
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
 /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
 /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
index 8228590fa3b..2b61e0ac71a 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
@@ -12,7 +12,7 @@ DEF_REDUC_PLUS (float, 128)
 DEF_REDUC_PLUS (float, 256)
 DEF_REDUC_PLUS (float, 512)
 
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 7 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
 /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
 /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
-- 
2.36.3

Reply via email to