This patch introduces multiplicative cost scaling (x2/x4/x8) to model
the higher latency and register pressure of large LMULs. The scaling
is applied uniformly in adjust_stmt_cost for all vector operations.

Performance impact:
3.69% uplift in SPEC2017 525.x264_r on an internal uarch.

Changes from v1 based on feedback from Robin:
- Rename penalty to scaling.
- Use multiplicative factors instead of additive ones.
- Apply scaling uniformly and remove NITERS checks.

Regarding Jeff's implementation concerns, targeting GCC 17 is fine. No rush
for GCC 16 unless a proper fix is found.

        PR target/122558

gcc/ChangeLog:

        * config/riscv/riscv-vector-costs.cc (get_lmul_cost_scaling):
        New function to calculate multiplicative scaling factors.
        (costs::adjust_stmt_cost): Apply LMUL scaling uniformly to all
        vector statements and remove duplicate logic.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/pr122558.c: New test.

Signed-off-by: Zhongyao Chen <[email protected]>
---
 gcc/config/riscv/riscv-vector-costs.cc        | 51 +++++++++++++++++++
 .../gcc.target/riscv/rvv/autovec/pr122558.c   | 37 ++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 27ced61e815..1a27b05c934 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1099,6 +1099,46 @@ segment_loadstore_group_size (enum vect_cost_for_stmt 
kind,
   return 0;
 }
 
+/* Calculate LMUL-based cost scaling factor.
+   Larger LMUL values process more data but have proportionally
+   higher latency and register pressure.
+
+   Returns the cost scaling factor based on LMUL.  For LMUL > 1,
+   the factor represents the relative cost increase (2x, 4x, 8x).
+   For LMUL <= 1, returns 1 (no scaling).  */
+static unsigned
+get_lmul_cost_scaling (machine_mode mode)
+{
+  if (!riscv_v_ext_vector_mode_p (mode))
+    return 1;
+
+  enum vlmul_type vlmul = get_vlmul (mode);
+
+  /* Cost scaling based on LMUL and data processed.
+     Larger LMUL values have proportionally higher latency:
+     - m1 (LMUL_1): 1x (baseline)
+     - m2 (LMUL_2): 2x (processes 2x data, ~2x latency)
+     - m4 (LMUL_4): 4x (processes 4x data, ~4x latency)
+     - m8 (LMUL_8): 8x (processes 8x data, ~8x latency)
+     - mf2/mf4/mf8: 1x (fractional LMUL, already efficient)  */
+  switch (vlmul)
+    {
+    case LMUL_2:
+      return 2;
+    case LMUL_4:
+      return 4;
+    case LMUL_8:
+      return 8;
+    case LMUL_1:
+    case LMUL_F2:
+    case LMUL_F4:
+    case LMUL_F8:
+    default:
+      return 1;
+    }
+}
+
+
 /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
    For some statement, we would like to further fine-grain tweak the cost on
    top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1243,6 +1283,17 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, 
loop_vec_info loop,
     default:
       break;
     }
+
+  /* Apply LMUL cost scaling uniformly to all vector operations.
+     Larger LMUL values have higher latency and register pressure,
+     which affects performance regardless of loop structure.  */
+  if (vectype)
+    {
+      unsigned lmul_factor = get_lmul_cost_scaling (TYPE_MODE (vectype));
+      if (lmul_factor > 1)
+       stmt_cost *= lmul_factor;
+    }
+
   return stmt_cost;
 }
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
new file mode 100644
index 00000000000..c9dbba64961
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -mabi=lp64d -march=rv64gcv 
-mrvv-max-lmul=dynamic -fdump-tree-vect-all" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-O2" "-Os" "-Og" "-Oz" } } */
+
+#include <stdint-gcc.h>
+
+void dct( int16_t d[16], int16_t dct[16] )
+{
+    int16_t tmp[16];
+    for( int i = 0; i < 4; i++ )
+    {
+        int s03 = d[i*4+0] + d[i*4+3];
+        int s12 = d[i*4+1] + d[i*4+2];
+        int d03 = d[i*4+0] - d[i*4+3];
+        int d12 = d[i*4+1] - d[i*4+2];
+        tmp[0*4+i] =   s03 +   s12;
+        tmp[1*4+i] = 2*d03 +   d12;
+        tmp[2*4+i] =   s03 -   s12;
+        tmp[3*4+i] =   d03 - 2*d12;
+    }
+    for( int i = 0; i < 4; i++ )
+    {
+        int s03 = tmp[i*4+0] + tmp[i*4+3];
+        int s12 = tmp[i*4+1] + tmp[i*4+2];
+        int d03 = tmp[i*4+0] - tmp[i*4+3];
+        int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+        dct[i*4+0] =   s03 +   s12;
+        dct[i*4+1] = 2*d03 +   d12;
+        dct[i*4+2] =   s03 -   s12;
+        dct[i*4+3] =   d03 - 2*d12;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "Choosing vector mode RVVMF2QI" "vect" } } */
+
-- 
2.43.0

Reply via email to