The penalty is only applied when loops are fully unrolled (NITERS < VF),
where vector_stmt with larger LMUL should have higher execution overhead.
PR target/122558
gcc/ChangeLog:
* config/riscv/riscv-vector-costs.cc (get_lmul_cost_penalty): New
function to calculate LMUL-based cost penalty.
(costs::adjust_stmt_cost): Apply LMUL penalty to
segment/unit-stride load/store operations
and vector arithmetic operations.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/pr122558.c: New test.
Signed-off-by: Zhongyao Chen <[email protected]>
---
gcc/config/riscv/riscv-vector-costs.cc | 86 +++++++++++++++++++
.../gcc.target/riscv/rvv/autovec/pr122558.c | 37 ++++++++
2 files changed, 123 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
diff --git a/gcc/config/riscv/riscv-vector-costs.cc
b/gcc/config/riscv/riscv-vector-costs.cc
index 27ced61e815..d3002f1799f 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1099,6 +1099,64 @@ segment_loadstore_group_size (enum vect_cost_for_stmt
kind,
return 0;
}
+/* Calculate LMUL-based cost adjustment factor.
+ Larger LMUL values increase execution overhead.
+
+ This penalty is only applied when the loop is completely unrolled.
+ Returns additional cost to be added based on LMUL. */
+static unsigned
+get_lmul_cost_penalty (machine_mode mode, loop_vec_info loop_vinfo)
+{
+ if (!riscv_v_ext_vector_mode_p (mode))
+ return 0;
+
+ /* Only apply LMUL penalty when loop is completely unrolled.
+ For non-unrolled loops, larger LMUL reduces iteration count,
+ which may provide overall benefit despite slower instructions. */
+ if (!loop_vinfo)
+ return 0;
+
+ /* Check if loop will be completely unrolled:
+ - NITERS must be known at compile time
+ - NITERS must be less than VF (single iteration) */
+ if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+ return 0;
+
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ unsigned HOST_WIDE_INT niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+
+ /* If NITERS >= VF, loop will have multiple iterations.
+ In this case, larger LMUL reduces loop count, don't penalize. */
+ if (maybe_ge (poly_uint64 (niters), vf))
+ return 0;
+
+ /* Loop is completely unrolled (single iteration).
+ Apply LMUL penalty since larger LMUL increases latency. */
+ enum vlmul_type vlmul = get_vlmul (mode);
+
+ /* Cost penalty increases with LMUL:
+ - m1 (LMUL_1): 0 penalty (baseline)
+ - m2 (LMUL_2): +1
+ - m4 (LMUL_4): +2
+ - m8 (LMUL_8): +3
+ - mf2/mf4/mf8: 0 (already efficient) */
+ switch (vlmul)
+ {
+ case LMUL_2:
+ return 1;
+ case LMUL_4:
+ return 2;
+ case LMUL_8:
+ return 3;
+ case LMUL_1:
+ case LMUL_F2:
+ case LMUL_F4:
+ case LMUL_F8:
+ default:
+ return 0;
+ }
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1181,6 +1239,15 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind,
loop_vec_info loop,
default:
break;
}
+
+ /* Adjust cost for all segment load/store operations based on
+ actual vectype LMUL. Only penalize when loop is completely
+ unrolled. */
+ if (vectype)
+ {
+ machine_mode actual_mode = TYPE_MODE (vectype);
+ stmt_cost += get_lmul_cost_penalty (actual_mode, loop);
+ }
}
else
{
@@ -1236,10 +1303,29 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind,
loop_vec_info loop,
}
}
}
+
+ /* Apply LMUL penalty for unit-stride operations.
+ This ensures consistent cost modeling across all
+ vector load/store types when loop is unrolled. */
+ if (vectype)
+ {
+ machine_mode actual_mode = TYPE_MODE (vectype);
+ stmt_cost += get_lmul_cost_penalty (actual_mode, loop);
+ }
}
break;
}
+ case vector_stmt:
+ /* Adjust cost for all vector arithmetic operations based on LMUL.
+ Only penalize when loop is completely unrolled. */
+ if (vectype)
+ {
+ machine_mode actual_mode = TYPE_MODE (vectype);
+ stmt_cost += get_lmul_cost_penalty (actual_mode, loop);
+ }
+ break;
+
default:
break;
}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
new file mode 100644
index 00000000000..c9dbba64961
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -mabi=lp64d -march=rv64gcv
-mrvv-max-lmul=dynamic -fdump-tree-vect-all" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-O2" "-Os" "-Og" "-Oz" } } */
+
+#include <stdint-gcc.h>
+
+void dct( int16_t d[16], int16_t dct[16] )
+{
+ int16_t tmp[16];
+ for( int i = 0; i < 4; i++ )
+ {
+ int s03 = d[i*4+0] + d[i*4+3];
+ int s12 = d[i*4+1] + d[i*4+2];
+ int d03 = d[i*4+0] - d[i*4+3];
+ int d12 = d[i*4+1] - d[i*4+2];
+ tmp[0*4+i] = s03 + s12;
+ tmp[1*4+i] = 2*d03 + d12;
+ tmp[2*4+i] = s03 - s12;
+ tmp[3*4+i] = d03 - 2*d12;
+ }
+ for( int i = 0; i < 4; i++ )
+ {
+ int s03 = tmp[i*4+0] + tmp[i*4+3];
+ int s12 = tmp[i*4+1] + tmp[i*4+2];
+ int d03 = tmp[i*4+0] - tmp[i*4+3];
+ int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+ dct[i*4+0] = s03 + s12;
+ dct[i*4+1] = 2*d03 + d12;
+ dct[i*4+2] = s03 - s12;
+ dct[i*4+3] = d03 - 2*d12;
+ }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "Choosing vector mode RVVMF2QI" "vect" } } */
+
--
2.43.0