This patch introduces multiplicative cost scaling (x2/x4/x8) to model
the higher latency and register pressure of larger LMULs. The scaling
is applied uniformly in adjust_stmt_cost for all vector operations.
In addition to VLA, VLS should also get the same LMUL cost scaling,
but doing so causes too many testsuite regressions currently,
mostly because these tests also need expectation updates.
This is left for future work.
All failures displayed in CI should have been fixed. Changes here are
all expectation updates, except for slp_run-17.c which is pre-existing
— I will open a PR for it later.
PR target/122558
gcc/ChangeLog:
* config/riscv/riscv-vector-costs.cc (get_lmul_cost_scaling):
New function to calculate multiplicative scaling factors.
(costs::adjust_stmt_cost): Apply LMUL scaling uniformly to all
vector statements.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/pr122558.c: New test.
* gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c: Update expected
dump counts after VLA LMUL cost scaling.
* gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c: Likewise.
* gcc.target/riscv/rvv/autovec/partial/slp-16.c: Likewise.
* gcc.target/riscv/rvv/autovec/partial/slp-5.c: Likewise.
* gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: Likewise.
Signed-off-by: Zhongyao Chen <[email protected]>
---
gcc/config/riscv/riscv-vector-costs.cc | 50 +++++++++++++++++++
.../vect/costmodel/riscv/rvv/pr113112-4.c | 5 +-
.../riscv/rvv/autovec/dyn-lmul-conv-1.c | 5 +-
.../riscv/rvv/autovec/dyn-lmul-conv-2.c | 3 +-
.../riscv/rvv/autovec/partial/slp-16.c | 7 ++-
.../riscv/rvv/autovec/partial/slp-5.c | 4 +-
.../gcc.target/riscv/rvv/autovec/pr122558.c | 37 ++++++++++++++
7 files changed, 100 insertions(+), 11 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
diff --git a/gcc/config/riscv/riscv-vector-costs.cc
b/gcc/config/riscv/riscv-vector-costs.cc
index f582551eba7..e678e0de766 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1235,6 +1235,45 @@ segment_loadstore_group_size (enum vect_cost_for_stmt
kind,
return 0;
}
+/* Calculate LMUL-based cost scaling factor.
+ Larger LMUL values process more data but have proportionally
+ higher latency and register pressure.
+
+ Returns the cost scaling factor based on LMUL. For LMUL > 1,
+ the factor represents the relative cost increase (2x, 4x, 8x).
+ For LMUL <= 1, returns 1 (no scaling). */
+static unsigned
+get_lmul_cost_scaling (machine_mode mode)
+{
+ if (!riscv_vla_mode_p (mode))
+ return 1;
+
+ enum vlmul_type vlmul = get_vlmul (mode);
+
+ /* Cost scaling based on LMUL and data processed.
+ Larger LMUL values have proportionally higher latency:
+ - m1 (LMUL_1): 1x (baseline)
+ - m2 (LMUL_2): 2x (processes 2x data, ~2x latency)
+ - m4 (LMUL_4): 4x (processes 4x data, ~4x latency)
+ - m8 (LMUL_8): 8x (processes 8x data, ~8x latency)
+ - mf2/mf4/mf8: 1x (fractional LMUL, already efficient) */
+ switch (vlmul)
+ {
+ case LMUL_2:
+ return 2;
+ case LMUL_4:
+ return 4;
+ case LMUL_8:
+ return 8;
+ case LMUL_1:
+ case LMUL_F2:
+ case LMUL_F4:
+ case LMUL_F8:
+ default:
+ return 1;
+ }
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1379,6 +1418,17 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind,
loop_vec_info loop,
default:
break;
}
+
+ /* Apply LMUL cost scaling uniformly to all vector operations.
+ Larger LMUL values have higher latency and register pressure,
+ which affects performance regardless of loop structure. */
+ if (vectype)
+ {
+ unsigned lmul_factor = get_lmul_cost_scaling (TYPE_MODE (vectype));
+ if (lmul_factor > 1)
+ stmt_cost *= lmul_factor;
+ }
+
return stmt_cost;
}
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
index 2c91987480b..8af5cc5fc0c 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
@@ -21,8 +21,9 @@ void move_replacements (rtx *x, rtx *y, int n_replacements)
}
}
-/* { dg-final { scan-assembler-not {e64,m2} { xfail *-*-* } } } */
-/* { dg-final { scan-assembler {e64,m4} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler {e64,m1} } } */
+/* { dg-final { scan-assembler-not {e64,m2} } } */
+/* { dg-final { scan-assembler-not {e64,m4} } } */
/* { dg-final { scan-assembler-not {jr} } } */
/* { dg-final { scan-assembler {ret} } } */
/* { dg-final { scan-assembler-not {sp} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c
index b07bd86f76e..91d777a58a7 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-1.c
@@ -37,6 +37,7 @@ void foo8x (long *restrict a, char *restrict b, int n)
a[i] = b[i];
}
+/* { dg-final { scan-assembler-times ",m1," 3 } } */
/* { dg-final { scan-assembler-times ",m2," 3 } } */
-/* { dg-final { scan-assembler-times ",m4," 2 } } */
-/* { dg-final { scan-assembler-times ",m8," 1 } } */
+/* { dg-final { scan-assembler-times ",m4," 4 } } */
+/* { dg-final { scan-assembler-times ",m8," 2 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c
index c37e4dd63f2..468f061e3b1 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/dyn-lmul-conv-2.c
@@ -37,7 +37,8 @@ void foo8x (unsigned char *restrict a, unsigned long
*restrict b, int n)
a[i] = b[i];
}
-/* { dg-final { scan-assembler-times ",m1," 6 } } */
+/* { dg-final { scan-assembler-times ",m1," 7 } } */
/* { dg-final { scan-assembler-times ",m2," 3 } } */
/* { dg-final { scan-assembler-times ",m4," 1 } } */
+/* { dg-final { scan-assembler-times ",m8," 1 } } */
/* { dg-final { scan-assembler-not ",mf2," } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
index 1c7503b770e..b31453852b2 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
@@ -19,8 +19,7 @@ f (uint8_t *restrict a, uint8_t *restrict b, int n)
}
}
-/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are
chosen
- instead of SLP when rvv-autotec-max-lmul=m1. */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail {
any-opts "-mrvv-max-lmul=m1" } } } } */
-/* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts
"-mrvv-max-lmul=m1"} } } } */
+/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are
chosen instead of SLP. */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail {
any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" "-mrvv-max-lmul=m4"
"-mrvv-max-lmul=m8" } } } } */
+/* { dg-final { scan-assembler {\tvid\.v} { xfail { any-opts
"-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" "-mrvv-max-lmul=m4" "-mrvv-max-lmul=m8"
} } } } */
/* { dg-final { scan-assembler-not {\tvmul} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
index a10a7c831b1..2b2099d6e60 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-5.c
@@ -20,5 +20,5 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
}
/* FIXME: Since we don't have VECT cost model yet, LOAD_LANES/STORE_LANES are
chosen
- instead of SLP when rvv-autotec-max-lmul=m1. */
-/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail {
any-opts "-mrvv-max-lmul=m1" } } } } */
+ instead of SLP. */
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" { xfail {
any-opts "-mrvv-max-lmul=m1" "-mrvv-max-lmul=m2" } } } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
new file mode 100644
index 00000000000..c9dbba64961
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -mabi=lp64d -march=rv64gcv
-mrvv-max-lmul=dynamic -fdump-tree-vect-all" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-O2" "-Os" "-Og" "-Oz" } } */
+
+#include <stdint-gcc.h>
+
+void dct( int16_t d[16], int16_t dct[16] )
+{
+ int16_t tmp[16];
+ for( int i = 0; i < 4; i++ )
+ {
+ int s03 = d[i*4+0] + d[i*4+3];
+ int s12 = d[i*4+1] + d[i*4+2];
+ int d03 = d[i*4+0] - d[i*4+3];
+ int d12 = d[i*4+1] - d[i*4+2];
+ tmp[0*4+i] = s03 + s12;
+ tmp[1*4+i] = 2*d03 + d12;
+ tmp[2*4+i] = s03 - s12;
+ tmp[3*4+i] = d03 - 2*d12;
+ }
+ for( int i = 0; i < 4; i++ )
+ {
+ int s03 = tmp[i*4+0] + tmp[i*4+3];
+ int s12 = tmp[i*4+1] + tmp[i*4+2];
+ int d03 = tmp[i*4+0] - tmp[i*4+3];
+ int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+ dct[i*4+0] = s03 + s12;
+ dct[i*4+1] = 2*d03 + d12;
+ dct[i*4+2] = s03 - s12;
+ dct[i*4+3] = d03 - 2*d12;
+ }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "Choosing vector mode RVVMF2QI" "vect" } } */
+
--
2.43.0