https://gcc.gnu.org/g:0d4eb75eb619469b7baf8cc266f13d7c8348f46d
commit r17-677-g0d4eb75eb619469b7baf8cc266f13d7c8348f46d Author: Wang Yaduo <[email protected]> Date: Fri May 22 11:26:10 2026 -0600 [PATCH v4] RISC-V: Add per-type reduction costs to the vector cost model Add type-specific reduction costs for integer (i8/i16/i32/i64) and floating-point (f16/f32/f64) reductions. Ordered (fold-left) FP reductions receive separate higher costs. Use helpers is_reduction() and get_reduction_cost() for readability. Adjust affected tests to use -mmax-vectorization. Changes in v4: - Fix failure in gcc.target/riscv/rvv/autovec/cond/pr111401.c by adding -mmax-vectorization. Tested locally with qemu, all affected tests pass. gcc/ * config/riscv/riscv-protos.h (common_vector_cost): Add reduc_i8_cost through reduc_f64_cost and reduc_f{16,32,64}_ordered_cost. * config/riscv/riscv.cc: Set costs in rvv_vls_vector_cost and rvv_vla_vector_cost. * config/riscv/riscv-vector-costs.cc (is_reduction): New helper. (get_reduction_cost): New helper. (costs::adjust_stmt_cost): Use them for vec_to_scalar kind. gcc/testsuite * gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c: New. * gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c: New. * gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c: New. * gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c: New. * gcc.target/riscv/rvv/autovec/cond/pr111401.c: Use -mmax-vectorization. * gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Use -mmax-vectorization, adjust scan counts. * gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Likewise. * gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Likewise. * gcc.target/riscv/rvv/autovec/vls/wred-3.c: Likewise. Diff: --- gcc/config/riscv/riscv-protos.h | 17 +++++- gcc/config/riscv/riscv-vector-costs.cc | 67 +++++++++++++++++++++- gcc/config/riscv/riscv.cc | 20 +++++++ .../vect/costmodel/riscv/rvv/reduc_vla_ordered.c | 23 ++++++++ .../vect/costmodel/riscv/rvv/reduc_vla_unordered.c | 29 ++++++++++ .../vect/costmodel/riscv/rvv/reduc_vls_ordered.c | 26 +++++++++ .../vect/costmodel/riscv/rvv/reduc_vls_unordered.c | 29 ++++++++++ .../gcc.target/riscv/rvv/autovec/cond/pr111401.c | 2 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-19.c | 4 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-20.c | 4 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-21.c | 4 +- .../gcc.target/riscv/rvv/autovec/vls/wred-3.c | 4 +- 12 files changed, 215 insertions(+), 14 deletions(-) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index fb600d60168d..234d625441d8 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -280,6 +280,21 @@ struct common_vector_cost /* Cost of an unaligned vector store. */ const int unalign_store_cost; + + /* Cost of vector reduction operations (unordered / tree reduction). + Indexed by element type. */ + const int reduc_i8_cost; + const int reduc_i16_cost; + const int reduc_i32_cost; + const int reduc_i64_cost; + const int reduc_f16_cost; + const int reduc_f32_cost; + const int reduc_f64_cost; + + /* Cost of ordered (fold-left) floating-point reductions. */ + const int reduc_f16_ordered_cost; + const int reduc_f32_ordered_cost; + const int reduc_f64_ordered_cost; }; /* scalable vectorization (VLA) specific cost. */ @@ -289,8 +304,6 @@ struct scalable_vector_cost : common_vector_cost : common_vector_cost (base) {} - /* TODO: We will need more other kinds of vector cost for VLA. - E.g. fold_left reduction cost, lanes load/store cost, ..., etc. */ }; /* Additional costs for register copies. Cost is for one register. */ diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index e678e0de766e..6d37519dbfee 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1274,6 +1274,58 @@ get_lmul_cost_scaling (machine_mode mode) } } +/* Return true if STMT_INFO or NODE represents a reduction operation. */ + +static bool +is_reduction (stmt_vec_info stmt_info, slp_tree node) +{ + return (stmt_info && vect_is_reduction (stmt_info)) + || (node && vect_is_reduction (node)); +} + +/* Return the per-type reduction cost for VECTYPE, or 0 if no specific cost + applies. For FP types, distinguish ordered vs unordered reductions. */ + +static int +get_reduction_cost (vec_info *vinfo, const cpu_vector_cost *costs, + loop_vec_info loop, slp_tree node, tree vectype) +{ + const common_vector_cost *common_costs + = loop && riscv_vla_mode_p (loop->vector_mode) + ? costs->vla : costs->vls; + + bool is_ordered = false; + if (FLOAT_TYPE_P (vectype) && node) + { + int reduc_type = vect_reduc_type (vinfo, node); + is_ordered = (reduc_type == FOLD_LEFT_REDUCTION); + } + + switch (GET_MODE_INNER (TYPE_MODE (vectype))) + { + case E_QImode: + return common_costs->reduc_i8_cost; + case E_HImode: + return common_costs->reduc_i16_cost; + case E_SImode: + return common_costs->reduc_i32_cost; + case E_DImode: + return common_costs->reduc_i64_cost; + case E_HFmode: + case E_BFmode: + return is_ordered ? common_costs->reduc_f16_ordered_cost + : common_costs->reduc_f16_cost; + case E_SFmode: + return is_ordered ? common_costs->reduc_f32_ordered_cost + : common_costs->reduc_f32_cost; + case E_DFmode: + return is_ordered ? common_costs->reduc_f64_ordered_cost + : common_costs->reduc_f64_cost; + default: + return 0; + } +} + /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost. For some statement, we would like to further fine-grain tweak the cost on top of riscv_builtin_vectorization_cost handling which doesn't have any @@ -1292,9 +1344,18 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, += (FLOAT_TYPE_P (vectype) ? get_fr2vr_cost () : get_gr2vr_cost ()); break; case vec_to_scalar: - stmt_cost - += (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ()); - break; + { + int reduc_cost = 0; + if (vectype && is_reduction (stmt_info, node)) + reduc_cost = get_reduction_cost (m_vinfo, costs, loop, node, vectype); + + if (reduc_cost) + stmt_cost = reduc_cost; + + stmt_cost + += (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ()); + break; + } case vector_load: case vector_store: { diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index c66a6d1efeed..8a737bb41b66 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -397,6 +397,16 @@ static const common_vector_cost rvv_vls_vector_cost = { 1, /* align_store_cost */ 2, /* unalign_load_cost */ 2, /* unalign_store_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ + 20, /* reduc_f16_ordered_cost */ + 10, /* reduc_f32_ordered_cost */ + 5, /* reduc_f64_ordered_cost */ }; /* RVV costs for VLA vector operations. */ @@ -420,6 +430,16 @@ static const scalable_vector_cost rvv_vla_vector_cost = { 1, /* align_store_cost */ 2, /* unalign_load_cost */ 2, /* unalign_store_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ + 20, /* reduc_f16_ordered_cost */ + 10, /* reduc_f32_ordered_cost */ + 5, /* reduc_f64_ordered_cost */ }, }; diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c new file mode 100644 index 000000000000..367a8016c97a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 -mrvv-vector-bits=scalable -fdump-tree-vect-details" } */ + +#include <stdint-gcc.h> + +#define DEF_REDUC_PLUS(TYPE) \ + TYPE __attribute__ ((noinline, noclone)) \ + reduc_plus_##TYPE (TYPE *restrict a, int n) \ + { \ + TYPE r = 0; \ + for (int i = 0; i < n; ++i) \ + r += a[i]; \ + return r; \ + } + +DEF_REDUC_PLUS (_Float16) +DEF_REDUC_PLUS (float) +DEF_REDUC_PLUS (double) + +/* Ordered reduction cost: reduc_f*_ordered_cost + vr2fr. */ +/* { dg-final { scan-tree-dump "vec_to_scalar costs 22" "vect" } } */ +/* { dg-final { scan-tree-dump "vec_to_scalar costs 12" "vect" } } */ +/* { dg-final { scan-tree-dump "vec_to_scalar costs 7" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c new file mode 100644 index 000000000000..b605eef2d141 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 -mrvv-vector-bits=scalable -ffast-math -fdump-tree-vect-details" } */ + +#include <stdint-gcc.h> + +#define DEF_REDUC_PLUS(TYPE) \ +TYPE __attribute__ ((noinline, noclone)) \ +reduc_plus_##TYPE (TYPE *restrict a, int n) \ +{ \ + TYPE r = 0; \ + for (int i = 0; i < n; ++i) \ + r += a[i]; \ + return r; \ +} + +DEF_REDUC_PLUS (int8_t) +DEF_REDUC_PLUS (int16_t) +DEF_REDUC_PLUS (int32_t) +DEF_REDUC_PLUS (int64_t) +DEF_REDUC_PLUS (_Float16) +DEF_REDUC_PLUS (float) +DEF_REDUC_PLUS (double) + +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 7 "vect" } } */ +/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */ +/* { dg-final { scan-assembler-times {vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */ + +/* Unordered reduction cost: reduc_*_cost + vr2gr/vr2fr. */ +/* { dg-final { scan-tree-dump "vec_to_scalar costs 4" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c new file mode 100644 index 000000000000..6d7a52cd1a10 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c @@ -0,0 +1,26 @@ +/* Ordered FP reduction costs in VLS mode. */ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-vect-details" } */ + +#include <stdint-gcc.h> + +#define DEF_REDUC_PLUS(TYPE, NUM) \ + TYPE __attribute__ ((noinline, noclone)) \ + reduc_plus_##TYPE##_##NUM (TYPE *restrict a) \ + { \ + TYPE r = 0; \ + for (int i = 0; i < NUM; ++i) \ + r += a[i]; \ + return r; \ + } + +DEF_REDUC_PLUS (_Float16, 8) +DEF_REDUC_PLUS (float, 8) +DEF_REDUC_PLUS (double, 8) + +/* { dg-final { scan-assembler-not {csrr} } } */ + +/* f16: 20+2=22, f32: 10+2=12, f64: 5+2=7. */ +/* { dg-final { scan-tree-dump "vec_to_scalar costs 22" "vect" } } */ +/* { dg-final { scan-tree-dump "vec_to_scalar costs 12" "vect" } } */ +/* { dg-final { scan-tree-dump "vec_to_scalar costs 7" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c new file mode 100644 index 000000000000..fd3350a5bbcb --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -ffast-math -fdump-tree-vect-details" } */ + +#include <stdint-gcc.h> + +#define DEF_REDUC_PLUS(TYPE, NUM) \ + TYPE __attribute__ ((noinline, noclone)) \ + reduc_plus_##TYPE##_##NUM (TYPE *restrict a) \ + { \ + TYPE r = 0; \ + for (int i = 0; i < NUM; ++i) \ + r += a[i]; \ + return r; \ + } + +DEF_REDUC_PLUS (int8_t, 8) +DEF_REDUC_PLUS (int16_t, 8) +DEF_REDUC_PLUS (int32_t, 8) +DEF_REDUC_PLUS (int64_t, 8) +DEF_REDUC_PLUS (_Float16, 8) +DEF_REDUC_PLUS (float, 8) +DEF_REDUC_PLUS (double, 8) + +/* { dg-final { scan-assembler-times {vredsum\.vs} 4 } } */ +/* { dg-final { scan-assembler-times {vfredusum\.vs} 3 } } */ +/* { dg-final { scan-assembler-not {csrr} } } */ + +/* Unordered reduction cost: reduc_*_cost (2) + vr2gr/vr2fr (2) = 4. */ +/* { dg-final { scan-tree-dump "vec_to_scalar costs 4" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c index 08d983997e2a..23ceb66b4b00 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c @@ -1,5 +1,5 @@ /* { dg-do run { target { riscv_v } } } */ -/* { dg-additional-options "-mrvv-vector-bits=scalable -fdump-tree-vect-details" } */ +/* { dg-additional-options "-mrvv-vector-bits=scalable -mmax-vectorization -fdump-tree-vect-details" } */ double __attribute__ ((noipa)) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c index 5a4df4824240..b09b38cce844 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-optimized-details" } */ +/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized-details" } */ #include "def.h" @@ -14,7 +14,7 @@ DEF_REDUC_PLUS (_Float16, 512) DEF_REDUC_PLUS (_Float16, 1024) DEF_REDUC_PLUS (_Float16, 2048) -/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */ +/* { dg-final { scan-assembler-times {vfredosum\.vs} 11 } } */ /* { dg-final { scan-assembler-not {csrr} } } */ /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */ /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c index daf9c8a32a94..f37ebd6ea480 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-optimized-details" } */ +/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized-details" } */ #include "def.h" @@ -13,7 +13,7 @@ DEF_REDUC_PLUS (float, 256) DEF_REDUC_PLUS (float, 512) DEF_REDUC_PLUS (float, 1024) -/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */ +/* { dg-final { scan-assembler-times {vfredosum\.vs} 10 } } */ /* { dg-final { scan-assembler-not {csrr} } } */ /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */ /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c index d1b8c2535cca..a67dda5e6fa5 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-optimized-details" } */ +/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized-details" } */ #include "def.h" @@ -12,7 +12,7 @@ DEF_REDUC_PLUS (float, 128) DEF_REDUC_PLUS (float, 256) DEF_REDUC_PLUS (float, 512) -/* { dg-final { scan-assembler-times {vfredosum\.vs} 7 } } */ +/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */ /* { dg-final { scan-assembler-not {csrr} } } */ /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */ /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c index 6e9456b23209..2fad7ad4c995 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c @@ -1,9 +1,9 @@ /* { dg-do compile } */ -/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -fdump-tree-optimized" } */ +/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized" } */ #include "wred-2.c" -/* { dg-final { scan-assembler-times {vfwredosum\.vs} 17 } } */ +/* { dg-final { scan-assembler-times {vfwredosum\.vs} 19 } } */ /* { dg-final { scan-assembler-not {csrr} } } */ /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */ /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
