Add per-type reduction costs (i8/i16/i32/i64/f16/f32/f64) to the RISC-V
vector cost model, distinguishing between ordered (fold-left) and
unordered (tree) floating-point reductions. When a reduction is
detected, the per-type cost replaces the default vec_to_scalar_cost,
similar to AArch64. This causes _Float16 n=4 ordered reductions to no
longer be vectorized in VLS mode due to the higher cost.
gcc/ChangeLog:
* config/riscv/riscv-protos.h (common_vector_cost): Add per-type
reduction cost fields: reduc_i8_cost, reduc_i16_cost,
reduc_i32_cost, reduc_i64_cost, reduc_f16_cost, reduc_f32_cost,
reduc_f64_cost for unordered reductions, and reduc_f16_ordered_cost,
reduc_f32_ordered_cost, reduc_f64_ordered_cost for ordered
(fold-left) reductions.
* config/riscv/riscv.cc (rvv_vla_vector_cost): Initialize reduction
cost fields with default values.
(rvv_vls_vector_cost): Likewise.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost): Add
reduction detection in the vec_to_scalar case. When a reduction is
detected, replace the default vec_to_scalar_cost with the
appropriate per-type reduction cost based on element mode and
reduction kind (ordered vs unordered).
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c: New test for
VLA unordered reduction costs.
* gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c: New test for
VLA ordered reduction costs.
* gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c: New test for
VLS reduction costs.
* gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Update expected
vfredosum count from 9 to 8.
* gcc.target/riscv/rvv/autovec/vls/wred-3.c: Update expected
vfwredosum count from 17 to 16.
Signed-off-by: Wang Yaduo <[email protected]>
---
gcc/config/riscv/riscv-protos.h | 20 +++++-
gcc/config/riscv/riscv-vector-costs.cc | 68 ++++++++++++++++++-
gcc/config/riscv/riscv.cc | 20 ++++++
.../riscv/rvv/autovec/reduc/reduc_cost-1.c | 34 ++++++++++
.../riscv/rvv/autovec/reduc/reduc_cost-2.c | 34 ++++++++++
.../riscv/rvv/autovec/vls/reduc-19.c | 4 +-
.../riscv/rvv/autovec/vls/reduc_cost-1.c | 41 +++++++++++
.../gcc.target/riscv/rvv/autovec/vls/wred-3.c | 4 +-
8 files changed, 219 insertions(+), 6 deletions(-)
create mode 100644
gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c
create mode 100644
gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c
create mode 100644
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index dd029c704..5da5a6a21 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -279,6 +279,24 @@ struct common_vector_cost
/* Cost of an unaligned vector store. */
const int unalign_store_cost;
+
+ /* Cost of vector reduction operations (unordered / tree reduction).
+ Indexed by element type. */
+ const int reduc_i8_cost;
+ const int reduc_i16_cost;
+ const int reduc_i32_cost;
+ const int reduc_i64_cost;
+ const int reduc_f16_cost;
+ const int reduc_f32_cost;
+ const int reduc_f64_cost;
+
+ /* Cost of ordered (fold-left / strict) floating-point reductions.
+ These are significantly more expensive than unordered (tree) reductions
+ because RVV ordered reduction instructions (e.g. vfredosum) process
+ elements sequentially. */
+ const int reduc_f16_ordered_cost;
+ const int reduc_f32_ordered_cost;
+ const int reduc_f64_ordered_cost;
};
/* scalable vectorization (VLA) specific cost. */
@@ -289,7 +307,7 @@ struct scalable_vector_cost : common_vector_cost
{}
/* TODO: We will need more other kinds of vector cost for VLA.
- E.g. fold_left reduction cost, lanes load/store cost, ..., etc. */
+ E.g. lanes load/store cost, ..., etc. */
};
/* Additional costs for register copies. Cost is for one register. */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc
b/gcc/config/riscv/riscv-vector-costs.cc
index f582551eb..a837e4879 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1253,9 +1253,71 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind,
loop_vec_info loop,
+= (FLOAT_TYPE_P (vectype) ? get_fr2vr_cost () : get_gr2vr_cost ());
break;
case vec_to_scalar:
- stmt_cost
- += (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ());
- break;
+ {
+ /* Detect reduction operations and apply type-specific reduction
+ costs. The vec_to_scalar cost kind represents the reduction
+ operation itself (e.g. vredsum.vs, vfredosum.vs), so we replace
+ the default vec_to_scalar_cost with a more precise per-type cost.
+ For floating-point reductions, distinguish between ordered
+ (fold-left, e.g. vfredosum) and unordered (tree, e.g. vfredusum)
+ reductions since ordered reductions are significantly more
+ expensive due to sequential processing. */
+ if (stmt_info && vectype && vect_is_reduction (stmt_info))
+ {
+ const common_vector_cost *common_costs
+ = loop && riscv_vla_mode_p (loop->vector_mode)
+ ? costs->vla : costs->vls;
+
+ bool is_ordered = false;
+ if (FLOAT_TYPE_P (vectype) && loop && node)
+ {
+ int reduc_type = vect_reduc_type (m_vinfo, node);
+ is_ordered = (reduc_type == FOLD_LEFT_REDUCTION);
+ }
+
+ int reduc_cost = 0;
+ switch (GET_MODE_INNER (TYPE_MODE (vectype)))
+ {
+ case E_QImode:
+ reduc_cost = common_costs->reduc_i8_cost;
+ break;
+ case E_HImode:
+ reduc_cost = common_costs->reduc_i16_cost;
+ break;
+ case E_SImode:
+ reduc_cost = common_costs->reduc_i32_cost;
+ break;
+ case E_DImode:
+ reduc_cost = common_costs->reduc_i64_cost;
+ break;
+ case E_HFmode:
+ case E_BFmode:
+ reduc_cost = is_ordered
+ ? common_costs->reduc_f16_ordered_cost
+ : common_costs->reduc_f16_cost;
+ break;
+ case E_SFmode:
+ reduc_cost = is_ordered
+ ? common_costs->reduc_f32_ordered_cost
+ : common_costs->reduc_f32_cost;
+ break;
+ case E_DFmode:
+ reduc_cost = is_ordered
+ ? common_costs->reduc_f64_ordered_cost
+ : common_costs->reduc_f64_cost;
+ break;
+ default:
+ break;
+ }
+
+ if (reduc_cost)
+ stmt_cost = reduc_cost;
+ }
+
+ stmt_cost
+ += (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ());
+ break;
+ }
case vector_load:
case vector_store:
{
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 97272b434..50fa9bd96 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -415,6 +415,16 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* align_store_cost */
2, /* unalign_load_cost */
2, /* unalign_store_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 6, /* reduc_f16_ordered_cost */
+ 4, /* reduc_f32_ordered_cost */
+ 2, /* reduc_f64_ordered_cost */
};
/* RVV costs for VLA vector operations. */
@@ -438,6 +448,16 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* align_store_cost */
2, /* unalign_load_cost */
2, /* unalign_store_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 6, /* reduc_f16_ordered_cost */
+ 4, /* reduc_f32_ordered_cost */
+ 2, /* reduc_f64_ordered_cost */
},
};
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c
new file mode 100644
index 000000000..f567e0ce7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-1.c
@@ -0,0 +1,34 @@
+/* Verify that the vector cost model handles unordered (tree) reductions
+ for all integer and floating-point element types (VLA). */
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d
-mrvv-vector-bits=scalable -ffast-math -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE) \
+TYPE __attribute__ ((noinline, noclone)) \
+reduc_plus_##TYPE (TYPE *restrict a, int n) \
+{ \
+ TYPE r = 0; \
+ for (int i = 0; i < n; ++i) \
+ r += a[i]; \
+ return r; \
+}
+
+DEF_REDUC_PLUS (int8_t)
+DEF_REDUC_PLUS (int16_t)
+DEF_REDUC_PLUS (int32_t)
+DEF_REDUC_PLUS (int64_t)
+DEF_REDUC_PLUS (_Float16)
+DEF_REDUC_PLUS (float)
+DEF_REDUC_PLUS (double)
+
+/* All loops should be vectorized with the cost model enabled. */
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 7 "vect" } }
*/
+/* { dg-final { scan-assembler-times
{vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times
{vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
+
+/* Verify the reduction cost is reflected in the cost model dump.
+ For unordered reductions: reduc_*_cost (2) + vr2gr/vr2fr (2) = 4,
+ where reduc_*_cost replaces the default vec_to_scalar_cost. */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 4" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c
new file mode 100644
index 000000000..af9ffbcf5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_cost-2.c
@@ -0,0 +1,34 @@
+/* Verify that the vector cost model handles ordered (fold-left / strict)
+ floating-point reductions for all FP element types (VLA). */
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d
-mrvv-vector-bits=scalable -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ reduc_plus_##TYPE (TYPE *restrict a, int n) \
+ { \
+ TYPE r = 0; \
+ for (int i = 0; i < n; ++i) \
+ r += a[i]; \
+ return r; \
+ }
+
+DEF_REDUC_PLUS (_Float16)
+DEF_REDUC_PLUS (float)
+DEF_REDUC_PLUS (double)
+
+/* Without -ffast-math, FP reductions use ordered (fold-left) mode.
+ The cost model should still allow vectorization. */
+/* { dg-final { scan-assembler {vfredosum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+}
} } */
+
+/* Verify ordered reduction costs are reflected in the cost model dump.
+ The reduc_f*_ordered_cost replaces the default vec_to_scalar_cost,
+ plus vr2fr cost (2):
+ f16: reduc_f16_ordered_cost (6) + vr2fr (2) = 8
+ f32: reduc_f32_ordered_cost (4) + vr2fr (2) = 6
+ f64: reduc_f64_ordered_cost (2) + vr2fr (2) = 4 */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 8" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 6" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 4" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
index 5a4df4824..3815bbadd 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
@@ -14,7 +14,9 @@ DEF_REDUC_PLUS (_Float16, 512)
DEF_REDUC_PLUS (_Float16, 1024)
DEF_REDUC_PLUS (_Float16, 2048)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
+/* The _Float16 n=4 case is not vectorized because the ordered reduction
+ cost (reduc_f16_ordered_cost) makes it unprofitable for small trip counts.
*/
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c
new file mode 100644
index 000000000..ed62ee230
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc_cost-1.c
@@ -0,0 +1,41 @@
+/* Verify that the vector cost model handles reductions for all element
+ types in VLS mode, including both unordered and ordered reductions. */
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -fdump-tree-vect-details" } */
+
+#include "def.h"
+
+/* Integer unordered reductions (VLS). */
+DEF_REDUC_PLUS (int8_t, 4)
+DEF_REDUC_PLUS (int8_t, 8)
+DEF_REDUC_PLUS (int16_t, 4)
+DEF_REDUC_PLUS (int16_t, 8)
+DEF_REDUC_PLUS (int32_t, 4)
+DEF_REDUC_PLUS (int32_t, 8)
+DEF_REDUC_PLUS (int64_t, 4)
+DEF_REDUC_PLUS (int64_t, 8)
+
+/* { dg-final { scan-assembler-times {vredsum\.vs} 8 } } */
+
+/* Floating-point ordered (strict) reductions (VLS).
+ Without -ffast-math, FP reductions default to ordered. */
+DEF_REDUC_PLUS (_Float16, 4)
+DEF_REDUC_PLUS (_Float16, 8)
+DEF_REDUC_PLUS (float, 4)
+DEF_REDUC_PLUS (float, 8)
+DEF_REDUC_PLUS (double, 4)
+DEF_REDUC_PLUS (double, 8)
+
+/* { dg-final { scan-assembler {vfredosum\.vs} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+
+/* Verify reduction costs in the cost model dump.
+ The reduc_*_cost replaces the default vec_to_scalar_cost,
+ plus vr2gr/vr2fr cost (2):
+ Integer unordered: reduc_i*_cost (2) + vr2gr (2) = 4
+ FP ordered f16: reduc_f16_ordered_cost (6) + vr2fr (2) = 8
+ FP ordered f32: reduc_f32_ordered_cost (4) + vr2fr (2) = 6
+ FP ordered f64: reduc_f64_ordered_cost (2) + vr2fr (2) = 4 */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 8" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 6" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 4" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
index 6e9456b23..0f08d50a5 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
@@ -3,7 +3,9 @@
#include "wred-2.c"
-/* { dg-final { scan-assembler-times {vfwredosum\.vs} 17 } } */
+/* The _Float16->float n=4 case is not vectorized because the ordered
+ reduction cost makes it unprofitable for small trip counts. */
+/* { dg-final { scan-assembler-times {vfwredosum\.vs} 16 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
--
2.47.1