Add type-specific reduction costs for integer (i8/i16/i32/i64) and
floating-point (f16/f32/f64) reductions. Ordered (fold-left) FP
reductions receive separate higher costs. Use helpers is_reduction()
and get_reduction_cost() for readability. Adjust affected tests to use
-mmax-vectorization.
gcc/config/riscv/riscv-protos.h (common_vector_cost): Add reduc_i8_cost
through reduc_f64_cost and reduc_f{16,32,64}_ordered_cost.
gcc/config/riscv/riscv.cc: Set costs in rvv_vls_vector_cost and
rvv_vla_vector_cost.
gcc/config/riscv/riscv-vector-costs.cc (is_reduction): New helper.
(get_reduction_cost): New helper.
(costs::adjust_stmt_cost): Use them for vec_to_scalar kind.
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c: New.
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c: New.
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c: New.
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c: New.
gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c: Use
-mmax-vectorization.
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Use
-mmax-vectorization, adjust scan counts.
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Likewise.
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Likewise.
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c: Likewise.
---
Hi Robin,
Changes in v4:
- Fix failure in gcc.target/riscv/rvv/autovec/cond/pr111401.c by
adding -mmax-vectorization.
Tested locally with qemu, all affected tests pass.
gcc/config/riscv/riscv-protos.h | 17 ++++-
gcc/config/riscv/riscv-vector-costs.cc | 67 ++++++++++++++++++-
gcc/config/riscv/riscv.cc | 20 ++++++
.../costmodel/riscv/rvv/reduc_vla_ordered.c | 23 +++++++
.../costmodel/riscv/rvv/reduc_vla_unordered.c | 29 ++++++++
.../costmodel/riscv/rvv/reduc_vls_ordered.c | 26 +++++++
.../costmodel/riscv/rvv/reduc_vls_unordered.c | 29 ++++++++
.../riscv/rvv/autovec/cond/pr111401.c | 2 +-
.../riscv/rvv/autovec/vls/reduc-19.c | 4 +-
.../riscv/rvv/autovec/vls/reduc-20.c | 4 +-
.../riscv/rvv/autovec/vls/reduc-21.c | 4 +-
.../gcc.target/riscv/rvv/autovec/vls/wred-3.c | 4 +-
12 files changed, 215 insertions(+), 14 deletions(-)
create mode 100644
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c
create mode 100644
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c
create mode 100644
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c
create mode 100644
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 8b362e323..5fdfb3141 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -279,6 +279,21 @@ struct common_vector_cost
/* Cost of an unaligned vector store. */
const int unalign_store_cost;
+
+ /* Cost of vector reduction operations (unordered / tree reduction).
+ Indexed by element type. */
+ const int reduc_i8_cost;
+ const int reduc_i16_cost;
+ const int reduc_i32_cost;
+ const int reduc_i64_cost;
+ const int reduc_f16_cost;
+ const int reduc_f32_cost;
+ const int reduc_f64_cost;
+
+ /* Cost of ordered (fold-left) floating-point reductions. */
+ const int reduc_f16_ordered_cost;
+ const int reduc_f32_ordered_cost;
+ const int reduc_f64_ordered_cost;
};
/* scalable vectorization (VLA) specific cost. */
@@ -288,8 +303,6 @@ struct scalable_vector_cost : common_vector_cost
: common_vector_cost (base)
{}
- /* TODO: We will need more other kinds of vector cost for VLA.
- E.g. fold_left reduction cost, lanes load/store cost, ..., etc. */
};
/* Additional costs for register copies. Cost is for one register. */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc
b/gcc/config/riscv/riscv-vector-costs.cc
index e678e0de7..6d37519db 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1274,6 +1274,58 @@ get_lmul_cost_scaling (machine_mode mode)
}
}
+/* Return true if STMT_INFO or NODE represents a reduction operation. */
+
+static bool
+is_reduction (stmt_vec_info stmt_info, slp_tree node)
+{
+ return (stmt_info && vect_is_reduction (stmt_info))
+ || (node && vect_is_reduction (node));
+}
+
+/* Return the per-type reduction cost for VECTYPE, or 0 if no specific cost
+ applies. For FP types, distinguish ordered vs unordered reductions. */
+
+static int
+get_reduction_cost (vec_info *vinfo, const cpu_vector_cost *costs,
+ loop_vec_info loop, slp_tree node, tree vectype)
+{
+ const common_vector_cost *common_costs
+ = loop && riscv_vla_mode_p (loop->vector_mode)
+ ? costs->vla : costs->vls;
+
+ bool is_ordered = false;
+ if (FLOAT_TYPE_P (vectype) && node)
+ {
+ int reduc_type = vect_reduc_type (vinfo, node);
+ is_ordered = (reduc_type == FOLD_LEFT_REDUCTION);
+ }
+
+ switch (GET_MODE_INNER (TYPE_MODE (vectype)))
+ {
+ case E_QImode:
+ return common_costs->reduc_i8_cost;
+ case E_HImode:
+ return common_costs->reduc_i16_cost;
+ case E_SImode:
+ return common_costs->reduc_i32_cost;
+ case E_DImode:
+ return common_costs->reduc_i64_cost;
+ case E_HFmode:
+ case E_BFmode:
+ return is_ordered ? common_costs->reduc_f16_ordered_cost
+ : common_costs->reduc_f16_cost;
+ case E_SFmode:
+ return is_ordered ? common_costs->reduc_f32_ordered_cost
+ : common_costs->reduc_f32_cost;
+ case E_DFmode:
+ return is_ordered ? common_costs->reduc_f64_ordered_cost
+ : common_costs->reduc_f64_cost;
+ default:
+ return 0;
+ }
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1292,9 +1344,18 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind,
loop_vec_info loop,
+= (FLOAT_TYPE_P (vectype) ? get_fr2vr_cost () : get_gr2vr_cost ());
break;
case vec_to_scalar:
- stmt_cost
- += (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ());
- break;
+ {
+ int reduc_cost = 0;
+ if (vectype && is_reduction (stmt_info, node))
+ reduc_cost = get_reduction_cost (m_vinfo, costs, loop, node, vectype);
+
+ if (reduc_cost)
+ stmt_cost = reduc_cost;
+
+ stmt_cost
+ += (FLOAT_TYPE_P (vectype) ? get_vr2fr_cost () : get_vr2gr_cost ());
+ break;
+ }
case vector_load:
case vector_store:
{
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 681b816d2..cb0c7ef3a 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -396,6 +396,16 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* align_store_cost */
2, /* unalign_load_cost */
2, /* unalign_store_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 20, /* reduc_f16_ordered_cost */
+ 10, /* reduc_f32_ordered_cost */
+ 5, /* reduc_f64_ordered_cost */
};
/* RVV costs for VLA vector operations. */
@@ -419,6 +429,16 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* align_store_cost */
2, /* unalign_load_cost */
2, /* unalign_store_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
+ 20, /* reduc_f16_ordered_cost */
+ 10, /* reduc_f32_ordered_cost */
+ 5, /* reduc_f64_ordered_cost */
},
};
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c
new file mode 100644
index 000000000..367a8016c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_ordered.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3
-mrvv-vector-bits=scalable -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ reduc_plus_##TYPE (TYPE *restrict a, int n) \
+ { \
+ TYPE r = 0; \
+ for (int i = 0; i < n; ++i) \
+ r += a[i]; \
+ return r; \
+ }
+
+DEF_REDUC_PLUS (_Float16)
+DEF_REDUC_PLUS (float)
+DEF_REDUC_PLUS (double)
+
+/* Ordered reduction cost: reduc_f*_ordered_cost + vr2fr. */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 22" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 12" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 7" "vect" } } */
diff --git
a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c
new file mode 100644
index 000000000..b605eef2d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vla_unordered.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3
-mrvv-vector-bits=scalable -ffast-math -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE) \
+TYPE __attribute__ ((noinline, noclone)) \
+reduc_plus_##TYPE (TYPE *restrict a, int n) \
+{ \
+ TYPE r = 0; \
+ for (int i = 0; i < n; ++i) \
+ r += a[i]; \
+ return r; \
+}
+
+DEF_REDUC_PLUS (int8_t)
+DEF_REDUC_PLUS (int16_t)
+DEF_REDUC_PLUS (int32_t)
+DEF_REDUC_PLUS (int64_t)
+DEF_REDUC_PLUS (_Float16)
+DEF_REDUC_PLUS (float)
+DEF_REDUC_PLUS (double)
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 7 "vect" } }
*/
+/* { dg-final { scan-assembler-times
{vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times
{vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
+
+/* Unordered reduction cost: reduc_*_cost + vr2gr/vr2fr. */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 4" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c
new file mode 100644
index 000000000..6d7a52cd1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_ordered.c
@@ -0,0 +1,26 @@
+/* Ordered FP reduction costs in VLS mode. */
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE, NUM) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ reduc_plus_##TYPE##_##NUM (TYPE *restrict a) \
+ { \
+ TYPE r = 0; \
+ for (int i = 0; i < NUM; ++i) \
+ r += a[i]; \
+ return r; \
+ }
+
+DEF_REDUC_PLUS (_Float16, 8)
+DEF_REDUC_PLUS (float, 8)
+DEF_REDUC_PLUS (double, 8)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+
+/* f16: 20+2=22, f32: 10+2=12, f64: 5+2=7. */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 22" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 12" "vect" } } */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 7" "vect" } } */
diff --git
a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c
new file mode 100644
index 000000000..fd3350a5b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/reduc_vls_unordered.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -ffast-math -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE, NUM) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ reduc_plus_##TYPE##_##NUM (TYPE *restrict a) \
+ { \
+ TYPE r = 0; \
+ for (int i = 0; i < NUM; ++i) \
+ r += a[i]; \
+ return r; \
+ }
+
+DEF_REDUC_PLUS (int8_t, 8)
+DEF_REDUC_PLUS (int16_t, 8)
+DEF_REDUC_PLUS (int32_t, 8)
+DEF_REDUC_PLUS (int64_t, 8)
+DEF_REDUC_PLUS (_Float16, 8)
+DEF_REDUC_PLUS (float, 8)
+DEF_REDUC_PLUS (double, 8)
+
+/* { dg-final { scan-assembler-times {vredsum\.vs} 4 } } */
+/* { dg-final { scan-assembler-times {vfredusum\.vs} 3 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+
+/* Unordered reduction cost: reduc_*_cost (2) + vr2gr/vr2fr (2) = 4. */
+/* { dg-final { scan-tree-dump "vec_to_scalar costs 4" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
index 08d983997..23ceb66b4 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
@@ -1,5 +1,5 @@
/* { dg-do run { target { riscv_v } } } */
-/* { dg-additional-options "-mrvv-vector-bits=scalable
-fdump-tree-vect-details" } */
+/* { dg-additional-options "-mrvv-vector-bits=scalable -mmax-vectorization
-fdump-tree-vect-details" } */
double
__attribute__ ((noipa))
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
index 5a4df4824..b09b38cce 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -fdump-tree-optimized-details" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized-details" } */
#include "def.h"
@@ -14,7 +14,7 @@ DEF_REDUC_PLUS (_Float16, 512)
DEF_REDUC_PLUS (_Float16, 1024)
DEF_REDUC_PLUS (_Float16, 2048)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 11 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
index daf9c8a32..f37ebd6ea 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -fdump-tree-optimized-details" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized-details" } */
#include "def.h"
@@ -13,7 +13,7 @@ DEF_REDUC_PLUS (float, 256)
DEF_REDUC_PLUS (float, 512)
DEF_REDUC_PLUS (float, 1024)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 10 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
index d1b8c2535..a67dda5e6 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -fdump-tree-optimized-details" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized-details" } */
#include "def.h"
@@ -12,7 +12,7 @@ DEF_REDUC_PLUS (float, 128)
DEF_REDUC_PLUS (float, 256)
DEF_REDUC_PLUS (float, 512)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 7 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
index 6e9456b23..2fad7ad4c 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/wred-3.c
@@ -1,9 +1,9 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -fdump-tree-optimized" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3
-mrvv-max-lmul=m8 -mmax-vectorization -fdump-tree-optimized" } */
#include "wred-2.c"
-/* { dg-final { scan-assembler-times {vfwredosum\.vs} 17 } } */
+/* { dg-final { scan-assembler-times {vfwredosum\.vs} 19 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
--
2.47.1