This commit allows quadratic LMUL cost for some targets when the number of iterations of the loop is unknown at compile time. Using higher LMUL values on targets whose LMUL latency is independent of VL can be especially harmful, so we would like to penalize higher LMULs for such targets.
My earlier patch https://gcc.gnu.org/pipermail/gcc-patches/2026-April/714635.html tried to use a new -mrvv-max-lmul=m1-dynamic for such cases. As Robin suggested using a cost model property, I composed this one. Ideally, I would like to penalize both VLA and VLS costs because sometimes the vectorizer fails to vectorize using VLA modes while succeeds to vectorize using a VLS mode. The current cost comparison code focuses on the average cost per iteration, and higher LMULs like M8 will still be chosen in many cases at a linear LMUL cost. I am not checking the type in this patch, so when Zhongyao's latest VLS LMUL cost scaling patch is merged, the quadratic LMUL cost will be applied VLS modes as well. gcc/ChangeLog: * config/riscv/riscv-cores.def (RISCV_TUNE): Change xt-c908v, xt-c920, xt-c920v2 to generic_ooo_vl_indep_tune_info. * config/riscv/riscv-opts.h (enum rvv_lmul_scaling_mode_enum): New enum. * config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost): Use a quadratic LMUL cost when the LMUL cost scaling mode is quadratic and the loop iteration is unknown. * config/riscv/riscv.cc (struct riscv_tune_param): Add new member. (riscv_option_override): Override the LMUL cost scaling mode option using the tune parameter if unset. * config/riscv/riscv.opt: Add new parameter. * doc/riscv-mtune.texi: Regenerated for generic-ooo-vl-indep. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-1.c: New test. * gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-2.c: New test. --- gcc/config/riscv/riscv-cores.def | 7 +-- gcc/config/riscv/riscv-opts.h | 8 +++ gcc/config/riscv/riscv-vector-costs.cc | 3 ++ gcc/config/riscv/riscv.cc | 50 +++++++++++++++++++ gcc/config/riscv/riscv.opt | 14 ++++++ gcc/doc/riscv-mtune.texi | 2 + .../rvv/autovec/lmul-scaling-quadratic-1.c | 20 ++++++++ .../rvv/autovec/lmul-scaling-quadratic-2.c | 20 ++++++++ 8 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-2.c diff --git a/gcc/config/riscv/riscv-cores.def b/gcc/config/riscv/riscv-cores.def index 257eae1168a..9d15ea2a956 100644 --- a/gcc/config/riscv/riscv-cores.def +++ b/gcc/config/riscv/riscv-cores.def @@ -43,16 +43,17 @@ RISCV_TUNE("sifive-p600-series", sifive_p600, sifive_p600_tune_info) RISCV_TUNE("tt-ascalon-d8", tt_ascalon_d8, tt_ascalon_d8_tune_info) RISCV_TUNE("thead-c906", generic, thead_c906_tune_info) RISCV_TUNE("xt-c908", generic, generic_ooo_tune_info) -RISCV_TUNE("xt-c908v", generic, generic_ooo_tune_info) +RISCV_TUNE("xt-c908v", generic, generic_ooo_vl_indep_tune_info) RISCV_TUNE("xt-c910", generic, generic_ooo_tune_info) RISCV_TUNE("xt-c910v2", generic, generic_ooo_tune_info) -RISCV_TUNE("xt-c920", generic, generic_ooo_tune_info) -RISCV_TUNE("xt-c920v2", generic, generic_ooo_tune_info) +RISCV_TUNE("xt-c920", generic, generic_ooo_vl_indep_tune_info) +RISCV_TUNE("xt-c920v2", generic, generic_ooo_vl_indep_tune_info) RISCV_TUNE("xiangshan-nanhu", xiangshan, xiangshan_nanhu_tune_info) RISCV_TUNE("xiangshan-kunminghu", xiangshan, generic_ooo_tune_info) RISCV_TUNE("spacemit-x60", spacemit_x60, spacemit_x60_tune_info) RISCV_TUNE("arc-v-rhx-100-series", arcv_rhx100, arcv_rhx100_tune_info) RISCV_TUNE("generic-ooo", generic_ooo, generic_ooo_tune_info) +RISCV_TUNE("generic-ooo-vl-indep", generic_ooo, generic_ooo_vl_indep_tune_info) RISCV_TUNE("size", generic, optimize_size_tune_info) RISCV_TUNE("mips-p8700", mips_p8700, mips_p8700_tune_info) RISCV_TUNE("andes-25-series", andes_25_series, andes_25_tune_info) diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h index 88a4aa33926..2c7618ac209 100644 --- a/gcc/config/riscv/riscv-opts.h +++ b/gcc/config/riscv/riscv-opts.h @@ -141,6 +141,14 @@ enum vsetvl_strategy_enum { VSETVL_OPT_NO_FUSION, }; +/* RVV vector cost LMUL scaling mode. */ +enum rvv_lmul_scaling_mode_enum { + /* Linear scaling. */ + RVV_LMUL_SCALING_MODE_LINEAR, + /* Quadratic scaling. */ + RVV_LMUL_SCALING_MODE_QUADRATIC, +}; + /* RVV vector bits for option -mrvv-vector-bits, default is scalable. */ enum rvv_vector_bits_enum { /* scalable indicates taking the value of zvl*b as the minimal vlen. */ diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index e678e0de766..2cd6f4ae7d8 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1425,6 +1425,9 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop, if (vectype) { unsigned lmul_factor = get_lmul_cost_scaling (TYPE_MODE (vectype)); + if (rvv_lmul_scaling_mode == RVV_LMUL_SCALING_MODE_QUADRATIC + && loop && !LOOP_VINFO_NITERS_KNOWN_P (loop)) + lmul_factor *= lmul_factor; if (lmul_factor > 1) stmt_cost *= lmul_factor; } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 681b816d248..caf2d3023c5 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -299,6 +299,7 @@ struct riscv_tune_param const char *jump_align; const char *loop_align; bool prefer_agnostic; + bool vl_dependent_lmul_scaling; }; @@ -467,6 +468,7 @@ static const struct riscv_tune_param generic_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ false, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for rocket. */ @@ -492,6 +494,7 @@ static const struct riscv_tune_param rocket_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ false, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for Sifive 7 Series. */ @@ -517,6 +520,7 @@ static const struct riscv_tune_param sifive_7_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ false, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for Sifive p400 Series. */ @@ -542,6 +546,7 @@ static const struct riscv_tune_param sifive_p400_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for Sifive p600 Series. */ @@ -567,6 +572,7 @@ static const struct riscv_tune_param sifive_p600_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for T-HEAD c906. */ @@ -592,6 +598,7 @@ static const struct riscv_tune_param thead_c906_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ false, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for xiangshan nanhu. */ @@ -617,6 +624,7 @@ static const struct riscv_tune_param xiangshan_nanhu_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for a generic ooo profile. */ @@ -642,6 +650,34 @@ static const struct riscv_tune_param generic_ooo_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ +}; + +/* Costs to use when optimizing for a generic ooo profile with + VL-independent LMUL cost scaling. */ +static const struct riscv_tune_param generic_ooo_vl_indep_tune_info = { + {COSTS_N_INSNS (2), COSTS_N_INSNS (2)}, /* fp_add */ + {COSTS_N_INSNS (5), COSTS_N_INSNS (6)}, /* fp_mul */ + {COSTS_N_INSNS (7), COSTS_N_INSNS (8)}, /* fp_div */ + {COSTS_N_INSNS (2), COSTS_N_INSNS (2)}, /* int_mul */ + {COSTS_N_INSNS (6), COSTS_N_INSNS (6)}, /* int_div */ + 1, /* issue_rate */ + 3, /* branch_cost */ + 4, /* memory_cost */ + 4, /* fmv_cost */ + false, /* slow_unaligned_access */ + true, /* vector_unaligned_access */ + false, /* use_divmod_expansion */ + true, /* overlap_op_by_pieces */ + true, /* use_zero_stride_load */ + false, /* speculative_sched_vsetvl */ + RISCV_FUSE_NOTHING, /* fusible_ops */ + &generic_vector_cost, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ + true, /* prefer-agnostic. */ + false, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for Tenstorrent Ascalon 8 wide. */ @@ -667,6 +703,7 @@ static const struct riscv_tune_param tt_ascalon_d8_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for size. */ @@ -692,6 +729,7 @@ static const struct riscv_tune_param optimize_size_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ false, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for MIPS P8700 */ @@ -717,6 +755,7 @@ static const struct riscv_tune_param mips_p8700_tune_info = { NULL, /* jump_align */ NULL, /* loop_align. */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for Andes 25 series. */ @@ -742,6 +781,7 @@ static const struct riscv_tune_param andes_25_tune_info = { NULL, /* jump_align */ NULL, /* loop_align. */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; static const struct riscv_tune_param spacemit_x60_tune_info= { @@ -766,6 +806,7 @@ static const struct riscv_tune_param spacemit_x60_tune_info= { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for Andes 23 series. */ @@ -791,6 +832,7 @@ static const struct riscv_tune_param andes_23_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for Andes 45 series. */ @@ -816,6 +858,7 @@ static const struct riscv_tune_param andes_45_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; /* Costs to use when optimizing for Synopsys RHX-100. */ @@ -841,6 +884,7 @@ static const struct riscv_tune_param arcv_rhx100_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; @@ -867,6 +911,7 @@ static const struct riscv_tune_param arcv_rmx100_tune_info = { NULL, /* jump_align */ NULL, /* loop_align */ true, /* prefer-agnostic. */ + true, /* vl_dependent_lmul_scaling */ }; static bool riscv_avoid_shrink_wrapping_separate (); @@ -11971,6 +12016,11 @@ riscv_option_override (void) riscv_override_options_internal (&global_options); + if (!tune_param->vl_dependent_lmul_scaling) + SET_OPTION_IF_UNSET (&global_options, &global_options_set, + rvv_lmul_scaling_mode, + RVV_LMUL_SCALING_MODE_QUADRATIC); + /* Save these options as the default ones in case we push and pop them later while processing functions with potential target attributes. */ target_option_default_node = target_option_current_node diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt index c6e099eb47e..aeba47955b9 100644 --- a/gcc/config/riscv/riscv.opt +++ b/gcc/config/riscv/riscv.opt @@ -402,6 +402,20 @@ mrvv-vector-bits= Target RejectNegative Joined Enum(rvv_vector_bits) Var(rvv_vector_bits) Init(RVV_VECTOR_BITS_SCALABLE) -mrvv-vector-bits=<string> Set the kind of bits for an RVV vector register. +Enum +Name(rvv_lmul_scaling_mode) Type(enum rvv_lmul_scaling_mode_enum) +Supported vector cost LMUL scaling modes: + +EnumValue +Enum(rvv_lmul_scaling_mode) String(linear) Value(RVV_LMUL_SCALING_MODE_LINEAR) + +EnumValue +Enum(rvv_lmul_scaling_mode) String(quadratic) Value(RVV_LMUL_SCALING_MODE_QUADRATIC) + +-param=rvv-lmul-scaling-mode= +Target Undocumented RejectNegative Joined Enum(rvv_lmul_scaling_mode) Var(rvv_lmul_scaling_mode) Init(RVV_LMUL_SCALING_MODE_LINEAR) +-param=rvv-lmul-scaling-mode=<string> Set the RVV vector cost LMUL scaling mode. + Enum Name(tls_type) Type(enum riscv_tls_type) The possible TLS dialects: diff --git a/gcc/doc/riscv-mtune.texi b/gcc/doc/riscv-mtune.texi index 7ab6f634540..abcc00e0dd1 100644 --- a/gcc/doc/riscv-mtune.texi +++ b/gcc/doc/riscv-mtune.texi @@ -56,6 +56,8 @@ particular CPU name. Permissible values for this option are: @samp{generic-ooo}, +@samp{generic-ooo-vl-indep}, + @samp{size}, @samp{mips-p8700}, diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-1.c new file mode 100644 index 00000000000..46d0a5907b1 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-1.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m8 -mtune=generic-ooo-vl-indep" } */ + +void +pixel_avg (unsigned char *restrict dst, int i_dst_stride, + unsigned char *restrict src1, int i_src1_stride, + unsigned char *restrict src2, int i_src2_stride, int i_width, + int i_height) +{ + for (int y = 0; y < i_height; y++) + { + for (int x = 0; x < i_width; x++) + dst[x] = (src1[x] + src2[x] + 1) >> 1; + dst += i_dst_stride; + src1 += i_src1_stride; + src2 += i_src2_stride; + } +} + +/* { dg-final { scan-assembler-times ",m1," 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-2.c new file mode 100644 index 00000000000..a8450c77c37 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/lmul-scaling-quadratic-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -mrvv-max-lmul=m8 --param=rvv-lmul-scaling-mode=quadratic" } */ + +void +pixel_avg (unsigned char *restrict dst, int i_dst_stride, + unsigned char *restrict src1, int i_src1_stride, + unsigned char *restrict src2, int i_src2_stride, int i_width, + int i_height) +{ + for (int y = 0; y < i_height; y++) + { + for (int x = 0; x < i_width; x++) + dst[x] = (src1[x] + src2[x] + 1) >> 1; + dst += i_dst_stride; + src1 += i_src1_stride; + src2 += i_src2_stride; + } +} + +/* { dg-final { scan-assembler-times ",m1," 1 } } */ -- 2.46.0
